From e60aa787f43f1e469632b8719e3e3ab1d9e7b720 Mon Sep 17 00:00:00 2001 From: Jonathan Lassoff Date: Mon, 11 Apr 2022 13:06:28 +1000 Subject: xfs: Simplify XFS logging methods. Rather than have a constructor to define many nearly-identical functions, use preprocessor macros to pass down a kernel logging level to a common function. Signed-off-by: Jonathan Lassoff Reviewed-by: Chris Down Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_message.c | 54 ++++++++++++++++++++++------------------------------ fs/xfs/xfs_message.h | 43 +++++++++++++++++++++-------------------- 2 files changed, 45 insertions(+), 52 deletions(-) diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index bc66d95c8d4c..9ceebd4c9ff1 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -27,37 +27,29 @@ __xfs_printk( printk("%sXFS: %pV\n", level, vaf); } -#define define_xfs_printk_level(func, kern_level) \ -void func(const struct xfs_mount *mp, const char *fmt, ...) \ -{ \ - struct va_format vaf; \ - va_list args; \ - int level; \ - \ - va_start(args, fmt); \ - \ - vaf.fmt = fmt; \ - vaf.va = &args; \ - \ - __xfs_printk(kern_level, mp, &vaf); \ - va_end(args); \ - \ - if (!kstrtoint(kern_level, 0, &level) && \ - level <= LOGLEVEL_ERR && \ - xfs_error_level >= XFS_ERRLEVEL_HIGH) \ - xfs_stack_trace(); \ -} \ - -define_xfs_printk_level(xfs_emerg, KERN_EMERG); -define_xfs_printk_level(xfs_alert, KERN_ALERT); -define_xfs_printk_level(xfs_crit, KERN_CRIT); -define_xfs_printk_level(xfs_err, KERN_ERR); -define_xfs_printk_level(xfs_warn, KERN_WARNING); -define_xfs_printk_level(xfs_notice, KERN_NOTICE); -define_xfs_printk_level(xfs_info, KERN_INFO); -#ifdef DEBUG -define_xfs_printk_level(xfs_debug, KERN_DEBUG); -#endif +void +xfs_printk_level( + const char *kern_level, + const struct xfs_mount *mp, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int level; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + __xfs_printk(kern_level, mp, &vaf); + + va_end(args); + + if (!kstrtoint(kern_level, 0, &level) && + level <= LOGLEVEL_ERR && + xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} void xfs_alert_tag( diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index bb9860ec9a93..a281b1cc13d5 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -6,33 +6,34 @@ struct xfs_mount; -extern __printf(2, 3) -void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...); extern __printf(3, 4) -void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); -extern __printf(2, 3) -void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_err(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_info(const struct xfs_mount *mp, const char *fmt, ...); +void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp, + const char *fmt, ...); +#define xfs_emerg(mp, fmt, ...) \ + xfs_printk_level(KERN_EMERG, mp, fmt, ##__VA_ARGS__) +#define xfs_alert(mp, fmt, ...) \ + xfs_printk_level(KERN_ALERT, mp, fmt, ##__VA_ARGS__) +#define xfs_crit(mp, fmt, ...) \ + xfs_printk_level(KERN_CRIT, mp, fmt, ##__VA_ARGS__) +#define xfs_err(mp, fmt, ...) \ + xfs_printk_level(KERN_ERR, mp, fmt, ##__VA_ARGS__) +#define xfs_warn(mp, fmt, ...) \ + xfs_printk_level(KERN_WARNING, mp, fmt, ##__VA_ARGS__) +#define xfs_notice(mp, fmt, ...) \ + xfs_printk_level(KERN_NOTICE, mp, fmt, ##__VA_ARGS__) +#define xfs_info(mp, fmt, ...) \ + xfs_printk_level(KERN_INFO, mp, fmt, ##__VA_ARGS__) #ifdef DEBUG -extern __printf(2, 3) -void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...); +#define xfs_debug(mp, fmt, ...) \ + xfs_printk_level(KERN_DEBUG, mp, fmt, ##__VA_ARGS__) #else -static inline __printf(2, 3) -void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) -{ -} +#define xfs_debug(mp, fmt, ...) do {} while (0) #endif +extern __printf(3, 4) +void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); + #define xfs_printk_ratelimited(func, dev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ -- cgit From e270356944ccba0a9c23b4d0751d2e029184ad21 Mon Sep 17 00:00:00 2001 From: Jonathan Lassoff Date: Mon, 11 Apr 2022 13:06:39 +1000 Subject: xfs: Add XFS messages to printk index In order for end users to quickly react to new issues that come up in production, it is proving useful to leverage the printk indexing system. This printk index enables kernel developers to use calls to printk() with changeable format strings (as they always have; no change of expectations), while enabling end users to examine format strings to detect changes. Since end users are using regular expressions to match messages printed through printk(), being able to detect changes in chosen format strings from release to release provides a useful signal to review printk()-matching regular expressions for any necessary updates. So that detailed XFS messages are captures by this printk index, this patch wraps the xfs_ and xfs_alert_tag functions. Signed-off-by: Jonathan Lassoff Reviewed-by: Chris Down Reviewed-by: Petr Mladek Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_message.c | 2 +- fs/xfs/xfs_message.h | 29 ++++++++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 9ceebd4c9ff1..22c2adff1260 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -52,7 +52,7 @@ xfs_printk_level( } void -xfs_alert_tag( +_xfs_alert_tag( const struct xfs_mount *mp, int panic_tag, const char *fmt, ...) diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index a281b1cc13d5..035ee3d244ac 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -10,29 +10,40 @@ extern __printf(3, 4) void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp, const char *fmt, ...); +#define xfs_printk_index_wrap(kern_level, mp, fmt, ...) \ +({ \ + printk_index_subsys_emit("%sXFS%s: ", kern_level, fmt); \ + xfs_printk_level(kern_level, mp, fmt, ##__VA_ARGS__); \ +}) #define xfs_emerg(mp, fmt, ...) \ - xfs_printk_level(KERN_EMERG, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_EMERG, mp, fmt, ##__VA_ARGS__) #define xfs_alert(mp, fmt, ...) \ - xfs_printk_level(KERN_ALERT, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_ALERT, mp, fmt, ##__VA_ARGS__) #define xfs_crit(mp, fmt, ...) \ - xfs_printk_level(KERN_CRIT, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_CRIT, mp, fmt, ##__VA_ARGS__) #define xfs_err(mp, fmt, ...) \ - xfs_printk_level(KERN_ERR, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_ERR, mp, fmt, ##__VA_ARGS__) #define xfs_warn(mp, fmt, ...) \ - xfs_printk_level(KERN_WARNING, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_WARNING, mp, fmt, ##__VA_ARGS__) #define xfs_notice(mp, fmt, ...) \ - xfs_printk_level(KERN_NOTICE, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_NOTICE, mp, fmt, ##__VA_ARGS__) #define xfs_info(mp, fmt, ...) \ - xfs_printk_level(KERN_INFO, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_INFO, mp, fmt, ##__VA_ARGS__) #ifdef DEBUG #define xfs_debug(mp, fmt, ...) \ - xfs_printk_level(KERN_DEBUG, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_DEBUG, mp, fmt, ##__VA_ARGS__) #else #define xfs_debug(mp, fmt, ...) do {} while (0) #endif +#define xfs_alert_tag(mp, tag, fmt, ...) \ +({ \ + printk_index_subsys_emit("%sXFS%s: ", KERN_ALERT, fmt); \ + _xfs_alert_tag(mp, tag, fmt, ##__VA_ARGS__); \ +}) + extern __printf(3, 4) -void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); +void _xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); #define xfs_printk_ratelimited(func, dev, fmt, ...) \ do { \ -- cgit From 3b0d9fd369ea48419ccb578e0bafa4c54df63ba6 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Wed, 7 Oct 2020 15:00:03 +0530 Subject: xfs: Move extent count limits to xfs_format.h Maximum values associated with extent counters i.e. Maximum extent length, Maximum data extents and Maximum xattr extents are dictated by the on-disk format. Hence move these definitions over to xfs_format.h. Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_format.h | 7 +++++++ fs/xfs/libxfs/xfs_types.h | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index d665c04e69dd..d75e5b16da7e 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -869,6 +869,13 @@ enum xfs_dinode_fmt { { XFS_DINODE_FMT_BTREE, "btree" }, \ { XFS_DINODE_FMT_UUID, "uuid" } +/* + * Max values for extlen, extnum, aextnum. + */ +#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */ +#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ +#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ + /* * Inode minimum and maximum sizes. */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index b6da06b40989..794a54cbd0de 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -56,13 +56,6 @@ typedef void * xfs_failaddr_t; #define NULLFSINO ((xfs_ino_t)-1) #define NULLAGINO ((xfs_agino_t)-1) -/* - * Max values for extlen, extnum, aextnum. - */ -#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */ -#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ -#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ - /* * Minimum and maximum blocksize and sectorsize. * The blocksize upper limit is pretty much arbitrary. -- cgit From 95f0b95e2b686ceaa3f465e9fa079f22e0fe7665 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Mon, 9 Aug 2021 12:05:22 +0530 Subject: xfs: Define max extent length based on on-disk format definition The maximum extent length depends on maximum block count that can be stored in a BMBT record. Hence this commit defines MAXEXTLEN based on BMBT_BLOCKCOUNT_BITLEN. While at it, the commit also renames MAXEXTLEN to XFS_MAX_BMBT_EXTLEN. Suggested-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_alloc.c | 2 +- fs/xfs/libxfs/xfs_bmap.c | 57 +++++++++++++++++++++--------------------- fs/xfs/libxfs/xfs_format.h | 5 ++-- fs/xfs/libxfs/xfs_inode_buf.c | 4 +-- fs/xfs/libxfs/xfs_trans_resv.c | 11 ++++---- fs/xfs/scrub/bmap.c | 2 +- fs/xfs/xfs_bmap_util.c | 14 ++++++----- fs/xfs/xfs_iomap.c | 28 ++++++++++----------- 8 files changed, 64 insertions(+), 59 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index b52ed339727f..f2a918ed7b8a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2511,7 +2511,7 @@ __xfs_free_extent_later( ASSERT(bno != NULLFSBLOCK); ASSERT(len > 0); - ASSERT(len <= MAXEXTLEN); + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); ASSERT(!isnullstartblock(bno)); agno = XFS_FSB_TO_AGNO(mp, bno); agbno = XFS_FSB_TO_AGBNO(mp, bno); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 74198dd82b03..00b8e6e1c404 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1452,7 +1452,7 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; /* @@ -1470,13 +1470,13 @@ xfs_bmap_add_extent_delay_real( new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)) + <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -2000,7 +2000,7 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; /* @@ -2018,13 +2018,13 @@ xfs_bmap_add_extent_unwritten_real( new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)) + <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; /* @@ -2510,15 +2510,15 @@ xfs_bmap_add_extent_hole_delay( */ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN))) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) state |= BMAP_RIGHT_CONTIG; /* @@ -2661,17 +2661,17 @@ xfs_bmap_add_extent_hole_real( left.br_startoff + left.br_blockcount == new->br_startoff && left.br_startblock + left.br_blockcount == new->br_startblock && left.br_state == new->br_state && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && new->br_startblock + new->br_blockcount == right.br_startblock && new->br_state == right.br_state && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN)) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -2906,15 +2906,15 @@ xfs_bmap_extsize_align( /* * For large extent hint sizes, the aligned extent might be larger than - * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls - * the length back under MAXEXTLEN. The outer allocation loops handle - * short allocation just fine, so it is safe to do this. We only want to - * do it when we are forced to, though, because it means more allocation - * operations are required. + * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so + * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer + * allocation loops handle short allocation just fine, so it is safe to + * do this. We only want to do it when we are forced to, though, because + * it means more allocation operations are required. */ - while (align_alen > MAXEXTLEN) + while (align_alen > XFS_MAX_BMBT_EXTLEN) align_alen -= extsz; - ASSERT(align_alen <= MAXEXTLEN); + ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN); /* * If the previous block overlaps with this proposed allocation @@ -3004,9 +3004,9 @@ xfs_bmap_extsize_align( return -EINVAL; } else { ASSERT(orig_off >= align_off); - /* see MAXEXTLEN handling above */ + /* see XFS_BMBT_MAX_EXTLEN handling above */ ASSERT(orig_end <= align_off + align_alen || - align_alen + extsz > MAXEXTLEN); + align_alen + extsz > XFS_MAX_BMBT_EXTLEN); } #ifdef DEBUG @@ -3971,7 +3971,7 @@ xfs_bmapi_reserve_delalloc( * Cap the alloc length. Keep track of prealloc so we know whether to * tag the inode before we return. */ - alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN); + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); if (prealloc && alen >= len) @@ -4104,7 +4104,7 @@ xfs_bmapi_allocate( if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev)) bma->prev.br_startoff = NULLFILEOFF; } else { - bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN); if (!bma->eof) bma->length = XFS_FILBLKS_MIN(bma->length, bma->got.br_startoff - bma->offset); @@ -4424,8 +4424,8 @@ xfs_bmapi_write( * xfs_extlen_t and therefore 32 bits. Hence we have to * check for 32-bit overflows and handle them here. */ - if (len > (xfs_filblks_t)MAXEXTLEN) - bma.length = MAXEXTLEN; + if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN) + bma.length = XFS_MAX_BMBT_EXTLEN; else bma.length = len; @@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc( bma.ip = ip; bma.wasdel = true; bma.offset = bma.got.br_startoff; - bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); + bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, + XFS_MAX_BMBT_EXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); /* @@ -4641,7 +4642,7 @@ xfs_bmapi_remap( ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(len > 0); - ASSERT(len <= (xfs_filblks_t)MAXEXTLEN); + ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_NORMAP))); @@ -5641,7 +5642,7 @@ xfs_bmse_can_merge( if ((left->br_startoff + left->br_blockcount != startoff) || (left->br_startblock + left->br_blockcount != got->br_startblock) || (left->br_state != got->br_state) || - (left->br_blockcount + got->br_blockcount > MAXEXTLEN)) + (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN)) return false; return true; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index d75e5b16da7e..66594853a88b 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -870,9 +870,8 @@ enum xfs_dinode_fmt { { XFS_DINODE_FMT_UUID, "uuid" } /* - * Max values for extlen, extnum, aextnum. + * Max values for extnum and aextnum. */ -#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */ #define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ #define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ @@ -1603,6 +1602,8 @@ typedef struct xfs_bmdr_block { #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) #define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) +#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK)) + /* * bmbt records have a file offset (block) field that is 54 bits wide, so this * is the largest xfs_fileoff_t that we ever expect to see. diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index cae9708c8587..87781a5d5a45 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -639,7 +639,7 @@ xfs_inode_validate_extsize( if (extsize_bytes % blocksize_bytes) return __this_address; - if (extsize > MAXEXTLEN) + if (extsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) @@ -696,7 +696,7 @@ xfs_inode_validate_cowextsize( if (cowextsize_bytes % mp->m_sb.sb_blocksize) return __this_address; - if (cowextsize > MAXEXTLEN) + if (cowextsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (cowextsize > mp->m_sb.sb_agblocks / 2) diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 6f83d9b306ee..8e1d09e8cc9a 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -199,8 +199,8 @@ xfs_calc_inode_chunk_res( /* * Per-extent log reservation for the btree changes involved in freeing or * allocating a realtime extent. We have to be able to log as many rtbitmap - * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, - * as well as the realtime summary block. + * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime + * extents, as well as the realtime summary block. */ static unsigned int xfs_rtalloc_log_count( @@ -210,7 +210,7 @@ xfs_rtalloc_log_count( unsigned int blksz = XFS_FSB_TO_B(mp, 1); unsigned int rtbmp_bytes; - rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY; return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; } @@ -247,7 +247,7 @@ xfs_rtalloc_log_count( * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size - * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes * the realtime summary: 1 block * the allocation btrees: 2 trees * (2 * max depth - 1) * block size * And the bmap_finish transaction can free bmap blocks in a join (t3): @@ -299,7 +299,8 @@ xfs_calc_write_reservation( * the agf for each of the ags: 2 * sector size * the agfl for each of the ags: 2 * sector size * the super block to reflect the freed blocks: sector size - * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes * the realtime summary: 2 exts * 1 block * worst case split in allocation btrees per extent assuming 2 extents: * 2 exts * 2 trees * (2 * max depth - 1) * block size diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index a4cbbc346f60..c357593e0a02 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -350,7 +350,7 @@ xchk_bmap_iextent( irec->br_startoff); /* Make sure the extent points to a valid place. */ - if (irec->br_blockcount > MAXEXTLEN) + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); if (info->is_rt && diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index eb2e387ba528..18c1b99311a8 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -119,14 +119,14 @@ retry: */ ralen = ap->length / mp->m_sb.sb_rextsize; /* - * If the old value was close enough to MAXEXTLEN that + * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that * we rounded up to it, cut it back so it's valid again. * Note that if it's a really large request (bigger than - * MAXEXTLEN), we don't hear about that number, and can't + * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't * adjust the starting point to match it. */ - if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) - ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; + if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN) + ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize; /* * Lock out modifications to both the RT bitmap and summary inodes @@ -839,9 +839,11 @@ xfs_alloc_file_space( * count, hence we need to limit the number of blocks we are * trying to reserve to avoid an overflow. We can't allocate * more than @nimaps extents, and an extent is limited on disk - * to MAXEXTLEN (21 bits), so use that to enforce the limit. + * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the + * limit. */ - resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); + resblks = min_t(xfs_fileoff_t, (e - s), + (XFS_MAX_BMBT_EXTLEN * nimaps)); if (unlikely(rt)) { dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); rblocks = resblks; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index e552ce541ec2..87e1cf5060bd 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -402,7 +402,7 @@ xfs_iomap_prealloc_size( */ plen = prev.br_blockcount; while (xfs_iext_prev_extent(ifp, &ncur, &got)) { - if (plen > MAXEXTLEN / 2 || + if (plen > XFS_MAX_BMBT_EXTLEN / 2 || isnullstartblock(got.br_startblock) || got.br_startoff + got.br_blockcount != prev.br_startoff || got.br_startblock + got.br_blockcount != prev.br_startblock) @@ -414,23 +414,23 @@ xfs_iomap_prealloc_size( /* * If the size of the extents is greater than half the maximum extent * length, then use the current offset as the basis. This ensures that - * for large files the preallocation size always extends to MAXEXTLEN - * rather than falling short due to things like stripe unit/width - * alignment of real extents. + * for large files the preallocation size always extends to + * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe + * unit/width alignment of real extents. */ alloc_blocks = plen * 2; - if (alloc_blocks > MAXEXTLEN) + if (alloc_blocks > XFS_MAX_BMBT_EXTLEN) alloc_blocks = XFS_B_TO_FSB(mp, offset); qblocks = alloc_blocks; /* - * MAXEXTLEN is not a power of two value but we round the prealloc down - * to the nearest power of two value after throttling. To prevent the - * round down from unconditionally reducing the maximum supported - * prealloc size, we round up first, apply appropriate throttling, - * round down and cap the value to MAXEXTLEN. + * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc + * down to the nearest power of two value after throttling. To prevent + * the round down from unconditionally reducing the maximum supported + * prealloc size, we round up first, apply appropriate throttling, round + * down and cap the value to XFS_BMBT_MAX_EXTLEN. */ - alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), + alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN), alloc_blocks); freesp = percpu_counter_read_positive(&mp->m_fdblocks); @@ -478,14 +478,14 @@ xfs_iomap_prealloc_size( */ if (alloc_blocks) alloc_blocks = rounddown_pow_of_two(alloc_blocks); - if (alloc_blocks > MAXEXTLEN) - alloc_blocks = MAXEXTLEN; + if (alloc_blocks > XFS_MAX_BMBT_EXTLEN) + alloc_blocks = XFS_MAX_BMBT_EXTLEN; /* * If we are still trying to allocate more space than is * available, squash the prealloc hard. This can happen if we * have a large file on a small filesystem and the above - * lowspace thresholds are smaller than MAXEXTLEN. + * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN. */ while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; -- cgit From 9feb8f19665c8ba051c6a81aa7897149e7748e1e Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Thu, 27 Aug 2020 15:09:10 +0530 Subject: xfs: Introduce xfs_iext_max_nextents() helper xfs_iext_max_nextents() returns the maximum number of extents possible for one of data, cow or attribute fork. This helper will be extended further in a future commit when maximum extent counts associated with data/attribute forks are increased. Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 9 ++++----- fs/xfs/libxfs/xfs_inode_buf.c | 8 +++----- fs/xfs/libxfs/xfs_inode_fork.c | 2 +- fs/xfs/libxfs/xfs_inode_fork.h | 8 ++++++++ 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 00b8e6e1c404..a713bc7242a4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -74,13 +74,12 @@ xfs_bmap_compute_maxlevels( * ATTR2 we have to assume the worst case scenario of a minimum size * available. */ - if (whichfork == XFS_DATA_FORK) { - maxleafents = MAXEXTNUM; + maxleafents = xfs_iext_max_nextents(whichfork); + if (whichfork == XFS_DATA_FORK) sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); - } else { - maxleafents = MAXAEXTNUM; + else sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); - } + maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; minnoderecs = mp->m_bmap_dmnr[1]; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 87781a5d5a45..b1c37a82ddce 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -337,6 +337,7 @@ xfs_dinode_verify_fork( int whichfork) { uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t max_extents; switch (XFS_DFORK_FORMAT(dip, whichfork)) { case XFS_DINODE_FMT_LOCAL: @@ -358,12 +359,9 @@ xfs_dinode_verify_fork( return __this_address; break; case XFS_DINODE_FMT_BTREE: - if (whichfork == XFS_ATTR_FORK) { - if (di_nextents > MAXAEXTNUM) - return __this_address; - } else if (di_nextents > MAXEXTNUM) { + max_extents = xfs_iext_max_nextents(whichfork); + if (di_nextents > max_extents) return __this_address; - } break; default: return __this_address; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9149f4f796fc..e136c29a0ec1 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -744,7 +744,7 @@ xfs_iext_count_may_overflow( if (whichfork == XFS_COW_FORK) return 0; - max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM; + max_exts = xfs_iext_max_nextents(whichfork); if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) max_exts = 10; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 3d64a3acb0ed..2605f7ff8fc1 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -133,6 +133,14 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp) return ifp->if_format; } +static inline xfs_extnum_t xfs_iext_max_nextents(int whichfork) +{ + if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) + return MAXEXTNUM; + + return MAXAEXTNUM; +} + struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format, xfs_extnum_t nextents); struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); -- cgit From bb1d50494cbdd9c5991ddc7feeeb14982872b2a8 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Fri, 26 Feb 2021 11:24:31 +0530 Subject: xfs: Use xfs_extnum_t instead of basic data types xfs_extnum_t is the type to use to declare variables which have values obtained from xfs_dinode->di_[a]nextents. This commit replaces basic types (e.g. uint32_t) with xfs_extnum_t for such variables. Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/libxfs/xfs_inode_buf.c | 2 +- fs/xfs/libxfs/xfs_inode_fork.c | 2 +- fs/xfs/scrub/inode.c | 2 +- fs/xfs/xfs_trace.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a713bc7242a4..cc15981b1793 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -54,7 +54,7 @@ xfs_bmap_compute_maxlevels( { int level; /* btree level */ uint maxblocks; /* max blocks at this level */ - uint maxleafents; /* max leaf entries possible */ + xfs_extnum_t maxleafents; /* max leaf entries possible */ int maxrootrecs; /* max records in root block */ int minleafrecs; /* min records in leaf block */ int minnoderecs; /* min records in node block */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index b1c37a82ddce..7cad307840b3 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -336,7 +336,7 @@ xfs_dinode_verify_fork( struct xfs_mount *mp, int whichfork) { - uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); xfs_extnum_t max_extents; switch (XFS_DFORK_FORMAT(dip, whichfork)) { diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index e136c29a0ec1..a17c4d87520a 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -105,7 +105,7 @@ xfs_iformat_extents( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); int state = xfs_bmap_fork_to_state(whichfork); - int nex = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t nex = XFS_DFORK_NEXTENTS(dip, whichfork); int size = nex * sizeof(xfs_bmbt_rec_t); struct xfs_iext_cursor icur; struct xfs_bmbt_rec *dp; diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index eac15af7b08c..87925761e174 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -232,7 +232,7 @@ xchk_dinode( size_t fork_recs; unsigned long long isize; uint64_t flags2; - uint32_t nextents; + xfs_extnum_t nextents; prid_t prid; uint16_t flags; uint16_t mode; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index b141ef78c755..16a91b4f97bd 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2169,7 +2169,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __field(int, which) __field(xfs_ino_t, ino) __field(int, format) - __field(int, nex) + __field(xfs_extnum_t, nex) __field(int, broot_size) __field(int, fork_off) ), -- cgit From dd95a6ce31d6441dfd5fd3aa5d7208b0fc61782f Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Thu, 27 Aug 2020 15:34:34 +0530 Subject: xfs: Introduce xfs_dfork_nextents() helper This commit replaces the macro XFS_DFORK_NEXTENTS() with the helper function xfs_dfork_nextents(). As of this commit, xfs_dfork_nextents() returns the same value as XFS_DFORK_NEXTENTS(). A future commit which extends inode's extent counter fields will add more logic to this helper. This commit also replaces direct accesses to xfs_dinode->di_[a]nextents with calls to xfs_dfork_nextents(). No functional changes have been made. Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_format.h | 4 ---- fs/xfs/libxfs/xfs_inode_buf.c | 17 ++++++++++++----- fs/xfs/libxfs/xfs_inode_fork.c | 8 ++++---- fs/xfs/libxfs/xfs_inode_fork.h | 32 ++++++++++++++++++++++++++++++++ fs/xfs/scrub/inode.c | 18 ++++++++++-------- 5 files changed, 58 insertions(+), 21 deletions(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 66594853a88b..b5e9256d6d32 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -924,10 +924,6 @@ enum xfs_dinode_fmt { ((w) == XFS_DATA_FORK ? \ (dip)->di_format : \ (dip)->di_aformat) -#define XFS_DFORK_NEXTENTS(dip,w) \ - ((w) == XFS_DATA_FORK ? \ - be32_to_cpu((dip)->di_nextents) : \ - be16_to_cpu((dip)->di_anextents)) /* * For block and character special files the 32bit dev_t is stored at the diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 7cad307840b3..f0e063835318 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -336,9 +336,11 @@ xfs_dinode_verify_fork( struct xfs_mount *mp, int whichfork) { - xfs_extnum_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t di_nextents; xfs_extnum_t max_extents; + di_nextents = xfs_dfork_nextents(dip, whichfork); + switch (XFS_DFORK_FORMAT(dip, whichfork)) { case XFS_DINODE_FMT_LOCAL: /* @@ -405,6 +407,9 @@ xfs_dinode_verify( uint16_t flags; uint64_t flags2; uint64_t di_size; + xfs_extnum_t nextents; + xfs_extnum_t naextents; + xfs_filblks_t nblocks; if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return __this_address; @@ -435,10 +440,12 @@ xfs_dinode_verify( if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) return __this_address; + nextents = xfs_dfork_data_extents(dip); + naextents = xfs_dfork_attr_extents(dip); + nblocks = be64_to_cpu(dip->di_nblocks); + /* Fork checks carried over from xfs_iformat_fork */ - if (mode && - be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > - be64_to_cpu(dip->di_nblocks)) + if (mode && nextents + naextents > nblocks) return __this_address; if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize) @@ -495,7 +502,7 @@ xfs_dinode_verify( default: return __this_address; } - if (dip->di_anextents) + if (naextents) return __this_address; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index a17c4d87520a..1cf48cee45e3 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -105,7 +105,7 @@ xfs_iformat_extents( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); int state = xfs_bmap_fork_to_state(whichfork); - xfs_extnum_t nex = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork); int size = nex * sizeof(xfs_bmbt_rec_t); struct xfs_iext_cursor icur; struct xfs_bmbt_rec *dp; @@ -230,7 +230,7 @@ xfs_iformat_data_fork( * depend on it. */ ip->i_df.if_format = dip->di_format; - ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents); + ip->i_df.if_nextents = xfs_dfork_data_extents(dip); switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -295,14 +295,14 @@ xfs_iformat_attr_fork( struct xfs_inode *ip, struct xfs_dinode *dip) { + xfs_extnum_t naextents = xfs_dfork_attr_extents(dip); int error = 0; /* * Initialize the extent count early, as the per-format routines may * depend on it. */ - ip->i_afp = xfs_ifork_alloc(dip->di_aformat, - be16_to_cpu(dip->di_anextents)); + ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents); switch (ip->i_afp->if_format) { case XFS_DINODE_FMT_LOCAL: diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 2605f7ff8fc1..7ed2ecb51bca 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -141,6 +141,38 @@ static inline xfs_extnum_t xfs_iext_max_nextents(int whichfork) return MAXAEXTNUM; } +static inline xfs_extnum_t +xfs_dfork_data_extents( + struct xfs_dinode *dip) +{ + return be32_to_cpu(dip->di_nextents); +} + +static inline xfs_extnum_t +xfs_dfork_attr_extents( + struct xfs_dinode *dip) +{ + return be16_to_cpu(dip->di_anextents); +} + +static inline xfs_extnum_t +xfs_dfork_nextents( + struct xfs_dinode *dip, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + return xfs_dfork_data_extents(dip); + case XFS_ATTR_FORK: + return xfs_dfork_attr_extents(dip); + default: + ASSERT(0); + break; + } + + return 0; +} + struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format, xfs_extnum_t nextents); struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 87925761e174..51820b40ab1c 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -233,6 +233,7 @@ xchk_dinode( unsigned long long isize; uint64_t flags2; xfs_extnum_t nextents; + xfs_extnum_t naextents; prid_t prid; uint16_t flags; uint16_t mode; @@ -390,8 +391,10 @@ xchk_dinode( xchk_inode_extsize(sc, dip, ino, mode, flags); + nextents = xfs_dfork_data_extents(dip); + naextents = xfs_dfork_attr_extents(dip); + /* di_nextents */ - nextents = be32_to_cpu(dip->di_nextents); fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); switch (dip->di_format) { case XFS_DINODE_FMT_EXTENTS: @@ -411,7 +414,7 @@ xchk_dinode( /* di_forkoff */ if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) xchk_ino_set_corrupt(sc, ino); - if (dip->di_anextents != 0 && dip->di_forkoff == 0) + if (naextents != 0 && dip->di_forkoff == 0) xchk_ino_set_corrupt(sc, ino); if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS) xchk_ino_set_corrupt(sc, ino); @@ -423,19 +426,18 @@ xchk_dinode( xchk_ino_set_corrupt(sc, ino); /* di_anextents */ - nextents = be16_to_cpu(dip->di_anextents); fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); switch (dip->di_aformat) { case XFS_DINODE_FMT_EXTENTS: - if (nextents > fork_recs) + if (naextents > fork_recs) xchk_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_BTREE: - if (nextents <= fork_recs) + if (naextents <= fork_recs) xchk_ino_set_corrupt(sc, ino); break; default: - if (nextents != 0) + if (naextents != 0) xchk_ino_set_corrupt(sc, ino); } @@ -513,14 +515,14 @@ xchk_inode_xref_bmap( &nextents, &count); if (!xchk_should_check_xref(sc, &error, NULL)) return; - if (nextents < be32_to_cpu(dip->di_nextents)) + if (nextents < xfs_dfork_data_extents(dip)) xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, &nextents, &acount); if (!xchk_should_check_xref(sc, &error, NULL)) return; - if (nextents != be16_to_cpu(dip->di_anextents)) + if (nextents != xfs_dfork_attr_extents(dip)) xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); /* Check nblocks against the inode. */ -- cgit From 1e7384f93db57c2135a9fa176e27b1c72ad860e3 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Tue, 16 Nov 2021 07:20:39 +0000 Subject: xfs: Use basic types to define xfs_log_dinode's di_nextents and di_anextents A future commit will increase the width of xfs_extnum_t in order to facilitate larger per-inode extent counters. Hence this patch now uses basic types to define xfs_log_dinode->[di_nextents|dianextents]. Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index b322db523d65..fd66e70248f7 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -396,8 +396,8 @@ struct xfs_log_dinode { xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + uint32_t di_nextents; /* number of extents in data fork */ + uint16_t di_anextents; /* number of extents in attribute fork*/ uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ int8_t di_aformat; /* format of attr fork's data */ uint32_t di_dmevmask; /* DMIG event mask */ -- cgit From 755c38ffe1a5937d8fa03419018f49f3a23fa9a7 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Tue, 16 Nov 2021 07:28:40 +0000 Subject: xfs: Promote xfs_extnum_t and xfs_aextnum_t to 64 and 32-bits respectively A future commit will introduce a 64-bit on-disk data extent counter and a 32-bit on-disk attr extent counter. This commit promotes xfs_extnum_t and xfs_aextnum_t to 64 and 32-bits in order to correctly handle in-core versions of these quantities. Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 4 ++-- fs/xfs/libxfs/xfs_inode_fork.c | 4 ++-- fs/xfs/libxfs/xfs_inode_fork.h | 2 +- fs/xfs/libxfs/xfs_types.h | 4 ++-- fs/xfs/xfs_inode.c | 4 ++-- fs/xfs/xfs_trace.h | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index cc15981b1793..9f38e33d6ce2 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -83,7 +83,7 @@ xfs_bmap_compute_maxlevels( maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; minnoderecs = mp->m_bmap_dmnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + maxblocks = howmany_64(maxleafents, minleafrecs); for (level = 1; maxblocks > 1; level++) { if (maxblocks <= maxrootrecs) maxblocks = 1; @@ -467,7 +467,7 @@ error0: if (bp_release) xfs_trans_brelse(NULL, bp); error_norelse: - xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + xfs_warn(mp, "%s: BAD after btree leaves for %llu extents", __func__, i); xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 1cf48cee45e3..004b205d87b8 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -117,8 +117,8 @@ xfs_iformat_extents( * we just bail out rather than crash in kmem_alloc() or memcpy() below. */ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { - xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", - (unsigned long long) ip->i_ino, nex); + xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).", + ip->i_ino, nex); xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(1)", dip, sizeof(*dip), __this_address); diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7ed2ecb51bca..4a8b77d425df 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -21,9 +21,9 @@ struct xfs_ifork { void *if_root; /* extent tree root */ char *if_data; /* inline file data */ } if_u1; + xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ - xfs_extnum_t if_nextents; /* # of extents in this fork */ }; /* diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 794a54cbd0de..373f64a492a4 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -12,8 +12,8 @@ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ -typedef int32_t xfs_extnum_t; /* # of extents in a file */ -typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */ +typedef uint64_t xfs_extnum_t; /* # of extents in a file */ +typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ typedef int64_t xfs_fsize_t; /* bytes in a file */ typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9de6205fe134..adc1355ce853 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3515,8 +3515,8 @@ xfs_iflush( if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: detected corrupt incore inode %Lu, " - "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, + "%s: detected corrupt incore inode %llu, " + "total extents = %llu nblocks = %lld, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), ip->i_nblocks, ip); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 16a91b4f97bd..fe6cb2951233 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2182,7 +2182,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->broot_size = ip->i_df.if_broot_bytes; __entry->fork_off = XFS_IFORK_BOFF(ip); ), - TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " + TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, " "broot size %d, forkoff 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, -- cgit From 919819f5e18097e6e888764c30625b1288d416c5 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Tue, 16 Nov 2021 08:39:32 +0000 Subject: xfs: Introduce XFS_SB_FEAT_INCOMPAT_NREXT64 and associated per-fs feature bit XFS_SB_FEAT_INCOMPAT_NREXT64 incompat feature bit will be set on filesystems which support large per-inode extent counters. This commit defines the new incompat feature bit and the corresponding per-fs feature bit (along with inline functions to work on it). Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_format.h | 1 + fs/xfs/libxfs/xfs_sb.c | 3 +++ fs/xfs/xfs_mount.h | 2 ++ 3 files changed, 6 insertions(+) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index b5e9256d6d32..64ff0c310696 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -372,6 +372,7 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ #define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ #define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ +#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE| \ XFS_SB_FEAT_INCOMPAT_SPINODES| \ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index f4e84aa1d50a..bd632389ae92 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -124,6 +124,9 @@ xfs_sb_version_to_features( features |= XFS_FEAT_BIGTIME; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) features |= XFS_FEAT_NEEDSREPAIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64) + features |= XFS_FEAT_NREXT64; + return features; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f6dc19de8322..98ceccdbcf51 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -276,6 +276,7 @@ typedef struct xfs_mount { #define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */ #define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ +#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ /* Mount features */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ @@ -338,6 +339,7 @@ __XFS_HAS_FEAT(realtime, REALTIME) __XFS_HAS_FEAT(inobtcounts, INOBTCNT) __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) +__XFS_HAS_FEAT(large_extent_counts, NREXT64) /* * Mount features -- cgit From 7c05aa9d9d2014937c8dacbd514bca2592b11f48 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Tue, 16 Nov 2021 07:56:54 +0000 Subject: xfs: Introduce XFS_FSOP_GEOM_FLAGS_NREXT64 XFS_FSOP_GEOM_FLAGS_NREXT64 indicates that the current filesystem instance supports 64-bit per-inode extent counters. Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_fs.h | 1 + fs/xfs/libxfs/xfs_sb.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 505533c43a92..1f7238db35cc 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ #define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ #define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */ +#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index bd632389ae92..e292a1914a5b 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1138,6 +1138,8 @@ xfs_fs_geometry( } else { geo->logsectsize = BBSIZE; } + if (xfs_has_large_extent_counts(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); -- cgit From 9b7d16e34bbebc0398b1dd4f2d64ae6793fdc5ea Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Tue, 16 Nov 2021 09:04:43 +0000 Subject: xfs: Introduce XFS_DIFLAG2_NREXT64 and associated helpers This commit adds the new per-inode flag XFS_DIFLAG2_NREXT64 to indicate that an inode supports 64-bit extent counters. This flag is also enabled by default on newly created inodes when the corresponding filesystem has large extent counter feature bit (i.e. XFS_FEAT_NREXT64) set. Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_format.h | 11 ++++++++++- fs/xfs/libxfs/xfs_ialloc.c | 2 ++ fs/xfs/xfs_inode.h | 5 +++++ fs/xfs/xfs_inode_item_recover.c | 7 +++++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 64ff0c310696..57b24744a7c2 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -991,15 +991,17 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ +#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) +#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { @@ -1007,6 +1009,13 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME)); } +static inline bool xfs_dinode_has_large_extent_counts( + const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64)); +} + /* * Inode number format: * low inopblog bits - offset in block diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b418fe0c0679..cdf8b63fcb22 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2772,6 +2772,8 @@ xfs_ialloc_setup_geometry( igeo->new_diflags2 = 0; if (xfs_has_bigtime(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; + if (xfs_has_large_extent_counts(mp)) + igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64; /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 740ab13d1aa2..aeab09882702 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -218,6 +218,11 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME; } +static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) +{ + return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; +} + /* * Return the buftarg used for data allocations on a given inode. */ diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 239dd2e3384e..44b90614859e 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -142,6 +142,13 @@ xfs_log_dinode_to_disk_ts( return ts; } +static inline bool xfs_log_dinode_has_large_extent_counts( + const struct xfs_log_dinode *ld) +{ + return ld->di_version >= 3 && + (ld->di_flags2 & XFS_DIFLAG2_NREXT64); +} + STATIC void xfs_log_dinode_to_disk( struct xfs_log_dinode *from, -- cgit From 0c35e7ba18508e9344a1f27b412924bc8b34eba8 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Tue, 16 Nov 2021 09:20:01 +0000 Subject: xfs: Use uint64_t to count maximum blocks that can be used by BMBT Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9f38e33d6ce2..b317226fb4ba 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -52,9 +52,9 @@ xfs_bmap_compute_maxlevels( xfs_mount_t *mp, /* file system mount structure */ int whichfork) /* data or attr fork */ { - int level; /* btree level */ - uint maxblocks; /* max blocks at this level */ + uint64_t maxblocks; /* max blocks at this level */ xfs_extnum_t maxleafents; /* max leaf entries possible */ + int level; /* btree level */ int maxrootrecs; /* max records in root block */ int minleafrecs; /* min records in leaf block */ int minnoderecs; /* min records in node block */ @@ -88,7 +88,7 @@ xfs_bmap_compute_maxlevels( if (maxblocks <= maxrootrecs) maxblocks = 1; else - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + maxblocks = howmany_64(maxblocks, minnoderecs); } mp->m_bm_maxlevels[whichfork] = level; ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk()); -- cgit From df9ad5cc7a524048ea7ff983d6feeb6d8c47a761 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Tue, 16 Nov 2021 09:54:37 +0000 Subject: xfs: Introduce macros to represent new maximum extent counts for data/attr forks This commit defines new macros to represent maximum extent counts allowed by filesystems which have support for large per-inode extent counters. Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 9 ++++----- fs/xfs/libxfs/xfs_bmap_btree.c | 9 +++++++-- fs/xfs/libxfs/xfs_format.h | 24 ++++++++++++++++++++++-- fs/xfs/libxfs/xfs_inode_buf.c | 4 +++- fs/xfs/libxfs/xfs_inode_fork.c | 3 ++- fs/xfs/libxfs/xfs_inode_fork.h | 21 +++++++++++++++++---- 6 files changed, 55 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index b317226fb4ba..1254d4d4821e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -61,10 +61,8 @@ xfs_bmap_compute_maxlevels( int sz; /* root block size */ /* - * The maximum number of extents in a file, hence the maximum number of - * leaf entries, is controlled by the size of the on-disk extent count, - * either a signed 32-bit number for the data fork, or a signed 16-bit - * number for the attr fork. + * The maximum number of extents in a fork, hence the maximum number of + * leaf entries, is controlled by the size of the on-disk extent count. * * Note that we can no longer assume that if we are in ATTR1 that the * fork offset of all the inodes will be @@ -74,7 +72,8 @@ xfs_bmap_compute_maxlevels( * ATTR2 we have to assume the worst case scenario of a minimum size * available. */ - maxleafents = xfs_iext_max_nextents(whichfork); + maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), + whichfork); if (whichfork == XFS_DATA_FORK) sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); else diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 453309fc85f2..2b77d45c215f 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -597,7 +597,11 @@ xfs_bmbt_maxrecs( return xfs_bmbt_block_maxrecs(blocklen, leaf); } -/* Compute the max possible height for block mapping btrees. */ +/* + * Calculate the maximum possible height of the btree that the on-disk format + * supports. This is used for sizing structures large enough to support every + * possible configuration of a filesystem that might get mounted. + */ unsigned int xfs_bmbt_maxlevels_ondisk(void) { @@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void) minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2; /* One extra level for the inode root. */ - return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1; + return xfs_btree_compute_maxlevels(minrecs, + XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1; } /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 57b24744a7c2..eb85bc9b229b 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -872,9 +872,29 @@ enum xfs_dinode_fmt { /* * Max values for extnum and aextnum. + * + * The original on-disk extent counts were held in signed fields, resulting in + * maximum extent counts of 2^31 and 2^15 for the data and attr forks + * respectively. Similarly the maximum extent length is limited to 2^21 blocks + * by the 21-bit wide blockcount field of a BMBT extent record. + * + * The newly introduced data fork extent counter can hold a 64-bit value, + * however the maximum number of extents in a file is also limited to 2^54 + * extents by the 54-bit wide startoff field of a BMBT extent record. + * + * It is further limited by the maximum supported file size of 2^63 + * *bytes*. This leads to a maximum extent count for maximally sized filesystem + * blocks (64kB) of: + * + * 2^63 bytes / 2^16 bytes per block = 2^47 blocks + * + * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence + * 2^48 was chosen as the maximum data fork extent count. */ -#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ -#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ +#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1)) +#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1)) +#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1)) +#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1)) /* * Inode minimum and maximum sizes. diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index f0e063835318..e0d3140c3622 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -361,7 +361,9 @@ xfs_dinode_verify_fork( return __this_address; break; case XFS_DINODE_FMT_BTREE: - max_extents = xfs_iext_max_nextents(whichfork); + max_extents = xfs_iext_max_nextents( + xfs_dinode_has_large_extent_counts(dip), + whichfork); if (di_nextents > max_extents) return __this_address; break; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 004b205d87b8..bb5d841aac58 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -744,7 +744,8 @@ xfs_iext_count_may_overflow( if (whichfork == XFS_COW_FORK) return 0; - max_exts = xfs_iext_max_nextents(whichfork); + max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip), + whichfork); if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) max_exts = 10; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 4a8b77d425df..967837a88860 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -133,12 +133,25 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp) return ifp->if_format; } -static inline xfs_extnum_t xfs_iext_max_nextents(int whichfork) +static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts, + int whichfork) { - if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) - return MAXEXTNUM; + switch (whichfork) { + case XFS_DATA_FORK: + case XFS_COW_FORK: + if (has_large_extent_counts) + return XFS_MAX_EXTCNT_DATA_FORK_LARGE; + return XFS_MAX_EXTCNT_DATA_FORK_SMALL; + + case XFS_ATTR_FORK: + if (has_large_extent_counts) + return XFS_MAX_EXTCNT_ATTR_FORK_LARGE; + return XFS_MAX_EXTCNT_ATTR_FORK_SMALL; - return MAXAEXTNUM; + default: + ASSERT(0); + return 0; + } } static inline xfs_extnum_t -- cgit From 8314bca03a1aa94ad8e7551f13c5664968200e41 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Tue, 8 Mar 2022 09:19:46 +0000 Subject: xfs: Replace numbered inode recovery error messages with descriptive ones This commit also prints inode fields with invalid values instead of printing addresses of inode and buffer instances. Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R Suggested-by: Dave Chinner --- fs/xfs/xfs_inode_item_recover.c | 52 +++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 44b90614859e..96b222e18b0f 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -324,13 +324,12 @@ xlog_recover_inode_commit_pass2( if (unlikely(S_ISREG(ldip->di_mode))) { if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && (ldip->di_format != XFS_DINODE_FMT_BTREE)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR( + "Bad log dinode data fork format for regular file", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad regular inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); + "Bad inode 0x%llx, data fork format 0x%x", + in_f->ilf_ino, ldip->di_format); error = -EFSCORRUPTED; goto out_release; } @@ -338,49 +337,42 @@ xlog_recover_inode_commit_pass2( if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && (ldip->di_format != XFS_DINODE_FMT_BTREE) && (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR( + "Bad log dinode data fork format for directory", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad dir inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); + "Bad inode 0x%llx, data fork format 0x%x", + in_f->ilf_ino, ldip->di_format); error = -EFSCORRUPTED; goto out_release; } } if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR("Bad log dinode extent counts", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", - __func__, item, dip, bp, in_f->ilf_ino, - ldip->di_nextents + ldip->di_anextents, + "Bad inode 0x%llx, nextents 0x%x, anextents 0x%x, nblocks 0x%llx", + in_f->ilf_ino, ldip->di_nextents, ldip->di_anextents, ldip->di_nblocks); error = -EFSCORRUPTED; goto out_release; } if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR("Bad log dinode fork offset", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, - item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); + "Bad inode 0x%llx, di_forkoff 0x%x", + in_f->ilf_ino, ldip->di_forkoff); error = -EFSCORRUPTED; goto out_release; } isize = xfs_log_dinode_size(mp); if (unlikely(item->ri_buf[1].i_len > isize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); + XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW, + mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "%s: Bad inode log record length %d, rec ptr "PTR_FMT, - __func__, item->ri_buf[1].i_len, item); + "Bad inode 0x%llx log dinode size 0x%x", + in_f->ilf_ino, item->ri_buf[1].i_len); error = -EFSCORRUPTED; goto out_release; } -- cgit From 52a4a14842ef940e5bab1c949e5adc8f027327dc Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Tue, 8 Mar 2022 09:34:28 +0000 Subject: xfs: Introduce per-inode 64-bit extent counters This commit introduces new fields in the on-disk inode format to support 64-bit data fork extent counters and 32-bit attribute fork extent counters. The new fields will be used only when an inode has XFS_DIFLAG2_NREXT64 flag set. Otherwise we continue to use the regular 32-bit data fork extent counters and 16-bit attribute fork extent counters. Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R Suggested-by: Dave Chinner --- fs/xfs/libxfs/xfs_format.h | 33 +++++++++++++-- fs/xfs/libxfs/xfs_inode_buf.c | 49 ++++++++++++++++++++-- fs/xfs/libxfs/xfs_inode_fork.h | 6 +++ fs/xfs/libxfs/xfs_log_format.h | 33 +++++++++++++-- fs/xfs/xfs_inode_item.c | 23 +++++++++-- fs/xfs/xfs_inode_item_recover.c | 90 ++++++++++++++++++++++++++++++++++------- 6 files changed, 204 insertions(+), 30 deletions(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index eb85bc9b229b..82b404c99b80 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -792,16 +792,41 @@ struct xfs_dinode { __be32 di_nlink; /* number of links to file */ __be16 di_projid_lo; /* lower part of owner's project id */ __be16 di_projid_hi; /* higher part owner's project id */ - __u8 di_pad[6]; /* unused, zeroed space */ - __be16 di_flushiter; /* incremented on flush */ + union { + /* Number of data fork extents if NREXT64 is set */ + __be64 di_big_nextents; + + /* Padding for V3 inodes without NREXT64 set. */ + __be64 di_v3_pad; + + /* Padding and inode flush counter for V2 inodes. */ + struct { + __u8 di_v2_pad[6]; + __be16 di_flushiter; + }; + }; xfs_timestamp_t di_atime; /* time last accessed */ xfs_timestamp_t di_mtime; /* time last modified */ xfs_timestamp_t di_ctime; /* time created/inode modified */ __be64 di_size; /* number of bytes in file */ __be64 di_nblocks; /* # of direct & btree blocks used */ __be32 di_extsize; /* basic/minimum extent size for file */ - __be32 di_nextents; /* number of extents in data fork */ - __be16 di_anextents; /* number of extents in attribute fork*/ + union { + /* + * For V2 inodes and V3 inodes without NREXT64 set, this + * is the number of data and attr fork extents. + */ + struct { + __be32 di_nextents; + __be16 di_anextents; + } __packed; + + /* Number of attr fork extents if NREXT64 is set. */ + struct { + __be32 di_big_anextents; + __be16 di_nrext64_pad; + } __packed; + } __packed; __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ __s8 di_aformat; /* format of attr fork's data */ __be32 di_dmevmask; /* DMIG event mask */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index e0d3140c3622..ee8d4eb7d048 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -279,6 +279,25 @@ xfs_inode_to_disk_ts( return ts; } +static inline void +xfs_inode_to_disk_iext_counters( + struct xfs_inode *ip, + struct xfs_dinode *to) +{ + if (xfs_inode_has_large_extent_counts(ip)) { + to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df)); + to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp)); + /* + * We might be upgrading the inode to use larger extent counters + * than was previously used. Hence zero the unused field. + */ + to->di_nrext64_pad = cpu_to_be16(0); + } else { + to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); + } +} + void xfs_inode_to_disk( struct xfs_inode *ip, @@ -296,7 +315,6 @@ xfs_inode_to_disk( to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); - memset(to->di_pad, 0, sizeof(to->di_pad)); to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); @@ -307,8 +325,6 @@ xfs_inode_to_disk( to->di_size = cpu_to_be64(ip->i_disk_size); to->di_nblocks = cpu_to_be64(ip->i_nblocks); to->di_extsize = cpu_to_be32(ip->i_extsize); - to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); - to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); to->di_forkoff = ip->i_forkoff; to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_flags = cpu_to_be16(ip->i_diflags); @@ -323,11 +339,14 @@ xfs_inode_to_disk( to->di_lsn = cpu_to_be64(lsn); memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_version = 2; to->di_flushiter = cpu_to_be16(ip->i_flushiter); + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } + + xfs_inode_to_disk_iext_counters(ip, to); } static xfs_failaddr_t @@ -398,6 +417,24 @@ xfs_dinode_verify_forkoff( return NULL; } +static xfs_failaddr_t +xfs_dinode_verify_nrext64( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) { + if (!xfs_has_large_extent_counts(mp)) + return __this_address; + if (dip->di_nrext64_pad != 0) + return __this_address; + } else if (dip->di_version >= 3) { + if (dip->di_v3_pad != 0) + return __this_address; + } + + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -442,6 +479,10 @@ xfs_dinode_verify( if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) return __this_address; + fa = xfs_dinode_verify_nrext64(mp, dip); + if (fa) + return fa; + nextents = xfs_dfork_data_extents(dip); naextents = xfs_dfork_attr_extents(dip); nblocks = be64_to_cpu(dip->di_nblocks); diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 967837a88860..fd5c3c2d77e0 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -158,6 +158,9 @@ static inline xfs_extnum_t xfs_dfork_data_extents( struct xfs_dinode *dip) { + if (xfs_dinode_has_large_extent_counts(dip)) + return be64_to_cpu(dip->di_big_nextents); + return be32_to_cpu(dip->di_nextents); } @@ -165,6 +168,9 @@ static inline xfs_extnum_t xfs_dfork_attr_extents( struct xfs_dinode *dip) { + if (xfs_dinode_has_large_extent_counts(dip)) + return be32_to_cpu(dip->di_big_anextents); + return be16_to_cpu(dip->di_anextents); } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index fd66e70248f7..12234a880e94 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -388,16 +388,41 @@ struct xfs_log_dinode { uint32_t di_nlink; /* number of links to file */ uint16_t di_projid_lo; /* lower part of owner's project id */ uint16_t di_projid_hi; /* higher part of owner's project id */ - uint8_t di_pad[6]; /* unused, zeroed space */ - uint16_t di_flushiter; /* incremented on flush */ + union { + /* Number of data fork extents if NREXT64 is set */ + uint64_t di_big_nextents; + + /* Padding for V3 inodes without NREXT64 set. */ + uint64_t di_v3_pad; + + /* Padding and inode flush counter for V2 inodes. */ + struct { + uint8_t di_v2_pad[6]; /* V2 inode zeroed space */ + uint16_t di_flushiter; /* V2 inode incremented on flush */ + }; + }; xfs_log_timestamp_t di_atime; /* time last accessed */ xfs_log_timestamp_t di_mtime; /* time last modified */ xfs_log_timestamp_t di_ctime; /* time created/inode modified */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - uint32_t di_nextents; /* number of extents in data fork */ - uint16_t di_anextents; /* number of extents in attribute fork*/ + union { + /* + * For V2 inodes and V3 inodes without NREXT64 set, this + * is the number of data and attr fork extents. + */ + struct { + uint32_t di_nextents; + uint16_t di_anextents; + } __packed; + + /* Number of attr fork extents if NREXT64 is set. */ + struct { + uint32_t di_big_anextents; + uint16_t di_nrext64_pad; + } __packed; + } __packed; uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ int8_t di_aformat; /* format of attr fork's data */ uint32_t di_dmevmask; /* DMIG event mask */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 9e6ef55cf29e..00733a18ccdc 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -359,6 +359,21 @@ xfs_copy_dm_fields_to_log_dinode( } } +static inline void +xfs_inode_to_log_dinode_iext_counters( + struct xfs_inode *ip, + struct xfs_log_dinode *to) +{ + if (xfs_inode_has_large_extent_counts(ip)) { + to->di_big_nextents = xfs_ifork_nextents(&ip->i_df); + to->di_big_anextents = xfs_ifork_nextents(ip->i_afp); + to->di_nrext64_pad = 0; + } else { + to->di_nextents = xfs_ifork_nextents(&ip->i_df); + to->di_anextents = xfs_ifork_nextents(ip->i_afp); + } +} + static void xfs_inode_to_log_dinode( struct xfs_inode *ip, @@ -374,7 +389,6 @@ xfs_inode_to_log_dinode( to->di_projid_lo = ip->i_projid & 0xffff; to->di_projid_hi = ip->i_projid >> 16; - memset(to->di_pad, 0, sizeof(to->di_pad)); memset(to->di_pad3, 0, sizeof(to->di_pad3)); to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); @@ -386,8 +400,6 @@ xfs_inode_to_log_dinode( to->di_size = ip->i_disk_size; to->di_nblocks = ip->i_nblocks; to->di_extsize = ip->i_extsize; - to->di_nextents = xfs_ifork_nextents(&ip->i_df); - to->di_anextents = xfs_ifork_nextents(ip->i_afp); to->di_forkoff = ip->i_forkoff; to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_flags = ip->i_diflags; @@ -407,11 +419,14 @@ xfs_inode_to_log_dinode( to->di_lsn = lsn; memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_version = 2; to->di_flushiter = ip->i_flushiter; + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } + + xfs_inode_to_log_dinode_iext_counters(ip, to); } /* diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 96b222e18b0f..6d44f5fd6d7e 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -149,6 +149,22 @@ static inline bool xfs_log_dinode_has_large_extent_counts( (ld->di_flags2 & XFS_DIFLAG2_NREXT64); } +static inline void +xfs_log_dinode_to_disk_iext_counters( + struct xfs_log_dinode *from, + struct xfs_dinode *to) +{ + if (xfs_log_dinode_has_large_extent_counts(from)) { + to->di_big_nextents = cpu_to_be64(from->di_big_nextents); + to->di_big_anextents = cpu_to_be32(from->di_big_anextents); + to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad); + } else { + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + } + +} + STATIC void xfs_log_dinode_to_disk( struct xfs_log_dinode *from, @@ -165,7 +181,6 @@ xfs_log_dinode_to_disk( to->di_nlink = cpu_to_be32(from->di_nlink); to->di_projid_lo = cpu_to_be16(from->di_projid_lo); to->di_projid_hi = cpu_to_be16(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); @@ -174,8 +189,6 @@ xfs_log_dinode_to_disk( to->di_size = cpu_to_be64(from->di_size); to->di_nblocks = cpu_to_be64(from->di_nblocks); to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); to->di_forkoff = from->di_forkoff; to->di_aformat = from->di_aformat; to->di_dmevmask = cpu_to_be32(from->di_dmevmask); @@ -191,12 +204,66 @@ xfs_log_dinode_to_disk( to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &from->di_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_flushiter = cpu_to_be16(from->di_flushiter); + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); + } + + xfs_log_dinode_to_disk_iext_counters(from, to); +} + +STATIC int +xlog_dinode_verify_extent_counts( + struct xfs_mount *mp, + struct xfs_log_dinode *ldip) +{ + xfs_extnum_t nextents; + xfs_aextnum_t anextents; + + if (xfs_log_dinode_has_large_extent_counts(ldip)) { + if (!xfs_has_large_extent_counts(mp) || + (ldip->di_nrext64_pad != 0)) { + XFS_CORRUPTION_ERROR( + "Bad log dinode large extent count format", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); + xfs_alert(mp, + "Bad inode 0x%llx, large extent counts %d, padding 0x%x", + ldip->di_ino, xfs_has_large_extent_counts(mp), + ldip->di_nrext64_pad); + return -EFSCORRUPTED; + } + + nextents = ldip->di_big_nextents; + anextents = ldip->di_big_anextents; + } else { + if (ldip->di_version == 3 && ldip->di_v3_pad != 0) { + XFS_CORRUPTION_ERROR( + "Bad log dinode di_v3_pad", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); + xfs_alert(mp, + "Bad inode 0x%llx, di_v3_pad 0x%llx", + ldip->di_ino, ldip->di_v3_pad); + return -EFSCORRUPTED; + } + + nextents = ldip->di_nextents; + anextents = ldip->di_anextents; + } + + if (unlikely(nextents + anextents > ldip->di_nblocks)) { + XFS_CORRUPTION_ERROR("Bad log dinode extent counts", + XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); + xfs_alert(mp, + "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx", + ldip->di_ino, xfs_has_large_extent_counts(mp), nextents, + anextents, ldip->di_nblocks); + return -EFSCORRUPTED; } + + return 0; } STATIC int @@ -347,16 +414,11 @@ xlog_recover_inode_commit_pass2( goto out_release; } } - if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ - XFS_CORRUPTION_ERROR("Bad log dinode extent counts", - XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); - xfs_alert(mp, - "Bad inode 0x%llx, nextents 0x%x, anextents 0x%x, nblocks 0x%llx", - in_f->ilf_ino, ldip->di_nextents, ldip->di_anextents, - ldip->di_nblocks); - error = -EFSCORRUPTED; + + error = xlog_dinode_verify_extent_counts(mp, ldip); + if (error) goto out_release; - } + if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { XFS_CORRUPTION_ERROR("Bad log dinode fork offset", XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); -- cgit From f3bf67c6c6fe863b7946ac0c2214a147dc50523d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 12 Apr 2022 06:49:40 +1000 Subject: xfs: Use generic_file_open() Remove the open-coded check of O_LARGEFILE. This changes the errno to be the same as other filesystems; it was changed generically in 2.6.24 but that fix skipped XFS. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5bddb1e9e0b3..c5541d062d0d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1167,12 +1167,10 @@ xfs_file_open( struct inode *inode, struct file *file) { - if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) - return -EFBIG; if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; - return 0; + return generic_file_open(inode, file); } STATIC int -- cgit From f34061f554feba68e12b7a73008c350d2a9afd0c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 12 Apr 2022 06:49:41 +1000 Subject: xfs: pass explicit mount pointer to rtalloc query functions Pass an explicit xfs_mount pointer to the rtalloc query functions so that they can support transactionless queries. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_rtbitmap.c | 9 +++++---- fs/xfs/scrub/rtbitmap.c | 9 +++++---- fs/xfs/xfs_fsmap.c | 6 +++--- fs/xfs/xfs_rtalloc.h | 7 ++++--- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5740ba664867..fa180ab66b73 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1008,6 +1008,7 @@ xfs_rtfree_extent( /* Find all the free records within a given range. */ int xfs_rtalloc_query_range( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *low_rec, const struct xfs_rtalloc_rec *high_rec, @@ -1015,7 +1016,6 @@ xfs_rtalloc_query_range( void *priv) { struct xfs_rtalloc_rec rec; - struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; xfs_rtblock_t high_key; @@ -1048,7 +1048,7 @@ xfs_rtalloc_query_range( rec.ar_startext = rtstart; rec.ar_extcount = rtend - rtstart + 1; - error = fn(tp, &rec, priv); + error = fn(mp, tp, &rec, priv); if (error) break; } @@ -1062,6 +1062,7 @@ xfs_rtalloc_query_range( /* Find all the free records. */ int xfs_rtalloc_query_all( + struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv) @@ -1069,10 +1070,10 @@ xfs_rtalloc_query_all( struct xfs_rtalloc_rec keys[2]; keys[0].ar_startext = 0; - keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1; + keys[1].ar_startext = mp->m_sb.sb_rextents - 1; keys[0].ar_extcount = keys[1].ar_extcount = 0; - return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); + return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv); } /* Is the given extent all free? */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 8fa012057405..0a3bde64c675 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -40,6 +40,7 @@ xchk_setup_rt( /* Scrub a free extent record from the realtime bitmap. */ STATIC int xchk_rtbitmap_rec( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -48,10 +49,10 @@ xchk_rtbitmap_rec( xfs_rtblock_t startblock; xfs_rtblock_t blockcount; - startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize; - blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize; + startblock = rec->ar_startext * mp->m_sb.sb_rextsize; + blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; - if (!xfs_verify_rtext(sc->mp, startblock, blockcount)) + if (!xfs_verify_rtext(mp, startblock, blockcount)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -114,7 +115,7 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc); + error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 10e1cb71439e..bb23199f65c3 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -450,11 +450,11 @@ xfs_getfsmap_logdev( /* Transform a rtbitmap "record" into a fsmap */ STATIC int xfs_getfsmap_rtdev_rtbitmap_helper( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) { - struct xfs_mount *mp = tp->t_mountp; struct xfs_getfsmap_info *info = priv; struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; @@ -535,7 +535,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( do_div(alow.ar_startext, mp->m_sb.sb_rextsize); if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) ahigh.ar_startext++; - error = xfs_rtalloc_query_range(tp, &alow, &ahigh, + error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) goto err; @@ -547,7 +547,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( info->last = true; ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext); - error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info); + error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info); if (error) goto err; err: diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 91b00289509b..539d134f4f25 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -22,6 +22,7 @@ struct xfs_rtalloc_rec { }; typedef int (*xfs_rtalloc_query_range_fn)( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv); @@ -123,11 +124,11 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, struct xfs_buf **rbpp, xfs_fsblock_t *rsb); -int xfs_rtalloc_query_range(struct xfs_trans *tp, +int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *low_rec, const struct xfs_rtalloc_rec *high_rec, xfs_rtalloc_query_range_fn fn, void *priv); -int xfs_rtalloc_query_all(struct xfs_trans *tp, +int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); @@ -140,7 +141,7 @@ int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) # define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) -# define xfs_rtalloc_query_all(t,f,p) (ENOSYS) +# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS) # define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) # define xfs_verify_rtbno(m, r) (false) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) -- cgit From 5a605fd6cb1da0ec9cb6e54c06bcf58f706d2f83 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 12 Apr 2022 06:49:42 +1000 Subject: xfs: recalculate free rt extents after log recovery I've been observing periodic corruption reports from xfs_scrub involving the free rt extent counter (frextents) while running xfs/141. That test uses an error injection knob to induce a torn write to the log, and an arbitrary number of recovery mounts, frextents will count fewer free rt extents than can be found the rtbitmap. The root cause of the problem is a combination of the misuse of sb_frextents in the incore mount to reflect both incore reservations made by running transactions as well as the actual count of free rt extents on disk. The following sequence can reproduce the undercount: Thread 1 Thread 2 xfs_trans_alloc(rtextents=3) xfs_mod_frextents(-3) xfs_attr_set() xfs_bmap_attr_addfork() xfs_add_attr2() xfs_log_sb() xfs_sb_to_disk() xfs_trans_commit() Note that thread 1 subtracts 3 from sb_frextents even though it never commits to using that space. Thread 2 writes the undercounted value to the ondisk superblock and logs it to the xattr transaction, which is then flushed to disk. At next mount, log recovery will find the logged superblock and write that back into the filesystem. At the end of log recovery, we reread the superblock and install the recovered undercounted frextents value into the incore superblock. From that point on, we've effectively leaked thread 1's transaction reservation. The correct fix for this is to separate the incore reservation from the ondisk usage, but that's a matter for the next patch. Because the kernel has been logging superblocks with undercounted frextents for a very long time and we don't demand that sysadmins run xfs_repair after a crash, fix the undercount by recomputing frextents after log recovery. Gating this on log recovery is a reasonable balance (I think) between correcting the problem and slowing down every mount attempt. Note that xfs_repair will fix undercounted frextents. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_mount.c | 41 ++++++++++++++++++++++++++++++++--------- fs/xfs/xfs_rtalloc.c | 37 +++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_rtalloc.h | 2 ++ 3 files changed, 71 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c5f153c3693f..53e130f803b1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -468,6 +468,8 @@ STATIC int xfs_check_summary_counts( struct xfs_mount *mp) { + int error = 0; + /* * The AG0 superblock verifier rejects in-progress filesystems, * so we should never see the flag set this far into mounting. @@ -506,11 +508,32 @@ xfs_check_summary_counts( * superblock to be correct and we don't need to do anything here. * Otherwise, recalculate the summary counters. */ - if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) && - !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) - return 0; + if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) || + xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) { + error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); + if (error) + return error; + } + + /* + * Older kernels misused sb_frextents to reflect both incore + * reservations made by running transactions and the actual count of + * free rt extents in the ondisk metadata. Transactions committed + * during runtime can therefore contain a superblock update that + * undercounts the number of free rt extents tracked in the rt bitmap. + * A clean unmount record will have the correct frextents value since + * there can be no other transactions running at that point. + * + * If we're mounting the rt volume after recovering the log, recompute + * frextents from the rtbitmap file to fix the inconsistency. + */ + if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { + error = xfs_rtalloc_reinit_frextents(mp); + if (error) + return error; + } - return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); + return 0; } /* @@ -784,11 +807,6 @@ xfs_mountfs( goto out_inodegc_shrinker; } - /* Make sure the summary counts are ok. */ - error = xfs_check_summary_counts(mp); - if (error) - goto out_log_dealloc; - /* Enable background inode inactivation workers. */ xfs_inodegc_start(mp); xfs_blockgc_start(mp); @@ -844,6 +862,11 @@ xfs_mountfs( goto out_rele_rip; } + /* Make sure the summary counts are ok. */ + error = xfs_check_summary_counts(mp); + if (error) + goto out_rtunmount; + /* * If this is a read-only mount defer the superblock updates until * the next remount into writeable mode. Otherwise we would never diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b8c79ee791af..76f50e75f99c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1284,6 +1284,43 @@ xfs_rtmount_init( return 0; } +static int +xfs_rtalloc_count_frextent( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + uint64_t *valp = priv; + + *valp += rec->ar_extcount; + return 0; +} + +/* + * Reinitialize the number of free realtime extents from the realtime bitmap. + * Callers must ensure that there is no other activity in the filesystem. + */ +int +xfs_rtalloc_reinit_frextents( + struct xfs_mount *mp) +{ + uint64_t val = 0; + int error; + + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, + &val); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL); + if (error) + return error; + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_frextents = val; + spin_unlock(&mp->m_sb_lock); + return 0; +} + /* * Get the bitmap and summary inodes and the summary cache into the mount * structure at mount time. diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 539d134f4f25..62c7ad79cbb6 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -135,6 +135,7 @@ bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, bool *is_free); +int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) @@ -145,6 +146,7 @@ int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, # define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) # define xfs_verify_rtbno(m, r) (false) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) +# define xfs_rtalloc_reinit_frextents(m) (0) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ -- cgit From 2229276c5283264b8c2241c1ed972bbb136cab22 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 12 Apr 2022 06:49:42 +1000 Subject: xfs: use a separate frextents counter for rt extent reservations As mentioned in the previous commit, the kernel misuses sb_frextents in the incore mount to reflect both incore reservations made by running transactions as well as the actual count of free rt extents on disk. This results in the superblock being written to the log with an underestimate of the number of rt extents that are marked free in the rtbitmap. Teaching XFS to recompute frextents after log recovery avoids operational problems in the current mount, but it doesn't solve the problem of us writing undercounted frextents which are then recovered by an older kernel that doesn't have that fix. Create an incore percpu counter to mirror the ondisk frextents. This new counter will track transaction reservations and the only time we will touch the incore super counter (i.e the one that gets logged) is when those transactions commit updates to the rt bitmap. This is in contrast to the lazysbcount counters (e.g. fdblocks), where we know that log recovery will always fix any incorrect counter that we log. As a bonus, we only take m_sb_lock at transaction commit time. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_sb.c | 5 +++++ fs/xfs/xfs_fsops.c | 5 +---- fs/xfs/xfs_icache.c | 9 ++++++--- fs/xfs/xfs_mount.c | 50 +++++++++++++++++++++----------------------------- fs/xfs/xfs_mount.h | 19 ++++++++++++++++--- fs/xfs/xfs_rtalloc.c | 1 + fs/xfs/xfs_super.c | 14 ++++++++++++-- fs/xfs/xfs_trans.c | 43 +++++++++++++++++++++++++++++++++++++------ 8 files changed, 99 insertions(+), 47 deletions(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index f4e84aa1d50a..8dd7186ef9df 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -911,6 +911,11 @@ xfs_log_sb( * reservations that have been taken out percpu counters. If we have an * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. + * + * Do not update sb_frextents here because it is not part of the lazy + * sb counters, despite having a percpu counter. It is always kept + * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas() + * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 68f74549fa22..a0d7aa7fbbff 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -349,10 +349,7 @@ xfs_fs_counts( cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - xfs_fdblocks_unavailable(mp); - - spin_lock(&mp->m_sb_lock); - cnt->freertx = mp->m_sb.sb_frextents; - spin_unlock(&mp->m_sb_lock); + cnt->freertx = percpu_counter_read_positive(&mp->m_frextents); } /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index bffd6eb0b298..5269354b1b69 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1916,13 +1916,16 @@ xfs_inodegc_want_queue_rt_file( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; - uint64_t freertx; if (!XFS_IS_REALTIME_INODE(ip)) return false; - freertx = READ_ONCE(mp->m_sb.sb_frextents); - return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT]; + if (__percpu_counter_compare(&mp->m_frextents, + mp->m_low_rtexts[XFS_LOWSP_5_PCNT], + XFS_FDBLOCKS_BATCH) < 0) + return true; + + return false; } #else # define xfs_inodegc_want_queue_rt_file(ip) (false) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 53e130f803b1..0c0bcbd4949d 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1110,24 +1110,33 @@ xfs_fs_writable( return true; } +/* Adjust m_fdblocks or m_frextents. */ int -xfs_mod_fdblocks( +xfs_mod_freecounter( struct xfs_mount *mp, + struct percpu_counter *counter, int64_t delta, bool rsvd) { int64_t lcounter; long long res_used; + uint64_t set_aside = 0; s32 batch; - uint64_t set_aside; + bool has_resv_pool; + + ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); + has_resv_pool = (counter == &mp->m_fdblocks); + if (rsvd) + ASSERT(has_resv_pool); if (delta > 0) { /* * If the reserve pool is depleted, put blocks back into it * first. Most of the time the pool is full. */ - if (likely(mp->m_resblks == mp->m_resblks_avail)) { - percpu_counter_add(&mp->m_fdblocks, delta); + if (likely(!has_resv_pool || + mp->m_resblks == mp->m_resblks_avail)) { + percpu_counter_add(counter, delta); return 0; } @@ -1139,7 +1148,7 @@ xfs_mod_fdblocks( } else { delta -= res_used; mp->m_resblks_avail = mp->m_resblks; - percpu_counter_add(&mp->m_fdblocks, delta); + percpu_counter_add(counter, delta); } spin_unlock(&mp->m_sb_lock); return 0; @@ -1153,7 +1162,7 @@ xfs_mod_fdblocks( * then make everything serialise as we are real close to * ENOSPC. */ - if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH, + if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH, XFS_FDBLOCKS_BATCH) < 0) batch = 1; else @@ -1170,9 +1179,10 @@ xfs_mod_fdblocks( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - set_aside = xfs_fdblocks_unavailable(mp); - percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); - if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, + if (has_resv_pool) + set_aside = xfs_fdblocks_unavailable(mp); + percpu_counter_add_batch(counter, delta, batch); + if (__percpu_counter_compare(counter, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0; @@ -1183,8 +1193,8 @@ xfs_mod_fdblocks( * that took us to ENOSPC. */ spin_lock(&mp->m_sb_lock); - percpu_counter_add(&mp->m_fdblocks, -delta); - if (!rsvd) + percpu_counter_add(counter, -delta); + if (!has_resv_pool || !rsvd) goto fdblocks_enospc; lcounter = (long long)mp->m_resblks_avail + delta; @@ -1201,24 +1211,6 @@ fdblocks_enospc: return -ENOSPC; } -int -xfs_mod_frextents( - struct xfs_mount *mp, - int64_t delta) -{ - int64_t lcounter; - int ret = 0; - - spin_lock(&mp->m_sb_lock); - lcounter = mp->m_sb.sb_frextents + delta; - if (lcounter < 0) - ret = -ENOSPC; - else - mp->m_sb.sb_frextents = lcounter; - spin_unlock(&mp->m_sb_lock); - return ret; -} - /* * Used to free the superblock along various error paths. */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f6dc19de8322..a6b8efb2df52 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -183,6 +183,8 @@ typedef struct xfs_mount { struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ struct percpu_counter m_fdblocks; /* free block counter */ + struct percpu_counter m_frextents; /* free rt extent counter */ + /* * Count of data device blocks reserved for delayed allocations, * including indlen blocks. Does not include allocated CoW staging @@ -494,9 +496,20 @@ xfs_fdblocks_unavailable( return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); } -extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, - bool reserved); -extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); +int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, + int64_t delta, bool rsvd); + +static inline int +xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved) +{ + return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved); +} + +static inline int +xfs_mod_frextents(struct xfs_mount *mp, int64_t delta) +{ + return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false); +} extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 76f50e75f99c..997e4a9d27d3 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1318,6 +1318,7 @@ xfs_rtalloc_reinit_frextents( spin_lock(&mp->m_sb_lock); mp->m_sb.sb_frextents = val; spin_unlock(&mp->m_sb_lock); + percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); return 0; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 54be9d64093e..3a5088646294 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -843,9 +843,11 @@ xfs_fs_statfs( if (XFS_IS_REALTIME_MOUNT(mp) && (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) { + s64 freertx; + statp->f_blocks = sbp->sb_rblocks; - statp->f_bavail = statp->f_bfree = - sbp->sb_frextents * sbp->sb_rextsize; + freertx = percpu_counter_sum_positive(&mp->m_frextents); + statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize; } return 0; @@ -1015,8 +1017,14 @@ xfs_init_percpu_counters( if (error) goto free_fdblocks; + error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); + if (error) + goto free_delalloc; + return 0; +free_delalloc: + percpu_counter_destroy(&mp->m_delalloc_blks); free_fdblocks: percpu_counter_destroy(&mp->m_fdblocks); free_ifree: @@ -1033,6 +1041,7 @@ xfs_reinit_percpu_counters( percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); + percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); } static void @@ -1045,6 +1054,7 @@ xfs_destroy_percpu_counters( ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); + percpu_counter_destroy(&mp->m_frextents); } static int diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 0ac717aad380..6d9df2e9b267 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -498,10 +498,31 @@ xfs_trans_apply_sb_deltas( be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta); } - if (tp->t_frextents_delta) - be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta); - if (tp->t_res_frextents_delta) - be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta); + /* + * Updating frextents requires careful handling because it does not + * behave like the lazysb counters because we cannot rely on log + * recovery in older kenels to recompute the value from the rtbitmap. + * This means that the ondisk frextents must be consistent with the + * rtbitmap. + * + * Therefore, log the frextents change to the ondisk superblock and + * update the incore superblock so that future calls to xfs_log_sb + * write the correct value ondisk. + * + * Don't touch m_frextents because it includes incore reservations, + * and those are handled by the unreserve function. + */ + if (tp->t_frextents_delta || tp->t_res_frextents_delta) { + struct xfs_mount *mp = tp->t_mountp; + int64_t rtxdelta; + + rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta; + + spin_lock(&mp->m_sb_lock); + be64_add_cpu(&sbp->sb_frextents, rtxdelta); + mp->m_sb.sb_frextents += rtxdelta; + spin_unlock(&mp->m_sb_lock); + } if (tp->t_dblocks_delta) { be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); @@ -614,7 +635,12 @@ xfs_trans_unreserve_and_mod_sb( if (ifreedelta) percpu_counter_add(&mp->m_ifree, ifreedelta); - if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) + if (rtxdelta) { + error = xfs_mod_frextents(mp, rtxdelta); + ASSERT(!error); + } + + if (!(tp->t_flags & XFS_TRANS_SB_DIRTY)) return; /* apply remaining deltas */ @@ -622,7 +648,12 @@ xfs_trans_unreserve_and_mod_sb( mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta; mp->m_sb.sb_icount += idelta; mp->m_sb.sb_ifree += ifreedelta; - mp->m_sb.sb_frextents += rtxdelta; + /* + * Do not touch sb_frextents here because we are dealing with incore + * reservation. sb_frextents is not part of the lazy sb counters so it + * must be consistent with the ondisk rtbitmap and must never include + * incore reservations. + */ mp->m_sb.sb_dblocks += tp->t_dblocks_delta; mp->m_sb.sb_agcount += tp->t_agcount_delta; mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; -- cgit From 83a21c18441f75aec64548692b52d34582b98a6a Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Tue, 29 Mar 2022 06:14:00 +0000 Subject: xfs: Directory's data fork extent counter can never overflow The maximum file size that can be represented by the data fork extent counter in the worst case occurs when all extents are 1 block in length and each block is 1KB in size. With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and with 1KB sized blocks, a file can reach upto, (2^31) * 1KB = 2TB This is much larger than the theoretical maximum size of a directory i.e. XFS_DIR2_SPACE_SIZE * 3 = ~96GB. Since a directory's inode can never overflow its data fork extent counter, this commit removes all the overflow checks associated with it. xfs_dinode_verify() now performs a rough check to verify if a diretory's data fork is larger than 96GB. Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 20 --------------- fs/xfs/libxfs/xfs_da_btree.h | 1 + fs/xfs/libxfs/xfs_da_format.h | 1 + fs/xfs/libxfs/xfs_dir2.c | 8 ++++++ fs/xfs/libxfs/xfs_format.h | 13 ++++++++++ fs/xfs/libxfs/xfs_inode_buf.c | 3 +++ fs/xfs/libxfs/xfs_inode_fork.h | 13 ---------- fs/xfs/xfs_inode.c | 55 ++---------------------------------------- fs/xfs/xfs_symlink.c | 5 ---- 9 files changed, 28 insertions(+), 91 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 1254d4d4821e..4fab0c92ab70 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5147,26 +5147,6 @@ xfs_bmap_del_extent_real( * Deleting the middle of the extent. */ - /* - * For directories, -ENOSPC is returned since a directory entry - * remove operation must not fail due to low extent count - * availability. -ENOSPC will be handled by higher layers of XFS - * by letting the corresponding empty Data/Free blocks to linger - * until a future remove operation. Dabtree blocks would be - * swapped with the last block in the leaf space and then the - * new last block will be unmapped. - * - * The above logic also applies to the source directory entry of - * a rename operation. - */ - error = xfs_iext_count_may_overflow(ip, whichfork, 1); - if (error) { - ASSERT(S_ISDIR(VFS_I(ip)->i_mode) && - whichfork == XFS_DATA_FORK); - error = -ENOSPC; - goto done; - } - old = got; got.br_blockcount = del->br_startoff - got.br_startoff; diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 0faf7d9ac241..7f08f6de48bf 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -30,6 +30,7 @@ struct xfs_da_geometry { unsigned int free_hdr_size; /* dir2 free header size */ unsigned int free_max_bests; /* # of bests entries in dir2 free */ xfs_dablk_t freeblk; /* blockno of free data v2 */ + xfs_extnum_t max_extents; /* Max. extents in corresponding fork */ xfs_dir2_data_aoff_t data_first_offset; size_t data_entry_offset; diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 5a49caa5c9df..95354b7ab7f5 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) * Directory address space divided into sections, * spaces separated by 32GB. */ +#define XFS_DIR2_MAX_SPACES 3 #define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) #define XFS_DIR2_DATA_SPACE 0 #define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 5f1e4799e8fa..3cd51fa3837b 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -150,6 +150,8 @@ xfs_da_mount( dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); + dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >> + mp->m_sb.sb_blocklog; dageo->magicpct = (dageo->blksize * 37) / 100; /* set up attribute geometry - single fsb only */ @@ -161,6 +163,12 @@ xfs_da_mount( dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size; dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); + + if (xfs_has_large_extent_counts(mp)) + dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE; + else + dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL; + dageo->magicpct = (dageo->blksize * 37) / 100; return 0; } diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 82b404c99b80..43de892d0305 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -915,6 +915,19 @@ enum xfs_dinode_fmt { * * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence * 2^48 was chosen as the maximum data fork extent count. + * + * The maximum file size that can be represented by the data fork extent counter + * in the worst case occurs when all extents are 1 block in length and each + * block is 1KB in size. + * + * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and + * with 1KB sized blocks, a file can reach upto, + * 1KB * (2^31) = 2TB + * + * This is much larger than the theoretical maximum size of a directory + * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB. + * + * Hence, a directory inode can never overflow its data fork extent counter. */ #define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1)) #define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1)) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index ee8d4eb7d048..74b82ec80f8e 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -491,6 +491,9 @@ xfs_dinode_verify( if (mode && nextents + naextents > nblocks) return __this_address; + if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) + return __this_address; + if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize) return __this_address; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index fd5c3c2d77e0..6f9d69f8896e 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -39,19 +39,6 @@ struct xfs_ifork { */ #define XFS_IEXT_PUNCH_HOLE_CNT (1) -/* - * Directory entry addition can cause the following, - * 1. Data block can be added/removed. - * A new extent can cause extent count to increase by 1. - * 2. Free disk block can be added/removed. - * Same behaviour as described above for Data block. - * 3. Dabtree blocks. - * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new - * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH. - */ -#define XFS_IEXT_DIR_MANIP_CNT(mp) \ - ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount) - /* * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to * be added. One extra extent for dabtree in case a local attr is diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index adc1355ce853..20f15a0393e1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1024,11 +1024,6 @@ xfs_create( xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; - error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto out_trans_cancel; - /* * A newly created regular or special file just has one directory * entry pointing to them, but a directory also the "." entry @@ -1242,11 +1237,6 @@ xfs_link( if (error) goto std_return; - error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto error_return; - /* * If we are using project inheritance, we only allow hard link * creation in our tree when the project IDs are the same; else @@ -3210,35 +3200,6 @@ retry: /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. - * - * Extent count overflow check: - * - * From the perspective of src_dp, a rename operation is essentially a - * directory entry remove operation. Hence the only place where we check - * for extent count overflow for src_dp is in - * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns - * -ENOSPC when it detects a possible extent count overflow and in - * response, the higher layers of directory handling code do the - * following: - * 1. Data/Free blocks: XFS lets these blocks linger until a - * future remove operation removes them. - * 2. Dabtree blocks: XFS swaps the blocks with the last block in the - * Leaf space and unmaps the last block. - * - * For target_dp, there are two cases depending on whether the - * destination directory entry exists or not. - * - * When destination directory entry does not exist (i.e. target_ip == - * NULL), extent count overflow check is performed only when transaction - * has a non-zero sized space reservation associated with it. With a - * zero-sized space reservation, XFS allows a rename operation to - * continue only when the directory has sufficient free space in its - * data/leaf/free space blocks to hold the new entry. - * - * When destination directory entry exists (i.e. target_ip != NULL), all - * we need to do is change the inode number associated with the already - * existing entry. Hence there is no need to perform an extent count - * overflow check. */ if (target_ip == NULL) { /* @@ -3249,12 +3210,6 @@ retry: error = xfs_dir_canenter(tp, target_dp, target_name); if (error) goto out_trans_cancel; - } else { - error = xfs_iext_count_may_overflow(target_dp, - XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto out_trans_cancel; } } else { /* @@ -3422,18 +3377,12 @@ retry: * inode number of the whiteout inode rather than removing it * altogether. */ - if (wip) { + if (wip) error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, spaceres); - } else { - /* - * NOTE: We don't need to check for extent count overflow here - * because the dir remove name code will leave the dir block in - * place if the extent count would overflow. - */ + else error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, spaceres); - } if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index affbedf78160..4145ba872547 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -226,11 +226,6 @@ xfs_symlink( goto out_trans_cancel; } - error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, - XFS_IEXT_DIR_MANIP_CNT(mp)); - if (error) - goto out_trans_cancel; - /* * Allocate an inode for the symlink. */ -- cgit From 4f86bb4b66c999ad9ddcfd49fec93992eeba2715 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Wed, 9 Mar 2022 07:49:36 +0000 Subject: xfs: Conditionally upgrade existing inodes to use large extent counters This commit enables upgrading existing inodes to use large extent counters provided that underlying filesystem's superblock has large extent counter feature enabled. Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 3 +++ fs/xfs/libxfs/xfs_bmap.c | 6 ++++-- fs/xfs/libxfs/xfs_format.h | 11 +++++++++++ fs/xfs/libxfs/xfs_inode_fork.c | 24 ++++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_fork.h | 2 ++ fs/xfs/xfs_bmap_item.c | 2 ++ fs/xfs/xfs_bmap_util.c | 13 +++++++++++++ fs/xfs/xfs_dquot.c | 3 +++ fs/xfs/xfs_iomap.c | 5 +++++ fs/xfs/xfs_reflink.c | 5 +++++ fs/xfs/xfs_rtalloc.c | 3 +++ 11 files changed, 75 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 23523b802539..2815cfbbae70 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -776,6 +776,9 @@ xfs_attr_set( if (args->value || xfs_inode_hasattr(dp)) { error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(args->trans, dp, + XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); if (error) goto out_trans_cancel; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 4fab0c92ab70..82d5467ddf2c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4524,14 +4524,16 @@ xfs_bmapi_convert_delalloc( return error; xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); error = xfs_iext_count_may_overflow(ip, whichfork, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, 0); - if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || bma.got.br_startoff > offset_fsb) { /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 43de892d0305..3beaa819b790 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -934,6 +934,17 @@ enum xfs_dinode_fmt { #define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1)) #define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1)) +/* + * When we upgrade an inode to the large extent counts, the maximum value by + * which the extent count can increase is bound by the change in size of the + * on-disk field. No upgrade operation should ever be adding more than a few + * tens of extents, so if we get a really large value it is a sign of a code bug + * or corruption. + */ +#define XFS_MAX_EXTCNT_UPGRADE_NR \ + min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \ + XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL) + /* * Inode minimum and maximum sizes. */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index bb5d841aac58..9aee4a1e2fe9 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -756,3 +756,27 @@ xfs_iext_count_may_overflow( return 0; } + +/* + * Upgrade this inode's extent counter fields to be able to handle a potential + * increase in the extent count by nr_to_add. Normally this is the same + * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG. + */ +int +xfs_iext_count_upgrade( + struct xfs_trans *tp, + struct xfs_inode *ip, + uint nr_to_add) +{ + ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); + + if (!xfs_has_large_extent_counts(ip->i_mount) || + xfs_inode_has_large_extent_counts(ip) || + XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) + return -EFBIG; + + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 6f9d69f8896e..4f68c1f20beb 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -275,6 +275,8 @@ int xfs_ifork_verify_local_data(struct xfs_inode *ip); int xfs_ifork_verify_local_attr(struct xfs_inode *ip); int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, int nr_to_add); +int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, + uint nr_to_add); /* returns true if the fork has extents but they are not read in yet. */ static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 761dde155099..593ac29cffc7 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -506,6 +506,8 @@ xfs_bui_item_recover( iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 18c1b99311a8..52be58372c63 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -859,6 +859,9 @@ xfs_alloc_file_space( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto error; @@ -914,6 +917,8 @@ xfs_unmap_extent( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; @@ -1195,6 +1200,8 @@ xfs_insert_file_space( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; @@ -1423,6 +1430,9 @@ xfs_swap_extent_rmap( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } @@ -1431,6 +1441,9 @@ xfs_swap_extent_rmap( error = xfs_iext_count_may_overflow(tip, XFS_DATA_FORK, XFS_IEXT_SWAP_RMAP_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_SWAP_RMAP_CNT); if (error) goto out; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 5afedcbc78c7..eb211e0ede5d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -322,6 +322,9 @@ xfs_dquot_disk_alloc( error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, quotip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto err_cancel; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 87e1cf5060bd..5a393259a3a3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -251,6 +251,8 @@ xfs_iomap_write_direct( return error; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, nr_exts); if (error) goto out_trans_cancel; @@ -555,6 +557,9 @@ xfs_iomap_write_unwritten( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_WRITE_UNWRITTEN_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_WRITE_UNWRITTEN_CNT); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 54e68e5693fd..1ae6d3434ad2 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -620,6 +620,9 @@ xfs_reflink_end_cow_extent( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_REFLINK_END_COW_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_REFLINK_END_COW_CNT); if (error) goto out_cancel; @@ -1121,6 +1124,8 @@ xfs_reflink_remap_extent( ++iext_delta; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto out_cancel; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b8c79ee791af..3e587e85d5bf 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -806,6 +806,9 @@ xfs_growfs_rt_alloc( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; -- cgit From 5b35d922c5279804be87cab60e4810403038488b Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Wed, 9 Mar 2022 12:34:04 +0000 Subject: xfs: Decouple XFS_IBULK flags from XFS_IWALK flags A future commit will add a new XFS_IBULK flag which will not have a corresponding XFS_IWALK flag. In preparation for the change, this commit separates XFS_IBULK_* flags from XFS_IWALK_* flags. Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/xfs_itable.c | 6 +++++- fs/xfs/xfs_itable.h | 2 +- fs/xfs/xfs_iwalk.h | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c08c79d9e311..71ed4905f206 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -256,6 +256,7 @@ xfs_bulkstat( .breq = breq, }; struct xfs_trans *tp; + unsigned int iwalk_flags = 0; int error; if (breq->mnt_userns != &init_user_ns) { @@ -279,7 +280,10 @@ xfs_bulkstat( if (error) goto out; - error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags, + if (breq->flags & XFS_IBULK_SAME_AG) + iwalk_flags |= XFS_IWALK_SAME_AG; + + error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags, xfs_bulkstat_iwalk, breq->icount, &bc); xfs_trans_cancel(tp); out: diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 7078d10c9b12..5ee1d3f44ce9 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -17,7 +17,7 @@ struct xfs_ibulk { }; /* Only iterate within the same AG as startino */ -#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG) +#define XFS_IBULK_SAME_AG (1U << 0) /* * Advance the user buffer pointer by one record of the given size. If the diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h index 37a795f03267..83699089755e 100644 --- a/fs/xfs/xfs_iwalk.h +++ b/fs/xfs/xfs_iwalk.h @@ -26,7 +26,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino, unsigned int inode_records, bool poll, void *data); /* Only iterate inodes within the same AG as @startino. */ -#define XFS_IWALK_SAME_AG (0x1) +#define XFS_IWALK_SAME_AG (1U << 0) #define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG) -- cgit From c3c4ecb529c5a1f0590cffb70649d407ee79b8a8 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Wed, 9 Mar 2022 12:58:37 +0000 Subject: xfs: Enable bulkstat ioctl to support 64-bit per-inode extent counters The following changes are made to enable userspace to obtain 64-bit extent counters, 1. Carve out a new 64-bit field xfs_bulkstat->bs_extents64 from xfs_bulkstat->bs_pad[] to hold 64-bit extent counter. 2. Define the new flag XFS_BULK_IREQ_BULKSTAT for userspace to indicate that it is capable of receiving 64-bit extent counters. Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Suggested-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_fs.h | 24 ++++++++++++++++++------ fs/xfs/xfs_ioctl.c | 3 +++ fs/xfs/xfs_itable.c | 9 ++++++++- fs/xfs/xfs_itable.h | 3 +++ 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 1f7238db35cc..1d2682ccdb78 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -378,7 +378,7 @@ struct xfs_bulkstat { uint32_t bs_extsize_blks; /* extent size hint, blocks */ uint32_t bs_nlink; /* number of links */ - uint32_t bs_extents; /* number of extents */ + uint32_t bs_extents; /* 32-bit data fork extent counter */ uint32_t bs_aextents; /* attribute number of extents */ uint16_t bs_version; /* structure version */ uint16_t bs_forkoff; /* inode fork offset in bytes */ @@ -387,8 +387,9 @@ struct xfs_bulkstat { uint16_t bs_checked; /* checked inode metadata */ uint16_t bs_mode; /* type and mode */ uint16_t bs_pad2; /* zeroed */ + uint64_t bs_extents64; /* 64-bit data fork extent counter */ - uint64_t bs_pad[7]; /* zeroed */ + uint64_t bs_pad[6]; /* zeroed */ }; #define XFS_BULKSTAT_VERSION_V1 (1) @@ -460,17 +461,28 @@ struct xfs_bulk_ireq { * Only return results from the specified @agno. If @ino is zero, start * with the first inode of @agno. */ -#define XFS_BULK_IREQ_AGNO (1 << 0) +#define XFS_BULK_IREQ_AGNO (1U << 0) /* * Return bulkstat information for a single inode, where @ino value is a * special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_* * values below. Not compatible with XFS_BULK_IREQ_AGNO. */ -#define XFS_BULK_IREQ_SPECIAL (1 << 1) +#define XFS_BULK_IREQ_SPECIAL (1U << 1) -#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ - XFS_BULK_IREQ_SPECIAL) +/* + * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign + * 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use + * xfs_bulkstat->bs_extents for returning data fork extent count and set + * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and + * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than + * XFS_MAX_EXTCNT_DATA_FORK_OLD. + */ +#define XFS_BULK_IREQ_NREXT64 (1U << 2) + +#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ + XFS_BULK_IREQ_SPECIAL | \ + XFS_BULK_IREQ_NREXT64) /* Operate on the root directory inode. */ #define XFS_BULK_IREQ_SPECIAL_ROOT (1) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 83481005317a..e9eadc7337ce 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -813,6 +813,9 @@ xfs_bulk_ireq_setup( if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) return -ECANCELED; + if (hdr->flags & XFS_BULK_IREQ_NREXT64) + breq->flags |= XFS_IBULK_NREXT64; + return 0; } diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 71ed4905f206..f74c9fff72bb 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -64,6 +64,7 @@ xfs_bulkstat_one_int( struct xfs_inode *ip; /* incore inode pointer */ struct inode *inode; struct xfs_bulkstat *buf = bc->buf; + xfs_extnum_t nextents; int error = -EINVAL; if (xfs_internal_inum(mp, ino)) @@ -102,7 +103,13 @@ xfs_bulkstat_one_int( buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize_blks = ip->i_extsize; - buf->bs_extents = xfs_ifork_nextents(&ip->i_df); + + nextents = xfs_ifork_nextents(&ip->i_df); + if (!(bc->breq->flags & XFS_IBULK_NREXT64)) + buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL); + else + buf->bs_extents64 = nextents; + xfs_bulkstat_health(ip, buf); buf->bs_aextents = xfs_ifork_nextents(ip->i_afp); buf->bs_forkoff = XFS_IFORK_BOFF(ip); diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 5ee1d3f44ce9..e2d0eba43f35 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -19,6 +19,9 @@ struct xfs_ibulk { /* Only iterate within the same AG as startino */ #define XFS_IBULK_SAME_AG (1U << 0) +/* Fill out the bs_extents64 field if set. */ +#define XFS_IBULK_NREXT64 (1U << 1) + /* * Advance the user buffer pointer by one record of the given size. If the * buffer is now full, return the appropriate error code. -- cgit From 973ac0eb3a7dfedecd385bd2b48b12e62a0492f2 Mon Sep 17 00:00:00 2001 From: Chandan Babu R Date: Wed, 11 Aug 2021 10:33:20 +0530 Subject: xfs: Add XFS_SB_FEAT_INCOMPAT_NREXT64 to the list of supported flags This commit enables XFS module to work with fs instances having 64-bit per-inode extent counters by adding XFS_SB_FEAT_INCOMPAT_NREXT64 flag to the list of supported incompat feature flags. Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_format.h | 3 ++- fs/xfs/xfs_super.c | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 3beaa819b790..398fd98bf29e 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -378,7 +378,8 @@ xfs_sb_has_ro_compat_feature( XFS_SB_FEAT_INCOMPAT_SPINODES| \ XFS_SB_FEAT_INCOMPAT_META_UUID| \ XFS_SB_FEAT_INCOMPAT_BIGTIME| \ - XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) + XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \ + XFS_SB_FEAT_INCOMPAT_NREXT64) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 54be9d64093e..d35536a08f82 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1639,6 +1639,10 @@ xfs_fs_fill_super( goto out_filestream_unmount; } + if (xfs_has_large_extent_counts(mp)) + xfs_warn(mp, + "EXPERIMENTAL Large extent counts feature in use. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; -- cgit From 2d9ac4319b9959bf3195fedf88bdfd224c67593b Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 21 Apr 2022 08:47:54 +1000 Subject: xfs: simplify local variable assignment in file write code Get the struct inode pointer from iocb->ki_filp->f_mapping->host directly and the other variables are unnecessary, so simplify the local variables assignment. Signed-off-by: Kaixu Xia Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c5541d062d0d..442bfaed202e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -694,9 +694,7 @@ xfs_file_buffered_write( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; bool cleared_space = false; @@ -767,9 +765,7 @@ xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; size_t ocount = iov_iter_count(from); -- cgit From 735fbf67df56f402e9baa079a5560ebe8fa049c1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:33:23 +1000 Subject: xfs: factor out the CIL transaction header building It is static code deep in the middle of the CIL push logic. Factor it out into a helper so that it is clear and easy to modify separately. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_cil.c | 61 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ba57323bfdce..93f966c191a3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -868,6 +868,41 @@ xlog_cil_write_commit_record( return error; } +struct xlog_cil_trans_hdr { + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr; +}; + +/* + * Build a checkpoint transaction header to begin the journal transaction. We + * need to account for the space used by the transaction header here as it is + * not accounted for in xlog_write(). + */ +static void +xlog_cil_build_trans_hdr( + struct xfs_cil_ctx *ctx, + struct xlog_cil_trans_hdr *hdr, + struct xfs_log_vec *lvhdr, + int num_iovecs) +{ + struct xlog_ticket *tic = ctx->ticket; + + memset(hdr, 0, sizeof(*hdr)); + + hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + hdr->thdr.th_type = XFS_TRANS_CHECKPOINT; + hdr->thdr.th_tid = tic->t_tid; + hdr->thdr.th_num_items = num_iovecs; + hdr->lhdr.i_addr = &hdr->thdr; + hdr->lhdr.i_len = sizeof(xfs_trans_header_t); + hdr->lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; + tic->t_curr_res -= hdr->lhdr.i_len + sizeof(struct xlog_op_header); + + lvhdr->lv_niovecs = 1; + lvhdr->lv_iovecp = &hdr->lhdr; + lvhdr->lv_next = ctx->lv_chain; +} + /* * Push the Committed Item List to the log. * @@ -892,11 +927,9 @@ xlog_cil_push_work( struct xlog *log = cil->xc_log; struct xfs_log_vec *lv; struct xfs_cil_ctx *new_ctx; - struct xlog_ticket *tic; int num_iovecs; int error = 0; - struct xfs_trans_header thdr; - struct xfs_log_iovec lhdr; + struct xlog_cil_trans_hdr thdr; struct xfs_log_vec lvhdr = { NULL }; xfs_csn_t push_seq; bool push_commit_stable; @@ -1025,24 +1058,8 @@ xlog_cil_push_work( * Build a checkpoint transaction header and write it to the log to * begin the transaction. We need to account for the space used by the * transaction header here as it is not accounted for in xlog_write(). - * - * The LSN we need to pass to the log items on transaction commit is - * the LSN reported by the first log vector write. If we use the commit - * record lsn then we can move the tail beyond the grant write head. */ - tic = ctx->ticket; - thdr.th_magic = XFS_TRANS_HEADER_MAGIC; - thdr.th_type = XFS_TRANS_CHECKPOINT; - thdr.th_tid = tic->t_tid; - thdr.th_num_items = num_iovecs; - lhdr.i_addr = &thdr; - lhdr.i_len = sizeof(xfs_trans_header_t); - lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; - tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); - - lvhdr.lv_niovecs = 1; - lvhdr.lv_iovecp = &lhdr; - lvhdr.lv_next = ctx->lv_chain; + xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs); error = xlog_cil_write_chain(ctx, &lvhdr); if (error) @@ -1052,7 +1069,7 @@ xlog_cil_push_work( if (error) goto out_abort_free_ticket; - xfs_log_ticket_ungrant(log, tic); + xfs_log_ticket_ungrant(log, ctx->ticket); /* * If the checkpoint spans multiple iclogs, wait for all previous iclogs @@ -1116,7 +1133,7 @@ out_skip: return; out_abort_free_ticket: - xfs_log_ticket_ungrant(log, tic); + xfs_log_ticket_ungrant(log, ctx->ticket); ASSERT(xlog_is_shutdown(log)); if (!ctx->commit_iclog) { xlog_cil_committed(ctx); -- cgit From 6eaed95e21a0872692246e63cb45542d0f62c922 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:33:48 +1000 Subject: xfs: only CIL pushes require a start record So move the one-off start record writing in xlog_write() out into the static header that the CIL push builds to write into the log initially. This simplifes the xlog_write() logic a lot. pahole on x86-64 confirms that the xlog_cil_trans_hdr is correctly 32 bit aligned and packed for copying the log op and transaction headers directly into the log as a single log region copy. struct xlog_cil_trans_hdr { struct xlog_op_header oph[2]; /* 0 24 */ struct xfs_trans_header thdr; /* 24 16 */ struct xfs_log_iovec lhdr[2]; /* 40 32 */ /* size: 72, cachelines: 2, members: 3 */ /* last cacheline: 8 bytes */ }; A wart is needed to handle the fact that length of the region the opheader points to doesn't include the opheader length. hence if we embed the opheader, we have to substract the opheader length from the length written into the opheader by the generic copying code. This will eventually go away when everything is converted to embedded opheaders. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 90 ++++++++++++++++++++++++++-------------------------- fs/xfs/xfs_log_cil.c | 43 +++++++++++++++++++++---- 2 files changed, 81 insertions(+), 52 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 499e15b24215..f6fa5426278b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2221,9 +2221,9 @@ xlog_print_trans( } /* - * Calculate the potential space needed by the log vector. We may need a start - * record, and each region gets its own struct xlog_op_header and may need to be - * double word aligned. + * Calculate the potential space needed by the log vector. If this is a start + * transaction, the caller has already accounted for both opheaders in the start + * transaction, so we don't need to account for them here. */ static int xlog_write_calc_vec_length( @@ -2236,9 +2236,6 @@ xlog_write_calc_vec_length( int len = 0; int i; - if (optype & XLOG_START_TRANS) - headers++; - for (lv = log_vector; lv; lv = lv->lv_next) { /* we don't write ordered log vectors */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) @@ -2254,24 +2251,20 @@ xlog_write_calc_vec_length( } } + /* Don't account for regions with embedded ophdrs */ + if (optype && headers > 0) { + if (optype & XLOG_START_TRANS) { + ASSERT(headers >= 2); + headers -= 2; + } + } + ticket->t_res_num_ophdrs += headers; len += headers * sizeof(struct xlog_op_header); return len; } -static void -xlog_write_start_rec( - struct xlog_op_header *ophdr, - struct xlog_ticket *ticket) -{ - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); - ophdr->oh_clientid = ticket->t_clientid; - ophdr->oh_len = 0; - ophdr->oh_flags = XLOG_START_TRANS; - ophdr->oh_res2 = 0; -} - static xlog_op_header_t * xlog_write_setup_ophdr( struct xlog *log, @@ -2467,9 +2460,11 @@ xlog_write( * If this is a commit or unmount transaction, we don't need a start * record to be written. We do, however, have to account for the * commit or unmount header that gets written. Hence we always have - * to account for an extra xlog_op_header here. + * to account for an extra xlog_op_header here for commit and unmount + * records. */ - ticket->t_curr_res -= sizeof(struct xlog_op_header); + if (optype & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -2510,7 +2505,7 @@ xlog_write( int copy_len; int copy_off; bool ordered = false; - bool wrote_start_rec = false; + bool added_ophdr = false; /* ordered log vectors have no regions to write */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { @@ -2524,25 +2519,24 @@ xlog_write( ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); /* - * Before we start formatting log vectors, we need to - * write a start record. Only do this for the first - * iclog we write to. + * The XLOG_START_TRANS has embedded ophdrs for the + * start record and transaction header. They will always + * be the first two regions in the lv chain. */ if (optype & XLOG_START_TRANS) { - xlog_write_start_rec(ptr, ticket); - xlog_write_adv_cnt(&ptr, &len, &log_offset, - sizeof(struct xlog_op_header)); - optype &= ~XLOG_START_TRANS; - wrote_start_rec = true; - } - - ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype); - if (!ophdr) - return -EIO; + ophdr = reg->i_addr; + if (index) + optype &= ~XLOG_START_TRANS; + } else { + ophdr = xlog_write_setup_ophdr(log, ptr, + ticket, optype); + if (!ophdr) + return -EIO; - xlog_write_adv_cnt(&ptr, &len, &log_offset, + xlog_write_adv_cnt(&ptr, &len, &log_offset, sizeof(struct xlog_op_header)); - + added_ophdr = true; + } len += xlog_write_setup_copy(ticket, ophdr, iclog->ic_size-log_offset, reg->i_len, @@ -2551,13 +2545,22 @@ xlog_write( &partial_copy_len); xlog_verify_dest_ptr(log, ptr); + + /* + * Wart: need to update length in embedded ophdr not + * to include it's own length. + */ + if (!added_ophdr) { + ophdr->oh_len = cpu_to_be32(copy_len - + sizeof(struct xlog_op_header)); + } /* * Copy region. * - * Unmount records just log an opheader, so can have - * empty payloads with no data region to copy. Hence we - * only copy the payload if the vector says it has data - * to copy. + * Commit and unmount records just log an opheader, so + * we can have empty payloads with no data region to + * copy. Hence we only copy the payload if the vector + * says it has data to copy. */ ASSERT(copy_len >= 0); if (copy_len > 0) { @@ -2565,12 +2568,9 @@ xlog_write( xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); } - copy_len += sizeof(struct xlog_op_header); - record_cnt++; - if (wrote_start_rec) { + if (added_ophdr) copy_len += sizeof(struct xlog_op_header); - record_cnt++; - } + record_cnt++; data_cnt += contwr ? copy_len : 0; error = xlog_write_copy_finish(log, iclog, optype, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 93f966c191a3..5ff046e82912 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -869,14 +869,22 @@ xlog_cil_write_commit_record( } struct xlog_cil_trans_hdr { + struct xlog_op_header oph[2]; struct xfs_trans_header thdr; - struct xfs_log_iovec lhdr; + struct xfs_log_iovec lhdr[2]; }; /* * Build a checkpoint transaction header to begin the journal transaction. We * need to account for the space used by the transaction header here as it is * not accounted for in xlog_write(). + * + * This is the only place we write a transaction header, so we also build the + * log opheaders that indicate the start of a log transaction and wrap the + * transaction header. We keep the start record in it's own log vector rather + * than compacting them into a single region as this ends up making the logic + * in xlog_write() for handling empty opheaders for start, commit and unmount + * records much simpler. */ static void xlog_cil_build_trans_hdr( @@ -886,20 +894,41 @@ xlog_cil_build_trans_hdr( int num_iovecs) { struct xlog_ticket *tic = ctx->ticket; + __be32 tid = cpu_to_be32(tic->t_tid); memset(hdr, 0, sizeof(*hdr)); + /* Log start record */ + hdr->oph[0].oh_tid = tid; + hdr->oph[0].oh_clientid = XFS_TRANSACTION; + hdr->oph[0].oh_flags = XLOG_START_TRANS; + + /* log iovec region pointer */ + hdr->lhdr[0].i_addr = &hdr->oph[0]; + hdr->lhdr[0].i_len = sizeof(struct xlog_op_header); + hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER; + + /* log opheader */ + hdr->oph[1].oh_tid = tid; + hdr->oph[1].oh_clientid = XFS_TRANSACTION; + hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header)); + + /* transaction header in host byte order format */ hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC; hdr->thdr.th_type = XFS_TRANS_CHECKPOINT; hdr->thdr.th_tid = tic->t_tid; hdr->thdr.th_num_items = num_iovecs; - hdr->lhdr.i_addr = &hdr->thdr; - hdr->lhdr.i_len = sizeof(xfs_trans_header_t); - hdr->lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; - tic->t_curr_res -= hdr->lhdr.i_len + sizeof(struct xlog_op_header); - lvhdr->lv_niovecs = 1; - lvhdr->lv_iovecp = &hdr->lhdr; + /* log iovec region pointer */ + hdr->lhdr[1].i_addr = &hdr->oph[1]; + hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) + + sizeof(struct xfs_trans_header); + hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR; + + tic->t_curr_res -= hdr->lhdr[0].i_len + hdr->lhdr[1].i_len; + + lvhdr->lv_niovecs = 2; + lvhdr->lv_iovecp = &hdr->lhdr[0]; lvhdr->lv_next = ctx->lv_chain; } -- cgit From ffa04c1f2cb047d6a44c3570bfb6e1ca5ba7f489 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:34:04 +1000 Subject: xfs: embed the xlog_op_header in the unmount record Remove another case where xlog_write() has to prepend an opheader to a log transaction. The unmount record + ophdr is smaller than the minimum amount of space guaranteed to be free in an iclog (2 * sizeof(ophdr)) and so we don't have to care about an unmount record being split across 2 iclogs. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f6fa5426278b..00e9af9b246a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -901,12 +901,22 @@ xlog_write_unmount_record( struct xlog *log, struct xlog_ticket *ticket) { - struct xfs_unmount_log_format ulf = { - .magic = XLOG_UNMOUNT_TYPE, + struct { + struct xlog_op_header ophdr; + struct xfs_unmount_log_format ulf; + } unmount_rec = { + .ophdr = { + .oh_clientid = XFS_LOG, + .oh_tid = cpu_to_be32(ticket->t_tid), + .oh_flags = XLOG_UNMOUNT_TRANS, + }, + .ulf = { + .magic = XLOG_UNMOUNT_TYPE, + }, }; struct xfs_log_iovec reg = { - .i_addr = &ulf, - .i_len = sizeof(ulf), + .i_addr = &unmount_rec, + .i_len = sizeof(unmount_rec), .i_type = XLOG_REG_TYPE_UNMOUNT, }; struct xfs_log_vec vec = { @@ -914,8 +924,12 @@ xlog_write_unmount_record( .lv_iovecp = ®, }; + BUILD_BUG_ON((sizeof(struct xlog_op_header) + + sizeof(struct xfs_unmount_log_format)) != + sizeof(unmount_rec)); + /* account for space used by record data */ - ticket->t_curr_res -= sizeof(ulf); + ticket->t_curr_res -= sizeof(unmount_rec); return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS); } @@ -2253,6 +2267,8 @@ xlog_write_calc_vec_length( /* Don't account for regions with embedded ophdrs */ if (optype && headers > 0) { + if (optype & XLOG_UNMOUNT_TRANS) + headers--; if (optype & XLOG_START_TRANS) { ASSERT(headers >= 2); headers -= 2; @@ -2458,12 +2474,11 @@ xlog_write( /* * If this is a commit or unmount transaction, we don't need a start - * record to be written. We do, however, have to account for the - * commit or unmount header that gets written. Hence we always have - * to account for an extra xlog_op_header here for commit and unmount - * records. + * record to be written. We do, however, have to account for the commit + * header that gets written. Hence we always have to account for an + * extra xlog_op_header here for commit records. */ - if (optype & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + if (optype & XLOG_COMMIT_TRANS) ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, @@ -2527,6 +2542,8 @@ xlog_write( ophdr = reg->i_addr; if (index) optype &= ~XLOG_START_TRANS; + } else if (optype & XLOG_UNMOUNT_TRANS) { + ophdr = reg->i_addr; } else { ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype); @@ -2557,7 +2574,7 @@ xlog_write( /* * Copy region. * - * Commit and unmount records just log an opheader, so + * Commit records just log an opheader, so * we can have empty payloads with no data region to * copy. Hence we only copy the payload if the vector * says it has data to copy. -- cgit From 54021b624261fe5b429d7ab4d081c3b2cca153a8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:34:15 +1000 Subject: xfs: embed the xlog_op_header in the commit record Remove the final case where xlog_write() has to prepend an opheader to a log transaction. Similar to the start record, the commit record is just an empty opheader with a XLOG_COMMIT_TRANS type, so we can just make this the payload for the region being passed to xlog_write() and remove the special handling in xlog_write() for the commit record. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 22 ++++++---------------- fs/xfs/xfs_log_cil.c | 11 +++++++++-- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 00e9af9b246a..c23a15c56ef1 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2267,11 +2267,10 @@ xlog_write_calc_vec_length( /* Don't account for regions with embedded ophdrs */ if (optype && headers > 0) { - if (optype & XLOG_UNMOUNT_TRANS) - headers--; + headers--; if (optype & XLOG_START_TRANS) { - ASSERT(headers >= 2); - headers -= 2; + ASSERT(headers >= 1); + headers--; } } @@ -2472,14 +2471,6 @@ xlog_write( int data_cnt = 0; int error = 0; - /* - * If this is a commit or unmount transaction, we don't need a start - * record to be written. We do, however, have to account for the commit - * header that gets written. Hence we always have to account for an - * extra xlog_op_header here for commit records. - */ - if (optype & XLOG_COMMIT_TRANS) - ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -2536,14 +2527,13 @@ xlog_write( /* * The XLOG_START_TRANS has embedded ophdrs for the * start record and transaction header. They will always - * be the first two regions in the lv chain. + * be the first two regions in the lv chain. Commit and + * unmount records also have embedded ophdrs. */ - if (optype & XLOG_START_TRANS) { + if (optype) { ophdr = reg->i_addr; if (index) optype &= ~XLOG_START_TRANS; - } else if (optype & XLOG_UNMOUNT_TRANS) { - ophdr = reg->i_addr; } else { ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 5ff046e82912..53dc5add5359 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -844,9 +844,14 @@ xlog_cil_write_commit_record( struct xfs_cil_ctx *ctx) { struct xlog *log = ctx->cil->xc_log; + struct xlog_op_header ophdr = { + .oh_clientid = XFS_TRANSACTION, + .oh_tid = cpu_to_be32(ctx->ticket->t_tid), + .oh_flags = XLOG_COMMIT_TRANS, + }; struct xfs_log_iovec reg = { - .i_addr = NULL, - .i_len = 0, + .i_addr = &ophdr, + .i_len = sizeof(struct xlog_op_header), .i_type = XLOG_REG_TYPE_COMMIT, }; struct xfs_log_vec vec = { @@ -862,6 +867,8 @@ xlog_cil_write_commit_record( if (error) return error; + /* account for space used by record data */ + ctx->ticket->t_curr_res -= reg.i_len; error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS); if (error) xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); -- cgit From c7610dceed39d978ef1ee0f2ab5a3c8d2d54d120 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:34:33 +1000 Subject: xfs: log tickets don't need log client id We currently set the log ticket client ID when we reserve a transaction. This client ID is only ever written to the log by a CIL checkpoint or unmount records, and so anything using a high level transaction allocated through xfs_trans_alloc() does not need a log ticket client ID to be set. For the CIL checkpoint, the client ID written to the journal is always XFS_TRANSACTION, and for the unmount record it is always XFS_LOG, and nothing else writes to the log. All of these operations tell xlog_write() exactly what they need to write to the log (the optype) and build their own opheaders for start, commit and unmount records. Hence we no longer need to set the client id in either the log ticket or the xfs_trans. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_log_format.h | 1 - fs/xfs/xfs_log.c | 47 +++++++----------------------------------- fs/xfs/xfs_log.h | 14 +++++-------- fs/xfs/xfs_log_cil.c | 2 +- fs/xfs/xfs_log_priv.h | 10 ++------- fs/xfs/xfs_trans.c | 6 ++---- 6 files changed, 18 insertions(+), 62 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index b322db523d65..2b89141ae81a 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -69,7 +69,6 @@ static inline uint xlog_get_cycle(char *ptr) /* Log Clients */ #define XFS_TRANSACTION 0x69 -#define XFS_VOLUME 0x2 #define XFS_LOG 0xaa #define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c23a15c56ef1..1f6e7092ec76 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -434,10 +434,9 @@ out_error: int xfs_log_reserve( struct xfs_mount *mp, - int unit_bytes, - int cnt, + int unit_bytes, + int cnt, struct xlog_ticket **ticp, - uint8_t client, bool permanent) { struct xlog *log = mp->m_log; @@ -445,15 +444,13 @@ xfs_log_reserve( int need_bytes; int error = 0; - ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - if (xlog_is_shutdown(log)) return -EIO; XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -947,7 +944,7 @@ xlog_unmount_write( struct xlog_ticket *tic = NULL; int error; - error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); + error = xfs_log_reserve(mp, 600, 1, &tic, 0); if (error) goto out_err; @@ -2282,35 +2279,13 @@ xlog_write_calc_vec_length( static xlog_op_header_t * xlog_write_setup_ophdr( - struct xlog *log, struct xlog_op_header *ophdr, - struct xlog_ticket *ticket, - uint flags) + struct xlog_ticket *ticket) { ophdr->oh_tid = cpu_to_be32(ticket->t_tid); - ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_clientid = XFS_TRANSACTION; ophdr->oh_res2 = 0; - - /* are we copying a commit or unmount record? */ - ophdr->oh_flags = flags; - - /* - * We've seen logs corrupted with bad transaction client ids. This - * makes sure that XFS doesn't generate them on. Turn this into an EIO - * and shut down the filesystem. - */ - switch (ophdr->oh_clientid) { - case XFS_TRANSACTION: - case XFS_VOLUME: - case XFS_LOG: - break; - default: - xfs_warn(log->l_mp, - "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT, - ophdr->oh_clientid, ticket); - return NULL; - } - + ophdr->oh_flags = 0; return ophdr; } @@ -2535,11 +2510,7 @@ xlog_write( if (index) optype &= ~XLOG_START_TRANS; } else { - ophdr = xlog_write_setup_ophdr(log, ptr, - ticket, optype); - if (!ophdr) - return -EIO; - + ophdr = xlog_write_setup_ophdr(ptr, ticket); xlog_write_adv_cnt(&ptr, &len, &log_offset, sizeof(struct xlog_op_header)); added_ophdr = true; @@ -3598,7 +3569,6 @@ xlog_ticket_alloc( struct xlog *log, int unit_bytes, int cnt, - char client, bool permanent) { struct xlog_ticket *tic; @@ -3616,7 +3586,6 @@ xlog_ticket_alloc( tic->t_cnt = cnt; tic->t_ocnt = cnt; tic->t_tid = prandom_u32(); - tic->t_clientid = client; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index dc1b77b92fc1..09b8fe9994f2 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -117,15 +117,11 @@ int xfs_log_mount_finish(struct xfs_mount *mp); void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); -void xfs_log_space_wake(struct xfs_mount *mp); -int xfs_log_reserve(struct xfs_mount *mp, - int length, - int count, - struct xlog_ticket **ticket, - uint8_t clientid, - bool permanent); -int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); -void xfs_log_unmount(struct xfs_mount *mp); +void xfs_log_space_wake(struct xfs_mount *mp); +int xfs_log_reserve(struct xfs_mount *mp, int length, int count, + struct xlog_ticket **ticket, bool permanent); +int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); +void xfs_log_unmount(struct xfs_mount *mp); bool xfs_log_writable(struct xfs_mount *mp); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 53dc5add5359..2403a7bbb913 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -37,7 +37,7 @@ xlog_cil_ticket_alloc( { struct xlog_ticket *tic; - tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0); + tic = xlog_ticket_alloc(log, 0, 1, 0); /* * set the current reservation to zero so we know to steal the basic diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 401cdc400980..6f247561588b 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -164,7 +164,6 @@ typedef struct xlog_ticket { int t_unit_res; /* unit reservation in bytes : 4 */ char t_ocnt; /* original count : 1 */ char t_cnt; /* current count : 1 */ - char t_clientid; /* who does this belong to; : 1 */ char t_flags; /* properties of reservation : 1 */ /* reservation array fields */ @@ -509,13 +508,8 @@ extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); extern struct kmem_cache *xfs_log_ticket_cache; -struct xlog_ticket * -xlog_ticket_alloc( - struct xlog *log, - int unit_bytes, - int count, - char client, - bool permanent); +struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, + int count, bool permanent); static inline void xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 0ac717aad380..8b9d36b19ae4 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -194,11 +194,9 @@ xfs_trans_reserve( ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); error = xfs_log_regrant(mp, tp->t_ticket); } else { - error = xfs_log_reserve(mp, - resp->tr_logres, + error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount, - &tp->t_ticket, XFS_TRANSACTION, - permanent); + &tp->t_ticket, permanent); } if (error) -- cgit From 3c352bef8335a8d9d8f14bc0bd533df023280a72 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:34:49 +1000 Subject: xfs: move log iovec alignment to preparation function To include log op headers directly into the log iovec regions that the ophdrs wrap, we need to move the buffer alignment code from xlog_finish_iovec() to xlog_prepare_iovec(). This is because the xlog_op_header is only 12 bytes long, and we need the buffer that the caller formats their data into to be 8 byte aligned. Hence once we start prepending the ophdr in xlog_prepare_iovec(), we are going to need to manage the padding directly to ensure that the buffer pointer returned is correctly aligned. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.h | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 09b8fe9994f2..d1fc43476166 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -21,6 +21,16 @@ struct xfs_log_vec { #define XFS_LOG_VEC_ORDERED (-1) +/* + * We need to make sure the buffer pointer returned is naturally aligned for the + * biggest basic data type we put into it. We have already accounted for this + * padding when sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + */ static inline void * xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, uint type) @@ -34,6 +44,9 @@ xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, vec = &lv->lv_iovecp[0]; } + if (!IS_ALIGNED(lv->lv_buf_len, sizeof(uint64_t))) + lv->lv_buf_len = round_up(lv->lv_buf_len, sizeof(uint64_t)); + vec->i_type = type; vec->i_addr = lv->lv_buf + lv->lv_buf_len; @@ -43,20 +56,10 @@ xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, return vec->i_addr; } -/* - * We need to make sure the next buffer is naturally aligned for the biggest - * basic data type we put into it. We already accounted for this padding when - * sizing the buffer. - * - * However, this padding does not get written into the log, and hence we have to - * track the space used by the log vectors separately to prevent log space hangs - * due to inaccurate accounting (i.e. a leak) of the used log space through the - * CIL context ticket. - */ static inline void xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) { - lv->lv_buf_len += round_up(len, sizeof(uint64_t)); + lv->lv_buf_len += len; lv->lv_bytes += len; vec->i_len = len; } -- cgit From 8d547cf9d2392585204075243f29022a619550f2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:34:59 +1000 Subject: xfs: reserve space and initialise xlog_op_header in item formatting Current xlog_write() adds op headers to the log manually for every log item region that is in the vector passed to it. While xlog_write() needs to stamp the transaction ID into the ophdr, we already know it's length, flags, clientid, etc at CIL commit time. This means the only time that xlog write really needs to format and reserve space for a new ophdr is when a region is split across two iclogs. Adding the opheader and accounting for it as part of the normal formatted item region means we simplify the accounting of space used by a transaction and we don't have to special case reserving of space in for the ophdrs in xlog_write(). It also means we can largely initialise the ophdr in transaction commit instead of xlog_write, making the xlog_write formatting inner loop much tighter. xlog_prepare_iovec() is now too large to stay as an inline function, so we move it out of line and into xfs_log.c. Object sizes: text data bss dec hex filename 1125934 305951 484 1432369 15db31 fs/xfs/built-in.a.before 1123360 305951 484 1429795 15d123 fs/xfs/built-in.a.after So the code is a roughly 2.5kB smaller with xlog_prepare_iovec() now out of line, even though it grew in size itself. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 115 ++++++++++++++++++++++++++++++++++----------------- fs/xfs/xfs_log.h | 42 ++++--------------- fs/xfs/xfs_log_cil.c | 25 ++++++----- 3 files changed, 99 insertions(+), 83 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1f6e7092ec76..23ca073de6df 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -90,6 +90,62 @@ xlog_iclogs_empty( static int xfs_log_cover(struct xfs_mount *); +/* + * We need to make sure the buffer pointer returned is naturally aligned for the + * biggest basic data type we put into it. We have already accounted for this + * padding when sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + * + * We also add space for the xlog_op_header that describes this region in the + * log. This prepends the data region we return to the caller to copy their data + * into, so do all the static initialisation of the ophdr now. Because the ophdr + * is not 8 byte aligned, we have to be careful to ensure that we align the + * start of the buffer such that the region we return to the call is 8 byte + * aligned and packed against the tail of the ophdr. + */ +void * +xlog_prepare_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + uint type) +{ + struct xfs_log_iovec *vec = *vecp; + struct xlog_op_header *oph; + uint32_t len; + void *buf; + + if (vec) { + ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); + vec++; + } else { + vec = &lv->lv_iovecp[0]; + } + + len = lv->lv_buf_len + sizeof(struct xlog_op_header); + if (!IS_ALIGNED(len, sizeof(uint64_t))) { + lv->lv_buf_len = round_up(len, sizeof(uint64_t)) - + sizeof(struct xlog_op_header); + } + + vec->i_type = type; + vec->i_addr = lv->lv_buf + lv->lv_buf_len; + + oph = vec->i_addr; + oph->oh_clientid = XFS_TRANSACTION; + oph->oh_res2 = 0; + oph->oh_flags = 0; + + buf = vec->i_addr + sizeof(struct xlog_op_header); + ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t))); + + *vecp = vec; + return buf; +} + static void xlog_grant_sub_space( struct xlog *log, @@ -2232,9 +2288,9 @@ xlog_print_trans( } /* - * Calculate the potential space needed by the log vector. If this is a start - * transaction, the caller has already accounted for both opheaders in the start - * transaction, so we don't need to account for them here. + * Calculate the potential space needed by the log vector. All regions contain + * their own opheaders and they are accounted for in region space so we don't + * need to add them to the vector length here. */ static int xlog_write_calc_vec_length( @@ -2261,18 +2317,7 @@ xlog_write_calc_vec_length( xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); } } - - /* Don't account for regions with embedded ophdrs */ - if (optype && headers > 0) { - headers--; - if (optype & XLOG_START_TRANS) { - ASSERT(headers >= 1); - headers--; - } - } - ticket->t_res_num_ophdrs += headers; - len += headers * sizeof(struct xlog_op_header); return len; } @@ -2282,7 +2327,6 @@ xlog_write_setup_ophdr( struct xlog_op_header *ophdr, struct xlog_ticket *ticket) { - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); ophdr->oh_clientid = XFS_TRANSACTION; ophdr->oh_res2 = 0; ophdr->oh_flags = 0; @@ -2500,21 +2544,25 @@ xlog_write( ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); /* - * The XLOG_START_TRANS has embedded ophdrs for the - * start record and transaction header. They will always - * be the first two regions in the lv chain. Commit and - * unmount records also have embedded ophdrs. + * Regions always have their ophdr at the start of the + * region, except for: + * - a transaction start which has a start record ophdr + * before the first region ophdr; and + * - the previous region didn't fully fit into an iclog + * so needs a continuation ophdr to prepend the region + * in this new iclog. */ - if (optype) { - ophdr = reg->i_addr; - if (index) - optype &= ~XLOG_START_TRANS; - } else { + ophdr = reg->i_addr; + if (optype && index) { + optype &= ~XLOG_START_TRANS; + } else if (partial_copy) { ophdr = xlog_write_setup_ophdr(ptr, ticket); xlog_write_adv_cnt(&ptr, &len, &log_offset, sizeof(struct xlog_op_header)); added_ophdr = true; } + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + len += xlog_write_setup_copy(ticket, ophdr, iclog->ic_size-log_offset, reg->i_len, @@ -2532,20 +2580,11 @@ xlog_write( ophdr->oh_len = cpu_to_be32(copy_len - sizeof(struct xlog_op_header)); } - /* - * Copy region. - * - * Commit records just log an opheader, so - * we can have empty payloads with no data region to - * copy. Hence we only copy the payload if the vector - * says it has data to copy. - */ - ASSERT(copy_len >= 0); - if (copy_len > 0) { - memcpy(ptr, reg->i_addr + copy_off, copy_len); - xlog_write_adv_cnt(&ptr, &len, &log_offset, - copy_len); - } + + ASSERT(copy_len > 0); + memcpy(ptr, reg->i_addr + copy_off, copy_len); + xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); + if (added_ophdr) copy_len += sizeof(struct xlog_op_header); record_cnt++; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index d1fc43476166..816f44d7dc81 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -21,44 +21,18 @@ struct xfs_log_vec { #define XFS_LOG_VEC_ORDERED (-1) -/* - * We need to make sure the buffer pointer returned is naturally aligned for the - * biggest basic data type we put into it. We have already accounted for this - * padding when sizing the buffer. - * - * However, this padding does not get written into the log, and hence we have to - * track the space used by the log vectors separately to prevent log space hangs - * due to inaccurate accounting (i.e. a leak) of the used log space through the - * CIL context ticket. - */ -static inline void * -xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, - uint type) -{ - struct xfs_log_iovec *vec = *vecp; - - if (vec) { - ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); - vec++; - } else { - vec = &lv->lv_iovecp[0]; - } - - if (!IS_ALIGNED(lv->lv_buf_len, sizeof(uint64_t))) - lv->lv_buf_len = round_up(lv->lv_buf_len, sizeof(uint64_t)); - - vec->i_type = type; - vec->i_addr = lv->lv_buf + lv->lv_buf_len; - - ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t))); - - *vecp = vec; - return vec->i_addr; -} +void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type); static inline void xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) { + struct xlog_op_header *oph = vec->i_addr; + + /* opheader tracks payload length, logvec tracks region length */ + oph->oh_len = cpu_to_be32(len); + + len += sizeof(struct xlog_op_header); lv->lv_buf_len += len; lv->lv_bytes += len; vec->i_len = len; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 2403a7bbb913..5ccbb6bd4655 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -214,13 +214,20 @@ xlog_cil_alloc_shadow_bufs( } /* - * We 64-bit align the length of each iovec so that the start - * of the next one is naturally aligned. We'll need to - * account for that slack space here. Then round nbytes up - * to 64-bit alignment so that the initial buffer alignment is - * easy to calculate and verify. + * We 64-bit align the length of each iovec so that the start of + * the next one is naturally aligned. We'll need to account for + * that slack space here. + * + * We also add the xlog_op_header to each region when + * formatting, but that's not accounted to the size of the item + * at this point. Hence we'll need an addition number of bytes + * for each vector to hold an opheader. + * + * Then round nbytes up to 64-bit alignment so that the initial + * buffer alignment is easy to calculate and verify. */ - nbytes += niovecs * sizeof(uint64_t); + nbytes += niovecs * + (sizeof(uint64_t) + sizeof(struct xlog_op_header)); nbytes = round_up(nbytes, sizeof(uint64_t)); /* @@ -465,11 +472,6 @@ xlog_cil_insert_items( spin_lock(&cil->xc_cil_lock); - /* account for space used by new iovec headers */ - iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t); - len += iovhdr_res; - ctx->nvecs += diff_iovecs; - /* attach the transaction to the CIL if it has any busy extents */ if (!list_empty(&tp->t_busy)) list_splice_init(&tp->t_busy, &ctx->busy_extents); @@ -501,6 +503,7 @@ xlog_cil_insert_items( } tp->t_ticket->t_curr_res -= len; ctx->space_used += len; + ctx->nvecs += diff_iovecs; /* * If we've overrun the reservation, dump the tx details before we move -- cgit From c5141320c42b08b99b7c4b250ac9675d7c7ed3a7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:35:09 +1000 Subject: xfs: log ticket region debug is largely useless xlog_tic_add_region() is used to trace the regions being added to a log ticket to provide information in the situation where a ticket reservation overrun occurs. The information gathered is stored int the ticket, and dumped if xlog_print_tic_res() is called. For a front end struct xfs_trans overrun, the ticket only contains reservation tracking information - the ticket is never handed to the log so has no regions attached to it. The overrun debug information in this case comes from xlog_print_trans(), which walks the items attached to the transaction and dumps their attached formatted log vectors directly. It also dumps the ticket state, but that only contains reservation accounting and nothing else. Hence xlog_print_tic_res() never dumps region or overrun information from this path. xlog_tic_add_region() is actually called from xlog_write(), which means it is being used to track the regions seen in a CIL checkpoint log vector chain. In looking at CIL behaviour recently, I've seen 32MB checkpoints regularly exceed 250,000 regions in the LV chain. The log ticket debug code can track *15* regions. IOWs, if there is a ticket overrun in the CIL code, the ticket region tracking code is going to be completely useless for determining what went wrong. The only thing it can tell us is how much of an overrun occurred, and we really don't need extra debug information in the log ticket to tell us that. Indeed, the main place we call xlog_tic_add_region() is also adding up the number of regions and the space used so that xlog_write() knows how much will be written to the log. This is exactly the same information that log ticket is storing once we take away the useless region tracking array. Hence xlog_tic_add_region() is not useful, but can be called 250,000 times a CIL push... Just strip all that debug "information" out of the of the log ticket and only have it report reservation space information when an overrun occurs. This also reduces the size of a log ticket down by about 150 bytes... Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 107 +++----------------------------------------------- fs/xfs/xfs_log_priv.h | 20 ---------- 2 files changed, 6 insertions(+), 121 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 23ca073de6df..4e3fc28c12f5 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -378,30 +378,6 @@ xlog_grant_head_check( return error; } -static void -xlog_tic_reset_res(xlog_ticket_t *tic) -{ - tic->t_res_num = 0; - tic->t_res_arr_sum = 0; - tic->t_res_num_ophdrs = 0; -} - -static void -xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) -{ - if (tic->t_res_num == XLOG_TIC_LEN_MAX) { - /* add to overflow and start again */ - tic->t_res_o_flow += tic->t_res_arr_sum; - tic->t_res_num = 0; - tic->t_res_arr_sum = 0; - } - - tic->t_res_arr[tic->t_res_num].r_len = len; - tic->t_res_arr[tic->t_res_num].r_type = type; - tic->t_res_arr_sum += len; - tic->t_res_num++; -} - bool xfs_log_writable( struct xfs_mount *mp) @@ -451,8 +427,6 @@ xfs_log_regrant( xlog_grant_push_ail(log, tic->t_unit_res); tic->t_curr_res = tic->t_unit_res; - xlog_tic_reset_res(tic); - if (tic->t_cnt > 0) return 0; @@ -2178,63 +2152,11 @@ xlog_print_tic_res( struct xfs_mount *mp, struct xlog_ticket *ticket) { - uint i; - uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); - - /* match with XLOG_REG_TYPE_* in xfs_log.h */ -#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str - static char *res_type_str[] = { - REG_TYPE_STR(BFORMAT, "bformat"), - REG_TYPE_STR(BCHUNK, "bchunk"), - REG_TYPE_STR(EFI_FORMAT, "efi_format"), - REG_TYPE_STR(EFD_FORMAT, "efd_format"), - REG_TYPE_STR(IFORMAT, "iformat"), - REG_TYPE_STR(ICORE, "icore"), - REG_TYPE_STR(IEXT, "iext"), - REG_TYPE_STR(IBROOT, "ibroot"), - REG_TYPE_STR(ILOCAL, "ilocal"), - REG_TYPE_STR(IATTR_EXT, "iattr_ext"), - REG_TYPE_STR(IATTR_BROOT, "iattr_broot"), - REG_TYPE_STR(IATTR_LOCAL, "iattr_local"), - REG_TYPE_STR(QFORMAT, "qformat"), - REG_TYPE_STR(DQUOT, "dquot"), - REG_TYPE_STR(QUOTAOFF, "quotaoff"), - REG_TYPE_STR(LRHEADER, "LR header"), - REG_TYPE_STR(UNMOUNT, "unmount"), - REG_TYPE_STR(COMMIT, "commit"), - REG_TYPE_STR(TRANSHDR, "trans header"), - REG_TYPE_STR(ICREATE, "inode create"), - REG_TYPE_STR(RUI_FORMAT, "rui_format"), - REG_TYPE_STR(RUD_FORMAT, "rud_format"), - REG_TYPE_STR(CUI_FORMAT, "cui_format"), - REG_TYPE_STR(CUD_FORMAT, "cud_format"), - REG_TYPE_STR(BUI_FORMAT, "bui_format"), - REG_TYPE_STR(BUD_FORMAT, "bud_format"), - }; - BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1); -#undef REG_TYPE_STR - xfs_warn(mp, "ticket reservation summary:"); - xfs_warn(mp, " unit res = %d bytes", - ticket->t_unit_res); - xfs_warn(mp, " current res = %d bytes", - ticket->t_curr_res); - xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)", - ticket->t_res_arr_sum, ticket->t_res_o_flow); - xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)", - ticket->t_res_num_ophdrs, ophdr_spc); - xfs_warn(mp, " ophdr + reg = %u bytes", - ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc); - xfs_warn(mp, " num regions = %u", - ticket->t_res_num); - - for (i = 0; i < ticket->t_res_num; i++) { - uint r_type = ticket->t_res_arr[i].r_type; - xfs_warn(mp, "region[%u]: %s - %u bytes", i, - ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? - "bad-rtype" : res_type_str[r_type]), - ticket->t_res_arr[i].r_len); - } + xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res); + xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res); + xfs_warn(mp, " original count = %d", ticket->t_ocnt); + xfs_warn(mp, " remaining count = %d", ticket->t_cnt); } /* @@ -2299,7 +2221,6 @@ xlog_write_calc_vec_length( uint optype) { struct xfs_log_vec *lv; - int headers = 0; int len = 0; int i; @@ -2308,17 +2229,9 @@ xlog_write_calc_vec_length( if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) continue; - headers += lv->lv_niovecs; - - for (i = 0; i < lv->lv_niovecs; i++) { - struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; - - len += vecp->i_len; - xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); - } + for (i = 0; i < lv->lv_niovecs; i++) + len += lv->lv_iovecp[i].i_len; } - ticket->t_res_num_ophdrs += headers; - return len; } @@ -2377,7 +2290,6 @@ xlog_write_setup_copy( /* account for new log op header */ ticket->t_curr_res -= sizeof(struct xlog_op_header); - ticket->t_res_num_ophdrs++; return sizeof(struct xlog_op_header); } @@ -3025,9 +2937,6 @@ restart: */ if (log_offset == 0) { ticket->t_curr_res -= log->l_iclog_hsize; - xlog_tic_add_region(ticket, - log->l_iclog_hsize, - XLOG_REG_TYPE_LRHEADER); head->h_cycle = cpu_to_be32(log->l_curr_cycle); head->h_lsn = cpu_to_be64( xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); @@ -3107,7 +3016,6 @@ xfs_log_ticket_regrant( xlog_grant_sub_space(log, &log->l_write_head.grant, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); trace_xfs_log_ticket_regrant_sub(log, ticket); @@ -3118,7 +3026,6 @@ xfs_log_ticket_regrant( trace_xfs_log_ticket_regrant_exit(log, ticket); ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); } xfs_log_ticket_put(ticket); @@ -3628,8 +3535,6 @@ xlog_ticket_alloc( if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; - xlog_tic_reset_res(tic); - return tic; } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 6f247561588b..b588bb3d02cf 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -142,19 +142,6 @@ enum xlog_iclog_state { #define XLOG_COVER_OPS 5 -/* Ticket reservation region accounting */ -#define XLOG_TIC_LEN_MAX 15 - -/* - * Reservation region - * As would be stored in xfs_log_iovec but without the i_addr which - * we don't care about. - */ -typedef struct xlog_res { - uint r_len; /* region length :4 */ - uint r_type; /* region's transaction type :4 */ -} xlog_res_t; - typedef struct xlog_ticket { struct list_head t_queue; /* reserve/write queue */ struct task_struct *t_task; /* task that owns this ticket */ @@ -165,13 +152,6 @@ typedef struct xlog_ticket { char t_ocnt; /* original count : 1 */ char t_cnt; /* current count : 1 */ char t_flags; /* properties of reservation : 1 */ - - /* reservation array fields */ - uint t_res_num; /* num in array : 4 */ - uint t_res_num_ophdrs; /* num op hdrs : 4 */ - uint t_res_arr_sum; /* array sum : 4 */ - uint t_res_o_flow; /* sum overflow : 4 */ - xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ } xlog_ticket_t; /* -- cgit From d80fc2914f9125a723d9af7038b1592fa8d1ea96 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:35:19 +1000 Subject: xfs: pass lv chain length into xlog_write() The caller of xlog_write() usually has a close accounting of the aggregated vector length contained in the log vector chain passed to xlog_write(). There is no need to iterate the chain to calculate he length of the data in xlog_write_calculate_len() if the caller is already iterating that chain to build it. Passing in the vector length avoids doing an extra chain iteration, which can be a significant amount of work given that large CIL commits can have hundreds of thousands of vectors attached to the chain. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 35 +++++------------------------------ fs/xfs/xfs_log_cil.c | 25 +++++++++++++++++-------- fs/xfs/xfs_log_priv.h | 2 +- 3 files changed, 23 insertions(+), 39 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4e3fc28c12f5..e03a9419f5cf 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -958,7 +958,8 @@ xlog_write_unmount_record( /* account for space used by record data */ ticket->t_curr_res -= sizeof(unmount_rec); - return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS); + return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS, + reg.i_len); } /* @@ -2209,32 +2210,6 @@ xlog_print_trans( } } -/* - * Calculate the potential space needed by the log vector. All regions contain - * their own opheaders and they are accounted for in region space so we don't - * need to add them to the vector length here. - */ -static int -xlog_write_calc_vec_length( - struct xlog_ticket *ticket, - struct xfs_log_vec *log_vector, - uint optype) -{ - struct xfs_log_vec *lv; - int len = 0; - int i; - - for (lv = log_vector; lv; lv = lv->lv_next) { - /* we don't write ordered log vectors */ - if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) - continue; - - for (i = 0; i < lv->lv_niovecs; i++) - len += lv->lv_iovecp[i].i_len; - } - return len; -} - static xlog_op_header_t * xlog_write_setup_ophdr( struct xlog_op_header *ophdr, @@ -2388,13 +2363,14 @@ xlog_write( struct xfs_cil_ctx *ctx, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, - uint optype) + uint optype, + uint32_t len) + { struct xlog_in_core *iclog = NULL; struct xfs_log_vec *lv = log_vector; struct xfs_log_iovec *vecp = lv->lv_iovecp; int index = 0; - int len; int partial_copy = 0; int partial_copy_len = 0; int contwr = 0; @@ -2409,7 +2385,6 @@ xlog_write( xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } - len = xlog_write_calc_vec_length(ticket, log_vector, optype); while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 5ccbb6bd4655..dad2b527c3ff 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -825,7 +825,8 @@ restart: static int xlog_cil_write_chain( struct xfs_cil_ctx *ctx, - struct xfs_log_vec *chain) + struct xfs_log_vec *chain, + uint32_t chain_len) { struct xlog *log = ctx->cil->xc_log; int error; @@ -833,7 +834,8 @@ xlog_cil_write_chain( error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD); if (error) return error; - return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS); + return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS, + chain_len); } /* @@ -872,7 +874,8 @@ xlog_cil_write_commit_record( /* account for space used by record data */ ctx->ticket->t_curr_res -= reg.i_len; - error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS); + error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS, + reg.i_len); if (error) xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; @@ -935,11 +938,12 @@ xlog_cil_build_trans_hdr( sizeof(struct xfs_trans_header); hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR; - tic->t_curr_res -= hdr->lhdr[0].i_len + hdr->lhdr[1].i_len; - lvhdr->lv_niovecs = 2; lvhdr->lv_iovecp = &hdr->lhdr[0]; + lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len; lvhdr->lv_next = ctx->lv_chain; + + tic->t_curr_res -= lvhdr->lv_bytes; } /* @@ -966,7 +970,8 @@ xlog_cil_push_work( struct xlog *log = cil->xc_log; struct xfs_log_vec *lv; struct xfs_cil_ctx *new_ctx; - int num_iovecs; + int num_iovecs = 0; + int num_bytes = 0; int error = 0; struct xlog_cil_trans_hdr thdr; struct xfs_log_vec lvhdr = { NULL }; @@ -1047,7 +1052,6 @@ xlog_cil_push_work( * by the flush lock. */ lv = NULL; - num_iovecs = 0; while (!list_empty(&cil->xc_cil)) { struct xfs_log_item *item; @@ -1061,6 +1065,10 @@ xlog_cil_push_work( lv = item->li_lv; item->li_lv = NULL; num_iovecs += lv->lv_niovecs; + + /* we don't write ordered log vectors */ + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) + num_bytes += lv->lv_bytes; } /* @@ -1099,8 +1107,9 @@ xlog_cil_push_work( * transaction header here as it is not accounted for in xlog_write(). */ xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs); + num_bytes += lvhdr.lv_bytes; - error = xlog_cil_write_chain(ctx, &lvhdr); + error = xlog_cil_write_chain(ctx, &lvhdr, num_bytes); if (error) goto out_abort_free_ticket; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b588bb3d02cf..a070a2c827ce 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -503,7 +503,7 @@ void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx, struct xfs_log_vec *log_vector, struct xlog_ticket *tic, - uint optype); + uint optype, uint32_t len); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); -- cgit From decb545fc081a1f03d4216831afb82482e8b6342 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Apr 2022 10:35:53 +1000 Subject: xfs: change the type of ic_datap Turn ic_datap from a char into a void pointer given that it points to arbitrary data. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong [dgc: also remove (char *) cast in xlog_alloc_log()] Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 7 +++---- fs/xfs/xfs_log_priv.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e03a9419f5cf..4cac816023a4 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1644,7 +1644,7 @@ xlog_alloc_log( iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); INIT_LIST_HEAD(&iclog->ic_callbacks); - iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; + iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize; init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -3664,7 +3664,7 @@ xlog_verify_iclog( if (field_offset & 0x1ff) { clientid = ophead->oh_clientid; } else { - idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); + idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3687,8 +3687,7 @@ xlog_verify_iclog( if (field_offset & 0x1ff) { op_len = be32_to_cpu(ophead->oh_len); } else { - idx = BTOBBT((uintptr_t)&ophead->oh_len - - (uintptr_t)iclog->ic_datap); + idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index a070a2c827ce..20d818c14003 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -190,7 +190,7 @@ typedef struct xlog_in_core { u32 ic_offset; enum xlog_iclog_state ic_state; unsigned int ic_flags; - char *ic_datap; /* pointer to iclog data */ + void *ic_datap; /* pointer to iclog data */ struct list_head ic_callbacks; /* reference counts need their own cacheline */ -- cgit From db357078b0423e0a86a633780cbca3f01c54885d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:36:05 +1000 Subject: xfs: introduce xlog_write_full() Introduce an optimised version of xlog_write() that is used when the entire write will fit in a single iclog. This greatly simplifies the implementation of writing a log vector chain into an iclog, and sets the ground work for a much more understandable xlog_write() implementation. This incorporates some factoring and simplifications proposed by Christoph Hellwig. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4cac816023a4..d1f4d9031df3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2210,6 +2210,58 @@ xlog_print_trans( } } +static inline void +xlog_write_iovec( + struct xlog_in_core *iclog, + uint32_t *log_offset, + void *data, + uint32_t write_len, + int *bytes_left, + uint32_t *record_cnt, + uint32_t *data_cnt) +{ + ASSERT(*log_offset % sizeof(int32_t) == 0); + ASSERT(write_len % sizeof(int32_t) == 0); + + memcpy(iclog->ic_datap + *log_offset, data, write_len); + *log_offset += write_len; + *bytes_left -= write_len; + (*record_cnt)++; + *data_cnt += write_len; +} + +/* + * Write log vectors into a single iclog which is guaranteed by the caller + * to have enough space to write the entire log vector into. + */ +static void +xlog_write_full( + struct xfs_log_vec *lv, + struct xlog_ticket *ticket, + struct xlog_in_core *iclog, + uint32_t *log_offset, + uint32_t *len, + uint32_t *record_cnt, + uint32_t *data_cnt) +{ + int index; + + ASSERT(*log_offset + *len <= iclog->ic_size); + + /* + * Ordered log vectors have no regions to write so this + * loop will naturally skip them. + */ + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *reg = &lv->lv_iovecp[index]; + struct xlog_op_header *ophdr = reg->i_addr; + + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + xlog_write_iovec(iclog, log_offset, reg->i_addr, + reg->i_len, len, record_cnt, data_cnt); + } +} + static xlog_op_header_t * xlog_write_setup_ophdr( struct xlog_op_header *ophdr, @@ -2374,8 +2426,8 @@ xlog_write( int partial_copy = 0; int partial_copy_len = 0; int contwr = 0; - int record_cnt = 0; - int data_cnt = 0; + uint32_t record_cnt = 0; + uint32_t data_cnt = 0; int error = 0; if (ticket->t_curr_res < 0) { @@ -2395,7 +2447,6 @@ xlog_write( return error; ASSERT(log_offset <= iclog->ic_size - 1); - ptr = iclog->ic_datap + log_offset; /* * If we have a context pointer, pass it the first iclog we are @@ -2407,10 +2458,22 @@ xlog_write( ctx = NULL; } + /* If this is a single iclog write, go fast... */ + if (!contwr && lv == log_vector) { + while (lv) { + xlog_write_full(lv, ticket, iclog, &log_offset, + &len, &record_cnt, &data_cnt); + lv = lv->lv_next; + } + data_cnt = 0; + break; + } + /* * This loop writes out as many regions as can fit in the amount * of space which was allocated by xlog_state_get_iclog_space(). */ + ptr = iclog->ic_datap + log_offset; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; -- cgit From ad3e3693182bb990484b187b33c7f9735bb549be Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:36:15 +1000 Subject: xfs: introduce xlog_write_partial() Re-implement writing of a log vector that does not fit into the current iclog. The iclog will already be in XLOG_STATE_WANT_SYNC because xlog_get_iclog_space() will have reserved all the remaining iclog space for us, hence we can simply iterate over the iovecs in the log vector getting more iclog space until the entire log vector is written. Handling this partial write case separately means we do need to pass unnecessary state around for the common, fast path case when the log vector fits entirely within the current iclog. It isolates the complexity and allows us to modify and improve the partial write case without impacting the simple fast path. This change includes several improvements incorporated from patches written by Christoph Hellwig. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 424 +++++++++++++++++++++++--------------------------- fs/xfs/xfs_log_priv.h | 8 - 2 files changed, 196 insertions(+), 236 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d1f4d9031df3..29e09a888f38 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2246,7 +2246,8 @@ xlog_write_full( { int index; - ASSERT(*log_offset + *len <= iclog->ic_size); + ASSERT(*log_offset + *len <= iclog->ic_size || + iclog->ic_state == XLOG_STATE_WANT_SYNC); /* * Ordered log vectors have no regions to write so this @@ -2262,111 +2263,177 @@ xlog_write_full( } } -static xlog_op_header_t * -xlog_write_setup_ophdr( - struct xlog_op_header *ophdr, - struct xlog_ticket *ticket) +static int +xlog_write_get_more_iclog_space( + struct xlog_ticket *ticket, + struct xlog_in_core **iclogp, + uint32_t *log_offset, + uint32_t len, + uint32_t *record_cnt, + uint32_t *data_cnt, + int *contwr) { - ophdr->oh_clientid = XFS_TRANSACTION; - ophdr->oh_res2 = 0; - ophdr->oh_flags = 0; - return ophdr; + struct xlog_in_core *iclog = *iclogp; + struct xlog *log = iclog->ic_log; + int error; + + spin_lock(&log->l_icloglock); + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC); + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + error = xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + if (error) + return error; + + error = xlog_state_get_iclog_space(log, len, &iclog, + ticket, contwr, log_offset); + if (error) + return error; + *record_cnt = 0; + *data_cnt = 0; + *iclogp = iclog; + return 0; } /* - * Set up the parameters of the region copy into the log. This has - * to handle region write split across multiple log buffers - this - * state is kept external to this function so that this code can - * be written in an obvious, self documenting manner. + * Write log vectors into a single iclog which is smaller than the current chain + * length. We write until we cannot fit a full record into the remaining space + * and then stop. We return the log vector that is to be written that cannot + * wholly fit in the iclog. */ static int -xlog_write_setup_copy( +xlog_write_partial( + struct xfs_log_vec *lv, struct xlog_ticket *ticket, - struct xlog_op_header *ophdr, - int space_available, - int space_required, - int *copy_off, - int *copy_len, - int *last_was_partial_copy, - int *bytes_consumed) -{ - int still_to_copy; - - still_to_copy = space_required - *bytes_consumed; - *copy_off = *bytes_consumed; - - if (still_to_copy <= space_available) { - /* write of region completes here */ - *copy_len = still_to_copy; - ophdr->oh_len = cpu_to_be32(*copy_len); - if (*last_was_partial_copy) - ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); - *last_was_partial_copy = 0; - *bytes_consumed = 0; - return 0; - } + struct xlog_in_core **iclogp, + uint32_t *log_offset, + uint32_t *len, + uint32_t *record_cnt, + uint32_t *data_cnt, + int *contwr) +{ + struct xlog_in_core *iclog = *iclogp; + struct xlog *log = iclog->ic_log; + struct xlog_op_header *ophdr; + int index = 0; + uint32_t rlen; + int error; - /* partial write of region, needs extra log op header reservation */ - *copy_len = space_available; - ophdr->oh_len = cpu_to_be32(*copy_len); - ophdr->oh_flags |= XLOG_CONTINUE_TRANS; - if (*last_was_partial_copy) - ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; - *bytes_consumed += *copy_len; - (*last_was_partial_copy)++; + /* walk the logvec, copying until we run out of space in the iclog */ + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *reg = &lv->lv_iovecp[index]; + uint32_t reg_offset = 0; - /* account for new log op header */ - ticket->t_curr_res -= sizeof(struct xlog_op_header); + /* + * The first region of a continuation must have a non-zero + * length otherwise log recovery will just skip over it and + * start recovering from the next opheader it finds. Because we + * mark the next opheader as a continuation, recovery will then + * incorrectly add the continuation to the previous region and + * that breaks stuff. + * + * Hence if there isn't space for region data after the + * opheader, then we need to start afresh with a new iclog. + */ + if (iclog->ic_size - *log_offset <= + sizeof(struct xlog_op_header)) { + error = xlog_write_get_more_iclog_space(ticket, + &iclog, log_offset, *len, record_cnt, + data_cnt, contwr); + if (error) + return error; + } - return sizeof(struct xlog_op_header); -} + ophdr = reg->i_addr; + rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset); -static int -xlog_write_copy_finish( - struct xlog *log, - struct xlog_in_core *iclog, - uint flags, - int *record_cnt, - int *data_cnt, - int *partial_copy, - int *partial_copy_len, - int log_offset) -{ - int error; + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header)); + if (rlen != reg->i_len) + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; + + xlog_verify_dest_ptr(log, iclog->ic_datap + *log_offset); + xlog_write_iovec(iclog, log_offset, reg->i_addr, + rlen, len, record_cnt, data_cnt); + + /* If we wrote the whole region, move to the next. */ + if (rlen == reg->i_len) + continue; - if (*partial_copy) { /* - * This iclog has already been marked WANT_SYNC by - * xlog_state_get_iclog_space. + * We now have a partially written iovec, but it can span + * multiple iclogs so we loop here. First we release the iclog + * we currently have, then we get a new iclog and add a new + * opheader. Then we continue copying from where we were until + * we either complete the iovec or fill the iclog. If we + * complete the iovec, then we increment the index and go right + * back to the top of the outer loop. if we fill the iclog, we + * run the inner loop again. + * + * This is complicated by the tail of a region using all the + * space in an iclog and hence requiring us to release the iclog + * and get a new one before returning to the outer loop. We must + * always guarantee that we exit this inner loop with at least + * space for log transaction opheaders left in the current + * iclog, hence we cannot just terminate the loop at the end + * of the of the continuation. So we loop while there is no + * space left in the current iclog, and check for the end of the + * continuation after getting a new iclog. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; - goto release_iclog; - } + do { + /* + * Ensure we include the continuation opheader in the + * space we need in the new iclog by adding that size + * to the length we require. This continuation opheader + * needs to be accounted to the ticket as the space it + * consumes hasn't been accounted to the lv we are + * writing. + */ + error = xlog_write_get_more_iclog_space(ticket, + &iclog, log_offset, + *len + sizeof(struct xlog_op_header), + record_cnt, data_cnt, contwr); + if (error) + return error; + + ophdr = iclog->ic_datap + *log_offset; + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = XFS_TRANSACTION; + ophdr->oh_res2 = 0; + ophdr->oh_flags = XLOG_WAS_CONT_TRANS; - *partial_copy = 0; - *partial_copy_len = 0; + ticket->t_curr_res -= sizeof(struct xlog_op_header); + *log_offset += sizeof(struct xlog_op_header); + *data_cnt += sizeof(struct xlog_op_header); - if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t)) - return 0; + /* + * If rlen fits in the iclog, then end the region + * continuation. Otherwise we're going around again. + */ + reg_offset += rlen; + rlen = reg->i_len - reg_offset; + if (rlen <= iclog->ic_size - *log_offset) + ophdr->oh_flags |= XLOG_END_TRANS; + else + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; - /* no more space in this iclog - push it. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; + rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset); + ophdr->oh_len = cpu_to_be32(rlen); - if (iclog->ic_state == XLOG_STATE_ACTIVE) - xlog_state_switch_iclogs(log, iclog, 0); - else - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || - xlog_is_shutdown(log)); -release_iclog: - error = xlog_state_release_iclog(log, iclog); - spin_unlock(&log->l_icloglock); - return error; + xlog_verify_dest_ptr(log, iclog->ic_datap + *log_offset); + xlog_write_iovec(iclog, log_offset, + reg->i_addr + reg_offset, + rlen, len, record_cnt, data_cnt); + + } while (ophdr->oh_flags & XLOG_CONTINUE_TRANS); + } + + /* + * No more iovecs remain in this logvec so return the next log vec to + * the caller so it can go back to fast path copying. + */ + *iclogp = iclog; + return 0; } /* @@ -2421,14 +2488,11 @@ xlog_write( { struct xlog_in_core *iclog = NULL; struct xfs_log_vec *lv = log_vector; - struct xfs_log_iovec *vecp = lv->lv_iovecp; - int index = 0; - int partial_copy = 0; - int partial_copy_len = 0; int contwr = 0; uint32_t record_cnt = 0; uint32_t data_cnt = 0; int error = 0; + int log_offset; if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, @@ -2437,151 +2501,54 @@ xlog_write( xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } - while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { - void *ptr; - int log_offset; - - error = xlog_state_get_iclog_space(log, len, &iclog, ticket, - &contwr, &log_offset); - if (error) - return error; + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &contwr, &log_offset); + if (error) + return error; - ASSERT(log_offset <= iclog->ic_size - 1); + ASSERT(log_offset <= iclog->ic_size - 1); - /* - * If we have a context pointer, pass it the first iclog we are - * writing to so it can record state needed for iclog write - * ordering. - */ - if (ctx) { - xlog_cil_set_ctx_write_state(ctx, iclog); - ctx = NULL; - } - - /* If this is a single iclog write, go fast... */ - if (!contwr && lv == log_vector) { - while (lv) { - xlog_write_full(lv, ticket, iclog, &log_offset, - &len, &record_cnt, &data_cnt); - lv = lv->lv_next; - } - data_cnt = 0; - break; - } + /* + * If we have a context pointer, pass it the first iclog we are + * writing to so it can record state needed for iclog write + * ordering. + */ + if (ctx) + xlog_cil_set_ctx_write_state(ctx, iclog); + while (lv) { /* - * This loop writes out as many regions as can fit in the amount - * of space which was allocated by xlog_state_get_iclog_space(). + * If the entire log vec does not fit in the iclog, punt it to + * the partial copy loop which can handle this case. */ - ptr = iclog->ic_datap + log_offset; - while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { - struct xfs_log_iovec *reg; - struct xlog_op_header *ophdr; - int copy_len; - int copy_off; - bool ordered = false; - bool added_ophdr = false; - - /* ordered log vectors have no regions to write */ - if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { - ASSERT(lv->lv_niovecs == 0); - ordered = true; - goto next_lv; - } - - reg = &vecp[index]; - ASSERT(reg->i_len % sizeof(int32_t) == 0); - ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); - - /* - * Regions always have their ophdr at the start of the - * region, except for: - * - a transaction start which has a start record ophdr - * before the first region ophdr; and - * - the previous region didn't fully fit into an iclog - * so needs a continuation ophdr to prepend the region - * in this new iclog. - */ - ophdr = reg->i_addr; - if (optype && index) { - optype &= ~XLOG_START_TRANS; - } else if (partial_copy) { - ophdr = xlog_write_setup_ophdr(ptr, ticket); - xlog_write_adv_cnt(&ptr, &len, &log_offset, - sizeof(struct xlog_op_header)); - added_ophdr = true; - } - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); - - len += xlog_write_setup_copy(ticket, ophdr, - iclog->ic_size-log_offset, - reg->i_len, - ©_off, ©_len, - &partial_copy, - &partial_copy_len); - xlog_verify_dest_ptr(log, ptr); - - - /* - * Wart: need to update length in embedded ophdr not - * to include it's own length. - */ - if (!added_ophdr) { - ophdr->oh_len = cpu_to_be32(copy_len - - sizeof(struct xlog_op_header)); - } - - ASSERT(copy_len > 0); - memcpy(ptr, reg->i_addr + copy_off, copy_len); - xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); - - if (added_ophdr) - copy_len += sizeof(struct xlog_op_header); - record_cnt++; - data_cnt += contwr ? copy_len : 0; - - error = xlog_write_copy_finish(log, iclog, optype, - &record_cnt, &data_cnt, - &partial_copy, - &partial_copy_len, - log_offset); - if (error) + if (lv->lv_niovecs && + lv->lv_bytes > iclog->ic_size - log_offset) { + error = xlog_write_partial(lv, ticket, &iclog, + &log_offset, &len, &record_cnt, + &data_cnt, &contwr); + if (error) { + /* + * We have no iclog to release, so just return + * the error immediately. + */ return error; - - /* - * if we had a partial copy, we need to get more iclog - * space but we don't want to increment the region - * index because there is still more is this region to - * write. - * - * If we completed writing this region, and we flushed - * the iclog (indicated by resetting of the record - * count), then we also need to get more log space. If - * this was the last record, though, we are done and - * can just return. - */ - if (partial_copy) - break; - - if (++index == lv->lv_niovecs) { -next_lv: - lv = lv->lv_next; - index = 0; - if (lv) - vecp = lv->lv_iovecp; - } - if (record_cnt == 0 && !ordered) { - if (!lv) - return 0; - break; } + } else { + xlog_write_full(lv, ticket, iclog, &log_offset, + &len, &record_cnt, &data_cnt); } + lv = lv->lv_next; } - ASSERT(len == 0); + /* + * We've already been guaranteed that the last writes will fit inside + * the current iclog, and hence it will already have the space used by + * those writes accounted to it. Hence we do not need to update the + * iclog with the number of bytes written here. + */ spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + xlog_state_finish_copy(log, iclog, record_cnt, 0); error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); @@ -3738,11 +3705,12 @@ xlog_verify_iclog( iclog->ic_header.h_cycle_data[idx]); } } - if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) + if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) { xfs_warn(log->l_mp, - "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx", - __func__, clientid, ophead, + "%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx", + __func__, i, clientid, ophead, (unsigned long)field_offset); + } /* check length */ p = &ophead->oh_len; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 20d818c14003..626e293ab2a4 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -491,14 +491,6 @@ extern struct kmem_cache *xfs_log_ticket_cache; struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, int count, bool permanent); -static inline void -xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) -{ - *ptr += bytes; - *len -= bytes; - *off += bytes; -} - void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx, -- cgit From 1236bbe86bb83165ff6ba68ce19d81713340f597 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Apr 2022 10:36:27 +1000 Subject: xfs: remove xlog_verify_dest_ptr Just check that the offset in xlog_write_vec is smaller than the iclog size and remove the expensive cycling through all iclogs. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 35 +---------------------------------- fs/xfs/xfs_log_priv.h | 4 ---- 2 files changed, 1 insertion(+), 38 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 29e09a888f38..106331ee454e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -61,10 +61,6 @@ xlog_sync( struct xlog_in_core *iclog); #if defined(DEBUG) STATIC void -xlog_verify_dest_ptr( - struct xlog *log, - void *ptr); -STATIC void xlog_verify_grant_tail( struct xlog *log); STATIC void @@ -77,7 +73,6 @@ xlog_verify_tail_lsn( struct xlog *log, struct xlog_in_core *iclog); #else -#define xlog_verify_dest_ptr(a,b) #define xlog_verify_grant_tail(a) #define xlog_verify_iclog(a,b,c) #define xlog_verify_tail_lsn(a,b) @@ -1626,9 +1621,6 @@ xlog_alloc_log( GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog->ic_data) goto out_free_iclog; -#ifdef DEBUG - log->l_iclog_bak[i] = &iclog->ic_header; -#endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); @@ -2220,6 +2212,7 @@ xlog_write_iovec( uint32_t *record_cnt, uint32_t *data_cnt) { + ASSERT(*log_offset < iclog->ic_log->l_iclog_size); ASSERT(*log_offset % sizeof(int32_t) == 0); ASSERT(write_len % sizeof(int32_t) == 0); @@ -2313,7 +2306,6 @@ xlog_write_partial( int *contwr) { struct xlog_in_core *iclog = *iclogp; - struct xlog *log = iclog->ic_log; struct xlog_op_header *ophdr; int index = 0; uint32_t rlen; @@ -2352,7 +2344,6 @@ xlog_write_partial( if (rlen != reg->i_len) ophdr->oh_flags |= XLOG_CONTINUE_TRANS; - xlog_verify_dest_ptr(log, iclog->ic_datap + *log_offset); xlog_write_iovec(iclog, log_offset, reg->i_addr, rlen, len, record_cnt, data_cnt); @@ -2420,7 +2411,6 @@ xlog_write_partial( rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset); ophdr->oh_len = cpu_to_be32(rlen); - xlog_verify_dest_ptr(log, iclog->ic_datap + *log_offset); xlog_write_iovec(iclog, log_offset, reg->i_addr + reg_offset, rlen, len, record_cnt, data_cnt); @@ -3544,29 +3534,6 @@ xlog_ticket_alloc( } #if defined(DEBUG) -/* - * Make sure that the destination ptr is within the valid data region of - * one of the iclogs. This uses backup pointers stored in a different - * part of the log in case we trash the log structure. - */ -STATIC void -xlog_verify_dest_ptr( - struct xlog *log, - void *ptr) -{ - int i; - int good_ptr = 0; - - for (i = 0; i < log->l_iclog_bufs; i++) { - if (ptr >= log->l_iclog_bak[i] && - ptr <= log->l_iclog_bak[i] + log->l_iclog_size) - good_ptr++; - } - - if (!good_ptr) - xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); -} - /* * Check to make sure the grant write head didn't just over lap the tail. If * the cycles are the same, we can't be overlapping. Otherwise, make sure that diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 626e293ab2a4..2fbec7a30e59 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -420,10 +420,6 @@ struct xlog { struct xfs_kobj l_kobj; - /* The following field are used for debugging; need to hold icloglock */ -#ifdef DEBUG - void *l_iclog_bak[XLOG_MAX_ICLOGS]; -#endif /* log recovery lsn tracking (for buffer submission */ xfs_lsn_t l_recovery_lsn; -- cgit From be8ddda5f7e01229729a3e00e9971cc2b8a9ec10 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:36:37 +1000 Subject: xfs: xlog_write() no longer needs contwr state The rework of xlog_write() no longer requires xlog_get_iclog_state() to tell it about internal iclog space reservation state to direct it on what to do. Remove this parameter. $ size fs/xfs/xfs_log.o.* text data bss dec hex filename 26520 560 8 27088 69d0 fs/xfs/xfs_log.o.orig 26384 560 8 26952 6948 fs/xfs/xfs_log.o.patched Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 106331ee454e..13273a906f1b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -49,7 +49,6 @@ xlog_state_get_iclog_space( int len, struct xlog_in_core **iclog, struct xlog_ticket *ticket, - int *continued_write, int *logoffsetp); STATIC void xlog_grant_push_ail( @@ -2263,8 +2262,7 @@ xlog_write_get_more_iclog_space( uint32_t *log_offset, uint32_t len, uint32_t *record_cnt, - uint32_t *data_cnt, - int *contwr) + uint32_t *data_cnt) { struct xlog_in_core *iclog = *iclogp; struct xlog *log = iclog->ic_log; @@ -2278,8 +2276,8 @@ xlog_write_get_more_iclog_space( if (error) return error; - error = xlog_state_get_iclog_space(log, len, &iclog, - ticket, contwr, log_offset); + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + log_offset); if (error) return error; *record_cnt = 0; @@ -2302,8 +2300,7 @@ xlog_write_partial( uint32_t *log_offset, uint32_t *len, uint32_t *record_cnt, - uint32_t *data_cnt, - int *contwr) + uint32_t *data_cnt) { struct xlog_in_core *iclog = *iclogp; struct xlog_op_header *ophdr; @@ -2331,7 +2328,7 @@ xlog_write_partial( sizeof(struct xlog_op_header)) { error = xlog_write_get_more_iclog_space(ticket, &iclog, log_offset, *len, record_cnt, - data_cnt, contwr); + data_cnt); if (error) return error; } @@ -2383,7 +2380,7 @@ xlog_write_partial( error = xlog_write_get_more_iclog_space(ticket, &iclog, log_offset, *len + sizeof(struct xlog_op_header), - record_cnt, data_cnt, contwr); + record_cnt, data_cnt); if (error) return error; @@ -2478,7 +2475,6 @@ xlog_write( { struct xlog_in_core *iclog = NULL; struct xfs_log_vec *lv = log_vector; - int contwr = 0; uint32_t record_cnt = 0; uint32_t data_cnt = 0; int error = 0; @@ -2492,7 +2488,7 @@ xlog_write( } error = xlog_state_get_iclog_space(log, len, &iclog, ticket, - &contwr, &log_offset); + &log_offset); if (error) return error; @@ -2515,7 +2511,7 @@ xlog_write( lv->lv_bytes > iclog->ic_size - log_offset) { error = xlog_write_partial(lv, ticket, &iclog, &log_offset, &len, &record_cnt, - &data_cnt, &contwr); + &data_cnt); if (error) { /* * We have no iclog to release, so just return @@ -2895,7 +2891,6 @@ xlog_state_get_iclog_space( int len, struct xlog_in_core **iclogp, struct xlog_ticket *ticket, - int *continued_write, int *logoffsetp) { int log_offset; @@ -2973,13 +2968,10 @@ restart: * iclogs (to mark it taken), this particular iclog will release/sync * to disk in xlog_write(). */ - if (len <= iclog->ic_size - iclog->ic_offset) { - *continued_write = 0; + if (len <= iclog->ic_size - iclog->ic_offset) iclog->ic_offset += len; - } else { - *continued_write = 1; + else xlog_state_switch_iclogs(log, iclog, iclog->ic_size); - } *iclogp = iclog; ASSERT(iclog->ic_offset <= iclog->ic_size); -- cgit From 14b07ecd5cd2545fcee3ff29119405f7cf2f59ad Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:36:48 +1000 Subject: xfs: xlog_write() doesn't need optype anymore So remove it from the interface and callers. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 4 +--- fs/xfs/xfs_log_cil.c | 6 ++---- fs/xfs/xfs_log_priv.h | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 13273a906f1b..c3fb2b0b40d3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -952,8 +952,7 @@ xlog_write_unmount_record( /* account for space used by record data */ ticket->t_curr_res -= sizeof(unmount_rec); - return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS, - reg.i_len); + return xlog_write(log, NULL, &vec, ticket, reg.i_len); } /* @@ -2469,7 +2468,6 @@ xlog_write( struct xfs_cil_ctx *ctx, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, - uint optype, uint32_t len) { diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index dad2b527c3ff..9065250227f9 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -834,8 +834,7 @@ xlog_cil_write_chain( error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD); if (error) return error; - return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS, - chain_len); + return xlog_write(log, ctx, chain, ctx->ticket, chain_len); } /* @@ -874,8 +873,7 @@ xlog_cil_write_commit_record( /* account for space used by record data */ ctx->ticket->t_curr_res -= reg.i_len; - error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS, - reg.i_len); + error = xlog_write(log, ctx, &vec, ctx->ticket, reg.i_len); if (error) xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2fbec7a30e59..b3d5ee85dd7e 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -491,7 +491,7 @@ void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx, struct xfs_log_vec *log_vector, struct xlog_ticket *tic, - uint optype, uint32_t len); + uint32_t len); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); -- cgit From 593e34391faafd72102bd79c43994f32e9dd0c91 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:36:56 +1000 Subject: xfs: CIL context doesn't need to count iovecs Now that we account for log opheaders in the log item formatting code, we don't actually use the aggregated count of log iovecs in the CIL for anything. Remove it and the tracking code that calculates it. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_cil.c | 22 ++++++---------------- fs/xfs/xfs_log_priv.h | 1 - 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 9065250227f9..e5ab62f08c19 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -284,22 +284,18 @@ xlog_cil_alloc_shadow_bufs( /* * Prepare the log item for insertion into the CIL. Calculate the difference in - * log space and vectors it will consume, and if it is a new item pin it as - * well. + * log space it will consume, and if it is a new item pin it as well. */ STATIC void xfs_cil_prepare_item( struct xlog *log, struct xfs_log_vec *lv, struct xfs_log_vec *old_lv, - int *diff_len, - int *diff_iovecs) + int *diff_len) { /* Account for the new LV being passed in */ - if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) *diff_len += lv->lv_bytes; - *diff_iovecs += lv->lv_niovecs; - } /* * If there is no old LV, this is the first time we've seen the item in @@ -316,7 +312,6 @@ xfs_cil_prepare_item( ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); *diff_len -= old_lv->lv_bytes; - *diff_iovecs -= old_lv->lv_niovecs; lv->lv_item->li_lv_shadow = old_lv; } @@ -365,12 +360,10 @@ static void xlog_cil_insert_format_items( struct xlog *log, struct xfs_trans *tp, - int *diff_len, - int *diff_iovecs) + int *diff_len) { struct xfs_log_item *lip; - /* Bail out if we didn't find a log item. */ if (list_empty(&tp->t_items)) { ASSERT(0); @@ -413,7 +406,6 @@ xlog_cil_insert_format_items( * set the item up as though it is a new insertion so * that the space reservation accounting is correct. */ - *diff_iovecs -= lv->lv_niovecs; *diff_len -= lv->lv_bytes; /* Ensure the lv is set up according to ->iop_size */ @@ -438,7 +430,7 @@ xlog_cil_insert_format_items( ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); lip->li_ops->iop_format(lip, lv); insert: - xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); + xfs_cil_prepare_item(log, lv, old_lv, diff_len); } } @@ -458,7 +450,6 @@ xlog_cil_insert_items( struct xfs_cil_ctx *ctx = cil->xc_ctx; struct xfs_log_item *lip; int len = 0; - int diff_iovecs = 0; int iclog_space; int iovhdr_res = 0, split_res = 0, ctx_res = 0; @@ -468,7 +459,7 @@ xlog_cil_insert_items( * We can do this safely because the context can't checkpoint until we * are done so it doesn't matter exactly how we update the CIL. */ - xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs); + xlog_cil_insert_format_items(log, tp, &len); spin_lock(&cil->xc_cil_lock); @@ -503,7 +494,6 @@ xlog_cil_insert_items( } tp->t_ticket->t_curr_res -= len; ctx->space_used += len; - ctx->nvecs += diff_iovecs; /* * If we've overrun the reservation, dump the tx details before we move diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b3d5ee85dd7e..4b7303ca1741 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -221,7 +221,6 @@ struct xfs_cil_ctx { xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ - int nvecs; /* number of regions */ int space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ struct xfs_log_vec *lv_chain; /* logvecs being pushed */ -- cgit From a4d98629c93fdb312641dfc336a9bda56358ef72 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:45:41 +1000 Subject: xfs: convert attr type flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_da_format.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 5a49caa5c9df..4c6561baf9e9 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -688,10 +688,10 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ #define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ #define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ -#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) -#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) -#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) -#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) +#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) /* -- cgit From 79539c7c761ac6d00abd50d5f1e4390f5dc9af18 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:45:52 +1000 Subject: xfs: convert scrub type flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. This touches xfs_fs.h so affects the user API, but the user API fields are also unsigned so the flags should really be unsigned, too. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_fs.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 505533c43a92..52c9d8676fa3 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -699,34 +699,34 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_NR 25 /* i: Repair this metadata. */ -#define XFS_SCRUB_IFLAG_REPAIR (1 << 0) +#define XFS_SCRUB_IFLAG_REPAIR (1u << 0) /* o: Metadata object needs repair. */ -#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1) +#define XFS_SCRUB_OFLAG_CORRUPT (1u << 1) /* * o: Metadata object could be optimized. It's not corrupt, but * we could improve on it somehow. */ -#define XFS_SCRUB_OFLAG_PREEN (1 << 2) +#define XFS_SCRUB_OFLAG_PREEN (1u << 2) /* o: Cross-referencing failed. */ -#define XFS_SCRUB_OFLAG_XFAIL (1 << 3) +#define XFS_SCRUB_OFLAG_XFAIL (1u << 3) /* o: Metadata object disagrees with cross-referenced metadata. */ -#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4) +#define XFS_SCRUB_OFLAG_XCORRUPT (1u << 4) /* o: Scan was not complete. */ -#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5) +#define XFS_SCRUB_OFLAG_INCOMPLETE (1u << 5) /* o: Metadata object looked funny but isn't corrupt. */ -#define XFS_SCRUB_OFLAG_WARNING (1 << 6) +#define XFS_SCRUB_OFLAG_WARNING (1u << 6) /* * o: IFLAG_REPAIR was set but metadata object did not need fixing or * optimization and has therefore not been altered. */ -#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7) +#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7) #define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) #define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ -- cgit From 0e5b8e45229bc2680f4b10505da338f1ca15a6d2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:46:01 +1000 Subject: xfs: convert bmap extent type flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 14 +++++++------- fs/xfs/libxfs/xfs_bmap.h | 22 +++++++++++----------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 74198dd82b03..d53dfe8db8f2 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1399,7 +1399,7 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t da_new; /* new count del alloc blocks used */ xfs_filblks_t da_old; /* old count del alloc blocks used */ xfs_filblks_t temp=0; /* value for da_new calculations */ @@ -1950,7 +1950,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec old; @@ -2479,7 +2479,7 @@ xfs_bmap_add_extent_hole_delay( xfs_filblks_t newlen=0; /* new indirect size */ xfs_filblks_t oldlen=0; /* old indirect size */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t temp; /* temp for indirect calculations */ ifp = XFS_IFORK_PTR(ip, whichfork); @@ -2626,7 +2626,7 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); @@ -4801,7 +4801,7 @@ xfs_bmap_del_extent_delay( int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; xfs_filblks_t got_indlen, new_indlen, stolen; - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); int error = 0; bool isrt; @@ -4926,7 +4926,7 @@ xfs_bmap_del_extent_cow( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec new; xfs_fileoff_t del_endoff, got_endoff; - int state = BMAP_COWFORK; + uint32_t state = BMAP_COWFORK; XFS_STATS_INC(mp, xs_del_exlist); @@ -5015,7 +5015,7 @@ xfs_bmap_del_extent_real( xfs_bmbt_irec_t new; /* new record to be inserted */ /* REFERENCED */ uint qfield; /* quota field to update */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; mp = ip->i_mount; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 03d9aaf87413..29d38c3c2607 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -124,16 +124,16 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) /* * Flags for xfs_bmap_add_extent*. */ -#define BMAP_LEFT_CONTIG (1 << 0) -#define BMAP_RIGHT_CONTIG (1 << 1) -#define BMAP_LEFT_FILLING (1 << 2) -#define BMAP_RIGHT_FILLING (1 << 3) -#define BMAP_LEFT_DELAY (1 << 4) -#define BMAP_RIGHT_DELAY (1 << 5) -#define BMAP_LEFT_VALID (1 << 6) -#define BMAP_RIGHT_VALID (1 << 7) -#define BMAP_ATTRFORK (1 << 8) -#define BMAP_COWFORK (1 << 9) +#define BMAP_LEFT_CONTIG (1u << 0) +#define BMAP_RIGHT_CONTIG (1u << 1) +#define BMAP_LEFT_FILLING (1u << 2) +#define BMAP_RIGHT_FILLING (1u << 3) +#define BMAP_LEFT_DELAY (1u << 4) +#define BMAP_RIGHT_DELAY (1u << 5) +#define BMAP_LEFT_VALID (1u << 6) +#define BMAP_RIGHT_VALID (1u << 7) +#define BMAP_ATTRFORK (1u << 8) +#define BMAP_COWFORK (1u << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ @@ -243,7 +243,7 @@ void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); -static inline int xfs_bmap_fork_to_state(int whichfork) +static inline uint32_t xfs_bmap_fork_to_state(int whichfork) { switch (whichfork) { case XFS_ATTR_FORK: -- cgit From e7d410ac336856cdae934e14b9c2c749ca5a32ea Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:46:09 +1000 Subject: xfs: convert bmapi flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 22 +++++++++++----------- fs/xfs/libxfs/xfs_bmap.h | 36 ++++++++++++++++++------------------ 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index d53dfe8db8f2..ad938e6e23aa 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -485,7 +485,7 @@ STATIC void xfs_bmap_validate_ret( xfs_fileoff_t bno, xfs_filblks_t len, - int flags, + uint32_t flags, xfs_bmbt_irec_t *mval, int nmap, int ret_nmap) @@ -2616,7 +2616,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp, - int flags) + uint32_t flags) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_mount *mp = ip->i_mount; @@ -3766,7 +3766,7 @@ xfs_bmapi_trim_map( xfs_fileoff_t obno, xfs_fileoff_t end, int n, - int flags) + uint32_t flags) { if ((flags & XFS_BMAPI_ENTIRE) || got->br_startoff + got->br_blockcount <= obno) { @@ -3811,7 +3811,7 @@ xfs_bmapi_update_map( xfs_fileoff_t obno, xfs_fileoff_t end, int *n, - int flags) + uint32_t flags) { xfs_bmbt_irec_t *mval = *map; @@ -3864,7 +3864,7 @@ xfs_bmapi_read( xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, - int flags) + uint32_t flags) { struct xfs_mount *mp = ip->i_mount; int whichfork = xfs_bmapi_whichfork(flags); @@ -4184,7 +4184,7 @@ xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, struct xfs_bmbt_irec *mval, xfs_filblks_t len, - int flags) + uint32_t flags) { int whichfork = xfs_bmapi_whichfork(flags); struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); @@ -4312,7 +4312,7 @@ xfs_bmapi_write( struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t bno, /* starting file offs. mapped */ xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ + uint32_t flags, /* XFS_BMAPI_... */ xfs_extlen_t total, /* total blocks needed */ struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap) /* i/o: mval size/count */ @@ -4629,7 +4629,7 @@ xfs_bmapi_remap( xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - int flags) + uint32_t flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; @@ -4999,7 +4999,7 @@ xfs_bmap_del_extent_real( xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ int whichfork, /* data or attr fork */ - int bflags) /* bmapi flags */ + uint32_t bflags) /* bmapi flags */ { xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ @@ -5281,7 +5281,7 @@ __xfs_bunmapi( struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t start, /* first file offset deleted */ xfs_filblks_t *rlen, /* i/o: amount remaining */ - int flags, /* misc flags */ + uint32_t flags, /* misc flags */ xfs_extnum_t nexts) /* number of extents max */ { struct xfs_btree_cur *cur; /* bmap btree cursor */ @@ -5609,7 +5609,7 @@ xfs_bunmapi( struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, - int flags, + uint32_t flags, xfs_extnum_t nexts, int *done) { diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 29d38c3c2607..16db95b11589 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -39,7 +39,7 @@ struct xfs_bmalloca { bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ int datatype;/* data type being allocated */ - int flags; + uint32_t flags; }; #define XFS_BMAP_MAX_NMAP 4 @@ -47,17 +47,17 @@ struct xfs_bmalloca { /* * Flags for xfs_bmapi_* */ -#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ -#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ -#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ -#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ -#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ +#define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */ +#define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */ +#define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ -#define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_CONVERT (1u << 5) /* * allocate zeroed extents - this requires all newly allocated user data extents @@ -65,7 +65,7 @@ struct xfs_bmalloca { * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found * during the allocation range to zeroed written extents. */ -#define XFS_BMAPI_ZERO 0x080 +#define XFS_BMAPI_ZERO (1u << 6) /* * Map the inode offset to the block given in ap->firstblock. Primarily @@ -75,16 +75,16 @@ struct xfs_bmalloca { * For bunmapi, this flag unmaps the range without adjusting quota, reducing * refcount, or freeing the blocks. */ -#define XFS_BMAPI_REMAP 0x100 +#define XFS_BMAPI_REMAP (1u << 7) /* Map something in the CoW fork. */ -#define XFS_BMAPI_COWFORK 0x200 +#define XFS_BMAPI_COWFORK (1u << 8) /* Skip online discard of freed extents */ -#define XFS_BMAPI_NODISCARD 0x1000 +#define XFS_BMAPI_NODISCARD (1u << 9) /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ -#define XFS_BMAPI_NORMAP 0x2000 +#define XFS_BMAPI_NORMAP (1u << 10) #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -106,7 +106,7 @@ static inline int xfs_bmapi_aflag(int w) (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); } -static inline int xfs_bmapi_whichfork(int bmapi_flags) +static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) { if (bmapi_flags & XFS_BMAPI_COWFORK) return XFS_COW_FORK; @@ -183,15 +183,15 @@ int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, - int *nmap, int flags); + int *nmap, uint32_t flags); int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags, + xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags, xfs_extnum_t nexts); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, @@ -260,7 +260,7 @@ xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - int flags); + uint32_t flags); extern struct kmem_cache *xfs_bmap_intent_cache; -- cgit From f53dde11b405e7c655997513822c90ac9761efdb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:46:16 +1000 Subject: xfs: convert AGF log flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc.c | 10 +++++----- fs/xfs/libxfs/xfs_alloc.h | 2 +- fs/xfs/libxfs/xfs_format.h | 38 +++++++++++++++++++------------------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index b52ed339727f..1ff3fa67d4c9 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2777,7 +2777,7 @@ xfs_alloc_get_freelist( xfs_agblock_t bno; __be32 *agfl_bno; int error; - int logflags; + uint32_t logflags; struct xfs_mount *mp = tp->t_mountp; struct xfs_perag *pag; @@ -2830,9 +2830,9 @@ xfs_alloc_get_freelist( */ void xfs_alloc_log_agf( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *bp, /* buffer for a.g. freelist header */ - int fields) /* mask of fields to be logged (XFS_AGF_...) */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte offset */ int last; /* last byte offset */ @@ -2902,7 +2902,7 @@ xfs_alloc_put_freelist( struct xfs_perag *pag; __be32 *blockp; int error; - int logflags; + uint32_t logflags; __be32 *agfl_bno; int startoff; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d4c057b764f9..84ca09b2223f 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -121,7 +121,7 @@ void xfs_alloc_log_agf( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *bp, /* buffer for a.g. freelist header */ - int fields);/* mask of fields to be logged (XFS_AGF_...) */ + uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */ /* * Interface for inode allocation to force the pag data to be initialized. diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index d665c04e69dd..65e24847841e 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -525,26 +525,26 @@ typedef struct xfs_agf { #define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) -#define XFS_AGF_MAGICNUM 0x00000001 -#define XFS_AGF_VERSIONNUM 0x00000002 -#define XFS_AGF_SEQNO 0x00000004 -#define XFS_AGF_LENGTH 0x00000008 -#define XFS_AGF_ROOTS 0x00000010 -#define XFS_AGF_LEVELS 0x00000020 -#define XFS_AGF_FLFIRST 0x00000040 -#define XFS_AGF_FLLAST 0x00000080 -#define XFS_AGF_FLCOUNT 0x00000100 -#define XFS_AGF_FREEBLKS 0x00000200 -#define XFS_AGF_LONGEST 0x00000400 -#define XFS_AGF_BTREEBLKS 0x00000800 -#define XFS_AGF_UUID 0x00001000 -#define XFS_AGF_RMAP_BLOCKS 0x00002000 -#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000 -#define XFS_AGF_REFCOUNT_ROOT 0x00008000 -#define XFS_AGF_REFCOUNT_LEVEL 0x00010000 -#define XFS_AGF_SPARE64 0x00020000 +#define XFS_AGF_MAGICNUM (1u << 0) +#define XFS_AGF_VERSIONNUM (1u << 1) +#define XFS_AGF_SEQNO (1u << 2) +#define XFS_AGF_LENGTH (1u << 3) +#define XFS_AGF_ROOTS (1u << 4) +#define XFS_AGF_LEVELS (1u << 5) +#define XFS_AGF_FLFIRST (1u << 6) +#define XFS_AGF_FLLAST (1u << 7) +#define XFS_AGF_FLCOUNT (1u << 8) +#define XFS_AGF_FREEBLKS (1u << 9) +#define XFS_AGF_LONGEST (1u << 10) +#define XFS_AGF_BTREEBLKS (1u << 11) +#define XFS_AGF_UUID (1u << 12) +#define XFS_AGF_RMAP_BLOCKS (1u << 13) +#define XFS_AGF_REFCOUNT_BLOCKS (1u << 14) +#define XFS_AGF_REFCOUNT_ROOT (1u << 15) +#define XFS_AGF_REFCOUNT_LEVEL (1u << 16) +#define XFS_AGF_SPARE64 (1u << 17) #define XFS_AGF_NUM_BITS 18 -#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) +#define XFS_AGF_ALL_BITS ((1u << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ -- cgit From 0d1b97696696871dc42dfc59d527a0b68b1a1209 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:46:24 +1000 Subject: xfs: convert AGI log flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_format.h | 30 +++++++++++++++--------------- fs/xfs/libxfs/xfs_ialloc.c | 6 +++--- fs/xfs/libxfs/xfs_ialloc.h | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 65e24847841e..0d6fa199a896 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -619,22 +619,22 @@ typedef struct xfs_agi { #define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) -#define XFS_AGI_MAGICNUM (1 << 0) -#define XFS_AGI_VERSIONNUM (1 << 1) -#define XFS_AGI_SEQNO (1 << 2) -#define XFS_AGI_LENGTH (1 << 3) -#define XFS_AGI_COUNT (1 << 4) -#define XFS_AGI_ROOT (1 << 5) -#define XFS_AGI_LEVEL (1 << 6) -#define XFS_AGI_FREECOUNT (1 << 7) -#define XFS_AGI_NEWINO (1 << 8) -#define XFS_AGI_DIRINO (1 << 9) -#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_MAGICNUM (1u << 0) +#define XFS_AGI_VERSIONNUM (1u << 1) +#define XFS_AGI_SEQNO (1u << 2) +#define XFS_AGI_LENGTH (1u << 3) +#define XFS_AGI_COUNT (1u << 4) +#define XFS_AGI_ROOT (1u << 5) +#define XFS_AGI_LEVEL (1u << 6) +#define XFS_AGI_FREECOUNT (1u << 7) +#define XFS_AGI_NEWINO (1u << 8) +#define XFS_AGI_DIRINO (1u << 9) +#define XFS_AGI_UNLINKED (1u << 10) #define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ -#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) -#define XFS_AGI_FREE_ROOT (1 << 11) -#define XFS_AGI_FREE_LEVEL (1 << 12) -#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */ +#define XFS_AGI_ALL_BITS_R1 ((1u << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1u << 11) +#define XFS_AGI_FREE_LEVEL (1u << 12) +#define XFS_AGI_IBLOCKS (1u << 13) /* both inobt/finobt block counters */ #define XFS_AGI_NUM_BITS_R2 14 /* disk block (xfs_daddr_t) in the AG */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b418fe0c0679..54c2be6a2972 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2414,9 +2414,9 @@ out_drop: */ void xfs_ialloc_log_agi( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *bp, /* allocation group header buffer */ - int fields) /* bitmask of fields to log */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte number */ int last; /* last byte number */ diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 8b5c2b709022..a7705b6a1fd3 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -60,7 +60,7 @@ void xfs_ialloc_log_agi( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *bp, /* allocation group header buffer */ - int fields); /* bitmask of fields to log */ + uint32_t fields); /* bitmask of fields to log */ /* * Read in the allocation group header (inode allocation section) -- cgit From 722db70fb2f03ef9ff21cd5194e9f592701e1be6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:46:33 +1000 Subject: xfs: convert btree buffer log flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. We also pass the fields to log to xfs_btree_offsets() as a uint32_t all cases now. I have no idea why we made that parameter a int64_t in the first place, but while we are fixing this up change it to a uint32_t field, too. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_btree.c | 10 +++++----- fs/xfs/libxfs/xfs_btree.h | 26 +++++++++++++------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c1500b238520..a8c79e760d8a 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -751,20 +751,20 @@ xfs_btree_lastrec( */ void xfs_btree_offsets( - int64_t fields, /* bitmask of fields */ + uint32_t fields, /* bitmask of fields */ const short *offsets, /* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last) /* output: last byte offset */ { int i; /* current bit number */ - int64_t imask; /* mask for current bit number */ + uint32_t imask; /* mask for current bit number */ ASSERT(fields != 0); /* * Find the lowest bit, so the first byte offset. */ - for (i = 0, imask = 1LL; ; i++, imask <<= 1) { + for (i = 0, imask = 1u; ; i++, imask <<= 1) { if (imask & fields) { *first = offsets[i]; break; @@ -773,7 +773,7 @@ xfs_btree_offsets( /* * Find the highest bit, so the last byte offset. */ - for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) { + for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { if (imask & fields) { *last = offsets[i + 1] - 1; break; @@ -1456,7 +1456,7 @@ void xfs_btree_log_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_buf *bp, /* buffer containing btree block */ - int fields) /* mask of fields: XFS_BB_... */ + uint32_t fields) /* mask of fields: XFS_BB_... */ { int first; /* first byte offset logged */ int last; /* last byte offset logged */ diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 22d9f411fde6..eef27858a013 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -68,19 +68,19 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); /* * For logging record fields. */ -#define XFS_BB_MAGIC (1 << 0) -#define XFS_BB_LEVEL (1 << 1) -#define XFS_BB_NUMRECS (1 << 2) -#define XFS_BB_LEFTSIB (1 << 3) -#define XFS_BB_RIGHTSIB (1 << 4) -#define XFS_BB_BLKNO (1 << 5) -#define XFS_BB_LSN (1 << 6) -#define XFS_BB_UUID (1 << 7) -#define XFS_BB_OWNER (1 << 8) +#define XFS_BB_MAGIC (1u << 0) +#define XFS_BB_LEVEL (1u << 1) +#define XFS_BB_NUMRECS (1u << 2) +#define XFS_BB_LEFTSIB (1u << 3) +#define XFS_BB_RIGHTSIB (1u << 4) +#define XFS_BB_BLKNO (1u << 5) +#define XFS_BB_LSN (1u << 6) +#define XFS_BB_UUID (1u << 7) +#define XFS_BB_OWNER (1u << 8) #define XFS_BB_NUM_BITS 5 -#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) +#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1) #define XFS_BB_NUM_BITS_CRC 9 -#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) +#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface @@ -345,7 +345,7 @@ xfs_btree_dup_cursor( */ void xfs_btree_offsets( - int64_t fields, /* bitmask of fields */ + uint32_t fields, /* bitmask of fields */ const short *offsets,/* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ @@ -435,7 +435,7 @@ bool xfs_btree_sblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. */ -void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); +void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t); void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* -- cgit From 581b4484475c14cf606cdc9d6cdecc98f7ab1be4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:46:40 +1000 Subject: xfs: convert buffer log item flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf_item.h | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index e11e9ef2338f..4d8a6aece995 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -8,15 +8,18 @@ /* kernel only definitions */ +struct xfs_buf; +struct xfs_mount; + /* buf log item flags */ -#define XFS_BLI_HOLD 0x01 -#define XFS_BLI_DIRTY 0x02 -#define XFS_BLI_STALE 0x04 -#define XFS_BLI_LOGGED 0x08 -#define XFS_BLI_INODE_ALLOC_BUF 0x10 -#define XFS_BLI_STALE_INODE 0x20 -#define XFS_BLI_INODE_BUF 0x40 -#define XFS_BLI_ORDERED 0x80 +#define XFS_BLI_HOLD (1u << 0) +#define XFS_BLI_DIRTY (1u << 1) +#define XFS_BLI_STALE (1u << 2) +#define XFS_BLI_LOGGED (1u << 3) +#define XFS_BLI_INODE_ALLOC_BUF (1u << 4) +#define XFS_BLI_STALE_INODE (1u << 5) +#define XFS_BLI_INODE_BUF (1u << 6) +#define XFS_BLI_ORDERED (1u << 7) #define XFS_BLI_FLAGS \ { XFS_BLI_HOLD, "HOLD" }, \ @@ -28,11 +31,6 @@ { XFS_BLI_INODE_BUF, "INODE_BUF" }, \ { XFS_BLI_ORDERED, "ORDERED" } - -struct xfs_buf; -struct xfs_mount; -struct xfs_buf_log_item; - /* * This is the in core log item structure used to track information * needed to log buffers. It tracks how many times the lock has been -- cgit From 3402d931575f1fb0c6863eaad6595f55e6389eda Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:46:47 +1000 Subject: xfs: convert da btree operations flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_da_btree.h | 16 ++++++++-------- fs/xfs/xfs_trace.h | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 0faf7d9ac241..7b0f986e5cb5 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -76,19 +76,19 @@ typedef struct xfs_da_args { xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ int rmtvaluelen2; /* remote attr value length in bytes */ - int op_flags; /* operation flags */ + uint32_t op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; /* * Operation flags: */ -#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */ -#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */ -#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ -#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ -#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ -#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */ +#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */ +#define XFS_DA_OP_RENAME (1u << 1) /* this is an atomic rename op */ +#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */ +#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */ +#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */ +#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -197,7 +197,7 @@ int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp, * Utility routines. */ -#define XFS_DABUF_MAP_HOLE_OK (1 << 0) +#define XFS_DABUF_MAP_HOLE_OK (1u << 0) int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index b141ef78c755..989ecda904db 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1924,7 +1924,7 @@ DECLARE_EVENT_CLASS(xfs_da_class, __field(int, namelen) __field(xfs_dahash_t, hashval) __field(xfs_ino_t, inumber) - __field(int, op_flags) + __field(uint32_t, op_flags) ), TP_fast_assign( __entry->dev = VFS_I(args->dp)->i_sb->s_dev; @@ -1990,7 +1990,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(xfs_dahash_t, hashval) __field(unsigned int, attr_filter) __field(unsigned int, attr_flags) - __field(int, op_flags) + __field(uint32_t, op_flags) ), TP_fast_assign( __entry->dev = VFS_I(args->dp)->i_sb->s_dev; @@ -2097,7 +2097,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_space_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(int, op_flags) + __field(uint32_t, op_flags) __field(int, idx) ), TP_fast_assign( @@ -2128,7 +2128,7 @@ TRACE_EVENT(xfs_dir2_leafn_moveents, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(int, op_flags) + __field(uint32_t, op_flags) __field(int, src_idx) __field(int, dst_idx) __field(int, count) -- cgit From 1005dd019c88f556f85cb3632df4d2c702ae95cd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:46:55 +1000 Subject: xfs: convert dquot flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_format.h | 8 ++++---- fs/xfs/libxfs/xfs_quota_defs.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 0d6fa199a896..f524736d811e 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1085,10 +1085,10 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ #define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ -#define XFS_DQTYPE_USER 0x01 /* user dquot record */ -#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ -#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ -#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */ +#define XFS_DQTYPE_USER (1u << 0) /* user dquot record */ +#define XFS_DQTYPE_PROJ (1u << 1) /* project dquot record */ +#define XFS_DQTYPE_GROUP (1u << 2) /* group dquot record */ +#define XFS_DQTYPE_BIGTIME (1u << 7) /* large expiry timestamps */ /* bitmask to determine if this is a user/group/project dquot */ #define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index a02c5062f9b2..fdfe3cc6f15c 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -29,8 +29,8 @@ typedef uint8_t xfs_dqtype_t; /* * flags for q_flags field in the dquot. */ -#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */ -#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */ +#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */ +#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */ #define XFS_DQFLAG_STRINGS \ { XFS_DQFLAG_DIRTY, "DIRTY" }, \ -- cgit From 22d53f480c56e34316d2e5f3757ba1839d47008b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:47:07 +1000 Subject: xfs: convert log item tracepoint flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_trans.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index de177842b951..569b68fc6912 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -58,10 +58,10 @@ struct xfs_log_item { #define XFS_LI_DIRTY 3 /* log item dirty in transaction */ #define XFS_LI_FLAGS \ - { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ - { (1 << XFS_LI_ABORTED), "ABORTED" }, \ - { (1 << XFS_LI_FAILED), "FAILED" }, \ - { (1 << XFS_LI_DIRTY), "DIRTY" } + { (1u << XFS_LI_IN_AIL), "IN_AIL" }, \ + { (1u << XFS_LI_ABORTED), "ABORTED" }, \ + { (1u << XFS_LI_FAILED), "FAILED" }, \ + { (1u << XFS_LI_DIRTY), "DIRTY" } struct xfs_item_ops { unsigned flags; -- cgit From a103375307ade71f3394889310ba37abb23c1c21 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:47:16 +1000 Subject: xfs: convert inode lock flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 12 ++++++------ fs/xfs/xfs_inode.c | 21 ++++++++++++--------- fs/xfs/xfs_inode.h | 24 ++++++++++++------------ 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5bddb1e9e0b3..f3e878408747 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -310,7 +310,7 @@ STATIC ssize_t xfs_file_write_checks( struct kiocb *iocb, struct iov_iter *from, - int *iolock) + unsigned int *iolock) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -513,7 +513,7 @@ xfs_file_dio_write_aligned( struct kiocb *iocb, struct iov_iter *from) { - int iolock = XFS_IOLOCK_SHARED; + unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; ret = xfs_ilock_iocb(iocb, iolock); @@ -566,7 +566,7 @@ xfs_file_dio_write_unaligned( { size_t isize = i_size_read(VFS_I(ip)); size_t count = iov_iter_count(from); - int iolock = XFS_IOLOCK_SHARED; + unsigned int iolock = XFS_IOLOCK_SHARED; unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; ssize_t ret; @@ -655,7 +655,7 @@ xfs_file_dax_write( { struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - int iolock = XFS_IOLOCK_EXCL; + unsigned int iolock = XFS_IOLOCK_EXCL; ssize_t ret, error = 0; loff_t pos; @@ -700,7 +700,7 @@ xfs_file_buffered_write( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; bool cleared_space = false; - int iolock; + unsigned int iolock; if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; @@ -1181,7 +1181,7 @@ xfs_dir_open( struct file *file) { struct xfs_inode *ip = XFS_I(inode); - int mode; + unsigned int mode; int error; error = xfs_file_open(inode, file); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9de6205fe134..5ea460f62201 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -416,10 +416,12 @@ xfs_lockdep_subclass_ok( * parent locking. Care must be taken to ensure we don't overrun the subclass * storage fields in the class mask we build. */ -static inline int -xfs_lock_inumorder(int lock_mode, int subclass) +static inline uint +xfs_lock_inumorder( + uint lock_mode, + uint subclass) { - int class = 0; + uint class = 0; ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | XFS_ILOCK_RTSUM))); @@ -464,7 +466,10 @@ xfs_lock_inodes( int inodes, uint lock_mode) { - int attempts = 0, i, j, try_lock; + int attempts = 0; + uint i; + int j; + bool try_lock; struct xfs_log_item *lp; /* @@ -489,9 +494,9 @@ xfs_lock_inodes( } else if (lock_mode & XFS_MMAPLOCK_EXCL) ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); - try_lock = 0; - i = 0; again: + try_lock = false; + i = 0; for (; i < inodes; i++) { ASSERT(ips[i]); @@ -506,7 +511,7 @@ again: for (j = (i - 1); j >= 0 && !try_lock; j--) { lp = &ips[j]->i_itemp->ili_item; if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) - try_lock++; + try_lock = true; } } @@ -546,8 +551,6 @@ again: if ((attempts % 5) == 0) { delay(1); /* Don't just spin the CPU */ } - i = 0; - try_lock = 0; goto again; } } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 740ab13d1aa2..b67ab9f10cf9 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -278,12 +278,12 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) */ -#define XFS_IOLOCK_EXCL (1<<0) -#define XFS_IOLOCK_SHARED (1<<1) -#define XFS_ILOCK_EXCL (1<<2) -#define XFS_ILOCK_SHARED (1<<3) -#define XFS_MMAPLOCK_EXCL (1<<4) -#define XFS_MMAPLOCK_SHARED (1<<5) +#define XFS_IOLOCK_EXCL (1u << 0) +#define XFS_IOLOCK_SHARED (1u << 1) +#define XFS_ILOCK_EXCL (1u << 2) +#define XFS_ILOCK_SHARED (1u << 3) +#define XFS_MMAPLOCK_EXCL (1u << 4) +#define XFS_MMAPLOCK_SHARED (1u << 5) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ @@ -350,19 +350,19 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) */ #define XFS_IOLOCK_SHIFT 16 #define XFS_IOLOCK_MAX_SUBCLASS 3 -#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_IOLOCK_DEP_MASK 0x000f0000u #define XFS_MMAPLOCK_SHIFT 20 #define XFS_MMAPLOCK_NUMORDER 0 #define XFS_MMAPLOCK_MAX_SUBCLASS 3 -#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000u #define XFS_ILOCK_SHIFT 24 -#define XFS_ILOCK_PARENT_VAL 5 +#define XFS_ILOCK_PARENT_VAL 5u #define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1) -#define XFS_ILOCK_RTBITMAP_VAL 6 -#define XFS_ILOCK_RTSUM_VAL 7 -#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_ILOCK_RTBITMAP_VAL 6u +#define XFS_ILOCK_RTSUM_VAL 7u +#define XFS_ILOCK_DEP_MASK 0xff000000u #define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT) -- cgit From 90215d74987159fdd7a6d800256ba1d2a9b0dca8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:47:25 +1000 Subject: xfs: convert ptag flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_error.h | 20 ++++++++++---------- fs/xfs/xfs_message.c | 2 +- fs/xfs/xfs_message.h | 3 ++- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 5735d5ea87ee..5191e9145e55 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -64,16 +64,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); * XFS panic tags -- allow a call to xfs_alert_tag() be turned into * a panic by setting xfs_panic_mask in a sysctl. */ -#define XFS_NO_PTAG 0 -#define XFS_PTAG_IFLUSH 0x00000001 -#define XFS_PTAG_LOGRES 0x00000002 -#define XFS_PTAG_AILDELETE 0x00000004 -#define XFS_PTAG_ERROR_REPORT 0x00000008 -#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 -#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 -#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 -#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 -#define XFS_PTAG_VERIFIER_ERROR 0x00000100 +#define XFS_NO_PTAG 0u +#define XFS_PTAG_IFLUSH (1u << 0) +#define XFS_PTAG_LOGRES (1u << 1) +#define XFS_PTAG_AILDELETE (1u << 2) +#define XFS_PTAG_ERROR_REPORT (1u << 3) +#define XFS_PTAG_SHUTDOWN_CORRUPT (1u << 4) +#define XFS_PTAG_SHUTDOWN_IOERROR (1u << 5) +#define XFS_PTAG_SHUTDOWN_LOGERROR (1u << 6) +#define XFS_PTAG_FSBLOCK_ZERO (1u << 7) +#define XFS_PTAG_VERIFIER_ERROR (1u << 8) #define XFS_PTAG_STRINGS \ { XFS_NO_PTAG, "none" }, \ diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index bc66d95c8d4c..c5084dce75cd 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -62,7 +62,7 @@ define_xfs_printk_level(xfs_debug, KERN_DEBUG); void xfs_alert_tag( const struct xfs_mount *mp, - int panic_tag, + uint32_t panic_tag, const char *fmt, ...) { struct va_format vaf; diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index bb9860ec9a93..dee98e9ccc3d 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -11,7 +11,8 @@ void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...); extern __printf(2, 3) void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...); extern __printf(3, 4) -void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); +void xfs_alert_tag(const struct xfs_mount *mp, uint32_t tag, + const char *fmt, ...); extern __printf(2, 3) void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...); extern __printf(2, 3) -- cgit From b9f3082eee5a77d5000742859532ba4ff584354f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:47:32 +1000 Subject: xfs: convert quota options flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_quota_defs.h | 45 +++++++++++++++++++++++++++++------------- fs/xfs/xfs_trace.h | 16 --------------- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index fdfe3cc6f15c..3076cd74fcaa 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -73,29 +73,45 @@ typedef uint8_t xfs_dqtype_t; * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) */ -#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ -#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ -#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_UQUOTA (1u << 0) /* user dquot requested */ +#define XFS_QMOPT_GQUOTA (1u << 1) /* group dquot requested */ +#define XFS_QMOPT_PQUOTA (1u << 2) /* project dquot requested */ +#define XFS_QMOPT_FORCE_RES (1u << 3) /* ignore quota limits */ +#define XFS_QMOPT_SBVERSION (1u << 4) /* change superblock version num */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be * modified. */ -#define XFS_QMOPT_RES_REGBLKS 0x0010000 -#define XFS_QMOPT_RES_RTBLKS 0x0020000 -#define XFS_QMOPT_BCOUNT 0x0040000 -#define XFS_QMOPT_ICOUNT 0x0080000 -#define XFS_QMOPT_RTBCOUNT 0x0100000 -#define XFS_QMOPT_DELBCOUNT 0x0200000 -#define XFS_QMOPT_DELRTBCOUNT 0x0400000 -#define XFS_QMOPT_RES_INOS 0x0800000 +#define XFS_QMOPT_RES_REGBLKS (1u << 7) +#define XFS_QMOPT_RES_RTBLKS (1u << 8) +#define XFS_QMOPT_BCOUNT (1u << 9) +#define XFS_QMOPT_ICOUNT (1u << 10) +#define XFS_QMOPT_RTBCOUNT (1u << 11) +#define XFS_QMOPT_DELBCOUNT (1u << 12) +#define XFS_QMOPT_DELRTBCOUNT (1u << 13) +#define XFS_QMOPT_RES_INOS (1u << 14) /* * flags for dqalloc. */ -#define XFS_QMOPT_INHERIT 0x1000000 +#define XFS_QMOPT_INHERIT (1u << 31) + +#define XFS_QMOPT_FLAGS \ + { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ + { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ + { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ + { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ + { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ + { XFS_QMOPT_INHERIT, "INHERIT" }, \ + { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ + { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ + { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ + { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ + { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ + { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ + { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ + { XFS_QMOPT_RES_INOS, "RES_INOS" } /* * flags to xfs_trans_mod_dquot. @@ -114,6 +130,7 @@ typedef uint8_t xfs_dqtype_t; (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, struct xfs_disk_dquot *ddq, xfs_dqid_t id); extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 989ecda904db..b88bd45da27a 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1096,22 +1096,6 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done); DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before); DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after); -#define XFS_QMOPT_FLAGS \ - { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ - { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ - { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ - { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ - { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ - { XFS_QMOPT_INHERIT, "INHERIT" }, \ - { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ - { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ - { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ - { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ - { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ - { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ - { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ - { XFS_QMOPT_RES_INOS, "RES_INOS" } - TRACE_EVENT(xfs_trans_mod_dquot, TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp, unsigned int field, int64_t delta), -- cgit From 2eb7550d2c0dd7c383839018991dfa602790dc77 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:47:38 +1000 Subject: xfs: convert shutdown reasons to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 2 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log.h | 2 +- fs/xfs/xfs_mount.h | 11 +++++------ 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 68f74549fa22..e4cc6b7cae0f 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -512,7 +512,7 @@ xfs_fs_goingdown( void xfs_do_force_shutdown( struct xfs_mount *mp, - int flags, + uint32_t flags, char *fname, int lnnum) { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 499e15b24215..3c216140a1c4 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3829,7 +3829,7 @@ xlog_verify_iclog( bool xlog_force_shutdown( struct xlog *log, - int shutdown_flags) + uint32_t shutdown_flags) { bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index dc1b77b92fc1..3ecf891f34c4 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -140,7 +140,7 @@ void xfs_log_clean(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); -bool xlog_force_shutdown(struct xlog *log, int shutdown_flags); +bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags); void xlog_use_incompat_feat(struct xlog *log); void xlog_drop_incompat_feat(struct xlog *log); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f6dc19de8322..e5629e7c5aaf 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -425,16 +425,15 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) #define XFS_MAX_IO_LOG 30 /* 1G */ #define XFS_MIN_IO_LOG PAGE_SHIFT -#define xfs_is_shutdown(mp) xfs_is_shutdown(mp) -void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, +void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname, int lnnum); #define xfs_force_shutdown(m,f) \ xfs_do_force_shutdown(m, f, __FILE__, __LINE__) -#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ -#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ -#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ -#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ +#define SHUTDOWN_META_IO_ERROR (1u << 0) /* write attempt to metadata failed */ +#define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */ +#define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */ +#define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */ #define XFS_SHUTDOWN_STRINGS \ { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ -- cgit From c60d13ea657f69a0f90c7ba131c16e0a25598488 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Apr 2022 10:48:01 +1000 Subject: xfs: convert log ticket and iclog flags to unsigned. 5.18 w/ std=gnu11 compiled with gcc-5 wants flags stored in unsigned fields to be unsigned. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_priv.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 401cdc400980..438df48a84c4 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -51,8 +51,8 @@ enum xlog_iclog_state { /* * In core log flags */ -#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */ -#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */ +#define XLOG_ICL_NEED_FLUSH (1u << 0) /* iclog needs REQ_PREFLUSH */ +#define XLOG_ICL_NEED_FUA (1u << 1) /* iclog needs REQ_FUA */ #define XLOG_ICL_STRINGS \ { XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \ @@ -62,7 +62,7 @@ enum xlog_iclog_state { /* * Log ticket flags */ -#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */ +#define XLOG_TIC_PERM_RESERV (1u << 0) /* permanent reservation */ #define XLOG_TIC_FLAGS \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } @@ -165,7 +165,7 @@ typedef struct xlog_ticket { char t_ocnt; /* original count : 1 */ char t_cnt; /* current count : 1 */ char t_clientid; /* who does this belong to; : 1 */ - char t_flags; /* properties of reservation : 1 */ + uint8_t t_flags; /* properties of reservation : 1 */ /* reservation array fields */ uint t_res_num; /* num in array : 4 */ -- cgit From 1a3385069745f10c4ca7278a25499d79934c703a Mon Sep 17 00:00:00 2001 From: Yang Xu Date: Tue, 26 Apr 2022 13:34:42 +1000 Subject: xfs: improve __xfs_set_acl Provide a proper stub for the !CONFIG_XFS_POSIX_ACL case. Also use a easy way for xfs_get_acl stub. Suggested-by: Christian Brauner (Microsoft) Signed-off-by: Yang Xu Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner (Microsoft) Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_acl.h | 8 +++++--- fs/xfs/xfs_iops.c | 2 -- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index bb6abdcb265d..263404d0bfda 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -16,11 +16,13 @@ extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); #else -static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu) +#define xfs_get_acl NULL +#define xfs_set_acl NULL +static inline int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, + int type) { - return NULL; + return 0; } -# define xfs_set_acl NULL static inline void xfs_forget_acl(struct inode *inode, const char *name) { } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b34e8e4344a8..94313b7e9991 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -209,7 +209,6 @@ xfs_generic_create( if (unlikely(error)) goto out_cleanup_inode; -#ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) @@ -220,7 +219,6 @@ xfs_generic_create( if (error) goto out_cleanup_inode; } -#endif xfs_setup_iops(ip); -- cgit From f650df7171b882dca737ddbbeb414100b31f16af Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 26 Apr 2022 13:34:54 +1000 Subject: xfs: fix soft lockup via spinning in filestream ag selection loop The filestream AG selection loop uses pagf data to aid in AG selection, which depends on pagf initialization. If the in-core structure is not initialized, the caller invokes the AGF read path to do so and carries on. If another task enters the loop and finds a pagf init already in progress, the AGF read returns -EAGAIN and the task continues the loop. This does not increment the current ag index, however, which means the task spins on the current AGF buffer until unlocked. If the AGF read I/O submitted by the initial task happens to be delayed for whatever reason, this results in soft lockup warnings via the spinning task. This is reproduced by xfs/170. To avoid this problem, fix the AGF trylock failure path to properly iterate to the next AG. If a task iterates all AGs without making progress, the trylock behavior is dropped in favor of blocking locks and thus a soft lockup is no longer possible. Fixes: f48e2df8a877ca1c ("xfs: make xfs_*read_agf return EAGAIN to ALLOC_FLAG_TRYLOCK callers") Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_filestream.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 6a3ce0f6dc9e..be9bcf8a1f99 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -128,11 +128,12 @@ xfs_filestream_pick_ag( if (!pag->pagf_init) { err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); if (err) { - xfs_perag_put(pag); - if (err != -EAGAIN) + if (err != -EAGAIN) { + xfs_perag_put(pag); return err; + } /* Couldn't lock the AGF, skip this AG. */ - continue; + goto next_ag; } } -- cgit From bc37e4fb5cac2925b2e286b1f1d4fc2b519f7d92 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 26 Apr 2022 13:35:26 +1000 Subject: xfs: revert "xfs: actually bump warning counts when we send warnings" This reverts commit 4b8628d57b725b32616965e66975fcdebe008fe7. XFS quota has had the concept of a "quota warning limit" since the earliest Irix implementation, but a mechanism for incrementing the warning counter was never implemented, as documented in the xfs_quota(8) man page. We do know from the historical archive that it was never incremented at runtime during quota reservation operations. With this commit, the warning counter quickly increments for every allocation attempt after the user has crossed a quote soft limit threshold, and this in turn transitions the user to hard quota failures, rendering soft quota thresholds and timers useless. This was reported as a regression by users. Because the intended behavior of this warning counter has never been understood or documented, and the result of this change is a regression in soft quota functionality, revert this commit to make soft quota limits and timers operable again. Fixes: 4b8628d57b72 ("xfs: actually bump warning counts when we send warnings) Signed-off-by: Eric Sandeen Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_trans_dquot.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 9ba7e6b9bed3..ebe2c227eb2f 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -603,7 +603,6 @@ xfs_dqresv_check( return QUOTA_NL_ISOFTLONGWARN; } - res->warnings++; return QUOTA_NL_ISOFTWARN; } -- cgit From c46eef34830e51ae7fe4b8371837c586448c3078 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Apr 2022 18:37:05 -0700 Subject: xfs: capture buffer ops in the xfs_buf tracepoints Record the buffer ops in the xfs_buf tracepoints so that we can monitor the alleged type of the buffer. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_trace.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e1197f9ad97e..91b916e82364 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -418,6 +418,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __field(unsigned, lockval) __field(unsigned, flags) __field(unsigned long, caller_ip) + __field(const void *, buf_ops) ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; @@ -428,9 +429,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; + __entry->buf_ops = bp->b_ops; ), TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d " - "lock %d flags %s caller %pS", + "lock %d flags %s bufops %pS caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->nblks, @@ -438,6 +440,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->pincount, __entry->lockval, __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + __entry->buf_ops, (void *)__entry->caller_ip) ) -- cgit From 5b7ca8b313621907d80460bfcc1fa876d2a38488 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Apr 2022 18:37:06 -0700 Subject: xfs: simplify xfs_rmap_lookup_le call sites Most callers of xfs_rmap_lookup_le will retrieve the btree record immediately if the lookup succeeds. The overlapped version of this function (xfs_rmap_lookup_le_range) will return the record if the lookup succeeds, so make the regular version do it too. Get rid of the useless len argument, since it's not part of the lookup key. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rmap.c | 59 ++++++++++++++++++------------------------------ fs/xfs/libxfs/xfs_rmap.h | 4 ++-- fs/xfs/scrub/bmap.c | 24 ++++---------------- 3 files changed, 28 insertions(+), 59 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index cd322174dbff..3eea8056e7bc 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -34,18 +34,32 @@ int xfs_rmap_lookup_le( struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat) { + int get_stat = 0; + int error; + cur->bc_rec.r.rm_startblock = bno; - cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_blockcount = 0; cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + if (error || !(*stat) || !irec) + return error; + + error = xfs_rmap_get_rec(cur, irec, &get_stat); + if (error) + return error; + if (!get_stat) + return -EFSCORRUPTED; + + return 0; } /* @@ -510,7 +524,7 @@ xfs_rmap_unmap( * for the AG headers at rm_startblock == 0 created by mkfs/growfs that * will not ever be removed from the tree. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, <rec, &i); if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -518,13 +532,6 @@ xfs_rmap_unmap( goto out_error; } - error = xfs_rmap_get_rec(cur, <rec, &i); - if (error) - goto out_error; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto out_error; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, @@ -786,18 +793,11 @@ xfs_rmap_map( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, <rec, &have_lt); if (error) goto out_error; if (have_lt) { - error = xfs_rmap_get_rec(cur, <rec, &have_lt); - if (error) - goto out_error; - if (XFS_IS_CORRUPT(mp, have_lt != 1)) { - error = -EFSCORRUPTED; - goto out_error; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, @@ -1022,7 +1022,7 @@ xfs_rmap_convert( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -1030,13 +1030,6 @@ xfs_rmap_convert( goto done; } - error = xfs_rmap_get_rec(cur, &PREV, &i); - if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, @@ -1140,7 +1133,7 @@ xfs_rmap_convert( _RET_IP_); /* reset the cursor back to PREV */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -2677,7 +2670,7 @@ xfs_rmap_record_exists( ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || (flags & XFS_RMAP_BMBT_BLOCK)); - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec, &has_record); if (error) return error; @@ -2686,14 +2679,6 @@ xfs_rmap_record_exists( return 0; } - error = xfs_rmap_get_rec(cur, &irec, &has_record); - if (error) - return error; - if (!has_record) { - *has_rmap = false; - return 0; - } - *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno && irec.rm_startblock + irec.rm_blockcount >= bno + len); return 0; diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index b718ebeda372..11ec9406a0ea 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -122,8 +122,8 @@ int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, const struct xfs_owner_info *oinfo); int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, uint64_t owner, uint64_t offset, - unsigned int flags, int *stat); + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags, int *stat); diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index c357593e0a02..285995ba3947 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -133,29 +133,13 @@ xchk_bmap_get_rmap( if (info->is_shared) { error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno, owner, offset, rflags, rmap, &has_rmap); - if (!xchk_should_check_xref(info->sc, &error, - &info->sc->sa.rmap_cur)) - return false; - goto out; + } else { + error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, + owner, offset, rflags, rmap, &has_rmap); } - - /* - * Otherwise, use the (faster) regular lookup. - */ - error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner, - offset, rflags, &has_rmap); - if (!xchk_should_check_xref(info->sc, &error, - &info->sc->sa.rmap_cur)) + if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur)) return false; - if (!has_rmap) - goto out; - error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap); - if (!xchk_should_check_xref(info->sc, &error, - &info->sc->sa.rmap_cur)) - return false; - -out: if (!has_rmap) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); -- cgit From 75d893d19c8e1b4bf4a9acd613fe5e7a80b58974 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Apr 2022 18:37:06 -0700 Subject: xfs: speed up rmap lookups by using non-overlapped lookups when possible Reverse mapping on a reflink-capable filesystem has some pretty high overhead when performing file operations. This is because the rmap records for logically and physically adjacent extents might not be adjacent in the rmap index due to data block sharing. As a result, we use expensive overlapped-interval btree search, which walks every record that overlaps with the supplied key in the hopes of finding the record. However, profiling data shows that when the index contains a record that is an exact match for a query key, the non-overlapped btree search function can find the record much faster than the overlapped version. Try the non-overlapped lookup first, which will make scrub run much faster. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rmap.c | 52 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 3eea8056e7bc..6f74dcda44b5 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -367,7 +367,6 @@ xfs_rmap_lookup_le_range_helper( return 0; *info->irec = *rec; - *info->stat = 1; return -ECANCELED; } @@ -388,6 +387,7 @@ xfs_rmap_lookup_le_range( int *stat) { struct xfs_find_left_neighbor_info info; + int found = 0; int error; info.high.rm_startblock = bno; @@ -400,20 +400,44 @@ xfs_rmap_lookup_le_range( info.high.rm_blockcount = 0; *stat = 0; info.irec = irec; - info.stat = stat; - trace_xfs_rmap_lookup_le_range(cur->bc_mp, - cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); - error = xfs_rmap_query_range(cur, &info.high, &info.high, - xfs_rmap_lookup_le_range_helper, &info); - if (error == -ECANCELED) - error = 0; - if (*stat) - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, - irec->rm_blockcount, irec->rm_owner, - irec->rm_offset, irec->rm_flags); - return error; + trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno, + bno, 0, owner, offset, flags); + + /* + * Historically, we always used the range query to walk every reverse + * mapping that could possibly overlap the key that the caller asked + * for, and filter out the ones that don't. That is very slow when + * there are a lot of records. + * + * However, there are two scenarios where the classic btree search can + * produce correct results -- if the index contains a record that is an + * exact match for the lookup key; and if there are no other records + * between the record we want and the key we supplied. + * + * As an optimization, try a non-overlapped lookup first. This makes + * scrub run much faster on most filesystems because bmbt records are + * usually an exact match for rmap records. If we don't find what we + * want, we fall back to the overlapped query. + */ + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec, + &found); + if (error) + return error; + if (found) + error = xfs_rmap_lookup_le_range_helper(cur, irec, &info); + if (!error) + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_lookup_le_range_helper, &info); + if (error != -ECANCELED) + return error; + + *stat = 1; + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, irec->rm_offset, + irec->rm_flags); + return 0; } /* -- cgit From 1edf8056131aca6fe7f98873da8297e6fa279d8c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Apr 2022 18:37:06 -0700 Subject: xfs: speed up write operations by using non-overlapped lookups when possible Reverse mapping on a reflink-capable filesystem has some pretty high overhead when performing file operations. This is because the rmap records for logically and physically adjacent extents might not be adjacent in the rmap index due to data block sharing. As a result, we use expensive overlapped-interval btree search, which walks every record that overlaps with the supplied key in the hopes of finding the record. However, profiling data shows that when the index contains a record that is an exact match for a query key, the non-overlapped btree search function can find the record much faster than the overlapped version. Try the non-overlapped lookup first when we're trying to find the left neighbor rmap record for a given file mapping, which makes unwritten extent conversion and remap operations run faster if data block sharing is minimal in this part of the filesystem. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rmap.c | 50 ++++++++++++++++++++++++++++++++++-------------- fs/xfs/libxfs/xfs_rmap.h | 3 --- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 6f74dcda44b5..2845019d31da 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -265,7 +265,6 @@ out_bad_rec: struct xfs_find_left_neighbor_info { struct xfs_rmap_irec high; struct xfs_rmap_irec *irec; - int *stat; }; /* For each rmap given, figure out if it matches the key we want. */ @@ -290,7 +289,6 @@ xfs_rmap_find_left_neighbor_helper( return 0; *info->irec = *rec; - *info->stat = 1; return -ECANCELED; } @@ -299,7 +297,7 @@ xfs_rmap_find_left_neighbor_helper( * return a match with the same owner and adjacent physical and logical * block ranges. */ -int +STATIC int xfs_rmap_find_left_neighbor( struct xfs_btree_cur *cur, xfs_agblock_t bno, @@ -310,6 +308,7 @@ xfs_rmap_find_left_neighbor( int *stat) { struct xfs_find_left_neighbor_info info; + int found = 0; int error; *stat = 0; @@ -327,21 +326,44 @@ xfs_rmap_find_left_neighbor( info.high.rm_flags = flags; info.high.rm_blockcount = 0; info.irec = irec; - info.stat = stat; trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); - error = xfs_rmap_query_range(cur, &info.high, &info.high, - xfs_rmap_find_left_neighbor_helper, &info); - if (error == -ECANCELED) - error = 0; - if (*stat) - trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, - irec->rm_blockcount, irec->rm_owner, - irec->rm_offset, irec->rm_flags); - return error; + /* + * Historically, we always used the range query to walk every reverse + * mapping that could possibly overlap the key that the caller asked + * for, and filter out the ones that don't. That is very slow when + * there are a lot of records. + * + * However, there are two scenarios where the classic btree search can + * produce correct results -- if the index contains a record that is an + * exact match for the lookup key; and if there are no other records + * between the record we want and the key we supplied. + * + * As an optimization, try a non-overlapped lookup first. This makes + * extent conversion and remap operations run a bit faster if the + * physical extents aren't being shared. If we don't find what we + * want, we fall back to the overlapped query. + */ + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec, + &found); + if (error) + return error; + if (found) + error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info); + if (!error) + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_find_left_neighbor_helper, &info); + if (error != -ECANCELED) + return error; + + *stat = 1; + trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, irec->rm_offset, + irec->rm_flags); + return 0; } /* For each rmap given, figure out if it matches the key we want. */ diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 11ec9406a0ea..54741a591a17 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -184,9 +184,6 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); -int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, - uint64_t owner, uint64_t offset, unsigned int flags, - struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, struct xfs_rmap_irec *irec, int *stat); -- cgit From c47260d4ea2ac11ce607d6ac1e0ca5528f42f482 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 26 Apr 2022 15:29:54 -0700 Subject: xfs: count EFIs when deciding to ask for a continuation of a refcount update A long time ago, I added to XFS the ability to use deferred reference count operations as part of a transaction chain. This enabled us to avoid blowing out the transaction reservation when the blocks in a physical extent all had different reference counts because we could ask the deferred operation manager for a continuation, which would get us a clean transaction. The refcount code asks for a continuation when the number of refcount record updates reaches the point where we think that the transaction has logged enough full btree blocks due to refcount (and free space) btree shape changes and refcount record updates that we're in danger of overflowing the transaction. We did not previously count the EFIs logged to the refcount update transaction because the clamps on the length of a bunmap operation were sufficient to avoid overflowing the transaction reservation even in the worst case situation where every other block of the unmapped extent is shared. Unfortunately, the restrictions on bunmap length avoid failure in the worst case by imposing a maximum unmap length of ~3000 blocks, even for non-pathological cases. This seriously limits performance when freeing large extents. Therefore, track EFIs with the same counter as refcount record updates, and use that information as input into when we should ask for a continuation. This enables the next patch to drop the clumsy bunmap limitation. Depends: 27dada070d59 ("xfs: change the order in which child and parent defer ops ar finished") Depends: 74f4d6a1e065 ("xfs: only relog deferred intent items if free space in the log gets low") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_refcount.c | 5 ++--- fs/xfs/libxfs/xfs_refcount.h | 8 ++++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 327ba25e9e17..a07ebaecba73 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -960,6 +960,7 @@ xfs_refcount_adjust_extents( * Either cover the hole (increment) or * delete the range (decrement). */ + cur->bc_ag.refc.nr_ops++; if (tmp.rc_refcount) { error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -970,7 +971,6 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_ag.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, @@ -1001,11 +1001,11 @@ xfs_refcount_adjust_extents( ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &ext); + cur->bc_ag.refc.nr_ops++; if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) goto out_error; - cur->bc_ag.refc.nr_ops++; } else if (ext.rc_refcount == 1) { error = xfs_refcount_delete(cur, &found_rec); if (error) @@ -1014,7 +1014,6 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_ag.refc.nr_ops++; goto advloop; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 9eb01edbd89d..37145637d7d8 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -67,6 +67,14 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, * log (plus any key updates) so we'll conservatively assume 32 bytes * per record. We must also leave space for btree splits on both ends * of the range and space for the CUD and a new CUI. + * + * Each EFI that we attach to the transaction is assumed to consume ~32 bytes. + * This is a low estimate for an EFI tracking a single extent (16 bytes for the + * EFI header, 16 for the extent, and 12 for the xlog op header), but the + * estimate is acceptable if there's more than one extent being freed. + * In the worst case of freeing every other block during a refcount decrease + * operation, we amortize the space used for one EFI log item across 16 + * extents. */ #define XFS_REFCOUNT_ITEM_OVERHEAD 32 -- cgit From 4ed6435cc369cce722966983f6e07b872562276f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Apr 2022 18:37:15 -0700 Subject: xfs: stop artificially limiting the length of bunmap calls In commit e1a4e37cc7b6, we clamped the length of bunmapi calls on the data forks of shared files to avoid two failure scenarios: one where the extent being unmapped is so sparsely shared that we exceed the transaction reservation with the sheer number of refcount btree updates and EFI intent items; and the other where we attach so many deferred updates to the transaction that we pin the log tail and later the log head meets the tail, causing the log to livelock. We avoid triggering the first problem by tracking the number of ops in the refcount btree cursor and forcing a requeue of the refcount intent item any time we think that we might be close to overflowing. This has been baked into XFS since before the original e1a4 patch. A recent patchset fixed the second problem by changing the deferred ops code to finish all the work items created by each round of trying to complete a refcount intent item, which eliminates the long chains of deferred items (27dad); and causing long-running transactions to relog their intent log items when space in the log gets low (74f4d). Because this clamp affects /any/ unmapping request regardless of the sharing factors of the component blocks, it degrades the performance of all large unmapping requests -- whereas with an unshared file we can unmap millions of blocks in one go, shared files are limited to unmapping a few thousand blocks at a time, which causes the upper level code to spin in a bunmapi loop even if it wasn't needed. This also eliminates one more place where log recovery behavior can differ from online behavior, because bunmapi operations no longer need to requeue. The fstest generic/447 was created to test the old fix, and it still passes with this applied. Partial-revert-of: e1a4e37cc7b6 ("xfs: try to avoid blowing out the transaction reservation when bunmaping a shared extent") Depends: 27dada070d59 ("xfs: change the order in which child and parent defer ops ar finished") Depends: 74f4d6a1e065 ("xfs: only relog deferred intent items if free space in the log gets low") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 22 +--------------------- fs/xfs/libxfs/xfs_refcount.h | 5 ----- 2 files changed, 1 insertion(+), 26 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 24462bdfd8e7..6833110d1bd4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5280,7 +5280,6 @@ __xfs_bunmapi( int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ - xfs_fileoff_t max_len; xfs_fileoff_t end; struct xfs_iext_cursor icur; bool done = false; @@ -5299,16 +5298,6 @@ __xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); - /* - * Guesstimate how many blocks we can unmap without running the risk of - * blowing out the transaction with a mix of EFIs and reflink - * adjustments. - */ - if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) - max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); - else - max_len = len; - error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; @@ -5347,7 +5336,7 @@ __xfs_bunmapi( extno = 0; while (end != (xfs_fileoff_t)-1 && end >= start && - (nexts == 0 || extno < nexts) && max_len > 0) { + (nexts == 0 || extno < nexts)) { /* * Is the found extent after a hole in which end lives? * Just back up to the previous extent, if so. @@ -5381,14 +5370,6 @@ __xfs_bunmapi( if (del.br_startoff + del.br_blockcount > end + 1) del.br_blockcount = end + 1 - del.br_startoff; - /* How much can we safely unmap? */ - if (max_len < del.br_blockcount) { - del.br_startoff += del.br_blockcount - max_len; - if (!wasdel) - del.br_startblock += del.br_blockcount - max_len; - del.br_blockcount = max_len; - } - if (!isrt) goto delete; @@ -5524,7 +5505,6 @@ delete: if (error) goto error0; - max_len -= del.br_blockcount; end = del.br_startoff - 1; nodelete: /* diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 37145637d7d8..e8b322de7f3d 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -78,11 +78,6 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, */ #define XFS_REFCOUNT_ITEM_OVERHEAD 32 -static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) -{ - return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; -} - extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); union xfs_btree_rec; -- cgit From f1e6a8d72806d2d57560b4873d8aa42c420384ee Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Apr 2022 18:38:12 -0700 Subject: xfs: remove a __xfs_bunmapi call from reflink This raw call isn't necessary since we can always remove a full delalloc extent. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_reflink.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 1ae6d3434ad2..960917628a44 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1138,7 +1138,7 @@ xfs_reflink_remap_extent( xfs_refcount_decrease_extent(tp, &smap); qdelta -= smap.br_blockcount; } else if (smap.br_startblock == DELAYSTARTBLOCK) { - xfs_filblks_t len = smap.br_blockcount; + int done; /* * If the extent we're unmapping is a delalloc reservation, @@ -1146,10 +1146,11 @@ xfs_reflink_remap_extent( * incore state. Dropping the delalloc reservation takes care * of the quota reservation for us. */ - error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1); + error = xfs_bunmapi(NULL, ip, smap.br_startoff, + smap.br_blockcount, 0, 1, &done); if (error) goto out_cancel; - ASSERT(len == 0); + ASSERT(done); } /* -- cgit From 52d8ea4f2406c14d632a0e7f816bbb18d8c3e9ed Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Apr 2022 18:38:13 -0700 Subject: xfs: create shadow transaction reservations for computing minimum log size Every time someone changes the transaction reservation sizes, they introduce potential compatibility problems if the changes affect the minimum log size that we validate at mount time. If the minimum log size gets larger (which should be avoided because doing so presents a serious risk of log livelock), filesystems created with old mkfs will not mount on a newer kernel; if the minimum size shrinks, filesystems created with newer mkfs will not mount on older kernels. Therefore, enable the creation of a shadow log reservation structure where we can "undo" the effects of tweaks when computing minimum log sizes. These shadow reservations should never be used in practice, but they insulate us from perturbations in minimum log size. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_log_rlimit.c | 15 +++++++++++---- fs/xfs/xfs_trace.h | 12 ++++++++++-- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 67798ff5e14e..4d04568ab07e 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -14,6 +14,7 @@ #include "xfs_trans_space.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" +#include "xfs_trace.h" /* * Calculate the maximum length in bytes that would be required for a local @@ -46,19 +47,25 @@ xfs_log_get_max_trans_res( struct xfs_mount *mp, struct xfs_trans_res *max_resp) { + struct xfs_trans_resv resv; struct xfs_trans_res *resp; struct xfs_trans_res *end_resp; + unsigned int i; int log_space = 0; int attr_space; attr_space = xfs_log_calc_max_attrsetm_res(mp); - resp = (struct xfs_trans_res *)M_RES(mp); - end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1); - for (; resp < end_resp; resp++) { + memcpy(&resv, M_RES(mp), sizeof(struct xfs_trans_resv)); + + resp = (struct xfs_trans_res *)&resv; + end_resp = (struct xfs_trans_res *)(&resv + 1); + for (i = 0; resp < end_resp; i++, resp++) { int tmp = resp->tr_logcount > 1 ? resp->tr_logres * resp->tr_logcount : resp->tr_logres; + + trace_xfs_trans_resv_calc_minlogsize(mp, i, resp); if (log_space < tmp) { log_space = tmp; *max_resp = *resp; /* struct copy */ @@ -66,7 +73,7 @@ xfs_log_get_max_trans_res( } if (attr_space > log_space) { - *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */ + *max_resp = resv.tr_attrsetm; /* struct copy */ max_resp->tr_logres = attr_space; } } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 91b916e82364..9110bb5dd866 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3500,7 +3500,7 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key); DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key); DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping); -TRACE_EVENT(xfs_trans_resv_calc, +DECLARE_EVENT_CLASS(xfs_trans_resv_class, TP_PROTO(struct xfs_mount *mp, unsigned int type, struct xfs_trans_res *res), TP_ARGS(mp, type, res), @@ -3524,7 +3524,15 @@ TRACE_EVENT(xfs_trans_resv_calc, __entry->logres, __entry->logcount, __entry->logflags) -); +) + +#define DEFINE_TRANS_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_trans_resv_class, name, \ + TP_PROTO(struct xfs_mount *mp, unsigned int type, \ + struct xfs_trans_res *res), \ + TP_ARGS(mp, type, res)) +DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc); +DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize); DECLARE_EVENT_CLASS(xfs_trans_class, TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), -- cgit From 918247ce541995dba05391cf14d6061cf0844866 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Apr 2022 18:38:13 -0700 Subject: xfs: report "max_resp" used for min log size computation Move the tracepoint that computes the size of the transaction used to compute the minimum log size into xfs_log_get_max_trans_res so that we only have to compute this stuff once. Leave xfs_log_get_max_trans_res as a non-static function so that xfs_db can call it to report the results of the userspace computation of the same value to diagnose mkfs/kernel misinteractions. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_log_rlimit.c | 1 + fs/xfs/xfs_trace.h | 19 +++++++++++++++++++ fs/xfs/xfs_trans.c | 3 --- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 4d04568ab07e..1db27c3a1d16 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -76,6 +76,7 @@ xfs_log_get_max_trans_res( *max_resp = resv.tr_attrsetm; /* struct copy */ max_resp->tr_logres = attr_space; } + trace_xfs_log_get_max_trans_res(mp, max_resp); } /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 9110bb5dd866..a690987cc5f0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3534,6 +3534,25 @@ DEFINE_EVENT(xfs_trans_resv_class, name, \ DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc); DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize); +TRACE_EVENT(xfs_log_get_max_trans_res, + TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res), + TP_ARGS(mp, res), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint, logres) + __field(int, logcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->logres = res->tr_logres; + __entry->logcount = res->tr_logcount; + ), + TP_printk("dev %d:%d logres %u logcount %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->logres, + __entry->logcount) +); + DECLARE_EVENT_CLASS(xfs_trans_class, TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), TP_ARGS(tp, caller_ip), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 836ce2beac53..82cf0189c0db 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -32,7 +32,6 @@ static void xfs_trans_trace_reservations( struct xfs_mount *mp) { - struct xfs_trans_res resv; struct xfs_trans_res *res; struct xfs_trans_res *end_res; int i; @@ -41,8 +40,6 @@ xfs_trans_trace_reservations( end_res = (struct xfs_trans_res *)(M_RES(mp) + 1); for (i = 0; res < end_res; i++, res++) trace_xfs_trans_resv_calc(mp, i, res); - xfs_log_get_max_trans_res(mp, &resv); - trace_xfs_trans_resv_calc(mp, -1, &resv); } #else # define xfs_trans_trace_reservations(mp) -- cgit From 4ecf9e7c69edcb8f5b98df471dd026419b881d2b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Apr 2022 18:38:14 -0700 Subject: xfs: reduce the absurdly large log operation count Back in the early days of reflink and rmap development I set the transaction reservation sizes to be overly generous for rmap+reflink filesystems, and a little under-generous for rmap-only filesystems. Since we don't need *eight* transaction rolls to handle three new log intent items, decrease the logcounts to what we actually need, and amend the shadow reservation computation function to reflect what we used to do so that the minimum log size doesn't change. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_log_rlimit.c | 51 ++++++++++++++++++++++++++++++++++++++++-- fs/xfs/libxfs/xfs_trans_resv.c | 46 ++++++++++++++++--------------------- fs/xfs/libxfs/xfs_trans_resv.h | 10 +++++++-- 3 files changed, 76 insertions(+), 31 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 1db27c3a1d16..60fff8c6716f 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -37,6 +37,53 @@ xfs_log_calc_max_attrsetm_res( M_RES(mp)->tr_attrsetrt.tr_logres * nblks; } +/* + * Compute an alternate set of log reservation sizes for use exclusively with + * minimum log size calculations. + */ +static void +xfs_log_calc_trans_resv_for_minlogblocks( + struct xfs_mount *mp, + struct xfs_trans_resv *resv) +{ + unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; + + /* + * In the early days of rmap+reflink, we always set the rmap maxlevels + * to 9 even if the AG was small enough that it would never grow to + * that height. Transaction reservation sizes influence the minimum + * log size calculation, which influences the size of the log that mkfs + * creates. Use the old value here to ensure that newly formatted + * small filesystems will mount on older kernels. + */ + if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) + mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; + + xfs_trans_resv_calc(mp, resv); + + if (xfs_has_reflink(mp)) { + /* + * In the early days of reflink, typical log operation counts + * were greatly overestimated. + */ + resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + resv->tr_itruncate.tr_logcount = + XFS_ITRUNCATE_LOG_COUNT_REFLINK; + resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + } else if (xfs_has_rmapbt(mp)) { + /* + * In the early days of non-reflink rmap, the impact of rmapbt + * updates on log counts were not taken into account at all. + */ + resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + } + + /* Put everything back the way it was. This goes at the end. */ + mp->m_rmap_maxlevels = rmap_maxlevels; +} + /* * Iterate over the log space reservation table to figure out and return * the maximum one in terms of the pre-calculated values which were done @@ -47,7 +94,7 @@ xfs_log_get_max_trans_res( struct xfs_mount *mp, struct xfs_trans_res *max_resp) { - struct xfs_trans_resv resv; + struct xfs_trans_resv resv = {}; struct xfs_trans_res *resp; struct xfs_trans_res *end_resp; unsigned int i; @@ -56,7 +103,7 @@ xfs_log_get_max_trans_res( attr_space = xfs_log_calc_max_attrsetm_res(mp); - memcpy(&resv, M_RES(mp), sizeof(struct xfs_trans_resv)); + xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv); resp = (struct xfs_trans_res *)&resv; end_resp = (struct xfs_trans_res *)(&resv + 1); diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 8e1d09e8cc9a..60be82cd491b 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -815,36 +815,18 @@ xfs_trans_resv_calc( struct xfs_mount *mp, struct xfs_trans_resv *resp) { - unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; - - /* - * In the early days of rmap+reflink, we always set the rmap maxlevels - * to 9 even if the AG was small enough that it would never grow to - * that height. Transaction reservation sizes influence the minimum - * log size calculation, which influences the size of the log that mkfs - * creates. Use the old value here to ensure that newly formatted - * small filesystems will mount on older kernels. - */ - if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) - mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; + int logcount_adj = 0; /* * The following transactions are logged in physical format and * require a permanent reservation on space. */ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; - else - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_itruncate.tr_logcount = - XFS_ITRUNCATE_LOG_COUNT_REFLINK; - else - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); @@ -901,10 +883,7 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; - else - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; /* @@ -931,6 +910,19 @@ xfs_trans_resv_calc( resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); - /* Put everything back the way it was. This goes at the end. */ - mp->m_rmap_maxlevels = rmap_maxlevels; + /* + * Add one logcount for BUI items that appear with rmap or reflink, + * one logcount for refcount intent items, and one logcount for rmap + * intent items. + */ + if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp)) + logcount_adj++; + if (xfs_has_reflink(mp)) + logcount_adj++; + if (xfs_has_rmapbt(mp)) + logcount_adj++; + + resp->tr_itruncate.tr_logcount += logcount_adj; + resp->tr_write.tr_logcount += logcount_adj; + resp->tr_qm_dqalloc.tr_logcount += logcount_adj; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index fc4e9b369a3a..fa330e646dc5 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -73,7 +73,6 @@ struct xfs_trans_resv { #define XFS_DEFAULT_LOG_COUNT 1 #define XFS_DEFAULT_PERM_LOG_COUNT 2 #define XFS_ITRUNCATE_LOG_COUNT 2 -#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 #define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 #define XFS_CREATE_TMPFILE_LOG_COUNT 2 @@ -83,12 +82,19 @@ struct xfs_trans_resv { #define XFS_LINK_LOG_COUNT 2 #define XFS_RENAME_LOG_COUNT 2 #define XFS_WRITE_LOG_COUNT 2 -#define XFS_WRITE_LOG_COUNT_REFLINK 8 #define XFS_ADDAFORK_LOG_COUNT 2 #define XFS_ATTRINVAL_LOG_COUNT 1 #define XFS_ATTRSET_LOG_COUNT 3 #define XFS_ATTRRM_LOG_COUNT 3 +/* + * Original log operation counts were overestimated in the early days of + * reflink. These are retained here purely for minimum log size calculations + * and must not be used for runtime reservations. + */ +#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 +#define XFS_WRITE_LOG_COUNT_REFLINK 8 + void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); -- cgit From b037c4eed2df4568a7702cd512d26625962f95b9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Apr 2022 18:38:14 -0700 Subject: xfs: reduce transaction reservations with reflink Before to the introduction of deferred refcount operations, reflink would try to cram refcount btree updates into the same transaction as an allocation or a free event. Mainline XFS has never actually done that, but we never refactored the transaction reservations to reflect that we now do all refcount updates in separate transactions. Fix this to reduce the transaction reservation size even farther, so that between this patch and the previous one, we reduce the tr_write and tr_itruncate sizes by 66%. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_log_rlimit.c | 12 ++++ fs/xfs/libxfs/xfs_refcount.c | 9 ++- fs/xfs/libxfs/xfs_trans_resv.c | 130 ++++++++++++++++++++++++++++++++++++----- fs/xfs/libxfs/xfs_trans_resv.h | 4 ++ 4 files changed, 138 insertions(+), 17 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 60fff8c6716f..9975b93a7412 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -80,6 +80,18 @@ xfs_log_calc_trans_resv_for_minlogblocks( resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; } + /* + * In the early days of reflink, we did not use deferred refcount + * update log items, so log reservations must be recomputed using the + * old calculations. + */ + resv->tr_write.tr_logres = + xfs_calc_write_reservation_minlogsize(mp); + resv->tr_itruncate.tr_logres = + xfs_calc_itruncate_reservation_minlogsize(mp); + resv->tr_qm_dqalloc.tr_logres = + xfs_calc_qm_dqalloc_reservation_minlogsize(mp); + /* Put everything back the way it was. This goes at the end. */ mp->m_rmap_maxlevels = rmap_maxlevels; } diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index a07ebaecba73..e53544d52ee2 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -886,8 +886,13 @@ xfs_refcount_still_have_space( { unsigned long overhead; - overhead = cur->bc_ag.refc.shape_changes * - xfs_allocfree_log_count(cur->bc_mp, 1); + /* + * Worst case estimate: full splits of the free space and rmap btrees + * to handle each of the shape changes to the refcount btree. + */ + overhead = xfs_allocfree_log_count(cur->bc_mp, + cur->bc_ag.refc.shape_changes); + overhead += cur->bc_mp->m_refc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; /* diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 60be82cd491b..ab688929d884 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -56,8 +56,7 @@ xfs_calc_buf_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating an extent. In classic XFS there were two trees that will be * modified (bnobt + cntbt). With rmap enabled, there are three trees - * (rmapbt). With reflink, there are four trees (refcountbt). The number of - * blocks reserved is based on the formula: + * (rmapbt). The number of blocks reserved is based on the formula: * * num trees * ((2 blocks/level * max depth) - 1) * @@ -73,12 +72,23 @@ xfs_allocfree_log_count( blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1); if (xfs_has_rmapbt(mp)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); - if (xfs_has_reflink(mp)) - blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); return blocks; } +/* + * Per-extent log reservation for refcount btree changes. These are never done + * in the same transaction as an allocation or a free, so we compute them + * separately. + */ +static unsigned int +xfs_refcountbt_block_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + return num_ops * (2 * mp->m_refc_maxlevels - 1); +} + /* * Logging inodes is really tricksy. They are logged in memory format, * which means that what we write into the log doesn't directly translate into @@ -233,6 +243,28 @@ xfs_rtalloc_log_count( * register overflow from temporaries in the calculations. */ +/* + * Compute the log reservation required to handle the refcount update + * transaction. Refcount updates are always done via deferred log items. + * + * This is calculated as: + * Data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +static unsigned int +xfs_calc_refcountbt_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); +} /* * In a write transaction we can allocate a maximum of 2 @@ -255,12 +287,14 @@ xfs_rtalloc_log_count( * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And any refcount updates that happen in a separate transaction (t4). */ STATIC uint xfs_calc_write_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - unsigned int t1, t2, t3; + unsigned int t1, t2, t3, t4; unsigned int blksz = XFS_FSB_TO_B(mp, 1); t1 = xfs_calc_inode_res(mp, 1) + @@ -282,7 +316,36 @@ xfs_calc_write_reservation( t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + /* + * In the early days of reflink, we included enough reservation to log + * two refcountbt splits for each transaction. The codebase runs + * refcountbt updates in separate transactions now, so to compute the + * minimum log size, add the refcountbtree splits back to t1 and t3 and + * do not account them separately as t4. Reflink did not support + * realtime when the reservations were established, so no adjustment to + * t2 is needed. + */ + if (for_minlogsize) { + unsigned int adj = 0; + + if (xfs_has_reflink(mp)) + adj = xfs_calc_buf_res( + xfs_refcountbt_block_count(mp, 2), + blksz); + t1 += adj; + t3 += adj; + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + } + + t4 = xfs_calc_refcountbt_reservation(mp, 1); + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); +} + +unsigned int +xfs_calc_write_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_write_reservation(mp, true); } /* @@ -304,12 +367,14 @@ xfs_calc_write_reservation( * the realtime summary: 2 exts * 1 block * worst case split in allocation btrees per extent assuming 2 extents: * 2 exts * 2 trees * (2 * max depth - 1) * block size + * And any refcount updates that happen in a separate transaction (t4). */ STATIC uint xfs_calc_itruncate_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - unsigned int t1, t2, t3; + unsigned int t1, t2, t3, t4; unsigned int blksz = XFS_FSB_TO_B(mp, 1); t1 = xfs_calc_inode_res(mp, 1) + @@ -326,7 +391,33 @@ xfs_calc_itruncate_reservation( t3 = 0; } - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + /* + * In the early days of reflink, we included enough reservation to log + * four refcountbt splits in the same transaction as bnobt/cntbt + * updates. The codebase runs refcountbt updates in separate + * transactions now, so to compute the minimum log size, add the + * refcount btree splits back here and do not compute them separately + * as t4. Reflink did not support realtime when the reservations were + * established, so do not adjust t3. + */ + if (for_minlogsize) { + if (xfs_has_reflink(mp)) + t2 += xfs_calc_buf_res( + xfs_refcountbt_block_count(mp, 4), + blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + } + + t4 = xfs_calc_refcountbt_reservation(mp, 2); + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); +} + +unsigned int +xfs_calc_itruncate_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_itruncate_reservation(mp, true); } /* @@ -792,13 +883,21 @@ xfs_calc_qm_setqlim_reservation(void) */ STATIC uint xfs_calc_qm_dqalloc_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - return xfs_calc_write_reservation(mp) + + return xfs_calc_write_reservation(mp, for_minlogsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); } +unsigned int +xfs_calc_qm_dqalloc_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_qm_dqalloc_reservation(mp, true); +} + /* * Syncing the incore super block changes to disk. * the super block to reflect the changes: sector size @@ -821,11 +920,11 @@ xfs_trans_resv_calc( * The following transactions are logged in physical format and * require a permanent reservation on space. */ - resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; @@ -882,7 +981,8 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); + resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp, + false); resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index fa330e646dc5..22b99042127a 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -98,4 +98,8 @@ struct xfs_trans_resv { void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); +unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); +unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); +unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); + #endif /* __XFS_TRANS_RESV_H__ */ -- cgit From df2fd88f8ac77f75a603d9fa5015225cc6c30edb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Apr 2022 18:38:15 -0700 Subject: xfs: rewrite xfs_reflink_end_cow to use intents Currently, the code that performs CoW remapping after a write has this odd behavior where it walks /backwards/ through the data fork to remap extents in reverse order. Earlier, we rewrote the reflink remap function to use deferred bmap log items instead of trying to cram as much into the first transaction that we could. Now do the same for the CoW remap code. There doesn't seem to be any performance impact; we're just making better use of code that we added for the benefit of reflink. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_reflink.c | 88 +++++++++++++++++++++++++++++++++------------------- fs/xfs/xfs_trace.h | 3 +- 2 files changed, 58 insertions(+), 33 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 960917628a44..e7a7c00d93be 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -586,21 +586,21 @@ out: STATIC int xfs_reflink_end_cow_extent( struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, - xfs_fileoff_t *end_fsb) + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) { - struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got, del, data; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - xfs_filblks_t rlen; unsigned int resblks; + int nmaps; int error; /* No COW extents? That's easy! */ if (ifp->if_bytes == 0) { - *end_fsb = offset_fsb; + *offset_fsb = end_fsb; return 0; } @@ -631,42 +631,66 @@ xfs_reflink_end_cow_extent( * left by the time I/O completes for the loser of the race. In that * case we are done. */ - if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) || - got.br_startoff + got.br_blockcount <= offset_fsb) { - *end_fsb = offset_fsb; + if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) || + got.br_startoff >= end_fsb) { + *offset_fsb = end_fsb; goto out_cancel; } - /* - * Structure copy @got into @del, then trim @del to the range that we - * were asked to remap. We preserve @got for the eventual CoW fork - * deletion; from now on @del represents the mapping that we're - * actually remapping. - */ - del = got; - xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb); - - ASSERT(del.br_blockcount > 0); - /* * Only remap real extents that contain data. With AIO, speculative * preallocations can leak into the range we are called upon, and we - * need to skip them. + * need to skip them. Preserve @got for the eventual CoW fork + * deletion; from now on @del represents the mapping that we're + * actually remapping. */ - if (!xfs_bmap_is_written_extent(&got)) { - *end_fsb = del.br_startoff; - goto out_cancel; + while (!xfs_bmap_is_written_extent(&got)) { + if (!xfs_iext_next_extent(ifp, &icur, &got) || + got.br_startoff >= end_fsb) { + *offset_fsb = end_fsb; + goto out_cancel; + } } + del = got; - /* Unmap the old blocks in the data fork. */ - rlen = del.br_blockcount; - error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); + /* Grab the corresponding mapping in the data fork. */ + nmaps = 1; + error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, + &nmaps, 0); if (error) goto out_cancel; - /* Trim the extent to whatever got unmapped. */ - xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen); - trace_xfs_reflink_cow_remap(ip, &del); + /* We can only remap the smaller of the two extent sizes. */ + data.br_blockcount = min(data.br_blockcount, del.br_blockcount); + del.br_blockcount = data.br_blockcount; + + trace_xfs_reflink_cow_remap_from(ip, &del); + trace_xfs_reflink_cow_remap_to(ip, &data); + + if (xfs_bmap_is_real_extent(&data)) { + /* + * If the extent we're remapping is backed by storage (written + * or not), unmap the extent and drop its refcount. + */ + xfs_bmap_unmap_extent(tp, ip, &data); + xfs_refcount_decrease_extent(tp, &data); + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, + -data.br_blockcount); + } else if (data.br_startblock == DELAYSTARTBLOCK) { + int done; + + /* + * If the extent we're remapping is a delalloc reservation, + * we can use the regular bunmapi function to release the + * incore state. Dropping the delalloc reservation takes care + * of the quota reservation for us. + */ + error = xfs_bunmapi(NULL, ip, data.br_startoff, + data.br_blockcount, 0, 1, &done); + if (error) + goto out_cancel; + ASSERT(done); + } /* Free the CoW orphan record. */ xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); @@ -687,7 +711,7 @@ xfs_reflink_end_cow_extent( return error; /* Update the caller about how much progress we made. */ - *end_fsb = del.br_startoff; + *offset_fsb = del.br_startoff + del.br_blockcount; return 0; out_cancel: @@ -715,7 +739,7 @@ xfs_reflink_end_cow( end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); /* - * Walk backwards until we're out of the I/O range. The loop function + * Walk forwards until we've remapped the I/O range. The loop function * repeatedly cycles the ILOCK to allocate one transaction per remapped * extent. * @@ -747,7 +771,7 @@ xfs_reflink_end_cow( * blocks will be remapped. */ while (end_fsb > offset_fsb && !error) - error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb); + error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb); if (error) trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a690987cc5f0..378f9dd1f66f 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3405,7 +3405,8 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); -- cgit From 6ed7e509d2304519f4f6741670f512a55e9e80fe Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Apr 2022 18:38:24 -0700 Subject: xfs: rename xfs_*alloc*_log_count to _block_count These functions return the maximum number of blocks that could be logged in a particular transaction. "log count" is confusing since there's a separate concept of a log (operation) count in the reservation code, so let's change it to "block count" to be less confusing. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_refcount.c | 2 +- fs/xfs/libxfs/xfs_trans_resv.c | 38 +++++++++++++++++++------------------- fs/xfs/libxfs/xfs_trans_resv.h | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index e53544d52ee2..97e9e6020596 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -890,7 +890,7 @@ xfs_refcount_still_have_space( * Worst case estimate: full splits of the free space and rmap btrees * to handle each of the shape changes to the refcount btree. */ - overhead = xfs_allocfree_log_count(cur->bc_mp, + overhead = xfs_allocfree_block_count(cur->bc_mp, cur->bc_ag.refc.shape_changes); overhead += cur->bc_mp->m_refc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index ab688929d884..e9913c2c5a24 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -63,7 +63,7 @@ xfs_calc_buf_res( * Keep in mind that max depth is calculated separately for each type of tree. */ uint -xfs_allocfree_log_count( +xfs_allocfree_block_count( struct xfs_mount *mp, uint num_ops) { @@ -146,7 +146,7 @@ xfs_calc_inobt_res( { return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -193,7 +193,7 @@ xfs_calc_inode_chunk_res( { uint res, size = 0; - res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ @@ -213,7 +213,7 @@ xfs_calc_inode_chunk_res( * extents, as well as the realtime summary block. */ static unsigned int -xfs_rtalloc_log_count( +xfs_rtalloc_block_count( struct xfs_mount *mp, unsigned int num_ops) { @@ -300,21 +300,21 @@ xfs_calc_write_reservation( t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); if (xfs_has_realtime(mp)) { t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz); } else { t2 = 0; } t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); /* * In the early days of reflink, we included enough reservation to log @@ -381,12 +381,12 @@ xfs_calc_itruncate_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); if (xfs_has_realtime(mp)) { t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); } else { t3 = 0; } @@ -441,7 +441,7 @@ xfs_calc_rename_reservation( xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), XFS_FSB_TO_B(mp, 1)))); } @@ -481,7 +481,7 @@ xfs_calc_link_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)))); } @@ -519,7 +519,7 @@ xfs_calc_remove_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -664,7 +664,7 @@ xfs_calc_growdata_reservation( struct xfs_mount *mp) { return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -686,7 +686,7 @@ xfs_calc_growrtalloc_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), XFS_FSB_TO_B(mp, 1)) + xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -762,7 +762,7 @@ xfs_calc_addafork_reservation( xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -785,7 +785,7 @@ xfs_calc_attrinval_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), XFS_FSB_TO_B(mp, 1)))); } @@ -852,7 +852,7 @@ xfs_calc_attrrm_reservation( XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 22b99042127a..0554b9d775d2 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -96,7 +96,7 @@ struct xfs_trans_resv { #define XFS_WRITE_LOG_COUNT_REFLINK 8 void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); -uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); +uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); -- cgit From cb512c921639613ce03f87e62c5e93ed9fe8c84d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 11:44:55 +1000 Subject: xfs: zero inode fork buffer at allocation When we first allocate or resize an inline inode fork, we round up the allocation to 4 byte alingment to make journal alignment constraints. We don't clear the unused bytes, so we can copy up to three uninitialised bytes into the journal. Zero those bytes so we only ever copy zeros into the journal. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_inode_fork.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9aee4a1e2fe9..a15ff38c3d41 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -50,8 +50,13 @@ xfs_init_local_fork( mem_size++; if (size) { + /* + * As we round up the allocation here, we need to ensure the + * bytes we don't copy data into are zeroed because the log + * vectors still copy them into the journal. + */ real_size = roundup(mem_size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS); + ifp->if_u1.if_data = kmem_zalloc(real_size, KM_NOFS); memcpy(ifp->if_u1.if_data, data, size); if (zero_terminate) ifp->if_u1.if_data[size] = '\0'; @@ -500,10 +505,11 @@ xfs_idata_realloc( /* * For inline data, the underlying buffer must be a multiple of 4 bytes * in size so that it can be logged and stay on word boundaries. - * We enforce that here. + * We enforce that here, and use __GFP_ZERO to ensure that size + * extensions always zero the unused roundup area. */ ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4), - GFP_NOFS | __GFP_NOFAIL); + GFP_NOFS | __GFP_NOFAIL | __GFP_ZERO); ifp->if_bytes = new_size; } -- cgit From c230a4a85bcdbfc1a7415deec6caf04e8fca1301 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 11:45:11 +1000 Subject: xfs: fix potential log item leak Ever since we added shadown format buffers to the log items, log items need to handle the item being released with shadow buffers attached. Due to the fact this requirement was added at the same time we added new rmap/reflink intents, we missed the cleanup of those items. In theory, this means shadow buffers can be leaked in a very small window when a shutdown is initiated. Testing with KASAN shows this leak does not happen in practice - we haven't identified a single leak in several years of shutdown testing since ~v4.8 kernels. However, the intent whiteout cleanup mechanism results in every cancelled intent in exactly the same state as this tiny race window creates and so if intents down clean up shadow buffers on final release we will leak the shadow buffer for just about every intent we create. Hence we start with this patch to close this condition off and ensure that when whiteouts start to be used we don't leak lots of memory. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_item.c | 2 ++ fs/xfs/xfs_icreate_item.c | 1 + fs/xfs/xfs_refcount_item.c | 2 ++ fs/xfs/xfs_rmap_item.c | 2 ++ 4 files changed, 7 insertions(+) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 593ac29cffc7..2c8b686e2a11 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -39,6 +39,7 @@ STATIC void xfs_bui_item_free( struct xfs_bui_log_item *buip) { + kmem_free(buip->bui_item.li_lv_shadow); kmem_cache_free(xfs_bui_cache, buip); } @@ -198,6 +199,7 @@ xfs_bud_item_release( struct xfs_bud_log_item *budp = BUD_ITEM(lip); xfs_bui_release(budp->bud_buip); + kmem_free(budp->bud_item.li_lv_shadow); kmem_cache_free(xfs_bud_cache, budp); } diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 508e184e3b8f..b05314d48176 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -63,6 +63,7 @@ STATIC void xfs_icreate_item_release( struct xfs_log_item *lip) { + kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow); kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip)); } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 0d868c93144d..10474fe389e1 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -35,6 +35,7 @@ STATIC void xfs_cui_item_free( struct xfs_cui_log_item *cuip) { + kmem_free(cuip->cui_item.li_lv_shadow); if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) kmem_free(cuip); else @@ -204,6 +205,7 @@ xfs_cud_item_release( struct xfs_cud_log_item *cudp = CUD_ITEM(lip); xfs_cui_release(cudp->cud_cuip); + kmem_free(cudp->cud_item.li_lv_shadow); kmem_cache_free(xfs_cud_cache, cudp); } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index a22b2d19ef91..6c0b56ebdbe1 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -35,6 +35,7 @@ STATIC void xfs_rui_item_free( struct xfs_rui_log_item *ruip) { + kmem_free(ruip->rui_item.li_lv_shadow); if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) kmem_free(ruip); else @@ -227,6 +228,7 @@ xfs_rud_item_release( struct xfs_rud_log_item *rudp = RUD_ITEM(lip); xfs_rui_release(rudp->rud_ruip); + kmem_free(rudp->rud_item.li_lv_shadow); kmem_cache_free(xfs_rud_cache, rudp); } -- cgit From b2c28035cea290edbcec697504e5b7a4b1e023e7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 11:45:50 +1000 Subject: xfs: hide log iovec alignment constraints Callers currently have to round out the size of buffers to match the aligment constraints of log iovecs and xlog_write(). They should not need to know this detail, so introduce a new function to calculate the iovec length (for use in ->iop_size implementations). Also modify xlog_finish_iovec() to round up the length to the correct alignment so the callers don't need to do this, either. Convert the only user - inode forks - of this alignment rounding to use the new interface. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_inode_fork.c | 20 ++++---------------- fs/xfs/xfs_inode_item.c | 25 +++++++----------------- fs/xfs/xfs_inode_item_recover.c | 4 ++-- fs/xfs/xfs_log.h | 42 ++++++++++++++++++++++++++++++++++++++--- 4 files changed, 52 insertions(+), 39 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index a15ff38c3d41..1a4cdf550f6d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -36,7 +36,7 @@ xfs_init_local_fork( int64_t size) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int mem_size = size, real_size = 0; + int mem_size = size; bool zero_terminate; /* @@ -50,13 +50,7 @@ xfs_init_local_fork( mem_size++; if (size) { - /* - * As we round up the allocation here, we need to ensure the - * bytes we don't copy data into are zeroed because the log - * vectors still copy them into the journal. - */ - real_size = roundup(mem_size, 4); - ifp->if_u1.if_data = kmem_zalloc(real_size, KM_NOFS); + ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS); memcpy(ifp->if_u1.if_data, data, size); if (zero_terminate) ifp->if_u1.if_data[size] = '\0'; @@ -502,14 +496,8 @@ xfs_idata_realloc( return; } - /* - * For inline data, the underlying buffer must be a multiple of 4 bytes - * in size so that it can be logged and stay on word boundaries. - * We enforce that here, and use __GFP_ZERO to ensure that size - * extensions always zero the unused roundup area. - */ - ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4), - GFP_NOFS | __GFP_NOFAIL | __GFP_ZERO); + ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size, + GFP_NOFS | __GFP_NOFAIL); ifp->if_bytes = new_size; } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 00733a18ccdc..721def0639fd 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -71,7 +71,7 @@ xfs_inode_item_data_fork_size( case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - *nbytes += roundup(ip->i_df.if_bytes, 4); + *nbytes += xlog_calc_iovec_len(ip->i_df.if_bytes); *nvecs += 1; } break; @@ -112,7 +112,7 @@ xfs_inode_item_attr_fork_size( case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { - *nbytes += roundup(ip->i_afp->if_bytes, 4); + *nbytes += xlog_calc_iovec_len(ip->i_afp->if_bytes); *nvecs += 1; } break; @@ -204,17 +204,12 @@ xfs_inode_item_format_data_fork( ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - /* - * Round i_bytes up to a word boundary. - * The underlying memory is guaranteed - * to be there by xfs_idata_realloc(). - */ - data_bytes = roundup(ip->i_df.if_bytes, 4); ASSERT(ip->i_df.if_u1.if_data != NULL); ASSERT(ip->i_disk_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, - ip->i_df.if_u1.if_data, data_bytes); - ilf->ilf_dsize = (unsigned)data_bytes; + ip->i_df.if_u1.if_data, + ip->i_df.if_bytes); + ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_DDATA; @@ -288,17 +283,11 @@ xfs_inode_item_format_attr_fork( if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { - /* - * Round i_bytes up to a word boundary. - * The underlying memory is guaranteed - * to be there by xfs_idata_realloc(). - */ - data_bytes = roundup(ip->i_afp->if_bytes, 4); ASSERT(ip->i_afp->if_u1.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, ip->i_afp->if_u1.if_data, - data_bytes); - ilf->ilf_asize = (unsigned)data_bytes; + ip->i_afp->if_bytes); + ilf->ilf_asize = (unsigned)ip->i_afp->if_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ADATA; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 6d44f5fd6d7e..d28ffaebd067 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -462,7 +462,7 @@ xlog_recover_inode_commit_pass2( ASSERT(in_f->ilf_size <= 4); ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); ASSERT(!(fields & XFS_ILOG_DFORK) || - (len == in_f->ilf_dsize)); + (len == xlog_calc_iovec_len(in_f->ilf_dsize))); switch (fields & XFS_ILOG_DFORK) { case XFS_ILOG_DDATA: @@ -497,7 +497,7 @@ xlog_recover_inode_commit_pass2( } len = item->ri_buf[attr_index].i_len; src = item->ri_buf[attr_index].i_addr; - ASSERT(len == in_f->ilf_asize); + ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize)); switch (in_f->ilf_fields & XFS_ILOG_AFORK) { case XFS_ILOG_ADATA: diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 8dafe8f771c7..3a4f6a4e4eb7 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -21,23 +21,59 @@ struct xfs_log_vec { #define XFS_LOG_VEC_ORDERED (-1) +/* + * Calculate the log iovec length for a given user buffer length. Intended to be + * used by ->iop_size implementations when sizing buffers of arbitrary + * alignments. + */ +static inline int +xlog_calc_iovec_len(int len) +{ + return roundup(len, sizeof(uint32_t)); +} + void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, uint type); static inline void -xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) +xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, + int data_len) { struct xlog_op_header *oph = vec->i_addr; - - /* opheader tracks payload length, logvec tracks region length */ + int len; + + /* + * Always round up the length to the correct alignment so callers don't + * need to know anything about this log vec layout requirement. This + * means we have to zero the area the data to be written does not cover. + * This is complicated by fact the payload region is offset into the + * logvec region by the opheader that tracks the payload. + */ + len = xlog_calc_iovec_len(data_len); + if (len - data_len != 0) { + char *buf = vec->i_addr + sizeof(struct xlog_op_header); + + memset(buf + data_len, 0, len - data_len); + } + + /* + * The opheader tracks aligned payload length, whilst the logvec tracks + * the overall region length. + */ oph->oh_len = cpu_to_be32(len); len += sizeof(struct xlog_op_header); lv->lv_buf_len += len; lv->lv_bytes += len; vec->i_len = len; + + /* Catch buffer overruns */ + ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size); } +/* + * Copy the amount of data requested by the caller into a new log iovec. + */ static inline void * xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, uint type, void *data, int len) -- cgit From 5ddd658ea878f8dbae5ec33dba6cfdabb5056916 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 11:46:00 +1000 Subject: xfs: don't commit the first deferred transaction without intents If the first operation in a string of defer ops has no intents, then there is no reason to commit it before running the first call to xfs_defer_finish_one(). This allows the defer ops to be used effectively for non-intent based operations without requiring an unnecessary extra transaction commit when first called. This fixes a regression in per-attribute modification transaction count when delayed attributes are not being used. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_defer.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 0805ade2d300..1aa32bfdf0cc 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -186,7 +186,7 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, }; -static void +static bool xfs_defer_create_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp, @@ -197,6 +197,7 @@ xfs_defer_create_intent( if (!dfp->dfp_intent) dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + return dfp->dfp_intent != NULL; } /* @@ -204,16 +205,18 @@ xfs_defer_create_intent( * associated extents, then add the entire intake list to the end of * the pending list. */ -STATIC void +static bool xfs_defer_create_intents( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; + bool ret = false; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { trace_xfs_defer_create_intent(tp->t_mountp, dfp); - xfs_defer_create_intent(tp, dfp, true); + ret |= xfs_defer_create_intent(tp, dfp, true); } + return ret; } /* Abort all the intents that were committed. */ @@ -487,7 +490,7 @@ int xfs_defer_finish_noroll( struct xfs_trans **tp) { - struct xfs_defer_pending *dfp; + struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); @@ -506,17 +509,20 @@ xfs_defer_finish_noroll( * of time that any one intent item can stick around in memory, * pinning the log tail. */ - xfs_defer_create_intents(*tp); + bool has_intents = xfs_defer_create_intents(*tp); + list_splice_init(&(*tp)->t_dfops, &dop_pending); - error = xfs_defer_trans_roll(tp); - if (error) - goto out_shutdown; + if (has_intents || dfp) { + error = xfs_defer_trans_roll(tp); + if (error) + goto out_shutdown; - /* Possibly relog intent items to keep the log moving. */ - error = xfs_defer_relog(tp, &dop_pending); - if (error) - goto out_shutdown; + /* Relog intent items to keep the log moving. */ + error = xfs_defer_relog(tp, &dop_pending); + if (error) + goto out_shutdown; + } dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); -- cgit From f5b81200b6c166f78b73b3e2ca3e8f0c34c9daaf Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 11:46:09 +1000 Subject: xfs: add log item flags to indicate intents We currently have a couple of helper functions that try to infer whether the log item is an intent or intent done item from the combinations of operations it supports. This is incredibly fragile and not very efficient as it requires checking specific combinations of ops. We need to be able to identify intent and intent done items quickly and easily in upcoming patches, so simply add intent and intent done type flags to the log item ops flags. These are static flags to begin with, so intent items should have been typed like this from the start. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_item.c | 4 +++- fs/xfs/xfs_extfree_item.c | 4 +++- fs/xfs/xfs_refcount_item.c | 4 +++- fs/xfs/xfs_rmap_item.c | 4 +++- fs/xfs/xfs_trans.h | 25 +++++++++++++------------ 5 files changed, 25 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 2c8b686e2a11..0e0aae83308c 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -204,7 +204,8 @@ xfs_bud_item_release( } static const struct xfs_item_ops xfs_bud_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_bud_item_size, .iop_format = xfs_bud_item_format, .iop_release = xfs_bud_item_release, @@ -588,6 +589,7 @@ xfs_bui_item_relog( } static const struct xfs_item_ops xfs_bui_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_bui_item_size, .iop_format = xfs_bui_item_format, .iop_unpin = xfs_bui_item_unpin, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 0e50f2c9348e..21a159f9d8c5 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -307,7 +307,8 @@ xfs_efd_item_release( } static const struct xfs_item_ops xfs_efd_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_efd_item_size, .iop_format = xfs_efd_item_format, .iop_release = xfs_efd_item_release, @@ -688,6 +689,7 @@ xfs_efi_item_relog( } static const struct xfs_item_ops xfs_efi_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_unpin = xfs_efi_item_unpin, diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 10474fe389e1..71225d094e03 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -210,7 +210,8 @@ xfs_cud_item_release( } static const struct xfs_item_ops xfs_cud_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_cud_item_size, .iop_format = xfs_cud_item_format, .iop_release = xfs_cud_item_release, @@ -602,6 +603,7 @@ xfs_cui_item_relog( } static const struct xfs_item_ops xfs_cui_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_cui_item_size, .iop_format = xfs_cui_item_format, .iop_unpin = xfs_cui_item_unpin, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 6c0b56ebdbe1..6ecbc37e4b8d 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -233,7 +233,8 @@ xfs_rud_item_release( } static const struct xfs_item_ops xfs_rud_item_ops = { - .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, .iop_size = xfs_rud_item_size, .iop_format = xfs_rud_item_format, .iop_release = xfs_rud_item_release, @@ -632,6 +633,7 @@ xfs_rui_item_relog( } static const struct xfs_item_ops xfs_rui_item_ops = { + .flags = XFS_ITEM_INTENT, .iop_size = xfs_rui_item_size, .iop_format = xfs_rui_item_format, .iop_unpin = xfs_rui_item_unpin, diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 87e940b5366e..f68e74e46026 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -80,28 +80,29 @@ struct xfs_item_ops { struct xfs_trans *tp); }; -/* Is this log item a deferred action intent? */ +/* + * Log item ops flags + */ +/* + * Release the log item when the journal commits instead of inserting into the + * AIL for writeback tracking and/or log tail pinning. + */ +#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0) +#define XFS_ITEM_INTENT (1 << 1) +#define XFS_ITEM_INTENT_DONE (1 << 2) + static inline bool xlog_item_is_intent(struct xfs_log_item *lip) { - return lip->li_ops->iop_recover != NULL && - lip->li_ops->iop_match != NULL; + return lip->li_ops->flags & XFS_ITEM_INTENT; } -/* Is this a log intent-done item? */ static inline bool xlog_item_is_intent_done(struct xfs_log_item *lip) { - return lip->li_ops->iop_unpin == NULL && - lip->li_ops->iop_push == NULL; + return lip->li_ops->flags & XFS_ITEM_INTENT_DONE; } -/* - * Release the log item as soon as committed. This is for items just logging - * intents that never need to be written back in place. - */ -#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0) - void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, int type, const struct xfs_item_ops *ops); -- cgit From bb7b1c9c5dd3d24db3f296e365570fd50c8ca80c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 11:46:21 +1000 Subject: xfs: tag transactions that contain intent done items Intent whiteouts will require extra work to be done during transaction commit if the transaction contains an intent done item. To determine if a transaction contains an intent done item, we want to avoid having to walk all the items in the transaction to check if they are intent done items. Hence when we add an intent done item to a transaction, tag the transaction to indicate that it contains such an item. We don't tag the transaction when the defer ops is relogging an intent to move it forward in the log. Whiteouts will never apply to these cases, so we don't need to bother looking for them. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_shared.h | 24 +++++++++++++++++------- fs/xfs/xfs_bmap_item.c | 2 +- fs/xfs/xfs_extfree_item.c | 2 +- fs/xfs/xfs_refcount_item.c | 2 +- fs/xfs/xfs_rmap_item.c | 2 +- 5 files changed, 21 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 25c4cab58851..c4381388c0c1 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -54,13 +54,23 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, /* * Values for t_flags. */ -#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ -#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ -#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ -#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ -#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ -#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */ +/* Transaction needs to be logged */ +#define XFS_TRANS_DIRTY (1u << 0) +/* Superblock is dirty and needs to be logged */ +#define XFS_TRANS_SB_DIRTY (1u << 1) +/* Transaction took a permanent log reservation */ +#define XFS_TRANS_PERM_LOG_RES (1u << 2) +/* Synchronous transaction commit needed */ +#define XFS_TRANS_SYNC (1u << 3) +/* Transaction can use reserve block pool */ +#define XFS_TRANS_RESERVE (1u << 4) +/* Transaction should avoid VFS level superblock write accounting */ +#define XFS_TRANS_NO_WRITECOUNT (1u << 5) +/* Transaction has freed blocks returned to it's reservation */ +#define XFS_TRANS_RES_FDBLKS (1u << 6) +/* Transaction contains an intent done log item */ +#define XFS_TRANS_HAS_INTENT_DONE (1u << 7) + /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 0e0aae83308c..3d1fa8edf28f 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -257,7 +257,7 @@ xfs_trans_log_finish_bmap_update( * 1.) releases the BUI and frees the BUD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); return error; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 21a159f9d8c5..96735f23d12d 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -381,7 +381,7 @@ xfs_trans_free_extent( * 1.) releases the EFI and frees the EFD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); next_extent = efdp->efd_next_extent; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 71225d094e03..b37a9d2ce652 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -262,7 +262,7 @@ xfs_trans_log_finish_refcount_update( * 1.) releases the CUI and frees the CUD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); return error; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 6ecbc37e4b8d..5221fd1e6f6f 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -330,7 +330,7 @@ xfs_trans_log_finish_rmap_update( * 1.) releases the RUI and frees the RUD * 2.) shuts down the filesystem */ - tp->t_flags |= XFS_TRANS_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); return error; -- cgit From 22b1afc57e42da13f840d630ad484d4d99504839 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 11:46:30 +1000 Subject: xfs: factor and move some code in xfs_log_cil.c In preparation for adding support for intent item whiteouts. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_cil.c | 119 +++++++++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 52 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index e5ab62f08c19..0d8d092447ad 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -47,6 +47,38 @@ xlog_cil_ticket_alloc( return tic; } +/* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +static bool +xlog_item_in_current_chkpt( + struct xfs_cil *cil, + struct xfs_log_item *lip) +{ + if (list_empty(&lip->li_cil)) + return false; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + return lip->li_seq == READ_ONCE(cil->xc_current_sequence); +} + +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip); +} + /* * Unavoidable forward declaration - xlog_cil_push_work() calls * xlog_cil_ctx_alloc() itself. @@ -934,6 +966,40 @@ xlog_cil_build_trans_hdr( tic->t_curr_res -= lvhdr->lv_bytes; } +/* + * Pull all the log vectors off the items in the CIL, and remove the items from + * the CIL. We don't need the CIL lock here because it's only needed on the + * transaction commit side which is currently locked out by the flush lock. + */ +static void +xlog_cil_build_lv_chain( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx, + uint32_t *num_iovecs, + uint32_t *num_bytes) +{ + struct xfs_log_vec *lv = NULL; + + while (!list_empty(&cil->xc_cil)) { + struct xfs_log_item *item; + + item = list_first_entry(&cil->xc_cil, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + if (!ctx->lv_chain) + ctx->lv_chain = item->li_lv; + else + lv->lv_next = item->li_lv; + lv = item->li_lv; + item->li_lv = NULL; + *num_iovecs += lv->lv_niovecs; + + /* we don't write ordered log vectors */ + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) + *num_bytes += lv->lv_bytes; + } +} + /* * Push the Committed Item List to the log. * @@ -956,7 +1022,6 @@ xlog_cil_push_work( container_of(work, struct xfs_cil_ctx, push_work); struct xfs_cil *cil = ctx->cil; struct xlog *log = cil->xc_log; - struct xfs_log_vec *lv; struct xfs_cil_ctx *new_ctx; int num_iovecs = 0; int num_bytes = 0; @@ -1033,31 +1098,7 @@ xlog_cil_push_work( list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); - /* - * Pull all the log vectors off the items in the CIL, and remove the - * items from the CIL. We don't need the CIL lock here because it's only - * needed on the transaction commit side which is currently locked out - * by the flush lock. - */ - lv = NULL; - while (!list_empty(&cil->xc_cil)) { - struct xfs_log_item *item; - - item = list_first_entry(&cil->xc_cil, - struct xfs_log_item, li_cil); - list_del_init(&item->li_cil); - if (!ctx->lv_chain) - ctx->lv_chain = item->li_lv; - else - lv->lv_next = item->li_lv; - lv = item->li_lv; - item->li_lv = NULL; - num_iovecs += lv->lv_niovecs; - - /* we don't write ordered log vectors */ - if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) - num_bytes += lv->lv_bytes; - } + xlog_cil_build_lv_chain(cil, ctx, &num_iovecs, &num_bytes); /* * Switch the contexts so we can drop the context lock and move out @@ -1508,32 +1549,6 @@ out_shutdown: return 0; } -/* - * Check if the current log item was first committed in this sequence. - * We can't rely on just the log item being in the CIL, we have to check - * the recorded commit sequence number. - * - * Note: for this to be used in a non-racy manner, it has to be called with - * CIL flushing locked out. As a result, it should only be used during the - * transaction commit process when deciding what to format into the item. - */ -bool -xfs_log_item_in_current_chkpt( - struct xfs_log_item *lip) -{ - struct xfs_cil *cil = lip->li_log->l_cilp; - - if (list_empty(&lip->li_cil)) - return false; - - /* - * li_seq is written on the first commit of a log item to record the - * first checkpoint it is written to. Hence if it is different to the - * current sequence, we're in a new checkpoint. - */ - return lip->li_seq == READ_ONCE(cil->xc_current_sequence); -} - /* * Perform initial CIL structure initialisation. */ -- cgit From c23ab603e3d6557bd15e672fdbcbba4b28d08921 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 11:46:39 +1000 Subject: xfs: add log item method to return related intents To apply a whiteout to an intent item when an intent done item is committed, we need to be able to retrieve the intent item from the the intent done item. Add a log item op method for doing this, and wire all the intent done items up to it. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_item.c | 8 ++++++++ fs/xfs/xfs_extfree_item.c | 8 ++++++++ fs/xfs/xfs_refcount_item.c | 8 ++++++++ fs/xfs/xfs_rmap_item.c | 8 ++++++++ fs/xfs/xfs_trans.h | 1 + 5 files changed, 33 insertions(+) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 3d1fa8edf28f..f05663fdb6ff 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -203,12 +203,20 @@ xfs_bud_item_release( kmem_cache_free(xfs_bud_cache, budp); } +static struct xfs_log_item * +xfs_bud_item_intent( + struct xfs_log_item *lip) +{ + return &BUD_ITEM(lip)->bud_buip->bui_item; +} + static const struct xfs_item_ops xfs_bud_item_ops = { .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | XFS_ITEM_INTENT_DONE, .iop_size = xfs_bud_item_size, .iop_format = xfs_bud_item_format, .iop_release = xfs_bud_item_release, + .iop_intent = xfs_bud_item_intent, }; static struct xfs_bud_log_item * diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 96735f23d12d..032db5269e97 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -306,12 +306,20 @@ xfs_efd_item_release( xfs_efd_item_free(efdp); } +static struct xfs_log_item * +xfs_efd_item_intent( + struct xfs_log_item *lip) +{ + return &EFD_ITEM(lip)->efd_efip->efi_item; +} + static const struct xfs_item_ops xfs_efd_item_ops = { .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | XFS_ITEM_INTENT_DONE, .iop_size = xfs_efd_item_size, .iop_format = xfs_efd_item_format, .iop_release = xfs_efd_item_release, + .iop_intent = xfs_efd_item_intent, }; /* diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index b37a9d2ce652..57a025f5fd4b 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -209,12 +209,20 @@ xfs_cud_item_release( kmem_cache_free(xfs_cud_cache, cudp); } +static struct xfs_log_item * +xfs_cud_item_intent( + struct xfs_log_item *lip) +{ + return &CUD_ITEM(lip)->cud_cuip->cui_item; +} + static const struct xfs_item_ops xfs_cud_item_ops = { .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | XFS_ITEM_INTENT_DONE, .iop_size = xfs_cud_item_size, .iop_format = xfs_cud_item_format, .iop_release = xfs_cud_item_release, + .iop_intent = xfs_cud_item_intent, }; static struct xfs_cud_log_item * diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 5221fd1e6f6f..1c7d8518cb48 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -232,12 +232,20 @@ xfs_rud_item_release( kmem_cache_free(xfs_rud_cache, rudp); } +static struct xfs_log_item * +xfs_rud_item_intent( + struct xfs_log_item *lip) +{ + return &RUD_ITEM(lip)->rud_ruip->rui_item; +} + static const struct xfs_item_ops xfs_rud_item_ops = { .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | XFS_ITEM_INTENT_DONE, .iop_size = xfs_rud_item_size, .iop_format = xfs_rud_item_format, .iop_release = xfs_rud_item_release, + .iop_intent = xfs_rud_item_intent, }; static struct xfs_rud_log_item * diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f68e74e46026..d72a5995d33e 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -78,6 +78,7 @@ struct xfs_item_ops { bool (*iop_match)(struct xfs_log_item *item, uint64_t id); struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, struct xfs_trans *tp); + struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done); }; /* -- cgit From 3512fc1e84c3ab58fa08466c2f75ae973fc472d1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 11:46:47 +1000 Subject: xfs: whiteouts release intents that are not in the AIL When we release an intent that a whiteout applies to, it will not have been committed to the journal and so won't be in the AIL. Hence when we drop the last reference to the intent, we do not want to try to remove it from the AIL as that will trigger a filesystem shutdown. Hence make the removal of intents from the AIL conditional on them actually being in the AIL so we do the correct thing. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_item.c | 9 +++++---- fs/xfs/xfs_extfree_item.c | 9 +++++---- fs/xfs/xfs_refcount_item.c | 9 +++++---- fs/xfs/xfs_rmap_item.c | 9 +++++---- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index f05663fdb6ff..51f66e982484 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -55,10 +55,11 @@ xfs_bui_release( struct xfs_bui_log_item *buip) { ASSERT(atomic_read(&buip->bui_refcount) > 0); - if (atomic_dec_and_test(&buip->bui_refcount)) { - xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_bui_item_free(buip); - } + if (!atomic_dec_and_test(&buip->bui_refcount)) + return; + + xfs_trans_ail_delete(&buip->bui_item, 0); + xfs_bui_item_free(buip); } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 032db5269e97..765be054dffe 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -58,10 +58,11 @@ xfs_efi_release( struct xfs_efi_log_item *efip) { ASSERT(atomic_read(&efip->efi_refcount) > 0); - if (atomic_dec_and_test(&efip->efi_refcount)) { - xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); - xfs_efi_item_free(efip); - } + if (!atomic_dec_and_test(&efip->efi_refcount)) + return; + + xfs_trans_ail_delete(&efip->efi_item, 0); + xfs_efi_item_free(efip); } /* diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 57a025f5fd4b..7e97bf19793d 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -54,10 +54,11 @@ xfs_cui_release( struct xfs_cui_log_item *cuip) { ASSERT(atomic_read(&cuip->cui_refcount) > 0); - if (atomic_dec_and_test(&cuip->cui_refcount)) { - xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_cui_item_free(cuip); - } + if (!atomic_dec_and_test(&cuip->cui_refcount)) + return; + + xfs_trans_ail_delete(&cuip->cui_item, 0); + xfs_cui_item_free(cuip); } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 1c7d8518cb48..fef92e02f3bb 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -54,10 +54,11 @@ xfs_rui_release( struct xfs_rui_log_item *ruip) { ASSERT(atomic_read(&ruip->rui_refcount) > 0); - if (atomic_dec_and_test(&ruip->rui_refcount)) { - xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_rui_item_free(ruip); - } + if (!atomic_dec_and_test(&ruip->rui_refcount)) + return; + + xfs_trans_ail_delete(&ruip->rui_item, 0); + xfs_rui_item_free(ruip); } STATIC void -- cgit From 0d227466be84332d1888724e1e74dac34bff6d71 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 11:50:29 +1000 Subject: xfs: intent item whiteouts When we log modifications based on intents, we add both intent and intent done items to the modification being made. These get written to the log to ensure that the operation is re-run if the intent done is not found in the log. However, for operations that complete wholly within a single checkpoint, the change in the checkpoint is atomic and will never need replay. In this case, we don't need to actually write the intent and intent done items to the journal because log recovery will never need to manually restart this modification. Log recovery currently handles intent/intent done matching by inserting the intent into the AIL, then removing it when a matching intent done item is found. Hence for all the intent-based operations that complete within a checkpoint, we spend all that time parsing the intent/intent done items just to cancel them and do nothing with them. Hence it follows that the only time we actually need intents in the log is when the modification crosses checkpoint boundaries in the log and so may only be partially complete in the journal. Hence if we commit and intent done item to the CIL and the intent item is in the same checkpoint, we don't actually have to write them to the journal because log recovery will always cancel the intents. We've never really worried about the overhead of logging intents unnecessarily like this because the intents we log are generally very much smaller than the change being made. e.g. freeing an extent involves modifying at lease two freespace btree blocks and the AGF, so the EFI/EFD overhead is only a small increase in space and processing time compared to the overall cost of freeing an extent. However, delayed attributes change this cost equation dramatically, especially for inline attributes. In the case of adding an inline attribute, we only log the inode core and attribute fork at present. With delayed attributes, we now log the attr intent which includes the name and value, the inode core adn attr fork, and finally the attr intent done item. We increase the number of items we log from 1 to 3, and the number of log vectors (regions) goes up from 3 to 7. Hence we tripple the number of objects that the CIL has to process, and more than double the number of log vectors that need to be written to the journal. At scale, this means delayed attributes cause a non-pipelined CIL to become CPU bound processing all the extra items, resulting in a > 40% performance degradation on 16-way file+xattr create worklaods. Pipelining the CIL (as per 5.15) reduces the performance degradation to 20%, but now the limitation is the rate at which the log items can be written to the iclogs and iclogs be dispatched for IO and completed. Even log IO completion is slowed down by these intents, because it now has to process 3x the number of items in the checkpoint. Processing completed intents is especially inefficient here, because we first insert the intent into the AIL, then remove it from the AIL when the intent done is processed. IOWs, we are also doing expensive operations in log IO completion we could completely avoid if we didn't log completed intent/intent done pairs. Enter log item whiteouts. When an intent done is committed, we can check to see if the associated intent is in the same checkpoint as we are currently committing the intent done to. If so, we can mark the intent log item with a whiteout and immediately free the intent done item rather than committing it to the CIL. We can basically skip the entire formatting and CIL insertion steps for the intent done item. However, we cannot remove the intent item from the CIL at this point because the unlocked per-cpu CIL item lists do not permit removal without holding the CIL context lock exclusively. Transaction commit only holds the context lock shared, hence the best we can do is mark the intent item with a whiteout so that the CIL push can release it rather than writing it to the log. This means we never write the intent to the log if the intent done has also been committed to the same checkpoint, but we'll always write the intent if the intent done has not been committed or has been committed to a different checkpoint. This will result in correct log recovery behaviour in all cases, without the overhead of logging unnecessary intents. This intent whiteout concept is generic - we can apply it to all intent/intent done pairs that have a direct 1:1 relationship. The way deferred ops iterate and relog intents mean that all intents currently have a 1:1 relationship with their done intent, and hence we can apply this cancellation to all existing intent/intent done implementations. For delayed attributes with a 16-way 64kB xattr create workload, whiteouts reduce the amount of journalled metadata from ~2.5GB/s down to ~600MB/s and improve the creation rate from 9000/s to 14000/s. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_cil.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/xfs/xfs_trace.h | 3 ++ fs/xfs/xfs_trans.h | 6 ++-- 3 files changed, 82 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 0d8d092447ad..70f718d76ceb 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -476,7 +476,8 @@ insert: static void xlog_cil_insert_items( struct xlog *log, - struct xfs_trans *tp) + struct xfs_trans *tp, + uint32_t released_space) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx = cil->xc_ctx; @@ -525,7 +526,9 @@ xlog_cil_insert_items( ASSERT(tp->t_ticket->t_curr_res >= len); } tp->t_ticket->t_curr_res -= len; + tp->t_ticket->t_curr_res += released_space; ctx->space_used += len; + ctx->space_used -= released_space; /* * If we've overrun the reservation, dump the tx details before we move @@ -970,11 +973,16 @@ xlog_cil_build_trans_hdr( * Pull all the log vectors off the items in the CIL, and remove the items from * the CIL. We don't need the CIL lock here because it's only needed on the * transaction commit side which is currently locked out by the flush lock. + * + * If a log item is marked with a whiteout, we do not need to write it to the + * journal and so we just move them to the whiteout list for the caller to + * dispose of appropriately. */ static void xlog_cil_build_lv_chain( struct xfs_cil *cil, struct xfs_cil_ctx *ctx, + struct list_head *whiteouts, uint32_t *num_iovecs, uint32_t *num_bytes) { @@ -985,6 +993,13 @@ xlog_cil_build_lv_chain( item = list_first_entry(&cil->xc_cil, struct xfs_log_item, li_cil); + + if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) { + list_move(&item->li_cil, whiteouts); + trace_xfs_cil_whiteout_skip(item); + continue; + } + list_del_init(&item->li_cil); if (!ctx->lv_chain) ctx->lv_chain = item->li_lv; @@ -1000,6 +1015,19 @@ xlog_cil_build_lv_chain( } } +static void +xlog_cil_cleanup_whiteouts( + struct list_head *whiteouts) +{ + while (!list_empty(whiteouts)) { + struct xfs_log_item *item = list_first_entry(whiteouts, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + trace_xfs_cil_whiteout_unpin(item); + item->li_ops->iop_unpin(item, 1); + } +} + /* * Push the Committed Item List to the log. * @@ -1030,6 +1058,7 @@ xlog_cil_push_work( struct xfs_log_vec lvhdr = { NULL }; xfs_csn_t push_seq; bool push_commit_stable; + LIST_HEAD (whiteouts); new_ctx = xlog_cil_ctx_alloc(); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -1098,7 +1127,7 @@ xlog_cil_push_work( list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); - xlog_cil_build_lv_chain(cil, ctx, &num_iovecs, &num_bytes); + xlog_cil_build_lv_chain(cil, ctx, &whiteouts, &num_iovecs, &num_bytes); /* * Switch the contexts so we can drop the context lock and move out @@ -1201,6 +1230,7 @@ xlog_cil_push_work( /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); + xlog_cil_cleanup_whiteouts(&whiteouts); return; out_skip: @@ -1212,6 +1242,7 @@ out_skip: out_abort_free_ticket: xfs_log_ticket_ungrant(log, ctx->ticket); ASSERT(xlog_is_shutdown(log)); + xlog_cil_cleanup_whiteouts(&whiteouts); if (!ctx->commit_iclog) { xlog_cil_committed(ctx); return; @@ -1360,6 +1391,43 @@ xlog_cil_empty( return empty; } +/* + * If there are intent done items in this transaction and the related intent was + * committed in the current (same) CIL checkpoint, we don't need to write either + * the intent or intent done item to the journal as the change will be + * journalled atomically within this checkpoint. As we cannot remove items from + * the CIL here, mark the related intent with a whiteout so that the CIL push + * can remove it rather than writing it to the journal. Then remove the intent + * done item from the current transaction and release it so it doesn't get put + * into the CIL at all. + */ +static uint32_t +xlog_cil_process_intents( + struct xfs_cil *cil, + struct xfs_trans *tp) +{ + struct xfs_log_item *lip, *ilip, *next; + uint32_t len = 0; + + list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { + if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE)) + continue; + + ilip = lip->li_ops->iop_intent(lip); + if (!ilip || !xlog_item_in_current_chkpt(cil, ilip)) + continue; + set_bit(XFS_LI_WHITEOUT, &ilip->li_flags); + trace_xfs_cil_whiteout_mark(ilip); + len += ilip->li_lv->lv_bytes; + kmem_free(ilip->li_lv); + ilip->li_lv = NULL; + + xfs_trans_del_item(lip); + lip->li_ops->iop_release(lip); + } + return len; +} + /* * Commit a transaction with the given vector to the Committed Item List. * @@ -1382,6 +1450,7 @@ xlog_cil_commit( { struct xfs_cil *cil = log->l_cilp; struct xfs_log_item *lip, *next; + uint32_t released_space = 0; /* * Do all necessary memory allocation before we lock the CIL. @@ -1393,7 +1462,10 @@ xlog_cil_commit( /* lock out background commit */ down_read(&cil->xc_ctx_lock); - xlog_cil_insert_items(log, tp); + if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE) + released_space = xlog_cil_process_intents(cil, tp); + + xlog_cil_insert_items(log, tp, released_space); if (regrant && !xlog_is_shutdown(log)) xfs_log_ticket_regrant(log, tp->t_ticket); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e1197f9ad97e..75934e3c3f55 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1332,6 +1332,9 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_push); DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); +DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark); +DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip); +DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin); DECLARE_EVENT_CLASS(xfs_ail_class, TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index d72a5995d33e..9561f193e7e1 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -55,13 +55,15 @@ struct xfs_log_item { #define XFS_LI_IN_AIL 0 #define XFS_LI_ABORTED 1 #define XFS_LI_FAILED 2 -#define XFS_LI_DIRTY 3 /* log item dirty in transaction */ +#define XFS_LI_DIRTY 3 +#define XFS_LI_WHITEOUT 4 #define XFS_LI_FLAGS \ { (1u << XFS_LI_IN_AIL), "IN_AIL" }, \ { (1u << XFS_LI_ABORTED), "ABORTED" }, \ { (1u << XFS_LI_FAILED), "FAILED" }, \ - { (1u << XFS_LI_DIRTY), "DIRTY" } + { (1u << XFS_LI_DIRTY), "DIRTY" }, \ + { (1u << XFS_LI_WHITEOUT), "WHITEOUT" } struct xfs_item_ops { unsigned flags; -- cgit From dc04db2aa7c9307e740d6d0e173085301c173b1a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 12:13:35 +1000 Subject: xfs: detect self referencing btree sibling pointers To catch the obvious graph cycle problem and hence potential endless looping. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_btree.c | 140 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 105 insertions(+), 35 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index a8c79e760d8a..2aa300f7461f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -51,6 +51,52 @@ xfs_btree_magic( return magic; } +static xfs_failaddr_t +xfs_btree_check_lblock_siblings( + struct xfs_mount *mp, + struct xfs_btree_cur *cur, + int level, + xfs_fsblock_t fsb, + xfs_fsblock_t sibling) +{ + if (sibling == NULLFSBLOCK) + return NULL; + if (sibling == fsb) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_lptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_fsbno(mp, sibling)) + return __this_address; + } + + return NULL; +} + +static xfs_failaddr_t +xfs_btree_check_sblock_siblings( + struct xfs_mount *mp, + struct xfs_btree_cur *cur, + int level, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_agblock_t sibling) +{ + if (sibling == NULLAGBLOCK) + return NULL; + if (sibling == agbno) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_sptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_agbno(mp, agno, sibling)) + return __this_address; + } + return NULL; +} + /* * Check a long btree block header. Return the address of the failing check, * or NULL if everything is ok. @@ -65,6 +111,8 @@ __xfs_btree_check_lblock( struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_has_crc(mp); + xfs_failaddr_t fa; + xfs_fsblock_t fsb = NULLFSBLOCK; if (crc) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -83,16 +131,16 @@ __xfs_btree_check_lblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + be64_to_cpu(block->bb_u.l.bb_leftsib)); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + be64_to_cpu(block->bb_u.l.bb_rightsib)); + return fa; } /* Check a long btree block header. */ @@ -130,6 +178,9 @@ __xfs_btree_check_sblock( struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_has_crc(mp); + xfs_failaddr_t fa; + xfs_agblock_t agbno = NULLAGBLOCK; + xfs_agnumber_t agno = NULLAGNUMBER; if (crc) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -146,16 +197,18 @@ __xfs_btree_check_sblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) { + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); + } + + fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, + be32_to_cpu(block->bb_u.s.bb_leftsib)); + if (!fa) + fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, + agbno, be32_to_cpu(block->bb_u.s.bb_rightsib)); + return fa; } /* Check a short btree block header. */ @@ -4271,6 +4324,21 @@ xfs_btree_visit_block( if (xfs_btree_ptr_is_null(cur, &rptr)) return -ENOENT; + /* + * We only visit blocks once in this walk, so we have to avoid the + * internal xfs_btree_lookup_get_block() optimisation where it will + * return the same block without checking if the right sibling points + * back to us and creates a cyclic reference in the btree. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } else { + if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } @@ -4445,20 +4513,21 @@ xfs_btree_lblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_fsblock_t fsb; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))) - return __this_address; - - return NULL; + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + be64_to_cpu(block->bb_u.l.bb_leftsib)); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + be64_to_cpu(block->bb_u.l.bb_rightsib)); + return fa; } /** @@ -4499,7 +4568,9 @@ xfs_btree_sblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - xfs_agblock_t agno; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) @@ -4507,14 +4578,13 @@ xfs_btree_sblock_verify( /* sibling pointer verification */ agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib))) - return __this_address; - - return NULL; + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + be32_to_cpu(block->bb_u.s.bb_leftsib)); + if (!fa) + fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + be32_to_cpu(block->bb_u.s.bb_rightsib)); + return fa; } /* -- cgit From 1eb70f54c445fcbb25817841e774adb3d912f3e8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 12:13:53 +1000 Subject: xfs: validate inode fork size against fork format xfs_repair catches fork size/format mismatches, but the in-kernel verifier doesn't, leading to null pointer failures when attempting to perform operations on the fork. This can occur in the xfs_dir_is_empty() where the in-memory fork format does not match the size and so the fork data pointer is accessed incorrectly. Note: this causes new failures in xfs/348 which is testing mode vs ftype mismatches. We now detect a regular file that has been changed to a directory or symlink mode as being corrupt because the data fork is for a symlink or directory should be in local form when there are only 3 bytes of data in the data fork. Hence the inode verify for the regular file now fires w/ -EFSCORRUPTED because the inode fork format does not match the format the corrupted mode says it should be in. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_inode_buf.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 74b82ec80f8e..3b1b63f9d886 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -357,21 +357,38 @@ xfs_dinode_verify_fork( { xfs_extnum_t di_nextents; xfs_extnum_t max_extents; + mode_t mode = be16_to_cpu(dip->di_mode); + uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork); + uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork); di_nextents = xfs_dfork_nextents(dip, whichfork); - switch (XFS_DFORK_FORMAT(dip, whichfork)) { + /* + * For fork types that can contain local data, check that the fork + * format matches the size of local data contained within the fork. + * + * For all types, check that when the size says the should be in extent + * or btree format, the inode isn't claiming it is in local format. + */ + if (whichfork == XFS_DATA_FORK) { + if (S_ISDIR(mode) || S_ISLNK(mode)) { + if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + if (be64_to_cpu(dip->di_size) > fork_size && + fork_format == XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + switch (fork_format) { case XFS_DINODE_FMT_LOCAL: /* - * no local regular files yet + * No local regular files yet. */ - if (whichfork == XFS_DATA_FORK) { - if (S_ISREG(be16_to_cpu(dip->di_mode))) - return __this_address; - if (be64_to_cpu(dip->di_size) > - XFS_DFORK_SIZE(dip, mp, whichfork)) - return __this_address; - } + if (S_ISREG(mode) && whichfork == XFS_DATA_FORK) + return __this_address; if (di_nextents) return __this_address; break; -- cgit From dd0d2f9755191690541b09e6385d0f8cd8bc9d8f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 12:14:13 +1000 Subject: xfs: set XFS_FEAT_NLINK correctly While xfs_has_nlink() is not used in kernel, it is used in userspace (e.g. by xfs_db) so we need to set the XFS_FEAT_NLINK flag correctly in xfs_sb_version_to_features(). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_sb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index cf9e5b9374c1..ec6eec5c0e02 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -70,6 +70,8 @@ xfs_sb_version_to_features( /* optional V4 features */ if (sbp->sb_rblocks > 0) features |= XFS_FEAT_REALTIME; + if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT) + features |= XFS_FEAT_NLINK; if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT) features |= XFS_FEAT_ATTR; if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT) -- cgit From f0f5f658065a5af09126ec892e4c383540a1c77f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 May 2022 12:17:18 +1000 Subject: xfs: validate v5 feature fields We don't check that the v4 feature flags taht v5 requires to be set are actually set anywhere. Do this check when we see that the filesystem is a v5 filesystem. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_sb.c | 68 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index ec6eec5c0e02..a20cade590e9 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -30,6 +30,47 @@ * Physical superblock buffer manipulations. Shared with libxfs in userspace. */ +/* + * Check that all the V4 feature bits that the V5 filesystem format requires are + * correctly set. + */ +static bool +xfs_sb_validate_v5_features( + struct xfs_sb *sbp) +{ + /* We must not have any unknown V4 feature bits set */ + if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) + return false; + + /* + * The CRC bit is considered an invalid V4 flag, so we have to add it + * manually to the OKBITS mask. + */ + if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS | + XFS_SB_VERSION2_CRCBIT)) + return false; + + /* Now check all the required V4 feature flags are set. */ + +#define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \ + XFS_SB_VERSION_ALIGNBIT | \ + XFS_SB_VERSION_LOGV2BIT | \ + XFS_SB_VERSION_EXTFLGBIT | \ + XFS_SB_VERSION_DIRV2BIT | \ + XFS_SB_VERSION_MOREBITSBIT) + +#define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_CRCBIT) + + if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS) + return false; + if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS) + return false; + return true; +} + /* * We support all XFS versions newer than a v4 superblock with V2 directories. */ @@ -37,9 +78,19 @@ bool xfs_sb_good_version( struct xfs_sb *sbp) { - /* all v5 filesystems are supported */ + /* + * All v5 filesystems are supported, but we must check that all the + * required v4 feature flags are enabled correctly as the code checks + * those flags and not for v5 support. + */ if (xfs_sb_is_v5(sbp)) - return true; + return xfs_sb_validate_v5_features(sbp); + + /* We must not have any unknown v4 feature bits set */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; /* versions prior to v4 are not supported */ if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4) @@ -51,12 +102,6 @@ xfs_sb_good_version( if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) return false; - /* And must not have any unknown v4 feature bits set */ - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) - return false; - /* It's a supported v4 filesystem */ return true; } @@ -267,12 +312,15 @@ xfs_validate_sb_common( bool has_dalign; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { - xfs_warn(mp, "bad magic number"); + xfs_warn(mp, +"Superblock has bad magic number 0x%x. Not an XFS filesystem?", + be32_to_cpu(dsb->sb_magicnum)); return -EWRONGFS; } if (!xfs_sb_good_version(sbp)) { - xfs_warn(mp, "bad version"); + xfs_warn(mp, +"Superblock has unknown features enabled or corrupted feature masks."); return -EWRONGFS; } -- cgit From 7b3ec2b20e44f579c022ad62243aa18c04c6addc Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Wed, 4 May 2022 12:39:02 +1000 Subject: xfs: Fix double unlock in defer capture code The new deferred attr patch set uncovered a double unlock in the recent port of the defer ops capture and continue code. During log recovery, we're allowed to hold buffers to a transaction that's being used to replay an intent item. When we capture the resources as part of scheduling a continuation of an intent chain, we call xfs_buf_hold to retain our reference to the buffer beyond the transaction commit, but we do /not/ call xfs_trans_bhold to maintain the buffer lock. This means that xfs_defer_ops_continue needs to relock the buffers before xfs_defer_restore_resources joins then tothe new transaction. Additionally, the buffers should not be passed back via the dres structure since they need to remain locked unlike the inodes. So simply set dr_bufs to zero after populating the dres structure. Signed-off-by: Darrick J. Wong Signed-off-by: Allison Henderson Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_defer.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 1aa32bfdf0cc..7e9a27f9d967 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -22,6 +22,7 @@ #include "xfs_refcount.h" #include "xfs_bmap.h" #include "xfs_alloc.h" +#include "xfs_buf.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -780,17 +781,25 @@ xfs_defer_ops_continue( struct xfs_trans *tp, struct xfs_defer_resources *dres) { + unsigned int i; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); - /* Lock and join the captured inode to the new transaction. */ + /* Lock the captured resources to the new transaction. */ if (dfc->dfc_held.dr_inos == 2) xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); else if (dfc->dfc_held.dr_inos == 1) xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); + + for (i = 0; i < dfc->dfc_held.dr_bufs; i++) + xfs_buf_lock(dfc->dfc_held.dr_bp[i]); + + /* Join the captured resources to the new transaction. */ xfs_defer_restore_resources(tp, &dfc->dfc_held); memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); + dres->dr_bufs = 0; /* Move captured dfops chain and state to the transaction. */ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); -- cgit From 9a39cdabc172ef2de3f21a34e73cdc1d02338d79 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Wed, 4 May 2022 12:40:02 +1000 Subject: xfs: Return from xfs_attr_set_iter if there are no more rmtblks to process During an attr rename operation, blocks are saved for later removal as rmtblkno2. The rmtblkno is used in the case of needing to alloc more blocks if not enough were available. However, in the case that no further blocks need to be added or removed, we can return as soon as xfs_attr_node_addname completes, rather than rolling the transaction with an -EAGAIN return. This extra loop does not hurt anything right now, but it will be a problem later when we get into log items because we end up with an empty log transaction. So, add a simple check to cut out the unneeded iteration. Signed-off-by: Allison Henderson Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 2815cfbbae70..e629bf51dc06 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -412,6 +412,14 @@ xfs_attr_set_iter( if (error) return error; + /* + * If addname was successful, and we dont need to alloc + * or remove anymore blks, we're done. + */ + if (!args->rmtblkno && + !(args->op_flags & XFS_DA_OP_RENAME)) + return 0; + dac->dela_state = XFS_DAS_FOUND_NBLK; } trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); -- cgit From fd920008784ead369e79c2be2f8d9cc736e306ca Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Wed, 4 May 2022 12:41:02 +1000 Subject: xfs: Set up infrastructure for log attribute replay Currently attributes are modified directly across one or more transactions. But they are not logged or replayed in the event of an error. The goal of log attr replay is to enable logging and replaying of attribute operations using the existing delayed operations infrastructure. This will later enable the attributes to become part of larger multi part operations that also must first be recorded to the log. This is mostly of interest in the scheme of parent pointers which would need to maintain an attribute containing parent inode information any time an inode is moved, created, or removed. Parent pointers would then be of interest to any feature that would need to quickly derive an inode path from the mount point. Online scrub, nfs lookups and fs grow or shrink operations are all features that could take advantage of this. This patch adds two new log item types for setting or removing attributes as deferred operations. The xfs_attri_log_item will log an intent to set or remove an attribute. The corresponding xfs_attrd_log_item holds a reference to the xfs_attri_log_item and is freed once the transaction is done. Both log items use a generic xfs_attr_log_format structure that contains the attribute name, value, flags, inode, and an op_flag that indicates if the operations is a set or remove. [dchinner: added extra little bits needed for intent whiteouts] Signed-off-by: Allison Henderson Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_attr.c | 42 +++- fs/xfs/libxfs/xfs_attr.h | 38 ++++ fs/xfs/libxfs/xfs_defer.c | 10 +- fs/xfs/libxfs/xfs_defer.h | 2 + fs/xfs/libxfs/xfs_log_format.h | 44 +++- fs/xfs/libxfs/xfs_log_recover.h | 2 + fs/xfs/scrub/common.c | 2 + fs/xfs/xfs_attr_item.c | 458 ++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_attr_item.h | 46 ++++ fs/xfs/xfs_attr_list.c | 1 + fs/xfs/xfs_ioctl32.c | 2 + fs/xfs/xfs_iops.c | 2 + fs/xfs/xfs_log_recover.c | 2 + fs/xfs/xfs_ondisk.h | 2 + 15 files changed, 648 insertions(+), 6 deletions(-) create mode 100644 fs/xfs/xfs_attr_item.c create mode 100644 fs/xfs/xfs_attr_item.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 04611a1068b4..b056cfc6398e 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -102,6 +102,7 @@ xfs-y += xfs_log.o \ xfs_buf_item_recover.o \ xfs_dquot_item_recover.o \ xfs_extfree_item.o \ + xfs_attr_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ xfs_inode_item_recover.o \ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e629bf51dc06..e22884fb6f00 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -24,6 +24,10 @@ #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" +#include "xfs_attr_item.h" + +struct kmem_cache *xfs_attri_cache; +struct kmem_cache *xfs_attrd_cache; /* * xfs_attr.c @@ -61,8 +65,6 @@ STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, struct xfs_da_state **state); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); -STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp); STATIC int xfs_attr_node_removename(struct xfs_da_args *args, struct xfs_da_state *state); @@ -166,7 +168,7 @@ xfs_attr_get( /* * Calculate how many blocks we need for the new attribute, */ -STATIC int +int xfs_attr_calc_size( struct xfs_da_args *args, int *local) @@ -840,6 +842,40 @@ out_trans_cancel: goto out_unlock; } +int __init +xfs_attri_init_cache(void) +{ + xfs_attri_cache = kmem_cache_create("xfs_attri", + sizeof(struct xfs_attri_log_item), + 0, 0, NULL); + + return xfs_attri_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_attri_destroy_cache(void) +{ + kmem_cache_destroy(xfs_attri_cache); + xfs_attri_cache = NULL; +} + +int __init +xfs_attrd_init_cache(void) +{ + xfs_attrd_cache = kmem_cache_create("xfs_attrd", + sizeof(struct xfs_attrd_log_item), + 0, 0, NULL); + + return xfs_attrd_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_attrd_destroy_cache(void) +{ + kmem_cache_destroy(xfs_attrd_cache); + xfs_attrd_cache = NULL; +} + /*======================================================================== * External routines when attribute list is inside the inode *========================================================================*/ diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 5e71f719bdd5..80b6f28b0d1a 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -28,6 +28,11 @@ struct xfs_attr_list_context; */ #define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ +static inline bool xfs_has_larp(struct xfs_mount *mp) +{ + return false; +} + /* * Kernel-internal version of the attrlist cursor. */ @@ -461,6 +466,11 @@ enum xfs_delattr_state { struct xfs_delattr_context { struct xfs_da_args *da_args; + /* + * Used by xfs_attr_set to hold a leaf buffer across a transaction roll + */ + struct xfs_buf *leaf_bp; + /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ struct xfs_bmbt_irec map; xfs_dablk_t lblkno; @@ -474,6 +484,23 @@ struct xfs_delattr_context { enum xfs_delattr_state dela_state; }; +/* + * List of attrs to commit later. + */ +struct xfs_attr_item { + struct xfs_delattr_context xattri_dac; + + /* + * Indicates if the attr operation is a set or a remove + * XFS_ATTR_OP_FLAGS_{SET,REMOVE} + */ + unsigned int xattri_op_flags; + + /* used to log this item to an intent */ + struct list_head xattri_list; +}; + + /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ @@ -490,10 +517,21 @@ int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set_args(struct xfs_da_args *args); +int xfs_attr_set_iter(struct xfs_delattr_context *dac, + struct xfs_buf **leaf_bp); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_remove_iter(struct xfs_delattr_context *dac); bool xfs_attr_namecheck(const void *name, size_t length); void xfs_delattr_context_init(struct xfs_delattr_context *dac, struct xfs_da_args *args); +int xfs_attr_calc_size(struct xfs_da_args *args, int *local); + +extern struct kmem_cache *xfs_attri_cache; +extern struct kmem_cache *xfs_attrd_cache; + +int __init xfs_attri_init_cache(void); +void xfs_attri_destroy_cache(void); +int __init xfs_attrd_init_cache(void); +void xfs_attrd_destroy_cache(void); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 7e9a27f9d967..466f333ea508 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -23,6 +23,7 @@ #include "xfs_bmap.h" #include "xfs_alloc.h" #include "xfs_buf.h" +#include "xfs_attr.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -869,7 +870,12 @@ xfs_defer_init_item_caches(void) error = xfs_extfree_intent_init_cache(); if (error) goto err; - + error = xfs_attri_init_cache(); + if (error) + goto err; + error = xfs_attrd_init_cache(); + if (error) + goto err; return 0; err: xfs_defer_destroy_item_caches(); @@ -880,6 +886,8 @@ err: void xfs_defer_destroy_item_caches(void) { + xfs_attri_destroy_cache(); + xfs_attrd_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); xfs_refcount_intent_destroy_cache(); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7bb8a31ad65b..fcd23e5cf1ee 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -63,6 +63,8 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +extern const struct xfs_defer_op_type xfs_attr_defer_type; + /* * Deferred operation item relogging limits. diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index afce51633f03..a27492e99673 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -113,7 +113,12 @@ struct xfs_unmount_log_format { #define XLOG_REG_TYPE_CUD_FORMAT 24 #define XLOG_REG_TYPE_BUI_FORMAT 25 #define XLOG_REG_TYPE_BUD_FORMAT 26 -#define XLOG_REG_TYPE_MAX 26 +#define XLOG_REG_TYPE_ATTRI_FORMAT 27 +#define XLOG_REG_TYPE_ATTRD_FORMAT 28 +#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_VALUE 30 +#define XLOG_REG_TYPE_MAX 30 + /* * Flags to log operation header @@ -236,6 +241,8 @@ typedef struct xfs_trans_header { #define XFS_LI_CUD 0x1243 #define XFS_LI_BUI 0x1244 /* bmbt update intent */ #define XFS_LI_BUD 0x1245 +#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/ +#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -251,7 +258,9 @@ typedef struct xfs_trans_header { { XFS_LI_CUI, "XFS_LI_CUI" }, \ { XFS_LI_CUD, "XFS_LI_CUD" }, \ { XFS_LI_BUI, "XFS_LI_BUI" }, \ - { XFS_LI_BUD, "XFS_LI_BUD" } + { XFS_LI_BUD, "XFS_LI_BUD" }, \ + { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \ + { XFS_LI_ATTRD, "XFS_LI_ATTRD" } /* * Inode Log Item Format definitions. @@ -893,4 +902,35 @@ struct xfs_icreate_log { __be32 icl_gen; /* inode generation number to use */ }; +/* + * Flags for deferred attribute operations. + * Upper bits are flags, lower byte is type code + */ +#define XFS_ATTR_OP_FLAGS_SET 1 /* Set the attribute */ +#define XFS_ATTR_OP_FLAGS_REMOVE 2 /* Remove the attribute */ +#define XFS_ATTR_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ + +/* + * This is the structure used to lay out an attr log item in the + * log. + */ +struct xfs_attri_log_format { + uint16_t alfi_type; /* attri log item type */ + uint16_t alfi_size; /* size of this item */ + uint32_t __pad; /* pad to 64 bit aligned */ + uint64_t alfi_id; /* attri identifier */ + uint64_t alfi_ino; /* the inode for this attr operation */ + uint32_t alfi_op_flags; /* marks the op as a set or remove */ + uint32_t alfi_name_len; /* attr name length */ + uint32_t alfi_value_len; /* attr value length */ + uint32_t alfi_attr_flags;/* attr flags */ +}; + +struct xfs_attrd_log_format { + uint16_t alfd_type; /* attrd log item type */ + uint16_t alfd_size; /* size of this item */ + uint32_t __pad; /* pad to 64 bit aligned */ + uint64_t alfd_alf_id; /* id of corresponding attri */ +}; + #endif /* __XFS_LOG_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index ff69a0000817..32e216255cb0 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -72,6 +72,8 @@ extern const struct xlog_recover_item_ops xlog_rui_item_ops; extern const struct xlog_recover_item_ops xlog_rud_item_ops; extern const struct xlog_recover_item_ops xlog_cui_item_ops; extern const struct xlog_recover_item_ops xlog_cud_item_ops; +extern const struct xlog_recover_item_ops xlog_attri_item_ops; +extern const struct xlog_recover_item_ops xlog_attrd_item_ops; /* * Macros, structures, prototypes for internal log manager use. diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index bf1f3607d0b6..97b54ac3075f 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -23,6 +23,8 @@ #include "xfs_rmap_btree.h" #include "xfs_log.h" #include "xfs_trans_priv.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c new file mode 100644 index 000000000000..b1141ecaa1ab --- /dev/null +++ b/fs/xfs/xfs_attr_item.c @@ -0,0 +1,458 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022 Oracle. All Rights Reserved. + * Author: Allison Henderson + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_shared.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_item.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_trans_space.h" +#include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" + +static const struct xfs_item_ops xfs_attri_item_ops; +static const struct xfs_item_ops xfs_attrd_item_ops; + +static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_attri_log_item, attri_item); +} + +STATIC void +xfs_attri_item_free( + struct xfs_attri_log_item *attrip) +{ + kmem_free(attrip->attri_item.li_lv_shadow); + kmem_free(attrip); +} + +/* + * Freeing the attrip requires that we remove it from the AIL if it has already + * been placed there. However, the ATTRI may not yet have been placed in the + * AIL when called by xfs_attri_release() from ATTRD processing due to the + * ordering of committed vs unpin operations in bulk insert operations. Hence + * the reference count to ensure only the last caller frees the ATTRI. + */ +STATIC void +xfs_attri_release( + struct xfs_attri_log_item *attrip) +{ + ASSERT(atomic_read(&attrip->attri_refcount) > 0); + if (!atomic_dec_and_test(&attrip->attri_refcount)) + return; + + xfs_trans_ail_delete(&attrip->attri_item, 0); + xfs_attri_item_free(attrip); +} + +STATIC void +xfs_attri_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + + *nvecs += 2; + *nbytes += sizeof(struct xfs_attri_log_format) + + xlog_calc_iovec_len(attrip->attri_name_len); + + if (!attrip->attri_value_len) + return; + + *nvecs += 1; + *nbytes += xlog_calc_iovec_len(attrip->attri_value_len); +} + +/* + * This is called to fill in the log iovecs for the given attri log + * item. We use 1 iovec for the attri_format_item, 1 for the name, and + * another for the value if it is present + */ +STATIC void +xfs_attri_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + attrip->attri_format.alfi_type = XFS_LI_ATTRI; + attrip->attri_format.alfi_size = 1; + + /* + * This size accounting must be done before copying the attrip into the + * iovec. If we do it after, the wrong size will be recorded to the log + * and we trip across assertion checks for bad region sizes later during + * the log recovery. + */ + + ASSERT(attrip->attri_name_len > 0); + attrip->attri_format.alfi_size++; + + if (attrip->attri_value_len > 0) + attrip->attri_format.alfi_size++; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT, + &attrip->attri_format, + sizeof(struct xfs_attri_log_format)); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME, + attrip->attri_name, + xlog_calc_iovec_len(attrip->attri_name_len)); + if (attrip->attri_value_len > 0) + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE, + attrip->attri_value, + xlog_calc_iovec_len(attrip->attri_value_len)); +} + +/* + * The unpin operation is the last place an ATTRI is manipulated in the log. It + * is either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the ATTRI transaction has been successfully committed to make + * it this far. Therefore, we expect whoever committed the ATTRI to either + * construct and commit the ATTRD or drop the ATTRD's reference in the event of + * error. Simply drop the log's ATTRI reference now that the log is done with + * it. + */ +STATIC void +xfs_attri_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + xfs_attri_release(ATTRI_ITEM(lip)); +} + + +STATIC void +xfs_attri_item_release( + struct xfs_log_item *lip) +{ + xfs_attri_release(ATTRI_ITEM(lip)); +} + +/* + * Allocate and initialize an attri item. Caller may allocate an additional + * trailing buffer for name and value + */ +STATIC struct xfs_attri_log_item * +xfs_attri_init( + struct xfs_mount *mp, + uint32_t name_len, + uint32_t value_len) + +{ + struct xfs_attri_log_item *attrip; + uint32_t name_vec_len = 0; + uint32_t value_vec_len = 0; + uint32_t buffer_size; + + if (name_len) + name_vec_len = xlog_calc_iovec_len(name_len); + if (value_len) + value_vec_len = xlog_calc_iovec_len(value_len); + + buffer_size = name_vec_len + value_vec_len; + + if (buffer_size) { + attrip = kmem_zalloc(sizeof(struct xfs_attri_log_item) + + buffer_size, KM_NOFS); + if (attrip == NULL) + return NULL; + } else { + attrip = kmem_cache_zalloc(xfs_attri_cache, + GFP_NOFS | __GFP_NOFAIL); + } + + attrip->attri_name_len = name_len; + if (name_len) + attrip->attri_name = ((char *)attrip) + + sizeof(struct xfs_attri_log_item); + else + attrip->attri_name = NULL; + + attrip->attri_value_len = value_len; + if (value_len) + attrip->attri_value = ((char *)attrip) + + sizeof(struct xfs_attri_log_item) + + name_vec_len; + else + attrip->attri_value = NULL; + + xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI, + &xfs_attri_item_ops); + attrip->attri_format.alfi_id = (uintptr_t)(void *)attrip; + atomic_set(&attrip->attri_refcount, 2); + + return attrip; +} + +/* + * Copy an attr format buffer from the given buf, and into the destination attr + * format structure. + */ +STATIC int +xfs_attri_copy_format( + struct xfs_log_iovec *buf, + struct xfs_attri_log_format *dst_attr_fmt) +{ + struct xfs_attri_log_format *src_attr_fmt = buf->i_addr; + size_t len; + + len = sizeof(struct xfs_attri_log_format); + if (buf->i_len != len) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + return -EFSCORRUPTED; + } + + memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len); + return 0; +} + +static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_attrd_log_item, attrd_item); +} + +STATIC void +xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp) +{ + kmem_free(attrdp->attrd_item.li_lv_shadow); + kmem_free(attrdp); +} + +STATIC void +xfs_attrd_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_attrd_log_format); +} + +/* + * This is called to fill in the log iovecs for the given attrd log item. We use + * only 1 iovec for the attrd_format, and we point that at the attr_log_format + * structure embedded in the attrd item. + */ +STATIC void +xfs_attrd_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + attrdp->attrd_format.alfd_type = XFS_LI_ATTRD; + attrdp->attrd_format.alfd_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRD_FORMAT, + &attrdp->attrd_format, + sizeof(struct xfs_attrd_log_format)); +} + +/* + * The ATTRD is either committed or aborted if the transaction is canceled. If + * the transaction is canceled, drop our reference to the ATTRI and free the + * ATTRD. + */ +STATIC void +xfs_attrd_item_release( + struct xfs_log_item *lip) +{ + struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip); + + xfs_attri_release(attrdp->attrd_attrip); + xfs_attrd_item_free(attrdp); +} + +STATIC xfs_lsn_t +xfs_attri_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + + /* + * The attrip refers to xfs_attr_item memory to log the name and value + * with the intent item. This already occurred when the intent was + * committed so these fields are no longer accessed. Clear them out of + * caution since we're about to free the xfs_attr_item. + */ + attrip->attri_name = NULL; + attrip->attri_value = NULL; + + /* + * The ATTRI is logged only once and cannot be moved in the log, so + * simply return the lsn at which it's been logged. + */ + return lsn; +} + +STATIC bool +xfs_attri_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id; +} + +/* Is this recovered ATTRI format ok? */ +static inline bool +xfs_attri_validate( + struct xfs_mount *mp, + struct xfs_attri_log_format *attrp) +{ + unsigned int op = attrp->alfi_op_flags & + XFS_ATTR_OP_FLAGS_TYPE_MASK; + + if (attrp->__pad != 0) + return false; + + /* alfi_op_flags should be either a set or remove */ + if (op != XFS_ATTR_OP_FLAGS_SET && op != XFS_ATTR_OP_FLAGS_REMOVE) + return false; + + if (attrp->alfi_value_len > XATTR_SIZE_MAX) + return false; + + if ((attrp->alfi_name_len > XATTR_NAME_MAX) || + (attrp->alfi_name_len == 0)) + return false; + + return xfs_verify_ino(mp, attrp->alfi_ino); +} + +STATIC int +xlog_recover_attri_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_attri_log_item *attrip; + struct xfs_attri_log_format *attri_formatp; + int region = 0; + + attri_formatp = item->ri_buf[region].i_addr; + + /* Validate xfs_attri_log_format */ + if (!xfs_attri_validate(mp, attri_formatp)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + /* memory alloc failure will cause replay to abort */ + attrip = xfs_attri_init(mp, attri_formatp->alfi_name_len, + attri_formatp->alfi_value_len); + if (attrip == NULL) + return -ENOMEM; + + error = xfs_attri_copy_format(&item->ri_buf[region], + &attrip->attri_format); + if (error) + goto out; + + region++; + memcpy(attrip->attri_name, item->ri_buf[region].i_addr, + attrip->attri_name_len); + + if (!xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto out; + } + + if (attrip->attri_value_len > 0) { + region++; + memcpy(attrip->attri_value, item->ri_buf[region].i_addr, + attrip->attri_value_len); + } + + /* + * The ATTRI has two references. One for the ATTRD and one for ATTRI to + * ensure it makes it into the AIL. Insert the ATTRI into the AIL + * directly and drop the ATTRI reference. Note that + * xfs_trans_ail_update() drops the AIL lock. + */ + xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn); + xfs_attri_release(attrip); + return 0; +out: + xfs_attri_item_free(attrip); + return error; +} + +/* + * This routine is called when an ATTRD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding ATTRI if + * it was still in the log. To do this it searches the AIL for the ATTRI with + * an id equal to that in the ATTRD format structure. If we find it we drop + * the ATTRD reference, which removes the ATTRI from the AIL and frees it. + */ +STATIC int +xlog_recover_attrd_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_attrd_log_format *attrd_formatp; + + attrd_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_ATTRI, + attrd_formatp->alfd_alf_id); + return 0; +} + +static const struct xfs_item_ops xfs_attri_item_ops = { + .flags = XFS_ITEM_INTENT, + .iop_size = xfs_attri_item_size, + .iop_format = xfs_attri_item_format, + .iop_unpin = xfs_attri_item_unpin, + .iop_committed = xfs_attri_item_committed, + .iop_release = xfs_attri_item_release, + .iop_match = xfs_attri_item_match, +}; + +const struct xlog_recover_item_ops xlog_attri_item_ops = { + .item_type = XFS_LI_ATTRI, + .commit_pass2 = xlog_recover_attri_commit_pass2, +}; + +static const struct xfs_item_ops xfs_attrd_item_ops = { + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | + XFS_ITEM_INTENT_DONE, + .iop_size = xfs_attrd_item_size, + .iop_format = xfs_attrd_item_format, + .iop_release = xfs_attrd_item_release, +}; + +const struct xlog_recover_item_ops xlog_attrd_item_ops = { + .item_type = XFS_LI_ATTRD, + .commit_pass2 = xlog_recover_attrd_commit_pass2, +}; diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h new file mode 100644 index 000000000000..c3b779f82adb --- /dev/null +++ b/fs/xfs/xfs_attr_item.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * + * Copyright (C) 2022 Oracle. All Rights Reserved. + * Author: Allison Henderson + */ +#ifndef __XFS_ATTR_ITEM_H__ +#define __XFS_ATTR_ITEM_H__ + +/* kernel only ATTRI/ATTRD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * This is the "attr intention" log item. It is used to log the fact that some + * extended attribute operations need to be processed. An operation is + * currently either a set or remove. Set or remove operations are described by + * the xfs_attr_item which may be logged to this intent. + * + * During a normal attr operation, name and value point to the name and value + * fields of the caller's xfs_da_args structure. During a recovery, the name + * and value buffers are copied from the log, and stored in a trailing buffer + * attached to the xfs_attr_item until they are committed. They are freed when + * the xfs_attr_item itself is freed when the work is done. + */ +struct xfs_attri_log_item { + struct xfs_log_item attri_item; + atomic_t attri_refcount; + int attri_name_len; + int attri_value_len; + void *attri_name; + void *attri_value; + struct xfs_attri_log_format attri_format; +}; + +/* + * This is the "attr done" log item. It is used to log the fact that some attrs + * earlier mentioned in an attri item have been freed. + */ +struct xfs_attrd_log_item { + struct xfs_log_item attrd_item; + struct xfs_attri_log_item *attrd_attrip; + struct xfs_attrd_log_format attrd_format; +}; + +#endif /* __XFS_ATTR_ITEM_H__ */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 2d1e5134cebe..90a14e85e76d 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -15,6 +15,7 @@ #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_sf.h" #include "xfs_attr_leaf.h" diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index ca25ed89b706..2f54b701eead 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -17,6 +17,8 @@ #include "xfs_itable.h" #include "xfs_fsops.h" #include "xfs_rtalloc.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_ioctl.h" #include "xfs_ioctl32.h" diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 94313b7e9991..e912b7fee714 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -13,6 +13,8 @@ #include "xfs_inode.h" #include "xfs_acl.h" #include "xfs_quota.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trans.h" #include "xfs_trace.h" diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index c4ad4296c540..97b941c07957 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1800,6 +1800,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { &xlog_cud_item_ops, &xlog_bui_item_ops, &xlog_bud_item_ops, + &xlog_attri_item_ops, + &xlog_attrd_item_ops, }; static const struct xlog_recover_item_ops * diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 25991923c1a8..758702b9495f 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -132,6 +132,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40); + XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16); /* * The v5 superblock format extended several v4 header structures with -- cgit From 1d08e11d04d293cb7006d1c8641be1fdd8a8e397 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Mon, 9 May 2022 19:09:07 +1000 Subject: xfs: Implement attr logging and replay This patch adds the needed routines to create, log and recover logged extended attribute intents. Signed-off-by: Allison Henderson Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_defer.c | 1 + fs/xfs/libxfs/xfs_defer.h | 1 + fs/xfs/libxfs/xfs_format.h | 9 +- fs/xfs/xfs_attr_item.c | 368 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 378 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 466f333ea508..b2ecc272f9e4 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -186,6 +186,7 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, + [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, }; static bool diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index fcd23e5cf1ee..114a3a4930a3 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -19,6 +19,7 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_AGFL_FREE, + XFS_DEFER_OPS_TYPE_ATTR, XFS_DEFER_OPS_TYPE_MAX, }; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 96fd49fbc9fa..afdfc8108c5f 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -390,7 +390,9 @@ xfs_sb_has_incompat_feature( return (sbp->sb_features_incompat & feature) != 0; } -#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS (1 << 0) /* Delayed Attributes */ +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \ + (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS) #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL static inline bool xfs_sb_has_incompat_log_feature( @@ -415,6 +417,11 @@ xfs_sb_add_incompat_log_features( sbp->sb_features_log_incompat |= features; } +static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp) +{ + return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); +} static inline bool xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index b1141ecaa1ab..bdcbf32690f4 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -13,6 +13,7 @@ #include "xfs_defer.h" #include "xfs_log_format.h" #include "xfs_trans.h" +#include "xfs_bmap_btree.h" #include "xfs_trans_priv.h" #include "xfs_log.h" #include "xfs_inode.h" @@ -29,6 +30,8 @@ static const struct xfs_item_ops xfs_attri_item_ops; static const struct xfs_item_ops xfs_attrd_item_ops; +static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp, + struct xfs_attri_log_item *attrip); static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) { @@ -283,6 +286,179 @@ xfs_attrd_item_release( xfs_attrd_item_free(attrdp); } +static struct xfs_log_item * +xfs_attrd_item_intent( + struct xfs_log_item *lip) +{ + return &ATTRD_ITEM(lip)->attrd_attrip->attri_item; +} + +/* + * Performs one step of an attribute update intent and marks the attrd item + * dirty.. An attr operation may be a set or a remove. Note that the + * transaction is marked dirty regardless of whether the operation succeeds or + * fails to support the ATTRI/ATTRD lifecycle rules. + */ +STATIC int +xfs_xattri_finish_update( + struct xfs_delattr_context *dac, + struct xfs_attrd_log_item *attrdp, + struct xfs_buf **leaf_bp, + uint32_t op_flags) +{ + struct xfs_da_args *args = dac->da_args; + unsigned int op = op_flags & + XFS_ATTR_OP_FLAGS_TYPE_MASK; + int error; + + switch (op) { + case XFS_ATTR_OP_FLAGS_SET: + error = xfs_attr_set_iter(dac, leaf_bp); + break; + case XFS_ATTR_OP_FLAGS_REMOVE: + ASSERT(XFS_IFORK_Q(args->dp)); + error = xfs_attr_remove_iter(dac); + break; + default: + error = -EFSCORRUPTED; + break; + } + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the ATTRI and frees the ATTRD + * 2.) shuts down the filesystem + */ + args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; + + /* + * attr intent/done items are null when logged attributes are disabled + */ + if (attrdp) + set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); + + return error; +} + +/* Log an attr to the intent item. */ +STATIC void +xfs_attr_log_item( + struct xfs_trans *tp, + struct xfs_attri_log_item *attrip, + struct xfs_attr_item *attr) +{ + struct xfs_attri_log_format *attrp; + + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags); + + /* + * At this point the xfs_attr_item has been constructed, and we've + * created the log intent. Fill in the attri log item and log format + * structure with fields from this xfs_attr_item + */ + attrp = &attrip->attri_format; + attrp->alfi_ino = attr->xattri_dac.da_args->dp->i_ino; + attrp->alfi_op_flags = attr->xattri_op_flags; + attrp->alfi_value_len = attr->xattri_dac.da_args->valuelen; + attrp->alfi_name_len = attr->xattri_dac.da_args->namelen; + attrp->alfi_attr_flags = attr->xattri_dac.da_args->attr_filter; + + memcpy(attrip->attri_name, attr->xattri_dac.da_args->name, + attr->xattri_dac.da_args->namelen); + memcpy(attrip->attri_value, attr->xattri_dac.da_args->value, + attr->xattri_dac.da_args->valuelen); + attrip->attri_name_len = attr->xattri_dac.da_args->namelen; + attrip->attri_value_len = attr->xattri_dac.da_args->valuelen; +} + +/* Get an ATTRI. */ +static struct xfs_log_item * +xfs_attr_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_attri_log_item *attrip; + struct xfs_attr_item *attr; + + ASSERT(count == 1); + + if (!xfs_sb_version_haslogxattrs(&mp->m_sb)) + return NULL; + + /* + * Each attr item only performs one attribute operation at a time, so + * this is a list of one + */ + list_for_each_entry(attr, items, xattri_list) { + attrip = xfs_attri_init(mp, attr->xattri_dac.da_args->namelen, + attr->xattri_dac.da_args->valuelen); + if (attrip == NULL) + return NULL; + + xfs_trans_add_item(tp, &attrip->attri_item); + xfs_attr_log_item(tp, attrip, attr); + } + + return &attrip->attri_item; +} + +/* Process an attr. */ +STATIC int +xfs_attr_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + struct xfs_attr_item *attr; + struct xfs_attrd_log_item *done_item = NULL; + int error; + struct xfs_delattr_context *dac; + + attr = container_of(item, struct xfs_attr_item, xattri_list); + dac = &attr->xattri_dac; + if (done) + done_item = ATTRD_ITEM(done); + + /* + * Always reset trans after EAGAIN cycle + * since the transaction is new + */ + dac->da_args->trans = tp; + + error = xfs_xattri_finish_update(dac, done_item, &dac->leaf_bp, + attr->xattri_op_flags); + if (error != -EAGAIN) + kmem_free(attr); + + return error; +} + +/* Abort all pending ATTRs. */ +STATIC void +xfs_attr_abort_intent( + struct xfs_log_item *intent) +{ + xfs_attri_release(ATTRI_ITEM(intent)); +} + +/* Cancel an attr */ +STATIC void +xfs_attr_cancel_item( + struct list_head *item) +{ + struct xfs_attr_item *attr; + + attr = container_of(item, struct xfs_attr_item, xattri_list); + kmem_free(attr); +} + STATIC xfs_lsn_t xfs_attri_item_committed( struct xfs_log_item *lip, @@ -340,6 +516,151 @@ xfs_attri_validate( return xfs_verify_ino(mp, attrp->alfi_ino); } +/* + * Process an attr intent item that was recovered from the log. We need to + * delete the attr that it describes. + */ +STATIC int +xfs_attri_item_recover( + struct xfs_log_item *lip, + struct list_head *capture_list) +{ + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attr_item *attr; + struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_inode *ip; + struct xfs_da_args *args; + struct xfs_trans *tp; + struct xfs_trans_res tres; + struct xfs_attri_log_format *attrp; + int error, ret = 0; + int total; + int local; + struct xfs_attrd_log_item *done_item = NULL; + + /* + * First check the validity of the attr described by the ATTRI. If any + * are bad, then assume that all are bad and just toss the ATTRI. + */ + attrp = &attrip->attri_format; + if (!xfs_attri_validate(mp, attrp) || + !xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) + return -EFSCORRUPTED; + + error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); + if (error) + return error; + + attr = kmem_zalloc(sizeof(struct xfs_attr_item) + + sizeof(struct xfs_da_args), KM_NOFS); + args = (struct xfs_da_args *)(attr + 1); + + attr->xattri_dac.da_args = args; + attr->xattri_op_flags = attrp->alfi_op_flags; + + args->dp = ip; + args->geo = mp->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->name = attrip->attri_name; + args->namelen = attrp->alfi_name_len; + args->hashval = xfs_da_hashname(args->name, args->namelen); + args->attr_filter = attrp->alfi_attr_flags; + + if (attrp->alfi_op_flags == XFS_ATTR_OP_FLAGS_SET) { + args->value = attrip->attri_value; + args->valuelen = attrp->alfi_value_len; + args->total = xfs_attr_calc_size(args, &local); + + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * + args->total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + total = args->total; + } else { + tres = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); + } + error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp); + if (error) + goto out; + + args->trans = tp; + done_item = xfs_trans_get_attrd(tp, attrip); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + ret = xfs_xattri_finish_update(&attr->xattri_dac, done_item, + &attr->xattri_dac.leaf_bp, + attrp->alfi_op_flags); + if (ret == -EAGAIN) { + /* There's more work to do, so add it to this transaction */ + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); + } else + error = ret; + + if (error) { + xfs_trans_cancel(tp); + goto out_unlock; + } + + error = xfs_defer_ops_capture_and_commit(tp, capture_list); + +out_unlock: + if (attr->xattri_dac.leaf_bp) + xfs_buf_relse(attr->xattri_dac.leaf_bp); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); +out: + if (ret != -EAGAIN) + kmem_free(attr); + return error; +} + +/* Re-log an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_attri_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_attrd_log_item *attrdp; + struct xfs_attri_log_item *old_attrip; + struct xfs_attri_log_item *new_attrip; + struct xfs_attri_log_format *new_attrp; + struct xfs_attri_log_format *old_attrp; + + old_attrip = ATTRI_ITEM(intent); + old_attrp = &old_attrip->attri_format; + + tp->t_flags |= XFS_TRANS_DIRTY; + attrdp = xfs_trans_get_attrd(tp, old_attrip); + set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); + + new_attrip = xfs_attri_init(tp->t_mountp, old_attrp->alfi_name_len, + old_attrp->alfi_value_len); + new_attrp = &new_attrip->attri_format; + + new_attrp->alfi_ino = old_attrp->alfi_ino; + new_attrp->alfi_op_flags = old_attrp->alfi_op_flags; + new_attrp->alfi_value_len = old_attrp->alfi_value_len; + new_attrp->alfi_name_len = old_attrp->alfi_name_len; + new_attrp->alfi_attr_flags = old_attrp->alfi_attr_flags; + + memcpy(new_attrip->attri_name, old_attrip->attri_name, + new_attrip->attri_name_len); + + if (new_attrip->attri_value_len > 0) + memcpy(new_attrip->attri_value, old_attrip->attri_value, + new_attrip->attri_value_len); + + xfs_trans_add_item(tp, &new_attrip->attri_item); + set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags); + + return &new_attrip->attri_item; +} + STATIC int xlog_recover_attri_commit_pass2( struct xlog *log, @@ -402,6 +723,50 @@ out: return error; } +/* + * This routine is called to allocate an "attr free done" log item. + */ +static struct xfs_attrd_log_item * +xfs_trans_get_attrd(struct xfs_trans *tp, + struct xfs_attri_log_item *attrip) +{ + struct xfs_attrd_log_item *attrdp; + + ASSERT(tp != NULL); + + attrdp = kmem_cache_alloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); + + xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, + &xfs_attrd_item_ops); + attrdp->attrd_attrip = attrip; + attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; + + xfs_trans_add_item(tp, &attrdp->attrd_item); + return attrdp; +} + +/* Get an ATTRD so we can process all the attrs. */ +static struct xfs_log_item * +xfs_attr_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + if (!intent) + return NULL; + + return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item; +} + +const struct xfs_defer_op_type xfs_attr_defer_type = { + .max_items = 1, + .create_intent = xfs_attr_create_intent, + .abort_intent = xfs_attr_abort_intent, + .create_done = xfs_attr_create_done, + .finish_item = xfs_attr_finish_item, + .cancel_item = xfs_attr_cancel_item, +}; + /* * This routine is called when an ATTRD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding ATTRI if @@ -436,7 +801,9 @@ static const struct xfs_item_ops xfs_attri_item_ops = { .iop_unpin = xfs_attri_item_unpin, .iop_committed = xfs_attri_item_committed, .iop_release = xfs_attri_item_release, + .iop_recover = xfs_attri_item_recover, .iop_match = xfs_attri_item_match, + .iop_relog = xfs_attri_item_relog, }; const struct xlog_recover_item_ops xlog_attri_item_ops = { @@ -450,6 +817,7 @@ static const struct xfs_item_ops xfs_attrd_item_ops = { .iop_size = xfs_attrd_item_size, .iop_format = xfs_attrd_item_format, .iop_release = xfs_attrd_item_release, + .iop_intent = xfs_attrd_item_intent, }; const struct xlog_recover_item_ops xlog_attrd_item_ops = { -- cgit From f38dc503d366b589d98d5676a5b279d10b47bcb9 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Mon, 9 May 2022 19:09:10 +1000 Subject: xfs: Skip flip flags for delayed attrs This is a clean up patch that skips the flip flag logic for delayed attr renames. Since the log replay keeps the inode locked, we do not need to worry about race windows with attr lookups. So we can skip over flipping the flag and the extra transaction roll for it Signed-off-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 54 ++++++++++++++++++++++++++----------------- fs/xfs/libxfs/xfs_attr_leaf.c | 3 ++- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e22884fb6f00..f5e74837b3de 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -358,6 +358,7 @@ xfs_attr_set_iter( struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; int forkoff, error = 0; + struct xfs_mount *mp = args->dp->i_mount; /* State machine switch */ switch (dac->dela_state) { @@ -480,16 +481,21 @@ xfs_attr_set_iter( * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - return error; - /* - * Commit the flag value change and start the next trans in - * series. - */ - dac->dela_state = XFS_DAS_FLIP_LFLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; + if (!xfs_has_larp(mp)) { + error = xfs_attr3_leaf_flipflags(args); + if (error) + return error; + /* + * Commit the flag value change and start the next trans + * in series. + */ + dac->dela_state = XFS_DAS_FLIP_LFLAG; + trace_xfs_attr_set_iter_return(dac->dela_state, + args->dp); + return -EAGAIN; + } + + fallthrough; case XFS_DAS_FLIP_LFLAG: /* * Dismantle the "old" attribute/value pair by removing a @@ -592,17 +598,21 @@ xfs_attr_set_iter( * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - goto out; - /* - * Commit the flag value change and start the next trans in - * series - */ - dac->dela_state = XFS_DAS_FLIP_NFLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; + if (!xfs_has_larp(mp)) { + error = xfs_attr3_leaf_flipflags(args); + if (error) + goto out; + /* + * Commit the flag value change and start the next trans + * in series + */ + dac->dela_state = XFS_DAS_FLIP_NFLAG; + trace_xfs_attr_set_iter_return(dac->dela_state, + args->dp); + return -EAGAIN; + } + fallthrough; case XFS_DAS_FLIP_NFLAG: /* * Dismantle the "old" attribute/value pair by removing a @@ -1273,6 +1283,7 @@ xfs_attr_node_addname_clear_incomplete( { struct xfs_da_args *args = dac->da_args; struct xfs_da_state *state = NULL; + struct xfs_mount *mp = args->dp->i_mount; int retval = 0; int error = 0; @@ -1280,7 +1291,8 @@ xfs_attr_node_addname_clear_incomplete( * Re-find the "old" attribute entry after any split ops. The INCOMPLETE * flag means that we will find the "old" attr, not the "new" one. */ - args->attr_filter |= XFS_ATTR_INCOMPLETE; + if (!xfs_has_larp(mp)) + args->attr_filter |= XFS_ATTR_INCOMPLETE; state = xfs_da_state_alloc(args); state->inleaf = 0; error = xfs_da3_node_lookup_int(state, &retval); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 014daa8c542d..74b76b09509f 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1487,7 +1487,8 @@ xfs_attr3_leaf_add_work( if (tmp) entry->flags |= XFS_ATTR_LOCAL; if (args->op_flags & XFS_DA_OP_RENAME) { - entry->flags |= XFS_ATTR_INCOMPLETE; + if (!xfs_has_larp(mp)) + entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { args->index2++; -- cgit From f3f36c893f260275eb9229cdc3dabb4c79650591 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Wed, 11 May 2022 17:01:13 +1000 Subject: xfs: Add xfs_attr_set_deferred and xfs_attr_remove_deferred These routines set up and queue a new deferred attribute operations. These functions are meant to be called by any routine needing to initiate a deferred attribute operation as opposed to the existing inline operations. New helper function xfs_attr_item_init also added. Finally enable delayed attributes in xfs_attr_set and xfs_attr_remove. Signed-off-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 70 +++++++++++++++++++++++++++++++++++++++++++++--- fs/xfs/libxfs/xfs_attr.h | 2 ++ fs/xfs/xfs_log.c | 41 ++++++++++++++++++++++++++++ fs/xfs/xfs_log.h | 1 + 4 files changed, 111 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index f5e74837b3de..5bfe3ff9f3e0 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -25,6 +25,7 @@ #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_attr_item.h" +#include "xfs_log.h" struct kmem_cache *xfs_attri_cache; struct kmem_cache *xfs_attrd_cache; @@ -729,6 +730,7 @@ xfs_attr_set( int error, local; int rmt_blks = 0; unsigned int total; + int delayed = xfs_has_larp(mp); if (xfs_is_shutdown(dp->i_mount)) return -EIO; @@ -785,13 +787,19 @@ xfs_attr_set( rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); } + if (delayed) { + error = xfs_attr_use_log_assist(mp); + if (error) + return error; + } + /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) - return error; + goto drop_incompat; if (args->value || xfs_inode_hasattr(dp)) { error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, @@ -812,9 +820,10 @@ xfs_attr_set( if (error != -ENOATTR && error != -EEXIST) goto out_trans_cancel; - error = xfs_attr_set_args(args); + error = xfs_attr_set_deferred(args); if (error) goto out_trans_cancel; + /* shortform attribute has already been committed */ if (!args->trans) goto out_unlock; @@ -822,7 +831,7 @@ xfs_attr_set( if (error != -EEXIST) goto out_trans_cancel; - error = xfs_attr_remove_args(args); + error = xfs_attr_remove_deferred(args); if (error) goto out_trans_cancel; } @@ -844,6 +853,9 @@ xfs_attr_set( error = xfs_trans_commit(args->trans); out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); +drop_incompat: + if (delayed) + xlog_drop_incompat_feat(mp->m_log); return error; out_trans_cancel: @@ -886,6 +898,58 @@ xfs_attrd_destroy_cache(void) xfs_attrd_cache = NULL; } +STATIC int +xfs_attr_item_init( + struct xfs_da_args *args, + unsigned int op_flags, /* op flag (set or remove) */ + struct xfs_attr_item **attr) /* new xfs_attr_item */ +{ + + struct xfs_attr_item *new; + + new = kmem_zalloc(sizeof(struct xfs_attr_item), KM_NOFS); + new->xattri_op_flags = op_flags; + new->xattri_dac.da_args = args; + + *attr = new; + return 0; +} + +/* Sets an attribute for an inode as a deferred operation */ +int +xfs_attr_set_deferred( + struct xfs_da_args *args) +{ + struct xfs_attr_item *new; + int error = 0; + + error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_SET, &new); + if (error) + return error; + + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + + return 0; +} + +/* Removes an attribute for an inode as a deferred operation */ +int +xfs_attr_remove_deferred( + struct xfs_da_args *args) +{ + + struct xfs_attr_item *new; + int error; + + error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REMOVE, &new); + if (error) + return error; + + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + + return 0; +} + /*======================================================================== * External routines when attribute list is inside the inode *========================================================================*/ diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 80b6f28b0d1a..b52156ad8e6e 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -525,6 +525,8 @@ bool xfs_attr_namecheck(const void *name, size_t length); void xfs_delattr_context_init(struct xfs_delattr_context *dac, struct xfs_da_args *args); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); +int xfs_attr_set_deferred(struct xfs_da_args *args); +int xfs_attr_remove_deferred(struct xfs_da_args *args); extern struct kmem_cache *xfs_attri_cache; extern struct kmem_cache *xfs_attrd_cache; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1e972f884a81..9dc748abdf33 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3877,3 +3877,44 @@ xlog_drop_incompat_feat( { up_read(&log->l_incompat_users); } + +/* + * Get permission to use log-assisted atomic exchange of file extents. + * + * Callers must not be running any transactions or hold any inode locks, and + * they must release the permission by calling xlog_drop_incompat_feat + * when they're done. + */ +int +xfs_attr_use_log_assist( + struct xfs_mount *mp) +{ + int error = 0; + + /* + * Protect ourselves from an idle log clearing the logged xattrs log + * incompat feature bit. + */ + xlog_use_incompat_feat(mp->m_log); + + /* + * If log-assisted xattrs are already enabled, the caller can use the + * log assisted swap functions with the log-incompat reference we got. + */ + if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + return 0; + + /* Enable log-assisted xattrs. */ + error = xfs_add_incompat_log_feature(mp, + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); + if (error) + goto drop_incompat; + + xfs_warn_once(mp, +"EXPERIMENTAL logged extended attributes feature added. Use at your own risk!"); + + return 0; +drop_incompat: + xlog_drop_incompat_feat(mp->m_log); + return error; +} diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 3a4f6a4e4eb7..252b098cde1f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -153,5 +153,6 @@ bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags); void xlog_use_incompat_feat(struct xlog *log); void xlog_drop_incompat_feat(struct xlog *log); +int xfs_attr_use_log_assist(struct xfs_mount *mp); #endif /* __XFS_LOG_H__ */ -- cgit From 73159fc27c6944ebe55e6652d6a1981d7cb3eb4a Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Wed, 11 May 2022 17:01:22 +1000 Subject: xfs: Remove unused xfs_attr_*_args Remove xfs_attr_set_args, xfs_attr_remove_args, and xfs_attr_trans_roll. These high level loops are now driven by the delayed operations code, and can be removed. Additionally collapse in the leaf_bp parameter of xfs_attr_set_iter since we only have one caller that passes dac->leaf_bp Signed-off-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 106 ++++------------------------------------ fs/xfs/libxfs/xfs_attr.h | 8 +-- fs/xfs/libxfs/xfs_attr_remote.c | 1 - fs/xfs/xfs_attr_item.c | 9 ++-- 4 files changed, 14 insertions(+), 110 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 5bfe3ff9f3e0..b00bff3270ce 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -246,64 +246,9 @@ xfs_attr_is_shortform( ip->i_afp->if_nextents == 0); } -/* - * Checks to see if a delayed attribute transaction should be rolled. If so, - * transaction is finished or rolled as needed. - */ -STATIC int -xfs_attr_trans_roll( - struct xfs_delattr_context *dac) -{ - struct xfs_da_args *args = dac->da_args; - int error; - - if (dac->flags & XFS_DAC_DEFER_FINISH) { - /* - * The caller wants us to finish all the deferred ops so that we - * avoid pinning the log tail with a large number of deferred - * ops. - */ - dac->flags &= ~XFS_DAC_DEFER_FINISH; - error = xfs_defer_finish(&args->trans); - } else - error = xfs_trans_roll_inode(&args->trans, args->dp); - - return error; -} - -/* - * Set the attribute specified in @args. - */ -int -xfs_attr_set_args( - struct xfs_da_args *args) -{ - struct xfs_buf *leaf_bp = NULL; - int error = 0; - struct xfs_delattr_context dac = { - .da_args = args, - }; - - do { - error = xfs_attr_set_iter(&dac, &leaf_bp); - if (error != -EAGAIN) - break; - - error = xfs_attr_trans_roll(&dac); - if (error) { - if (leaf_bp) - xfs_trans_brelse(args->trans, leaf_bp); - return error; - } - } while (true); - - return error; -} - STATIC int xfs_attr_sf_addname( - struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp) + struct xfs_delattr_context *dac) { struct xfs_da_args *args = dac->da_args; struct xfs_inode *dp = args->dp; @@ -322,7 +267,7 @@ xfs_attr_sf_addname( * It won't fit in the shortform, transform to a leaf block. GROT: * another possible req'mt for a double-split btree op. */ - error = xfs_attr_shortform_to_leaf(args, leaf_bp); + error = xfs_attr_shortform_to_leaf(args, &dac->leaf_bp); if (error) return error; @@ -331,7 +276,7 @@ xfs_attr_sf_addname( * push cannot grab the half-baked leaf buffer and run into problems * with the write verifier. */ - xfs_trans_bhold(args->trans, *leaf_bp); + xfs_trans_bhold(args->trans, dac->leaf_bp); /* * We're still in XFS_DAS_UNINIT state here. We've converted @@ -339,7 +284,6 @@ xfs_attr_sf_addname( * add. */ trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp); - dac->flags |= XFS_DAC_DEFER_FINISH; return -EAGAIN; } @@ -352,8 +296,7 @@ xfs_attr_sf_addname( */ int xfs_attr_set_iter( - struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp) + struct xfs_delattr_context *dac) { struct xfs_da_args *args = dac->da_args; struct xfs_inode *dp = args->dp; @@ -372,14 +315,14 @@ xfs_attr_set_iter( * release the hold once we return with a clean transaction. */ if (xfs_attr_is_shortform(dp)) - return xfs_attr_sf_addname(dac, leaf_bp); - if (*leaf_bp != NULL) { - xfs_trans_bhold_release(args->trans, *leaf_bp); - *leaf_bp = NULL; + return xfs_attr_sf_addname(dac); + if (dac->leaf_bp != NULL) { + xfs_trans_bhold_release(args->trans, dac->leaf_bp); + dac->leaf_bp = NULL; } if (xfs_attr_is_leaf(dp)) { - error = xfs_attr_leaf_try_add(args, *leaf_bp); + error = xfs_attr_leaf_try_add(args, dac->leaf_bp); if (error == -ENOSPC) { error = xfs_attr3_leaf_to_node(args); if (error) @@ -398,7 +341,6 @@ xfs_attr_set_iter( * be a node, so we'll fall down into the node * handling code below */ - dac->flags |= XFS_DAC_DEFER_FINISH; trace_xfs_attr_set_iter_return( dac->dela_state, args->dp); return -EAGAIN; @@ -689,32 +631,6 @@ xfs_attr_lookup( return xfs_attr_node_hasname(args, NULL); } -/* - * Remove the attribute specified in @args. - */ -int -xfs_attr_remove_args( - struct xfs_da_args *args) -{ - int error; - struct xfs_delattr_context dac = { - .da_args = args, - }; - - do { - error = xfs_attr_remove_iter(&dac); - if (error != -EAGAIN) - break; - - error = xfs_attr_trans_roll(&dac); - if (error) - return error; - - } while (true); - - return error; -} - /* * Note: If args->value is NULL the attribute will be removed, just like the * Linux ->setattr API. @@ -1311,7 +1227,6 @@ xfs_attr_node_addname( * this. dela_state is still unset by this function at * this point. */ - dac->flags |= XFS_DAC_DEFER_FINISH; trace_xfs_attr_node_addname_return( dac->dela_state, args->dp); return -EAGAIN; @@ -1326,7 +1241,6 @@ xfs_attr_node_addname( error = xfs_da3_split(state); if (error) goto out; - dac->flags |= XFS_DAC_DEFER_FINISH; } else { /* * Addition succeeded, update Btree hashvals. @@ -1580,7 +1494,6 @@ xfs_attr_remove_iter( if (error) goto out; dac->dela_state = XFS_DAS_RM_NAME; - dac->flags |= XFS_DAC_DEFER_FINISH; trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp); return -EAGAIN; } @@ -1608,7 +1521,6 @@ xfs_attr_remove_iter( if (error) goto out; - dac->flags |= XFS_DAC_DEFER_FINISH; dac->dela_state = XFS_DAS_RM_SHRINK; trace_xfs_attr_remove_iter_return( dac->dela_state, args->dp); diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index b52156ad8e6e..5331551d5939 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -457,8 +457,7 @@ enum xfs_delattr_state { /* * Defines for xfs_delattr_context.flags */ -#define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */ -#define XFS_DAC_LEAF_ADDNAME_INIT 0x02 /* xfs_attr_leaf_addname init*/ +#define XFS_DAC_LEAF_ADDNAME_INIT 0x01 /* xfs_attr_leaf_addname init*/ /* * Context used for keeping track of delayed attribute operations @@ -516,10 +515,7 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); -int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_set_iter(struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp); -int xfs_attr_remove_args(struct xfs_da_args *args); +int xfs_attr_set_iter(struct xfs_delattr_context *dac); int xfs_attr_remove_iter(struct xfs_delattr_context *dac); bool xfs_attr_namecheck(const void *name, size_t length); void xfs_delattr_context_init(struct xfs_delattr_context *dac, diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 83b95be9ded8..c806319134fb 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -695,7 +695,6 @@ xfs_attr_rmtval_remove( * the parent */ if (!done) { - dac->flags |= XFS_DAC_DEFER_FINISH; trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp); return -EAGAIN; } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index bdcbf32690f4..56bc231822b3 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -303,7 +303,6 @@ STATIC int xfs_xattri_finish_update( struct xfs_delattr_context *dac, struct xfs_attrd_log_item *attrdp, - struct xfs_buf **leaf_bp, uint32_t op_flags) { struct xfs_da_args *args = dac->da_args; @@ -313,7 +312,7 @@ xfs_xattri_finish_update( switch (op) { case XFS_ATTR_OP_FLAGS_SET: - error = xfs_attr_set_iter(dac, leaf_bp); + error = xfs_attr_set_iter(dac); break; case XFS_ATTR_OP_FLAGS_REMOVE: ASSERT(XFS_IFORK_Q(args->dp)); @@ -432,8 +431,7 @@ xfs_attr_finish_item( */ dac->da_args->trans = tp; - error = xfs_xattri_finish_update(dac, done_item, &dac->leaf_bp, - attr->xattri_op_flags); + error = xfs_xattri_finish_update(dac, done_item, attr->xattri_op_flags); if (error != -EAGAIN) kmem_free(attr); @@ -592,8 +590,7 @@ xfs_attri_item_recover( xfs_trans_ijoin(tp, ip, 0); ret = xfs_xattri_finish_update(&attr->xattri_dac, done_item, - &attr->xattri_dac.leaf_bp, - attrp->alfi_op_flags); + attrp->alfi_op_flags); if (ret == -EAGAIN) { /* There's more work to do, so add it to this transaction */ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); -- cgit From abd61ca3c333506ffa4ee73b78659ab57e7efcf7 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Wed, 11 May 2022 17:01:22 +1000 Subject: xfs: Add log attribute error tag This patch adds an error tag that we can use to test log attribute recovery and replay Signed-off-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_errortag.h | 4 +++- fs/xfs/xfs_attr_item.c | 7 +++++++ fs/xfs/xfs_error.c | 3 +++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index a23a52e643ad..c15d2340220c 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -59,7 +59,8 @@ #define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36 #define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37 #define XFS_ERRTAG_AG_RESV_FAIL 38 -#define XFS_ERRTAG_MAX 39 +#define XFS_ERRTAG_LARP 39 +#define XFS_ERRTAG_MAX 40 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -103,5 +104,6 @@ #define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 #define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 #define XFS_RANDOM_AG_RESV_FAIL 1 +#define XFS_RANDOM_LARP 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 56bc231822b3..6d1dcc88abfe 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -24,6 +24,7 @@ #include "xfs_trace.h" #include "xfs_inode.h" #include "xfs_trans_space.h" +#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" @@ -310,6 +311,11 @@ xfs_xattri_finish_update( XFS_ATTR_OP_FLAGS_TYPE_MASK; int error; + if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + error = -EIO; + goto out; + } + switch (op) { case XFS_ATTR_OP_FLAGS_SET: error = xfs_attr_set_iter(dac); @@ -323,6 +329,7 @@ xfs_xattri_finish_update( break; } +out: /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 749fd18c4f32..666f4837b1e1 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -57,6 +57,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_REDUCE_MAX_IEXTENTS, XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, XFS_RANDOM_AG_RESV_FAIL, + XFS_RANDOM_LARP, }; struct xfs_errortag_attr { @@ -170,6 +171,7 @@ XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); +XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -211,6 +213,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), + XFS_ERRORTAG_ATTR_LIST(larp), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); -- cgit From 535e2f75c4e377e6ccc9d4396695b516d118f8f0 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Wed, 11 May 2022 17:01:22 +1000 Subject: xfs: Add larp debug option This patch adds a debug option to enable log attribute replay. Eventually this can be removed when delayed attrs becomes permanent. Signed-off-by: Allison Henderson Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.h | 4 ++++ fs/xfs/xfs_globals.c | 1 + fs/xfs/xfs_sysctl.h | 1 + fs/xfs/xfs_sysfs.c | 24 ++++++++++++++++++++++++ 4 files changed, 30 insertions(+) diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 5331551d5939..78884e826ca4 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -30,7 +30,11 @@ struct xfs_attr_list_context; static inline bool xfs_has_larp(struct xfs_mount *mp) { +#ifdef DEBUG + return xfs_globals.larp; +#else return false; +#endif } /* diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f62fa652c2fd..4d0a98f920ca 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -41,5 +41,6 @@ struct xfs_globals xfs_globals = { #endif #ifdef DEBUG .pwork_threads = -1, /* automatic thread detection */ + .larp = false, /* log attribute replay */ #endif }; diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 7692e76ead33..f78ad6b10ea5 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -83,6 +83,7 @@ extern xfs_param_t xfs_params; struct xfs_globals { #ifdef DEBUG int pwork_threads; /* parallel workqueue threads */ + bool larp; /* log attribute replay */ #endif int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 574b80c29fe1..f7faf6e70d7f 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -228,6 +228,29 @@ pwork_threads_show( return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads); } XFS_SYSFS_ATTR_RW(pwork_threads); + +static ssize_t +larp_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &xfs_globals.larp); + if (ret < 0) + return ret; + return count; +} + +STATIC ssize_t +larp_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp); +} +XFS_SYSFS_ATTR_RW(larp); #endif /* DEBUG */ static struct attribute *xfs_dbg_attrs[] = { @@ -237,6 +260,7 @@ static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(always_cow), #ifdef DEBUG ATTR_LIST(pwork_threads), + ATTR_LIST(larp), #endif NULL, }; -- cgit From d68c51e9a4095b57f06bf5dd15ab8fae6dab5d8b Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Wed, 11 May 2022 17:01:22 +1000 Subject: xfs: Merge xfs_delattr_context into xfs_attr_item This is a clean up patch that merges xfs_delattr_context into xfs_attr_item. Now that the refactoring is complete and the delayed operation infrastructure is in place, we can combine these to eliminate the extra struct Signed-off-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 162 +++++++++++++++++++++------------------- fs/xfs/libxfs/xfs_attr.h | 40 +++++----- fs/xfs/libxfs/xfs_attr_remote.c | 36 ++++----- fs/xfs/libxfs/xfs_attr_remote.h | 6 +- fs/xfs/xfs_attr_item.c | 50 ++++++------- 5 files changed, 147 insertions(+), 147 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index b00bff3270ce..e53d726480fb 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -58,10 +58,9 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); -STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_addname_clear_incomplete( - struct xfs_delattr_context *dac); +STATIC int xfs_attr_node_addname(struct xfs_attr_item *attr); +STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_item *attr); +STATIC int xfs_attr_node_addname_clear_incomplete(struct xfs_attr_item *attr); STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, struct xfs_da_state **state); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); @@ -248,9 +247,9 @@ xfs_attr_is_shortform( STATIC int xfs_attr_sf_addname( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; int error = 0; @@ -267,7 +266,7 @@ xfs_attr_sf_addname( * It won't fit in the shortform, transform to a leaf block. GROT: * another possible req'mt for a double-split btree op. */ - error = xfs_attr_shortform_to_leaf(args, &dac->leaf_bp); + error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp); if (error) return error; @@ -276,7 +275,7 @@ xfs_attr_sf_addname( * push cannot grab the half-baked leaf buffer and run into problems * with the write verifier. */ - xfs_trans_bhold(args->trans, dac->leaf_bp); + xfs_trans_bhold(args->trans, attr->xattri_leaf_bp); /* * We're still in XFS_DAS_UNINIT state here. We've converted @@ -296,16 +295,16 @@ xfs_attr_sf_addname( */ int xfs_attr_set_iter( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; int forkoff, error = 0; struct xfs_mount *mp = args->dp->i_mount; /* State machine switch */ - switch (dac->dela_state) { + switch (attr->xattri_dela_state) { case XFS_DAS_UNINIT: /* * If the fork is shortform, attempt to add the attr. If there @@ -315,14 +314,16 @@ xfs_attr_set_iter( * release the hold once we return with a clean transaction. */ if (xfs_attr_is_shortform(dp)) - return xfs_attr_sf_addname(dac); - if (dac->leaf_bp != NULL) { - xfs_trans_bhold_release(args->trans, dac->leaf_bp); - dac->leaf_bp = NULL; + return xfs_attr_sf_addname(attr); + if (attr->xattri_leaf_bp != NULL) { + xfs_trans_bhold_release(args->trans, + attr->xattri_leaf_bp); + attr->xattri_leaf_bp = NULL; } if (xfs_attr_is_leaf(dp)) { - error = xfs_attr_leaf_try_add(args, dac->leaf_bp); + error = xfs_attr_leaf_try_add(args, + attr->xattri_leaf_bp); if (error == -ENOSPC) { error = xfs_attr3_leaf_to_node(args); if (error) @@ -342,19 +343,19 @@ xfs_attr_set_iter( * handling code below */ trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); + attr->xattri_dela_state, args->dp); return -EAGAIN; } else if (error) { return error; } - dac->dela_state = XFS_DAS_FOUND_LBLK; + attr->xattri_dela_state = XFS_DAS_FOUND_LBLK; } else { - error = xfs_attr_node_addname_find_attr(dac); + error = xfs_attr_node_addname_find_attr(attr); if (error) return error; - error = xfs_attr_node_addname(dac); + error = xfs_attr_node_addname(attr); if (error) return error; @@ -366,9 +367,10 @@ xfs_attr_set_iter( !(args->op_flags & XFS_DA_OP_RENAME)) return 0; - dac->dela_state = XFS_DAS_FOUND_NBLK; + attr->xattri_dela_state = XFS_DAS_FOUND_NBLK; } - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); + trace_xfs_attr_set_iter_return(attr->xattri_dela_state, + args->dp); return -EAGAIN; case XFS_DAS_FOUND_LBLK: /* @@ -379,10 +381,10 @@ xfs_attr_set_iter( */ /* Open coded xfs_attr_rmtval_set without trans handling */ - if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) { - dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT; + if ((attr->xattri_flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) { + attr->xattri_flags |= XFS_DAC_LEAF_ADDNAME_INIT; if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(dac); + error = xfs_attr_rmtval_find_space(attr); if (error) return error; } @@ -392,11 +394,11 @@ xfs_attr_set_iter( * Repeat allocating remote blocks for the attr value until * blkcnt drops to zero. */ - if (dac->blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(dac); + if (attr->xattri_blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(attr); if (error) return error; - trace_xfs_attr_set_iter_return(dac->dela_state, + trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp); return -EAGAIN; } @@ -432,8 +434,8 @@ xfs_attr_set_iter( * Commit the flag value change and start the next trans * in series. */ - dac->dela_state = XFS_DAS_FLIP_LFLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, + attr->xattri_dela_state = XFS_DAS_FLIP_LFLAG; + trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp); return -EAGAIN; } @@ -452,17 +454,18 @@ xfs_attr_set_iter( fallthrough; case XFS_DAS_RM_LBLK: /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ - dac->dela_state = XFS_DAS_RM_LBLK; + attr->xattri_dela_state = XFS_DAS_RM_LBLK; if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(dac); + error = xfs_attr_rmtval_remove(attr); if (error == -EAGAIN) trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); + attr->xattri_dela_state, args->dp); if (error) return error; - dac->dela_state = XFS_DAS_RD_LEAF; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); + attr->xattri_dela_state = XFS_DAS_RD_LEAF; + trace_xfs_attr_set_iter_return(attr->xattri_dela_state, + args->dp); return -EAGAIN; } @@ -493,7 +496,7 @@ xfs_attr_set_iter( * state. */ if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(dac); + error = xfs_attr_rmtval_find_space(attr); if (error) return error; } @@ -506,14 +509,14 @@ xfs_attr_set_iter( * after we create the attribute so that we don't overflow the * maximum size of a transaction and/or hit a deadlock. */ - dac->dela_state = XFS_DAS_ALLOC_NODE; + attr->xattri_dela_state = XFS_DAS_ALLOC_NODE; if (args->rmtblkno > 0) { - if (dac->blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(dac); + if (attr->xattri_blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(attr); if (error) return error; trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); + attr->xattri_dela_state, args->dp); return -EAGAIN; } @@ -549,8 +552,8 @@ xfs_attr_set_iter( * Commit the flag value change and start the next trans * in series */ - dac->dela_state = XFS_DAS_FLIP_NFLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, + attr->xattri_dela_state = XFS_DAS_FLIP_NFLAG; + trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp); return -EAGAIN; } @@ -570,18 +573,19 @@ xfs_attr_set_iter( fallthrough; case XFS_DAS_RM_NBLK: /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ - dac->dela_state = XFS_DAS_RM_NBLK; + attr->xattri_dela_state = XFS_DAS_RM_NBLK; if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(dac); + error = xfs_attr_rmtval_remove(attr); if (error == -EAGAIN) trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); + attr->xattri_dela_state, args->dp); if (error) return error; - dac->dela_state = XFS_DAS_CLR_FLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); + attr->xattri_dela_state = XFS_DAS_CLR_FLAG; + trace_xfs_attr_set_iter_return(attr->xattri_dela_state, + args->dp); return -EAGAIN; } @@ -591,7 +595,7 @@ xfs_attr_set_iter( * The last state for node format. Look up the old attr and * remove it. */ - error = xfs_attr_node_addname_clear_incomplete(dac); + error = xfs_attr_node_addname_clear_incomplete(attr); break; default: ASSERT(0); @@ -825,7 +829,7 @@ xfs_attr_item_init( new = kmem_zalloc(sizeof(struct xfs_attr_item), KM_NOFS); new->xattri_op_flags = op_flags; - new->xattri_dac.da_args = args; + new->xattri_da_args = args; *attr = new; return 0; @@ -1135,16 +1139,16 @@ xfs_attr_node_hasname( STATIC int xfs_attr_node_addname_find_attr( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; int retval; /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - retval = xfs_attr_node_hasname(args, &dac->da_state); + retval = xfs_attr_node_hasname(args, &attr->xattri_da_state); if (retval != -ENOATTR && retval != -EEXIST) goto error; @@ -1172,8 +1176,8 @@ xfs_attr_node_addname_find_attr( return 0; error: - if (dac->da_state) - xfs_da_state_free(dac->da_state); + if (attr->xattri_da_state) + xfs_da_state_free(attr->xattri_da_state); return retval; } @@ -1194,10 +1198,10 @@ error: */ STATIC int xfs_attr_node_addname( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = dac->da_state; + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state *state = attr->xattri_da_state; struct xfs_da_state_blk *blk; int error; @@ -1228,7 +1232,7 @@ xfs_attr_node_addname( * this point. */ trace_xfs_attr_node_addname_return( - dac->dela_state, args->dp); + attr->xattri_dela_state, args->dp); return -EAGAIN; } @@ -1257,9 +1261,9 @@ out: STATIC int xfs_attr_node_addname_clear_incomplete( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; struct xfs_da_state *state = NULL; struct xfs_mount *mp = args->dp->i_mount; int retval = 0; @@ -1363,10 +1367,10 @@ xfs_attr_leaf_mark_incomplete( */ STATIC int xfs_attr_node_removename_setup( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state **state = &dac->da_state; + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state **state = &attr->xattri_da_state; int error; error = xfs_attr_node_hasname(args, state); @@ -1425,16 +1429,16 @@ xfs_attr_node_removename( */ int xfs_attr_remove_iter( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = dac->da_state; + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state *state = attr->xattri_da_state; int retval, error = 0; struct xfs_inode *dp = args->dp; trace_xfs_attr_node_removename(args); - switch (dac->dela_state) { + switch (attr->xattri_dela_state) { case XFS_DAS_UNINIT: if (!xfs_inode_hasattr(dp)) return -ENOATTR; @@ -1453,16 +1457,16 @@ xfs_attr_remove_iter( * Node format may require transaction rolls. Set up the * state context and fall into the state machine. */ - if (!dac->da_state) { - error = xfs_attr_node_removename_setup(dac); + if (!attr->xattri_da_state) { + error = xfs_attr_node_removename_setup(attr); if (error) return error; - state = dac->da_state; + state = attr->xattri_da_state; } fallthrough; case XFS_DAS_RMTBLK: - dac->dela_state = XFS_DAS_RMTBLK; + attr->xattri_dela_state = XFS_DAS_RMTBLK; /* * If there is an out-of-line value, de-allocate the blocks. @@ -1475,10 +1479,10 @@ xfs_attr_remove_iter( * May return -EAGAIN. Roll and repeat until all remote * blocks are removed. */ - error = xfs_attr_rmtval_remove(dac); + error = xfs_attr_rmtval_remove(attr); if (error == -EAGAIN) { trace_xfs_attr_remove_iter_return( - dac->dela_state, args->dp); + attr->xattri_dela_state, args->dp); return error; } else if (error) { goto out; @@ -1493,8 +1497,10 @@ xfs_attr_remove_iter( error = xfs_attr_refillstate(state); if (error) goto out; - dac->dela_state = XFS_DAS_RM_NAME; - trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp); + + attr->xattri_dela_state = XFS_DAS_RM_NAME; + trace_xfs_attr_remove_iter_return( + attr->xattri_dela_state, args->dp); return -EAGAIN; } @@ -1504,7 +1510,7 @@ xfs_attr_remove_iter( * If we came here fresh from a transaction roll, reattach all * the buffers to the current transaction. */ - if (dac->dela_state == XFS_DAS_RM_NAME) { + if (attr->xattri_dela_state == XFS_DAS_RM_NAME) { error = xfs_attr_refillstate(state); if (error) goto out; @@ -1521,9 +1527,9 @@ xfs_attr_remove_iter( if (error) goto out; - dac->dela_state = XFS_DAS_RM_SHRINK; + attr->xattri_dela_state = XFS_DAS_RM_SHRINK; trace_xfs_attr_remove_iter_return( - dac->dela_state, args->dp); + attr->xattri_dela_state, args->dp); return -EAGAIN; } diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 78884e826ca4..1ef58d34eb59 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -434,7 +434,7 @@ struct xfs_attr_list_context { */ /* - * Enum values for xfs_delattr_context.da_state + * Enum values for xfs_attr_item.xattri_da_state * * These values are used by delayed attribute operations to keep track of where * they were before they returned -EAGAIN. A return code of -EAGAIN signals the @@ -459,39 +459,32 @@ enum xfs_delattr_state { }; /* - * Defines for xfs_delattr_context.flags + * Defines for xfs_attr_item.xattri_flags */ #define XFS_DAC_LEAF_ADDNAME_INIT 0x01 /* xfs_attr_leaf_addname init*/ /* * Context used for keeping track of delayed attribute operations */ -struct xfs_delattr_context { - struct xfs_da_args *da_args; +struct xfs_attr_item { + struct xfs_da_args *xattri_da_args; /* * Used by xfs_attr_set to hold a leaf buffer across a transaction roll */ - struct xfs_buf *leaf_bp; + struct xfs_buf *xattri_leaf_bp; /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ - struct xfs_bmbt_irec map; - xfs_dablk_t lblkno; - int blkcnt; + struct xfs_bmbt_irec xattri_map; + xfs_dablk_t xattri_lblkno; + int xattri_blkcnt; /* Used in xfs_attr_node_removename to roll through removing blocks */ - struct xfs_da_state *da_state; + struct xfs_da_state *xattri_da_state; /* Used to keep track of current state of delayed operation */ - unsigned int flags; - enum xfs_delattr_state dela_state; -}; - -/* - * List of attrs to commit later. - */ -struct xfs_attr_item { - struct xfs_delattr_context xattri_dac; + unsigned int xattri_flags; + enum xfs_delattr_state xattri_dela_state; /* * Indicates if the attr operation is a set or a remove @@ -499,7 +492,10 @@ struct xfs_attr_item { */ unsigned int xattri_op_flags; - /* used to log this item to an intent */ + /* + * used to log this item to an intent containing a list of attrs to + * commit later + */ struct list_head xattri_list; }; @@ -519,11 +515,9 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); -int xfs_attr_set_iter(struct xfs_delattr_context *dac); -int xfs_attr_remove_iter(struct xfs_delattr_context *dac); +int xfs_attr_set_iter(struct xfs_attr_item *attr); +int xfs_attr_remove_iter(struct xfs_attr_item *attr); bool xfs_attr_namecheck(const void *name, size_t length); -void xfs_delattr_context_init(struct xfs_delattr_context *dac, - struct xfs_da_args *args); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); int xfs_attr_set_deferred(struct xfs_da_args *args); int xfs_attr_remove_deferred(struct xfs_da_args *args); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index c806319134fb..4250159ecced 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -568,14 +568,14 @@ xfs_attr_rmtval_stale( */ int xfs_attr_rmtval_find_space( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_bmbt_irec *map = &dac->map; + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_bmbt_irec *map = &attr->xattri_map; int error; - dac->lblkno = 0; - dac->blkcnt = 0; + attr->xattri_lblkno = 0; + attr->xattri_blkcnt = 0; args->rmtblkcnt = 0; args->rmtblkno = 0; memset(map, 0, sizeof(struct xfs_bmbt_irec)); @@ -584,8 +584,8 @@ xfs_attr_rmtval_find_space( if (error) return error; - dac->blkcnt = args->rmtblkcnt; - dac->lblkno = args->rmtblkno; + attr->xattri_blkcnt = args->rmtblkcnt; + attr->xattri_lblkno = args->rmtblkno; return 0; } @@ -598,17 +598,18 @@ xfs_attr_rmtval_find_space( */ int xfs_attr_rmtval_set_blk( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; - struct xfs_bmbt_irec *map = &dac->map; + struct xfs_bmbt_irec *map = &attr->xattri_map; int nmap; int error; nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno, - dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total, + error = xfs_bmapi_write(args->trans, dp, + (xfs_fileoff_t)attr->xattri_lblkno, + attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total, map, &nmap); if (error) return error; @@ -618,8 +619,8 @@ xfs_attr_rmtval_set_blk( (map->br_startblock != HOLESTARTBLOCK)); /* roll attribute extent map forwards */ - dac->lblkno += map->br_blockcount; - dac->blkcnt -= map->br_blockcount; + attr->xattri_lblkno += map->br_blockcount; + attr->xattri_blkcnt -= map->br_blockcount; return 0; } @@ -673,9 +674,9 @@ xfs_attr_rmtval_invalidate( */ int xfs_attr_rmtval_remove( - struct xfs_delattr_context *dac) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; int error, done; /* @@ -695,7 +696,8 @@ xfs_attr_rmtval_remove( * the parent */ if (!done) { - trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp); + trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state, + args->dp); return -EAGAIN; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index d72eff30ca18..62b398edec3f 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); -int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_remove(struct xfs_attr_item *attr); int xfs_attr_rmt_find_hole(struct xfs_da_args *args); int xfs_attr_rmtval_set_value(struct xfs_da_args *args); -int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac); -int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_set_blk(struct xfs_attr_item *attr); +int xfs_attr_rmtval_find_space(struct xfs_attr_item *attr); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 6d1dcc88abfe..1a72fdf76a5f 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -302,11 +302,11 @@ xfs_attrd_item_intent( */ STATIC int xfs_xattri_finish_update( - struct xfs_delattr_context *dac, + struct xfs_attr_item *attr, struct xfs_attrd_log_item *attrdp, uint32_t op_flags) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; unsigned int op = op_flags & XFS_ATTR_OP_FLAGS_TYPE_MASK; int error; @@ -318,11 +318,11 @@ xfs_xattri_finish_update( switch (op) { case XFS_ATTR_OP_FLAGS_SET: - error = xfs_attr_set_iter(dac); + error = xfs_attr_set_iter(attr); break; case XFS_ATTR_OP_FLAGS_REMOVE: ASSERT(XFS_IFORK_Q(args->dp)); - error = xfs_attr_remove_iter(dac); + error = xfs_attr_remove_iter(attr); break; default: error = -EFSCORRUPTED; @@ -366,18 +366,18 @@ xfs_attr_log_item( * structure with fields from this xfs_attr_item */ attrp = &attrip->attri_format; - attrp->alfi_ino = attr->xattri_dac.da_args->dp->i_ino; + attrp->alfi_ino = attr->xattri_da_args->dp->i_ino; attrp->alfi_op_flags = attr->xattri_op_flags; - attrp->alfi_value_len = attr->xattri_dac.da_args->valuelen; - attrp->alfi_name_len = attr->xattri_dac.da_args->namelen; - attrp->alfi_attr_flags = attr->xattri_dac.da_args->attr_filter; - - memcpy(attrip->attri_name, attr->xattri_dac.da_args->name, - attr->xattri_dac.da_args->namelen); - memcpy(attrip->attri_value, attr->xattri_dac.da_args->value, - attr->xattri_dac.da_args->valuelen); - attrip->attri_name_len = attr->xattri_dac.da_args->namelen; - attrip->attri_value_len = attr->xattri_dac.da_args->valuelen; + attrp->alfi_value_len = attr->xattri_da_args->valuelen; + attrp->alfi_name_len = attr->xattri_da_args->namelen; + attrp->alfi_attr_flags = attr->xattri_da_args->attr_filter; + + memcpy(attrip->attri_name, attr->xattri_da_args->name, + attr->xattri_da_args->namelen); + memcpy(attrip->attri_value, attr->xattri_da_args->value, + attr->xattri_da_args->valuelen); + attrip->attri_name_len = attr->xattri_da_args->namelen; + attrip->attri_value_len = attr->xattri_da_args->valuelen; } /* Get an ATTRI. */ @@ -402,8 +402,8 @@ xfs_attr_create_intent( * this is a list of one */ list_for_each_entry(attr, items, xattri_list) { - attrip = xfs_attri_init(mp, attr->xattri_dac.da_args->namelen, - attr->xattri_dac.da_args->valuelen); + attrip = xfs_attri_init(mp, attr->xattri_da_args->namelen, + attr->xattri_da_args->valuelen); if (attrip == NULL) return NULL; @@ -425,10 +425,8 @@ xfs_attr_finish_item( struct xfs_attr_item *attr; struct xfs_attrd_log_item *done_item = NULL; int error; - struct xfs_delattr_context *dac; attr = container_of(item, struct xfs_attr_item, xattri_list); - dac = &attr->xattri_dac; if (done) done_item = ATTRD_ITEM(done); @@ -436,9 +434,10 @@ xfs_attr_finish_item( * Always reset trans after EAGAIN cycle * since the transaction is new */ - dac->da_args->trans = tp; + attr->xattri_da_args->trans = tp; - error = xfs_xattri_finish_update(dac, done_item, attr->xattri_op_flags); + error = xfs_xattri_finish_update(attr, done_item, + attr->xattri_op_flags); if (error != -EAGAIN) kmem_free(attr); @@ -560,7 +559,7 @@ xfs_attri_item_recover( sizeof(struct xfs_da_args), KM_NOFS); args = (struct xfs_da_args *)(attr + 1); - attr->xattri_dac.da_args = args; + attr->xattri_da_args = args; attr->xattri_op_flags = attrp->alfi_op_flags; args->dp = ip; @@ -596,8 +595,7 @@ xfs_attri_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - ret = xfs_xattri_finish_update(&attr->xattri_dac, done_item, - attrp->alfi_op_flags); + ret = xfs_xattri_finish_update(attr, done_item, attrp->alfi_op_flags); if (ret == -EAGAIN) { /* There's more work to do, so add it to this transaction */ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); @@ -612,8 +610,8 @@ xfs_attri_item_recover( error = xfs_defer_ops_capture_and_commit(tp, capture_list); out_unlock: - if (attr->xattri_dac.leaf_bp) - xfs_buf_relse(attr->xattri_dac.leaf_bp); + if (attr->xattri_leaf_bp) + xfs_buf_relse(attr->xattri_leaf_bp); xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); -- cgit From cd1549d6df22e4f72903dbb169202203d429bcff Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Wed, 11 May 2022 17:01:22 +1000 Subject: xfs: Add helper function xfs_attr_leaf_addname This patch adds a helper function xfs_attr_leaf_addname. While this does help to break down xfs_attr_set_iter, it does also hoist out some of the state management. This patch has been moved to the end of the clean up series for further discussion. Suggested-by: Darrick J. Wong Signed-off-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 110 ++++++++++++++++++++++++++--------------------- fs/xfs/xfs_trace.h | 1 + 2 files changed, 61 insertions(+), 50 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e53d726480fb..af578ce7b8bb 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -286,6 +286,65 @@ xfs_attr_sf_addname( return -EAGAIN; } +STATIC int +xfs_attr_leaf_addname( + struct xfs_attr_item *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + int error; + + if (xfs_attr_is_leaf(dp)) { + error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp); + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; + + /* + * Finish any deferred work items and roll the + * transaction once more. The goal here is to call + * node_addname with the inode and transaction in the + * same state (inode locked and joined, transaction + * clean) no matter how we got to this step. + * + * At this point, we are still in XFS_DAS_UNINIT, but + * when we come back, we'll be a node, so we'll fall + * down into the node handling code below + */ + trace_xfs_attr_set_iter_return( + attr->xattri_dela_state, args->dp); + return -EAGAIN; + } + + if (error) + return error; + + attr->xattri_dela_state = XFS_DAS_FOUND_LBLK; + } else { + error = xfs_attr_node_addname_find_attr(attr); + if (error) + return error; + + error = xfs_attr_node_addname(attr); + if (error) + return error; + + /* + * If addname was successful, and we dont need to alloc or + * remove anymore blks, we're done. + */ + if (!args->rmtblkno && + !(args->op_flags & XFS_DA_OP_RENAME)) + return 0; + + attr->xattri_dela_state = XFS_DAS_FOUND_NBLK; + } + + trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); + return -EAGAIN; +} + /* * Set the attribute specified in @args. * This routine is meant to function as a delayed operation, and may return @@ -321,57 +380,8 @@ xfs_attr_set_iter( attr->xattri_leaf_bp = NULL; } - if (xfs_attr_is_leaf(dp)) { - error = xfs_attr_leaf_try_add(args, - attr->xattri_leaf_bp); - if (error == -ENOSPC) { - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; - - /* - * Finish any deferred work items and roll the - * transaction once more. The goal here is to - * call node_addname with the inode and - * transaction in the same state (inode locked - * and joined, transaction clean) no matter how - * we got to this step. - * - * At this point, we are still in - * XFS_DAS_UNINIT, but when we come back, we'll - * be a node, so we'll fall down into the node - * handling code below - */ - trace_xfs_attr_set_iter_return( - attr->xattri_dela_state, args->dp); - return -EAGAIN; - } else if (error) { - return error; - } - - attr->xattri_dela_state = XFS_DAS_FOUND_LBLK; - } else { - error = xfs_attr_node_addname_find_attr(attr); - if (error) - return error; + return xfs_attr_leaf_addname(attr); - error = xfs_attr_node_addname(attr); - if (error) - return error; - - /* - * If addname was successful, and we dont need to alloc - * or remove anymore blks, we're done. - */ - if (!args->rmtblkno && - !(args->op_flags & XFS_DA_OP_RENAME)) - return 0; - - attr->xattri_dela_state = XFS_DAS_FOUND_NBLK; - } - trace_xfs_attr_set_iter_return(attr->xattri_dela_state, - args->dp); - return -EAGAIN; case XFS_DAS_FOUND_LBLK: /* * If there was an out-of-line value, allocate the blocks we diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e19a3f7351be..fec4198b738b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4150,6 +4150,7 @@ DEFINE_EVENT(xfs_das_state_class, name, \ TP_ARGS(das, ip)) DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); -- cgit From c3546cf5d1e50389a789290f8c21a555e3408aa8 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Wed, 11 May 2022 17:01:23 +1000 Subject: xfs: Add helper function xfs_init_attr_trans Quick helper function to collapse duplicate code to initialize transactions for attributes Signed-off-by: Allison Henderson Suggested-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 33 +++++++++++++++++++++++---------- fs/xfs/libxfs/xfs_attr.h | 2 ++ fs/xfs/xfs_attr_item.c | 12 ++---------- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index af578ce7b8bb..3a5f22eae607 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -201,6 +201,28 @@ xfs_attr_calc_size( return nblks; } +/* Initialize transaction reservation for attr operations */ +void +xfs_init_attr_trans( + struct xfs_da_args *args, + struct xfs_trans_res *tres, + unsigned int *total) +{ + struct xfs_mount *mp = args->dp->i_mount; + + if (args->value) { + tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * + args->total; + tres->tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres->tr_logflags = XFS_TRANS_PERM_LOG_RES; + *total = args->total; + } else { + *tres = M_RES(mp)->tr_attrrm; + *total = XFS_ATTRRM_SPACE_RES(mp); + } +} + STATIC int xfs_attr_try_sf_addname( struct xfs_inode *dp, @@ -700,20 +722,10 @@ xfs_attr_set( return error; } - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * - args->total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - total = args->total; - if (!local) rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); } else { XFS_STATS_INC(mp, xs_attr_remove); - - tres = M_RES(mp)->tr_attrrm; - total = XFS_ATTRRM_SPACE_RES(mp); rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); } @@ -727,6 +739,7 @@ xfs_attr_set( * Root fork attributes can use reserved data blocks for this * operation if necessary */ + xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) goto drop_incompat; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 1ef58d34eb59..f6c13d2bfbcd 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -519,6 +519,8 @@ int xfs_attr_set_iter(struct xfs_attr_item *attr); int xfs_attr_remove_iter(struct xfs_attr_item *attr); bool xfs_attr_namecheck(const void *name, size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); +void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, + unsigned int *total); int xfs_attr_set_deferred(struct xfs_da_args *args); int xfs_attr_remove_deferred(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 1a72fdf76a5f..676b1e03cf40 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -574,17 +574,9 @@ xfs_attri_item_recover( args->value = attrip->attri_value; args->valuelen = attrp->alfi_value_len; args->total = xfs_attr_calc_size(args, &local); - - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * - args->total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - total = args->total; - } else { - tres = M_RES(mp)->tr_attrrm; - total = XFS_ATTRRM_SPACE_RES(mp); } + + xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp); if (error) goto out; -- cgit From c3b948be34702a0a81f10662c4040e500a90eb54 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Wed, 11 May 2022 17:01:23 +1000 Subject: xfs: add leaf split error tag Add an error tag on xfs_da3_split to test log attribute recovery and replay. Signed-off-by: Catherine Hoang Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_da_btree.c | 4 ++++ fs/xfs/libxfs/xfs_errortag.h | 4 +++- fs/xfs/xfs_error.c | 3 +++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 9dc1ecb9713d..aa74f3fdb571 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -22,6 +22,7 @@ #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_errortag.h" /* * xfs_da_btree.c @@ -482,6 +483,9 @@ xfs_da3_split( trace_xfs_da_split(state->args); + if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) + return -EIO; + /* * Walk back up the tree splitting/inserting/adjusting as necessary. * If we need to insert and there isn't room, split the node, then diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index c15d2340220c..6d06a502bbdf 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -60,7 +60,8 @@ #define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37 #define XFS_ERRTAG_AG_RESV_FAIL 38 #define XFS_ERRTAG_LARP 39 -#define XFS_ERRTAG_MAX 40 +#define XFS_ERRTAG_DA_LEAF_SPLIT 40 +#define XFS_ERRTAG_MAX 41 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -105,5 +106,6 @@ #define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 #define XFS_RANDOM_AG_RESV_FAIL 1 #define XFS_RANDOM_LARP 1 +#define XFS_RANDOM_DA_LEAF_SPLIT 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 666f4837b1e1..2aa5d4d2b30a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -58,6 +58,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, XFS_RANDOM_AG_RESV_FAIL, XFS_RANDOM_LARP, + XFS_RANDOM_DA_LEAF_SPLIT, }; struct xfs_errortag_attr { @@ -172,6 +173,7 @@ XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); +XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -214,6 +216,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), XFS_ERRORTAG_ATTR_LIST(larp), + XFS_ERRORTAG_ATTR_LIST(da_leaf_split), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); -- cgit From c5218a7cd97349c53bc64e447778a07e49364d40 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Wed, 11 May 2022 17:01:23 +1000 Subject: xfs: add leaf to node error tag Add an error tag on xfs_attr3_leaf_to_node to test log attribute recovery and replay. Signed-off-by: Catherine Hoang Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_leaf.c | 6 ++++++ fs/xfs/libxfs/xfs_errortag.h | 4 +++- fs/xfs/xfs_error.c | 3 +++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 74b76b09509f..e90bfd9d7551 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -28,6 +28,7 @@ #include "xfs_dir2.h" #include "xfs_log.h" #include "xfs_ag.h" +#include "xfs_errortag.h" /* @@ -1189,6 +1190,11 @@ xfs_attr3_leaf_to_node( trace_xfs_attr_leaf_to_node(args); + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { + error = -EIO; + goto out; + } + error = xfs_da_grow_inode(args, &blkno); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 6d06a502bbdf..5362908164b0 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -61,7 +61,8 @@ #define XFS_ERRTAG_AG_RESV_FAIL 38 #define XFS_ERRTAG_LARP 39 #define XFS_ERRTAG_DA_LEAF_SPLIT 40 -#define XFS_ERRTAG_MAX 41 +#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 +#define XFS_ERRTAG_MAX 42 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -107,5 +108,6 @@ #define XFS_RANDOM_AG_RESV_FAIL 1 #define XFS_RANDOM_LARP 1 #define XFS_RANDOM_DA_LEAF_SPLIT 1 +#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 2aa5d4d2b30a..296faa41d81d 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -59,6 +59,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_AG_RESV_FAIL, XFS_RANDOM_LARP, XFS_RANDOM_DA_LEAF_SPLIT, + XFS_RANDOM_ATTR_LEAF_TO_NODE, }; struct xfs_errortag_attr { @@ -174,6 +175,7 @@ XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTE XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); +XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -217,6 +219,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), XFS_ERRORTAG_ATTR_LIST(larp), XFS_ERRORTAG_ATTR_LIST(da_leaf_split), + XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); -- cgit From a4b8917b06c71a4ea61ac45b6e979eb7676417f8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 11 May 2022 17:02:23 +1000 Subject: xfs: avoid empty xattr transaction when attrs are inline generic/642 triggered a reproducable assert failure in xlog_cil_commit() that resulted from a xfs_attr_set() committing an empty but dirty transaction. When the CIL is empty and this occurs, xlog_cil_commit() tries a background push and this triggers a "pushing an empty CIL" assert. XFS: Assertion failed: !list_empty(&cil->xc_cil), file: fs/xfs/xfs_log_cil.c, line: 1274 Call Trace: xlog_cil_commit+0xa5a/0xad0 __xfs_trans_commit+0xb8/0x330 xfs_trans_commit+0x10/0x20 xfs_attr_set+0x3e2/0x4c0 xfs_xattr_set+0x8d/0xe0 __vfs_setxattr+0x6b/0x90 __vfs_setxattr_noperm+0x76/0x220 __vfs_setxattr_locked+0xdf/0x100 vfs_setxattr+0x94/0x170 setxattr+0x110/0x200 path_setxattr+0xbf/0xe0 __x64_sys_setxattr+0x2b/0x30 do_syscall_64+0x35/0x80 The problem is related to the breakdown of attribute addition in xfs_attr_set_iter() and how it is called from deferred operations. When we have a pure leaf xattr insert, we add the xattr to the leaf and set the next state to XFS_DAS_FOUND_LBLK and return -EAGAIN. This requeues the xattr defered work, rolls the transaction and runs xfs_attr_set_iter() again. This then checks the xattr for being remote (it's not) and whether a replace op is being done (this is a create op) and if neither are true it returns without having done anything. xfs_xattri_finish_update() then unconditionally sets the transaction dirty, and the deferops finishes and returns to __xfs_trans_commit() which sees the transaction dirty and tries to commit it by calling xlog_cil_commit(). The transaction is empty, and then the assert fires if this happens when the CIL is empty. This patch addresses the structure of xfs_attr_set_iter() that requires re-entry on leaf add even when nothing will be done. This gets rid of the trailing empty transaction and so doesn't trigger the XFS_TRANS_DIRTY assignment in xfs_xattri_finish_update() incorrectly. Addressing that is for a different patch. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 3a5f22eae607..98a2a2f89664 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -314,6 +314,7 @@ xfs_attr_leaf_addname( { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; + enum xfs_delattr_state next_state = XFS_DAS_UNINIT; int error; if (xfs_attr_is_leaf(dp)) { @@ -334,37 +335,35 @@ xfs_attr_leaf_addname( * when we come back, we'll be a node, so we'll fall * down into the node handling code below */ - trace_xfs_attr_set_iter_return( - attr->xattri_dela_state, args->dp); - return -EAGAIN; + error = -EAGAIN; + goto out; } - - if (error) - return error; - - attr->xattri_dela_state = XFS_DAS_FOUND_LBLK; + next_state = XFS_DAS_FOUND_LBLK; } else { error = xfs_attr_node_addname_find_attr(attr); if (error) return error; + next_state = XFS_DAS_FOUND_NBLK; error = xfs_attr_node_addname(attr); - if (error) - return error; - - /* - * If addname was successful, and we dont need to alloc or - * remove anymore blks, we're done. - */ - if (!args->rmtblkno && - !(args->op_flags & XFS_DA_OP_RENAME)) - return 0; + } + if (error) + return error; - attr->xattri_dela_state = XFS_DAS_FOUND_NBLK; + /* + * We need to commit and roll if we need to allocate remote xattr blocks + * or perform more xattr manipulations. Otherwise there is nothing more + * to do and we can return success. + */ + if (args->rmtblkno || + (args->op_flags & XFS_DA_OP_RENAME)) { + attr->xattri_dela_state = next_state; + error = -EAGAIN; } +out: trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); - return -EAGAIN; + return error; } /* -- cgit From f3d430ff8cda80ccb9b73d9efa0e186fa532b74e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 11 May 2022 17:03:23 +1000 Subject: xfs: initialise attrd item to zero On the first allocation of a attrd item, xfs_trans_add_item() fires an assert like so: XFS (pmem0): EXPERIMENTAL logged extended attributes feature added. Use at your own risk! XFS: Assertion failed: !test_bit(XFS_LI_DIRTY, &lip->li_flags), file: fs/xfs/xfs_trans.c, line: 683 ------------[ cut here ]------------ kernel BUG at fs/xfs/xfs_message.c:102! Call Trace: xfs_trans_add_item+0x17e/0x190 xfs_trans_get_attrd+0x67/0x90 xfs_attr_create_done+0x13/0x20 xfs_defer_finish_noroll+0x100/0x690 __xfs_trans_commit+0x144/0x330 xfs_trans_commit+0x10/0x20 xfs_attr_set+0x3e2/0x4c0 xfs_initxattrs+0xaa/0xe0 security_inode_init_security+0xb0/0x130 xfs_init_security+0x18/0x20 xfs_generic_create+0x13a/0x340 xfs_vn_create+0x17/0x20 path_openat+0xff3/0x12f0 do_filp_open+0xb2/0x150 The attrd log item is allocated via kmem_cache_alloc, and xfs_log_item_init() does not zero the entire log item structure - it assumes that the structure is already all zeros as it only initialises non-zero fields. Fix the attr items to be allocated via the *zalloc methods. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_attr_item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 676b1e03cf40..96933a3c0dcd 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -728,7 +728,7 @@ xfs_trans_get_attrd(struct xfs_trans *tp, ASSERT(tp != NULL); - attrdp = kmem_cache_alloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); + attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, &xfs_attrd_item_ops); -- cgit From e22b88de5bacdd60ffa70e911e5fbae9ad36441a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 11 May 2022 17:04:23 +1000 Subject: xfs: make xattri_leaf_bp more useful We currently set it and hold it when converting from short to leaf form, then release it only to immediately look it back up again to do the leaf insert. Do a bit of refactoring to xfs_attr_leaf_try_add() to avoid this messy handling of the newly allocated leaf buffer. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 50 +++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 98a2a2f89664..60878ebc07ba 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -318,7 +318,15 @@ xfs_attr_leaf_addname( int error; if (xfs_attr_is_leaf(dp)) { + + /* + * Use the leaf buffer we may already hold locked as a result of + * a sf-to-leaf conversion. The held buffer is no longer valid + * after this call, regardless of the result. + */ error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp); + attr->xattri_leaf_bp = NULL; + if (error == -ENOSPC) { error = xfs_attr3_leaf_to_node(args); if (error) @@ -340,6 +348,8 @@ xfs_attr_leaf_addname( } next_state = XFS_DAS_FOUND_LBLK; } else { + ASSERT(!attr->xattri_leaf_bp); + error = xfs_attr_node_addname_find_attr(attr); if (error) return error; @@ -395,12 +405,6 @@ xfs_attr_set_iter( */ if (xfs_attr_is_shortform(dp)) return xfs_attr_sf_addname(attr); - if (attr->xattri_leaf_bp != NULL) { - xfs_trans_bhold_release(args->trans, - attr->xattri_leaf_bp); - attr->xattri_leaf_bp = NULL; - } - return xfs_attr_leaf_addname(attr); case XFS_DAS_FOUND_LBLK: @@ -991,18 +995,31 @@ xfs_attr_leaf_try_add( struct xfs_da_args *args, struct xfs_buf *bp) { - int retval; + int error; /* - * Look up the given attribute in the leaf block. Figure out if - * the given flags produce an error or call for an atomic rename. + * If the caller provided a buffer to us, it is locked and held in + * the transaction because it just did a shortform to leaf conversion. + * Hence we don't need to read it again. Otherwise read in the leaf + * buffer. */ - retval = xfs_attr_leaf_hasname(args, &bp); - if (retval != -ENOATTR && retval != -EEXIST) - return retval; - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) + if (bp) { + xfs_trans_bhold_release(args->trans, bp); + } else { + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; + } + + /* + * Look up the xattr name to set the insertion point for the new xattr. + */ + error = xfs_attr3_leaf_lookup_int(bp, args); + if (error != -ENOATTR && error != -EEXIST) goto out_brelse; - if (retval == -EEXIST) { + if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) + goto out_brelse; + if (error == -EEXIST) { if (args->attr_flags & XATTR_CREATE) goto out_brelse; @@ -1022,14 +1039,11 @@ xfs_attr_leaf_try_add( args->rmtvaluelen = 0; } - /* - * Add the attribute to the leaf block - */ return xfs_attr3_leaf_add(bp, args); out_brelse: xfs_trans_brelse(args->trans, bp); - return retval; + return error; } /* -- cgit From 709c8632597c3276cd21324b0256628f1a7fd4df Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 11 May 2022 17:05:23 +1000 Subject: xfs: rework deferred attribute operation setup Logged attribute intents only have set and remove types - there is no separate intent type for a replace operation. We should have a separate type for a replace operation, as it needs to perform operations that neither SET or REMOVE can perform. Add this type to the intent items and rearrange the deferred operation setup to reflect the different operations we are performing. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 165 ++++++++++++++++++++++++----------------- fs/xfs/libxfs/xfs_attr.h | 2 - fs/xfs/libxfs/xfs_log_format.h | 1 + fs/xfs/xfs_attr_item.c | 9 ++- fs/xfs/xfs_trace.h | 4 + 5 files changed, 110 insertions(+), 71 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 60878ebc07ba..54f90d66b206 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -670,6 +670,81 @@ xfs_attr_lookup( return xfs_attr_node_hasname(args, NULL); } +static int +xfs_attr_item_init( + struct xfs_da_args *args, + unsigned int op_flags, /* op flag (set or remove) */ + struct xfs_attr_item **attr) /* new xfs_attr_item */ +{ + + struct xfs_attr_item *new; + + new = kmem_zalloc(sizeof(struct xfs_attr_item), KM_NOFS); + new->xattri_op_flags = op_flags; + new->xattri_da_args = args; + + *attr = new; + return 0; +} + +/* Sets an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_add( + struct xfs_da_args *args) +{ + struct xfs_attr_item *new; + int error = 0; + + error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_SET, &new); + if (error) + return error; + + new->xattri_dela_state = XFS_DAS_UNINIT; + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); + + return 0; +} + +/* Sets an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_replace( + struct xfs_da_args *args) +{ + struct xfs_attr_item *new; + int error = 0; + + error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REPLACE, &new); + if (error) + return error; + + new->xattri_dela_state = XFS_DAS_UNINIT; + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp); + + return 0; +} + +/* Removes an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_remove( + struct xfs_da_args *args) +{ + + struct xfs_attr_item *new; + int error; + + error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REMOVE, &new); + if (error) + return error; + + new->xattri_dela_state = XFS_DAS_UNINIT; + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp); + + return 0; +} + /* * Note: If args->value is NULL the attribute will be removed, just like the * Linux ->setattr API. @@ -758,29 +833,35 @@ xfs_attr_set( } error = xfs_attr_lookup(args); - if (args->value) { - if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) - goto out_trans_cancel; - if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto out_trans_cancel; - if (error != -ENOATTR && error != -EEXIST) + switch (error) { + case -EEXIST: + /* if no value, we are performing a remove operation */ + if (!args->value) { + error = xfs_attr_defer_remove(args); + break; + } + /* Pure create fails if the attr already exists */ + if (args->attr_flags & XATTR_CREATE) goto out_trans_cancel; - error = xfs_attr_set_deferred(args); - if (error) + error = xfs_attr_defer_replace(args); + break; + case -ENOATTR: + /* Can't remove what isn't there. */ + if (!args->value) goto out_trans_cancel; - /* shortform attribute has already been committed */ - if (!args->trans) - goto out_unlock; - } else { - if (error != -EEXIST) + /* Pure replace fails if no existing attr to replace. */ + if (args->attr_flags & XATTR_REPLACE) goto out_trans_cancel; - error = xfs_attr_remove_deferred(args); - if (error) - goto out_trans_cancel; + error = xfs_attr_defer_add(args); + break; + default: + goto out_trans_cancel; } + if (error) + goto out_trans_cancel; /* * If this is a synchronous mount, make sure that the @@ -844,58 +925,6 @@ xfs_attrd_destroy_cache(void) xfs_attrd_cache = NULL; } -STATIC int -xfs_attr_item_init( - struct xfs_da_args *args, - unsigned int op_flags, /* op flag (set or remove) */ - struct xfs_attr_item **attr) /* new xfs_attr_item */ -{ - - struct xfs_attr_item *new; - - new = kmem_zalloc(sizeof(struct xfs_attr_item), KM_NOFS); - new->xattri_op_flags = op_flags; - new->xattri_da_args = args; - - *attr = new; - return 0; -} - -/* Sets an attribute for an inode as a deferred operation */ -int -xfs_attr_set_deferred( - struct xfs_da_args *args) -{ - struct xfs_attr_item *new; - int error = 0; - - error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_SET, &new); - if (error) - return error; - - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); - - return 0; -} - -/* Removes an attribute for an inode as a deferred operation */ -int -xfs_attr_remove_deferred( - struct xfs_da_args *args) -{ - - struct xfs_attr_item *new; - int error; - - error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REMOVE, &new); - if (error) - return error; - - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); - - return 0; -} - /*======================================================================== * External routines when attribute list is inside the inode *========================================================================*/ diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index f6c13d2bfbcd..c9c867e3406c 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -521,8 +521,6 @@ bool xfs_attr_namecheck(const void *name, size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, unsigned int *total); -int xfs_attr_set_deferred(struct xfs_da_args *args); -int xfs_attr_remove_deferred(struct xfs_da_args *args); extern struct kmem_cache *xfs_attri_cache; extern struct kmem_cache *xfs_attrd_cache; diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a27492e99673..f7edd1ecf6d9 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -908,6 +908,7 @@ struct xfs_icreate_log { */ #define XFS_ATTR_OP_FLAGS_SET 1 /* Set the attribute */ #define XFS_ATTR_OP_FLAGS_REMOVE 2 /* Remove the attribute */ +#define XFS_ATTR_OP_FLAGS_REPLACE 3 /* Replace the attribute */ #define XFS_ATTR_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ /* diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 96933a3c0dcd..ee8b140a2801 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -318,6 +318,7 @@ xfs_xattri_finish_update( switch (op) { case XFS_ATTR_OP_FLAGS_SET: + case XFS_ATTR_OP_FLAGS_REPLACE: error = xfs_attr_set_iter(attr); break; case XFS_ATTR_OP_FLAGS_REMOVE: @@ -507,8 +508,14 @@ xfs_attri_validate( return false; /* alfi_op_flags should be either a set or remove */ - if (op != XFS_ATTR_OP_FLAGS_SET && op != XFS_ATTR_OP_FLAGS_REMOVE) + switch (op) { + case XFS_ATTR_OP_FLAGS_SET: + case XFS_ATTR_OP_FLAGS_REPLACE: + case XFS_ATTR_OP_FLAGS_REMOVE: + break; + default: return false; + } if (attrp->alfi_value_len > XATTR_SIZE_MAX) return false; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index fec4198b738b..01ce0401aa32 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4154,6 +4154,10 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add); +DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace); +DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove); + TRACE_EVENT(xfs_force_shutdown, TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname, -- cgit From 5cc21e522d02d9a10bf856d71032d4dcc10185a8 Mon Sep 17 00:00:00 2001 From: Catherine Hoang Date: Tue, 10 May 2022 13:27:58 -0700 Subject: xfs: remove quota warning limit from struct xfs_quota_limits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Warning limits in xfs quota is an unused feature that is currently documented as unimplemented, and it is unclear what the intended behavior of these limits are. Remove the ‘warn’ field from struct xfs_quota_limits and any other related code. Signed-off-by: Catherine Hoang Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_qm.c | 9 --------- fs/xfs/xfs_qm.h | 5 ----- fs/xfs/xfs_qm_syscalls.c | 17 +++-------------- fs/xfs/xfs_quotaops.c | 6 +++--- fs/xfs/xfs_trans_dquot.c | 3 +-- 5 files changed, 7 insertions(+), 33 deletions(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index f165d1a3de1d..8fc813cb6011 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -582,9 +582,6 @@ xfs_qm_init_timelimits( defq->blk.time = XFS_QM_BTIMELIMIT; defq->ino.time = XFS_QM_ITIMELIMIT; defq->rtb.time = XFS_QM_RTBTIMELIMIT; - defq->blk.warn = XFS_QM_BWARNLIMIT; - defq->ino.warn = XFS_QM_IWARNLIMIT; - defq->rtb.warn = XFS_QM_RTBWARNLIMIT; /* * We try to get the limits from the superuser's limits fields. @@ -608,12 +605,6 @@ xfs_qm_init_timelimits( defq->ino.time = dqp->q_ino.timer; if (dqp->q_rtb.timer) defq->rtb.time = dqp->q_rtb.timer; - if (dqp->q_blk.warnings) - defq->blk.warn = dqp->q_blk.warnings; - if (dqp->q_ino.warnings) - defq->ino.warn = dqp->q_ino.warnings; - if (dqp->q_rtb.warnings) - defq->rtb.warn = dqp->q_rtb.warnings; xfs_qm_dqdestroy(dqp); } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 5bb12717ea28..9683f0457d19 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -34,7 +34,6 @@ struct xfs_quota_limits { xfs_qcnt_t hard; /* default hard limit */ xfs_qcnt_t soft; /* default soft limit */ time64_t time; /* limit for timers */ - xfs_qwarncnt_t warn; /* limit for warnings */ }; /* Defaults for each quota type: time limits, warn limits, usage limits */ @@ -134,10 +133,6 @@ struct xfs_dquot_acct { #define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */ #define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */ -#define XFS_QM_BWARNLIMIT 5 -#define XFS_QM_IWARNLIMIT 5 -#define XFS_QM_RTBWARNLIMIT 5 - extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); /* quota ops */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 7d5a31827681..e7f3ac60ebd9 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -250,17 +250,6 @@ xfs_setqlim_limits( return true; } -static inline void -xfs_setqlim_warns( - struct xfs_dquot_res *res, - struct xfs_quota_limits *qlim, - int warns) -{ - res->warnings = warns; - if (qlim) - qlim->warn = warns; -} - static inline void xfs_setqlim_timer( struct xfs_mount *mp, @@ -355,7 +344,7 @@ xfs_qm_scall_setqlim( if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk")) xfs_dquot_set_prealloc_limits(dqp); if (newlim->d_fieldmask & QC_SPC_WARNS) - xfs_setqlim_warns(res, qlim, newlim->d_spc_warns); + res->warnings = newlim->d_spc_warns; if (newlim->d_fieldmask & QC_SPC_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer); @@ -371,7 +360,7 @@ xfs_qm_scall_setqlim( xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb"); if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns); + res->warnings = newlim->d_rt_spc_warns; if (newlim->d_fieldmask & QC_RT_SPC_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer); @@ -387,7 +376,7 @@ xfs_qm_scall_setqlim( xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino"); if (newlim->d_fieldmask & QC_INO_WARNS) - xfs_setqlim_warns(res, qlim, newlim->d_ino_warns); + res->warnings = newlim->d_ino_warns; if (newlim->d_fieldmask & QC_INO_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer); diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 07989bd67728..50391730241f 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -40,9 +40,9 @@ xfs_qm_fill_state( tstate->spc_timelimit = (u32)defq->blk.time; tstate->ino_timelimit = (u32)defq->ino.time; tstate->rt_spc_timelimit = (u32)defq->rtb.time; - tstate->spc_warnlimit = defq->blk.warn; - tstate->ino_warnlimit = defq->ino.warn; - tstate->rt_spc_warnlimit = defq->rtb.warn; + tstate->spc_warnlimit = 0; + tstate->ino_warnlimit = 0; + tstate->rt_spc_warnlimit = 0; if (tempqip) xfs_irele(ip); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index ebe2c227eb2f..aa00cf67ad72 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -597,8 +597,7 @@ xfs_dqresv_check( if (softlimit && total_count > softlimit) { time64_t now = ktime_get_real_seconds(); - if ((res->timer != 0 && now > res->timer) || - (res->warnings != 0 && res->warnings >= qlim->warn)) { + if (res->timer != 0 && now > res->timer) { *fatal = true; return QUOTA_NL_ISOFTLONGWARN; } -- cgit From 2e06df552a7cba13eb0046b9116a9aa26001ee2c Mon Sep 17 00:00:00 2001 From: Catherine Hoang Date: Tue, 10 May 2022 13:27:59 -0700 Subject: xfs: remove warning counters from struct xfs_dquot_res Warning counts are not used anywhere in the kernel. In addition, there are no use cases, test coverage, or documentation for this functionality. Remove the 'warnings' field from struct xfs_dquot_res and any other related code. Signed-off-by: Catherine Hoang Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_quota_defs.h | 1 - fs/xfs/xfs_dquot.c | 15 ++++----------- fs/xfs/xfs_dquot.h | 8 -------- fs/xfs/xfs_qm_syscalls.c | 12 +++--------- 4 files changed, 7 insertions(+), 29 deletions(-) diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 3076cd74fcaa..cb035da3f990 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -16,7 +16,6 @@ * and quota-limits. This is a waste in the common case, but hey ... */ typedef uint64_t xfs_qcnt_t; -typedef uint16_t xfs_qwarncnt_t; typedef uint8_t xfs_dqtype_t; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index eb211e0ede5d..5a6c3c3c4de2 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -136,10 +136,7 @@ xfs_qm_adjust_res_timer( res->timer = xfs_dquot_set_timeout(mp, ktime_get_real_seconds() + qlim->time); } else { - if (res->timer == 0) - res->warnings = 0; - else - res->timer = 0; + res->timer = 0; } } @@ -592,10 +589,6 @@ xfs_dquot_from_disk( dqp->q_ino.count = be64_to_cpu(ddqp->d_icount); dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount); - dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns); - dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns); - dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns); - dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer); dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer); dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer); @@ -637,9 +630,9 @@ xfs_dquot_to_disk( ddqp->d_icount = cpu_to_be64(dqp->q_ino.count); ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count); - ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings); - ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings); - ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings); + ddqp->d_bwarns = 0; + ddqp->d_iwarns = 0; + ddqp->d_rtbwarns = 0; ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer); ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer); diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 6b5e3cf40c8b..80c8f851a2f3 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -44,14 +44,6 @@ struct xfs_dquot_res { * in seconds since the Unix epoch. */ time64_t timer; - - /* - * For root dquots, this is the maximum number of warnings that will - * be issued for this quota type. Otherwise, this is the number of - * warnings issued against this quota. Note that none of this is - * implemented. - */ - xfs_qwarncnt_t warnings; }; static inline bool diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index e7f3ac60ebd9..2149c203b1d0 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -343,8 +343,6 @@ xfs_qm_scall_setqlim( if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk")) xfs_dquot_set_prealloc_limits(dqp); - if (newlim->d_fieldmask & QC_SPC_WARNS) - res->warnings = newlim->d_spc_warns; if (newlim->d_fieldmask & QC_SPC_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer); @@ -359,8 +357,6 @@ xfs_qm_scall_setqlim( qlim = id == 0 ? &defq->rtb : NULL; xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb"); - if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - res->warnings = newlim->d_rt_spc_warns; if (newlim->d_fieldmask & QC_RT_SPC_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer); @@ -375,8 +371,6 @@ xfs_qm_scall_setqlim( qlim = id == 0 ? &defq->ino : NULL; xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino"); - if (newlim->d_fieldmask & QC_INO_WARNS) - res->warnings = newlim->d_ino_warns; if (newlim->d_fieldmask & QC_INO_TIMER) xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer); @@ -417,13 +411,13 @@ xfs_qm_scall_getquota_fill_qc( dst->d_ino_count = dqp->q_ino.reserved; dst->d_spc_timer = dqp->q_blk.timer; dst->d_ino_timer = dqp->q_ino.timer; - dst->d_ino_warns = dqp->q_ino.warnings; - dst->d_spc_warns = dqp->q_blk.warnings; + dst->d_ino_warns = 0; + dst->d_spc_warns = 0; dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit); dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit); dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved); dst->d_rt_spc_timer = dqp->q_rtb.timer; - dst->d_rt_spc_warns = dqp->q_rtb.warnings; + dst->d_rt_spc_warns = 0; /* * Internally, we don't reset all the timers when quota enforcement -- cgit From 5349b2afc117d87d35502f2fe1930692d6bfc68b Mon Sep 17 00:00:00 2001 From: Catherine Hoang Date: Tue, 10 May 2022 13:28:00 -0700 Subject: xfs: don't set quota warning values Having just dropped support for quota warning limits and warning counters, the warning fields no longer have any meaning. Prevent these fields from being set by removing QC_WARNS_MASK from XFS_QC_SETINFO_MASK and XFS_QC_MASK. Signed-off-by: Catherine Hoang Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_qm_syscalls.c | 3 +-- fs/xfs/xfs_quotaops.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 2149c203b1d0..74ac9ca9e119 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -217,8 +217,7 @@ xfs_qm_scall_quotaon( return 0; } -#define XFS_QC_MASK \ - (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK) +#define XFS_QC_MASK (QC_LIMIT_MASK | QC_TIMER_MASK) /* * Adjust limits of this quota, and the defaults if passed in. Returns true diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 50391730241f..9c162e69976b 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -98,7 +98,7 @@ xfs_quota_type(int type) } } -#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK) +#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK) /* * Adjust quota timers & warnings -- cgit From e0c41089b998f5a54dabd7a34ab24108e192d2ee Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:52 +1000 Subject: xfs: separate out initial attr_set states We current use XFS_DAS_UNINIT for several steps in the attr_set state machine. We use it for setting shortform xattrs, converting from shortform to leaf, leaf add, leaf-to-node and leaf add. All of these things are essentially known before we start the state machine iterating, so we really should separate them out: XFS_DAS_SF_ADD: - tries to do a shortform add - on success -> done - on ENOSPC converts to leaf, -> XFS_DAS_LEAF_ADD - on error, dies. XFS_DAS_LEAF_ADD: - tries to do leaf add - on success: - inline attr -> done - remote xattr || REPLACE -> XFS_DAS_FOUND_LBLK - on ENOSPC converts to node, -> XFS_DAS_NODE_ADD - on error, dies XFS_DAS_NODE_ADD: - tries to do node add - on success: - inline attr -> done - remote xattr || REPLACE -> XFS_DAS_FOUND_NBLK - on error, dies This makes it easier to understand how the state machine starts up and sets us up on the path to further state machine simplifications. This also converts the DAS state tracepoints to use strings rather than numbers, as converting between enums and numbers requires manual counting rather than just reading the name. This also introduces a XFS_DAS_DONE state so that we can trace successful operation completions easily. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 161 +++++++++++++++++++++++----------------------- fs/xfs/libxfs/xfs_attr.h | 89 +++++++++++++++++++++---- fs/xfs/libxfs/xfs_defer.c | 2 + fs/xfs/xfs_acl.c | 4 +- fs/xfs/xfs_attr_item.c | 13 +++- fs/xfs/xfs_ioctl.c | 4 +- fs/xfs/xfs_trace.h | 22 ++++++- fs/xfs/xfs_xattr.c | 2 +- 8 files changed, 194 insertions(+), 103 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 54f90d66b206..7c11fe8b7b26 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -58,7 +58,7 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); -STATIC int xfs_attr_node_addname(struct xfs_attr_item *attr); +static int xfs_attr_node_try_addname(struct xfs_attr_item *attr); STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_item *attr); STATIC int xfs_attr_node_addname_clear_incomplete(struct xfs_attr_item *attr); STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, @@ -223,6 +223,11 @@ xfs_init_attr_trans( } } +/* + * Add an attr to a shortform fork. If there is no space, + * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC. + * to use. + */ STATIC int xfs_attr_try_sf_addname( struct xfs_inode *dp, @@ -254,20 +259,7 @@ xfs_attr_try_sf_addname( return error; } -/* - * Check to see if the attr should be upgraded from non-existent or shortform to - * single-leaf-block attribute list. - */ -static inline bool -xfs_attr_is_shortform( - struct xfs_inode *ip) -{ - return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || - (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0); -} - -STATIC int +static int xfs_attr_sf_addname( struct xfs_attr_item *attr) { @@ -275,14 +267,12 @@ xfs_attr_sf_addname( struct xfs_inode *dp = args->dp; int error = 0; - /* - * Try to add the attr to the attribute list in the inode. - */ error = xfs_attr_try_sf_addname(dp, args); - - /* Should only be 0, -EEXIST or -ENOSPC */ - if (error != -ENOSPC) - return error; + if (error != -ENOSPC) { + ASSERT(!error || error == -EEXIST); + attr->xattri_dela_state = XFS_DAS_DONE; + goto out; + } /* * It won't fit in the shortform, transform to a leaf block. GROT: @@ -298,64 +288,42 @@ xfs_attr_sf_addname( * with the write verifier. */ xfs_trans_bhold(args->trans, attr->xattri_leaf_bp); - - /* - * We're still in XFS_DAS_UNINIT state here. We've converted - * the attr fork to leaf format and will restart with the leaf - * add. - */ - trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp); - return -EAGAIN; + attr->xattri_dela_state = XFS_DAS_LEAF_ADD; + error = -EAGAIN; +out: + trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp); + return error; } -STATIC int +static int xfs_attr_leaf_addname( struct xfs_attr_item *attr) { struct xfs_da_args *args = attr->xattri_da_args; - struct xfs_inode *dp = args->dp; - enum xfs_delattr_state next_state = XFS_DAS_UNINIT; int error; - if (xfs_attr_is_leaf(dp)) { - - /* - * Use the leaf buffer we may already hold locked as a result of - * a sf-to-leaf conversion. The held buffer is no longer valid - * after this call, regardless of the result. - */ - error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp); - attr->xattri_leaf_bp = NULL; + ASSERT(xfs_attr_is_leaf(args->dp)); - if (error == -ENOSPC) { - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; - - /* - * Finish any deferred work items and roll the - * transaction once more. The goal here is to call - * node_addname with the inode and transaction in the - * same state (inode locked and joined, transaction - * clean) no matter how we got to this step. - * - * At this point, we are still in XFS_DAS_UNINIT, but - * when we come back, we'll be a node, so we'll fall - * down into the node handling code below - */ - error = -EAGAIN; - goto out; - } - next_state = XFS_DAS_FOUND_LBLK; - } else { - ASSERT(!attr->xattri_leaf_bp); + /* + * Use the leaf buffer we may already hold locked as a result of + * a sf-to-leaf conversion. The held buffer is no longer valid + * after this call, regardless of the result. + */ + error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp); + attr->xattri_leaf_bp = NULL; - error = xfs_attr_node_addname_find_attr(attr); + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); if (error) return error; - next_state = XFS_DAS_FOUND_NBLK; - error = xfs_attr_node_addname(attr); + /* + * We're not in leaf format anymore, so roll the transaction and + * retry the add to the newly allocated node block. + */ + attr->xattri_dela_state = XFS_DAS_NODE_ADD; + error = -EAGAIN; + goto out; } if (error) return error; @@ -367,15 +335,46 @@ xfs_attr_leaf_addname( */ if (args->rmtblkno || (args->op_flags & XFS_DA_OP_RENAME)) { - attr->xattri_dela_state = next_state; + attr->xattri_dela_state = XFS_DAS_FOUND_LBLK; error = -EAGAIN; + } else { + attr->xattri_dela_state = XFS_DAS_DONE; } - out: trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); return error; } +static int +xfs_attr_node_addname( + struct xfs_attr_item *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error; + + ASSERT(!attr->xattri_leaf_bp); + + error = xfs_attr_node_addname_find_attr(attr); + if (error) + return error; + + error = xfs_attr_node_try_addname(attr); + if (error) + return error; + + if (args->rmtblkno || + (args->op_flags & XFS_DA_OP_RENAME)) { + attr->xattri_dela_state = XFS_DAS_FOUND_NBLK; + error = -EAGAIN; + } else { + attr->xattri_dela_state = XFS_DAS_DONE; + } + + trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp); + return error; +} + + /* * Set the attribute specified in @args. * This routine is meant to function as a delayed operation, and may return @@ -396,16 +395,14 @@ xfs_attr_set_iter( /* State machine switch */ switch (attr->xattri_dela_state) { case XFS_DAS_UNINIT: - /* - * If the fork is shortform, attempt to add the attr. If there - * is no space, this converts to leaf format and returns - * -EAGAIN with the leaf buffer held across the roll. The caller - * will deal with a transaction roll error, but otherwise - * release the hold once we return with a clean transaction. - */ - if (xfs_attr_is_shortform(dp)) - return xfs_attr_sf_addname(attr); + ASSERT(0); + return -EFSCORRUPTED; + case XFS_DAS_SF_ADD: + return xfs_attr_sf_addname(attr); + case XFS_DAS_LEAF_ADD: return xfs_attr_leaf_addname(attr); + case XFS_DAS_NODE_ADD: + return xfs_attr_node_addname(attr); case XFS_DAS_FOUND_LBLK: /* @@ -699,7 +696,7 @@ xfs_attr_defer_add( if (error) return error; - new->xattri_dela_state = XFS_DAS_UNINIT; + new->xattri_dela_state = xfs_attr_init_add_state(args); xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); @@ -718,7 +715,7 @@ xfs_attr_defer_replace( if (error) return error; - new->xattri_dela_state = XFS_DAS_UNINIT; + new->xattri_dela_state = xfs_attr_init_replace_state(args); xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp); @@ -1261,8 +1258,8 @@ error: * to handle this, and recall the function until a successful error code is *returned. */ -STATIC int -xfs_attr_node_addname( +static int +xfs_attr_node_try_addname( struct xfs_attr_item *attr) { struct xfs_da_args *args = attr->xattri_da_args; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index c9c867e3406c..bbbc964f4e3c 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -443,21 +443,44 @@ struct xfs_attr_list_context { * to where it was and resume executing where it left off. */ enum xfs_delattr_state { - XFS_DAS_UNINIT = 0, /* No state has been set yet */ - XFS_DAS_RMTBLK, /* Removing remote blks */ - XFS_DAS_RM_NAME, /* Remove attr name */ - XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ - XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */ - XFS_DAS_FOUND_NBLK, /* We found node blk for attr */ - XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */ - XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ - XFS_DAS_RD_LEAF, /* Read in the new leaf */ - XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */ - XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */ - XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ - XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ + XFS_DAS_UNINIT = 0, /* No state has been set yet */ + XFS_DAS_SF_ADD, /* Initial shortform set iter state */ + XFS_DAS_LEAF_ADD, /* Initial leaf form set iter state */ + XFS_DAS_NODE_ADD, /* Initial node form set iter state */ + XFS_DAS_RMTBLK, /* Removing remote blks */ + XFS_DAS_RM_NAME, /* Remove attr name */ + XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ + XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */ + XFS_DAS_FOUND_NBLK, /* We found node blk for attr */ + XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */ + XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ + XFS_DAS_RD_LEAF, /* Read in the new leaf */ + XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */ + XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */ + XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ + XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ + XFS_DAS_DONE, /* finished operation */ }; +#define XFS_DAS_STRINGS \ + { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \ + { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \ + { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \ + { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \ + { XFS_DAS_RMTBLK, "XFS_DAS_RMTBLK" }, \ + { XFS_DAS_RM_NAME, "XFS_DAS_RM_NAME" }, \ + { XFS_DAS_RM_SHRINK, "XFS_DAS_RM_SHRINK" }, \ + { XFS_DAS_FOUND_LBLK, "XFS_DAS_FOUND_LBLK" }, \ + { XFS_DAS_FOUND_NBLK, "XFS_DAS_FOUND_NBLK" }, \ + { XFS_DAS_FLIP_LFLAG, "XFS_DAS_FLIP_LFLAG" }, \ + { XFS_DAS_RM_LBLK, "XFS_DAS_RM_LBLK" }, \ + { XFS_DAS_RD_LEAF, "XFS_DAS_RD_LEAF" }, \ + { XFS_DAS_ALLOC_NODE, "XFS_DAS_ALLOC_NODE" }, \ + { XFS_DAS_FLIP_NFLAG, "XFS_DAS_FLIP_NFLAG" }, \ + { XFS_DAS_RM_NBLK, "XFS_DAS_RM_NBLK" }, \ + { XFS_DAS_CLR_FLAG, "XFS_DAS_CLR_FLAG" }, \ + { XFS_DAS_DONE, "XFS_DAS_DONE" } + /* * Defines for xfs_attr_item.xattri_flags */ @@ -530,4 +553,44 @@ void xfs_attri_destroy_cache(void); int __init xfs_attrd_init_cache(void); void xfs_attrd_destroy_cache(void); +/* + * Check to see if the attr should be upgraded from non-existent or shortform to + * single-leaf-block attribute list. + */ +static inline bool +xfs_attr_is_shortform( + struct xfs_inode *ip) +{ + return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || + (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0); +} + +static inline enum xfs_delattr_state +xfs_attr_init_add_state(struct xfs_da_args *args) +{ + + /* + * When called from the completion of a attr remove to determine the + * next state, the attribute fork may be null. This can occur only occur + * on a pure remove, but we grab the next state before we check if a + * replace operation is being performed. If we are called from any other + * context, i_afp is guaranteed to exist. Hence if the attr fork is + * null, we were called from a pure remove operation and so we are done. + */ + if (!args->dp->i_afp) + return XFS_DAS_DONE; + if (xfs_attr_is_shortform(args->dp)) + return XFS_DAS_SF_ADD; + if (xfs_attr_is_leaf(args->dp)) + return XFS_DAS_LEAF_ADD; + return XFS_DAS_NODE_ADD; +} + +static inline enum xfs_delattr_state +xfs_attr_init_replace_state(struct xfs_da_args *args) +{ + return xfs_attr_init_add_state(args); +} + #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index b2ecc272f9e4..ceb222b4f261 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -23,6 +23,8 @@ #include "xfs_bmap.h" #include "xfs_alloc.h" #include "xfs_buf.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" static struct kmem_cache *xfs_defer_pending_cache; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 5c52ee869272..3df9c1782ead 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -10,12 +10,12 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_acl.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_trans.h" #include diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index ee8b140a2801..f058b034ee34 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -577,10 +577,21 @@ xfs_attri_item_recover( args->hashval = xfs_da_hashname(args->name, args->namelen); args->attr_filter = attrp->alfi_attr_flags; - if (attrp->alfi_op_flags == XFS_ATTR_OP_FLAGS_SET) { + switch (attrp->alfi_op_flags & XFS_ATTR_OP_FLAGS_TYPE_MASK) { + case XFS_ATTR_OP_FLAGS_SET: + case XFS_ATTR_OP_FLAGS_REPLACE: args->value = attrip->attri_value; args->valuelen = attrp->alfi_value_len; args->total = xfs_attr_calc_size(args, &local); + attr->xattri_dela_state = xfs_attr_init_add_state(args); + break; + case XFS_ATTR_OP_FLAGS_REMOVE: + attr->xattri_dela_state = XFS_DAS_UNINIT; + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + goto out; } xfs_init_attr_trans(args, &tres, &total); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index e9eadc7337ce..0e5cb7936206 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -15,6 +15,8 @@ #include "xfs_iwalk.h" #include "xfs_itable.h" #include "xfs_error.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -35,8 +37,6 @@ #include "xfs_health.h" #include "xfs_reflink.h" #include "xfs_ioctl.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include #include diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 01ce0401aa32..8f722be25c29 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4129,6 +4129,23 @@ DEFINE_ICLOG_EVENT(xlog_iclog_want_sync); DEFINE_ICLOG_EVENT(xlog_iclog_wait_on); DEFINE_ICLOG_EVENT(xlog_iclog_write); +TRACE_DEFINE_ENUM(XFS_DAS_UNINIT); +TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD); +TRACE_DEFINE_ENUM(XFS_DAS_RMTBLK); +TRACE_DEFINE_ENUM(XFS_DAS_RM_NAME); +TRACE_DEFINE_ENUM(XFS_DAS_RM_SHRINK); +TRACE_DEFINE_ENUM(XFS_DAS_FOUND_LBLK); +TRACE_DEFINE_ENUM(XFS_DAS_FOUND_NBLK); +TRACE_DEFINE_ENUM(XFS_DAS_FLIP_LFLAG); +TRACE_DEFINE_ENUM(XFS_DAS_RM_LBLK); +TRACE_DEFINE_ENUM(XFS_DAS_RD_LEAF); +TRACE_DEFINE_ENUM(XFS_DAS_ALLOC_NODE); +TRACE_DEFINE_ENUM(XFS_DAS_FLIP_NFLAG); +TRACE_DEFINE_ENUM(XFS_DAS_RM_NBLK); +TRACE_DEFINE_ENUM(XFS_DAS_CLR_FLAG); + DECLARE_EVENT_CLASS(xfs_das_state_class, TP_PROTO(int das, struct xfs_inode *ip), TP_ARGS(das, ip), @@ -4140,8 +4157,9 @@ DECLARE_EVENT_CLASS(xfs_das_state_class, __entry->das = das; __entry->ino = ip->i_ino; ), - TP_printk("state change %d ino 0x%llx", - __entry->das, __entry->ino) + TP_printk("state change %s ino 0x%llx", + __print_symbolic(__entry->das, XFS_DAS_STRINGS), + __entry->ino) ) #define DEFINE_DAS_STATE_EVENT(name) \ diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 0d050f8829ef..7a044afd4c46 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -12,9 +12,9 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" +#include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_acl.h" -#include "xfs_da_btree.h" #include -- cgit From 2157d1699e59819c8a31ba3e47008e4145d854a9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:54 +1000 Subject: xfs: kill XFS_DAC_LEAF_ADDNAME_INIT We re-enter the XFS_DAS_FOUND_LBLK state when we have to allocate multiple extents for a remote xattr. We currently have a flag called XFS_DAC_LEAF_ADDNAME_INIT to avoid running the remote attr hole finding code more than once. However, for the node format tree, we have a separate state for this so we never reenter the state machine at XFS_DAS_FOUND_NBLK and so it does not need a special flag to skip over the remote attr hold finding code. Convert the leaf block code to use the same state machine as the node blocks and kill the XFS_DAC_LEAF_ADDNAME_INIT flag. This further points out that this "ALLOC" state is only traversed if we have remote xattrs or we are doing a rename operation. Rename both the leaf and node alloc states to _ALLOC_RMT to indicate they are iterating to do allocation of remote xattr blocks. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 45 +++++++++++++++++++++++---------------------- fs/xfs/libxfs/xfs_attr.h | 6 ++++-- fs/xfs/xfs_trace.h | 3 ++- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 7c11fe8b7b26..1ae210dc8a2b 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -405,40 +405,41 @@ xfs_attr_set_iter( return xfs_attr_node_addname(attr); case XFS_DAS_FOUND_LBLK: + /* + * Find space for remote blocks and fall into the allocation + * state. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_find_space(attr); + if (error) + return error; + } + attr->xattri_dela_state = XFS_DAS_LEAF_ALLOC_RMT; + fallthrough; + case XFS_DAS_LEAF_ALLOC_RMT: + /* * If there was an out-of-line value, allocate the blocks we * identified for its storage and copy the value. This is done * after we create the attribute so that we don't overflow the * maximum size of a transaction and/or hit a deadlock. */ - - /* Open coded xfs_attr_rmtval_set without trans handling */ - if ((attr->xattri_flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) { - attr->xattri_flags |= XFS_DAC_LEAF_ADDNAME_INIT; - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(attr); + if (args->rmtblkno > 0) { + if (attr->xattri_blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(attr); if (error) return error; + trace_xfs_attr_set_iter_return( + attr->xattri_dela_state, + args->dp); + return -EAGAIN; } - } - /* - * Repeat allocating remote blocks for the attr value until - * blkcnt drops to zero. - */ - if (attr->xattri_blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(attr); + error = xfs_attr_rmtval_set_value(args); if (error) return error; - trace_xfs_attr_set_iter_return(attr->xattri_dela_state, - args->dp); - return -EAGAIN; } - error = xfs_attr_rmtval_set_value(args); - if (error) - return error; - /* * If this is not a rename, clear the incomplete flag and we're * done. @@ -533,15 +534,15 @@ xfs_attr_set_iter( return error; } + attr->xattri_dela_state = XFS_DAS_NODE_ALLOC_RMT; fallthrough; - case XFS_DAS_ALLOC_NODE: + case XFS_DAS_NODE_ALLOC_RMT: /* * If there was an out-of-line value, allocate the blocks we * identified for its storage and copy the value. This is done * after we create the attribute so that we don't overflow the * maximum size of a transaction and/or hit a deadlock. */ - attr->xattri_dela_state = XFS_DAS_ALLOC_NODE; if (args->rmtblkno > 0) { if (attr->xattri_blkcnt > 0) { error = xfs_attr_rmtval_set_blk(attr); diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index bbbc964f4e3c..cdfc5a9b4495 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -451,11 +451,12 @@ enum xfs_delattr_state { XFS_DAS_RM_NAME, /* Remove attr name */ XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */ + XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ XFS_DAS_FOUND_NBLK, /* We found node blk for attr */ + XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */ XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ XFS_DAS_RD_LEAF, /* Read in the new leaf */ - XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */ XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */ XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ @@ -471,11 +472,12 @@ enum xfs_delattr_state { { XFS_DAS_RM_NAME, "XFS_DAS_RM_NAME" }, \ { XFS_DAS_RM_SHRINK, "XFS_DAS_RM_SHRINK" }, \ { XFS_DAS_FOUND_LBLK, "XFS_DAS_FOUND_LBLK" }, \ + { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \ { XFS_DAS_FOUND_NBLK, "XFS_DAS_FOUND_NBLK" }, \ + { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \ { XFS_DAS_FLIP_LFLAG, "XFS_DAS_FLIP_LFLAG" }, \ { XFS_DAS_RM_LBLK, "XFS_DAS_RM_LBLK" }, \ { XFS_DAS_RD_LEAF, "XFS_DAS_RD_LEAF" }, \ - { XFS_DAS_ALLOC_NODE, "XFS_DAS_ALLOC_NODE" }, \ { XFS_DAS_FLIP_NFLAG, "XFS_DAS_FLIP_NFLAG" }, \ { XFS_DAS_RM_NBLK, "XFS_DAS_RM_NBLK" }, \ { XFS_DAS_CLR_FLAG, "XFS_DAS_CLR_FLAG" }, \ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8f722be25c29..067ab31d7a20 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4137,11 +4137,12 @@ TRACE_DEFINE_ENUM(XFS_DAS_RMTBLK); TRACE_DEFINE_ENUM(XFS_DAS_RM_NAME); TRACE_DEFINE_ENUM(XFS_DAS_RM_SHRINK); TRACE_DEFINE_ENUM(XFS_DAS_FOUND_LBLK); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT); TRACE_DEFINE_ENUM(XFS_DAS_FOUND_NBLK); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT); TRACE_DEFINE_ENUM(XFS_DAS_FLIP_LFLAG); TRACE_DEFINE_ENUM(XFS_DAS_RM_LBLK); TRACE_DEFINE_ENUM(XFS_DAS_RD_LEAF); -TRACE_DEFINE_ENUM(XFS_DAS_ALLOC_NODE); TRACE_DEFINE_ENUM(XFS_DAS_FLIP_NFLAG); TRACE_DEFINE_ENUM(XFS_DAS_RM_NBLK); TRACE_DEFINE_ENUM(XFS_DAS_CLR_FLAG); -- cgit From 251b29c88eb84922e916ed4685f50db741aeb0af Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:54 +1000 Subject: xfs: consolidate leaf/node states in xfs_attr_set_iter The operations performed from XFS_DAS_FOUND_LBLK through to XFS_DAS_RM_LBLK are now identical to XFS_DAS_FOUND_NBLK through to XFS_DAS_RM_NBLK. We can collapse these down into a single set of code. To do this, define the states that leaf and node run through as separate sets of sequential states. Then as we move to the next state, we can use increments rather than specific state assignments to move through the states. This means the state progression is set by the initial state that enters the series and we don't need to duplicate the code anymore. At the exit point of the series we need to select the correct leaf or node state, but that can also be done by state increment rather than assignment. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 127 ++++++++--------------------------------------- fs/xfs/libxfs/xfs_attr.h | 9 +++- 2 files changed, 27 insertions(+), 109 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 1ae210dc8a2b..d06998d8cbdb 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -393,6 +393,7 @@ xfs_attr_set_iter( struct xfs_mount *mp = args->dp->i_mount; /* State machine switch */ +next_state: switch (attr->xattri_dela_state) { case XFS_DAS_UNINIT: ASSERT(0); @@ -405,6 +406,7 @@ xfs_attr_set_iter( return xfs_attr_node_addname(attr); case XFS_DAS_FOUND_LBLK: + case XFS_DAS_FOUND_NBLK: /* * Find space for remote blocks and fall into the allocation * state. @@ -414,9 +416,10 @@ xfs_attr_set_iter( if (error) return error; } - attr->xattri_dela_state = XFS_DAS_LEAF_ALLOC_RMT; + attr->xattri_dela_state++; fallthrough; case XFS_DAS_LEAF_ALLOC_RMT: + case XFS_DAS_NODE_ALLOC_RMT: /* * If there was an out-of-line value, allocate the blocks we @@ -465,16 +468,18 @@ xfs_attr_set_iter( return error; /* * Commit the flag value change and start the next trans - * in series. + * in series at FLIP_FLAG. */ - attr->xattri_dela_state = XFS_DAS_FLIP_LFLAG; + attr->xattri_dela_state++; trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp); return -EAGAIN; } + attr->xattri_dela_state++; fallthrough; case XFS_DAS_FLIP_LFLAG: + case XFS_DAS_FLIP_NFLAG: /* * Dismantle the "old" attribute/value pair by removing a * "remote" value (if it exists). @@ -484,10 +489,10 @@ xfs_attr_set_iter( if (error) return error; + attr->xattri_dela_state++; fallthrough; case XFS_DAS_RM_LBLK: - /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ - attr->xattri_dela_state = XFS_DAS_RM_LBLK; + case XFS_DAS_RM_NBLK: if (args->rmtblkno) { error = xfs_attr_rmtval_remove(attr); if (error == -EAGAIN) @@ -502,7 +507,16 @@ xfs_attr_set_iter( return -EAGAIN; } - fallthrough; + /* + * This is the end of the shared leaf/node sequence. We need + * to continue at the next state in the sequence, but we can't + * easily just fall through. So we increment to the next state + * and then jump back to switch statement to evaluate the next + * state correctly. + */ + attr->xattri_dela_state++; + goto next_state; + case XFS_DAS_RD_LEAF: /* * This is the last step for leaf format. Read the block with @@ -523,106 +537,6 @@ xfs_attr_set_iter( return error; - case XFS_DAS_FOUND_NBLK: - /* - * Find space for remote blocks and fall into the allocation - * state. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(attr); - if (error) - return error; - } - - attr->xattri_dela_state = XFS_DAS_NODE_ALLOC_RMT; - fallthrough; - case XFS_DAS_NODE_ALLOC_RMT: - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - if (args->rmtblkno > 0) { - if (attr->xattri_blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(attr); - if (error) - return error; - trace_xfs_attr_set_iter_return( - attr->xattri_dela_state, args->dp); - return -EAGAIN; - } - - error = xfs_attr_rmtval_set_value(args); - if (error) - return error; - } - - /* - * If this was not a rename, clear the incomplete flag and we're - * done. - */ - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - goto out; - } - - /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. - */ - if (!xfs_has_larp(mp)) { - error = xfs_attr3_leaf_flipflags(args); - if (error) - goto out; - /* - * Commit the flag value change and start the next trans - * in series - */ - attr->xattri_dela_state = XFS_DAS_FLIP_NFLAG; - trace_xfs_attr_set_iter_return(attr->xattri_dela_state, - args->dp); - return -EAGAIN; - } - - fallthrough; - case XFS_DAS_FLIP_NFLAG: - /* - * Dismantle the "old" attribute/value pair by removing a - * "remote" value (if it exists). - */ - xfs_attr_restore_rmt_blk(args); - - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - - fallthrough; - case XFS_DAS_RM_NBLK: - /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ - attr->xattri_dela_state = XFS_DAS_RM_NBLK; - if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(attr); - if (error == -EAGAIN) - trace_xfs_attr_set_iter_return( - attr->xattri_dela_state, args->dp); - - if (error) - return error; - - attr->xattri_dela_state = XFS_DAS_CLR_FLAG; - trace_xfs_attr_set_iter_return(attr->xattri_dela_state, - args->dp); - return -EAGAIN; - } - - fallthrough; case XFS_DAS_CLR_FLAG: /* * The last state for node format. Look up the old attr and @@ -634,7 +548,6 @@ xfs_attr_set_iter( ASSERT(0); break; } -out: return error; } diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index cdfc5a9b4495..908a13d61716 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -450,16 +450,21 @@ enum xfs_delattr_state { XFS_DAS_RMTBLK, /* Removing remote blks */ XFS_DAS_RM_NAME, /* Remove attr name */ XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ + + /* Leaf state set sequence */ XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */ XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ - XFS_DAS_FOUND_NBLK, /* We found node blk for attr */ - XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */ XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ XFS_DAS_RD_LEAF, /* Read in the new leaf */ + + /* Node state set sequence, must match leaf state above */ + XFS_DAS_FOUND_NBLK, /* We found node blk for attr */ + XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */ XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ + XFS_DAS_DONE, /* finished operation */ }; -- cgit From 7d03533629d1c3fca395e6fd0935ca1de676f2bc Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:55 +1000 Subject: xfs: split remote attr setting out from replace path When we set a new xattr, we have three exit paths: 1. nothing else to do 2. allocate and set the remote xattr value 3. perform the rest of a replace operation Currently we push both 2 and 3 into the same state, regardless of whether we just set a remote attribute or not. Once we've set the remote xattr, we have two exit states: 1. nothing else to do 2. perform the rest of a replace operation Hence we can split the remote xattr allocation and setting into their own states and factor it out of xfs_attr_set_iter() to further clean up the state machine and the implementation of the state machine. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 113 ++++++++++++++++++++++++++--------------------- fs/xfs/libxfs/xfs_attr.h | 14 +++--- fs/xfs/xfs_trace.h | 9 ++-- 3 files changed, 77 insertions(+), 59 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index d06998d8cbdb..513f0b1a6a4c 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -333,9 +333,11 @@ xfs_attr_leaf_addname( * or perform more xattr manipulations. Otherwise there is nothing more * to do and we can return success. */ - if (args->rmtblkno || - (args->op_flags & XFS_DA_OP_RENAME)) { - attr->xattri_dela_state = XFS_DAS_FOUND_LBLK; + if (args->rmtblkno) { + attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; + error = -EAGAIN; + } else if (args->op_flags & XFS_DA_OP_RENAME) { + attr->xattri_dela_state = XFS_DAS_LEAF_REPLACE; error = -EAGAIN; } else { attr->xattri_dela_state = XFS_DAS_DONE; @@ -362,9 +364,11 @@ xfs_attr_node_addname( if (error) return error; - if (args->rmtblkno || - (args->op_flags & XFS_DA_OP_RENAME)) { - attr->xattri_dela_state = XFS_DAS_FOUND_NBLK; + if (args->rmtblkno) { + attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT; + error = -EAGAIN; + } else if (args->op_flags & XFS_DA_OP_RENAME) { + attr->xattri_dela_state = XFS_DAS_NODE_REPLACE; error = -EAGAIN; } else { attr->xattri_dela_state = XFS_DAS_DONE; @@ -374,6 +378,40 @@ xfs_attr_node_addname( return error; } +static int +xfs_attr_rmtval_alloc( + struct xfs_attr_item *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error = 0; + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (attr->xattri_blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(attr); + if (error) + return error; + error = -EAGAIN; + goto out; + } + + error = xfs_attr_rmtval_set_value(args); + if (error) + return error; + + /* If this is not a rename, clear the incomplete flag and we're done. */ + if (!(args->op_flags & XFS_DA_OP_RENAME)) { + error = xfs_attr3_leaf_clearflag(args); + attr->xattri_dela_state = XFS_DAS_DONE; + } +out: + trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp); + return error; +} /* * Set the attribute specified in @args. @@ -405,54 +443,26 @@ next_state: case XFS_DAS_NODE_ADD: return xfs_attr_node_addname(attr); - case XFS_DAS_FOUND_LBLK: - case XFS_DAS_FOUND_NBLK: - /* - * Find space for remote blocks and fall into the allocation - * state. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(attr); - if (error) - return error; - } + case XFS_DAS_LEAF_SET_RMT: + case XFS_DAS_NODE_SET_RMT: + error = xfs_attr_rmtval_find_space(attr); + if (error) + return error; attr->xattri_dela_state++; fallthrough; + case XFS_DAS_LEAF_ALLOC_RMT: case XFS_DAS_NODE_ALLOC_RMT: - - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - if (args->rmtblkno > 0) { - if (attr->xattri_blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(attr); - if (error) - return error; - trace_xfs_attr_set_iter_return( - attr->xattri_dela_state, - args->dp); - return -EAGAIN; - } - - error = xfs_attr_rmtval_set_value(args); - if (error) - return error; - } - - /* - * If this is not a rename, clear the incomplete flag and we're - * done. - */ - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); + error = xfs_attr_rmtval_alloc(attr); + if (error) return error; - } + if (attr->xattri_dela_state == XFS_DAS_DONE) + break; + attr->xattri_dela_state++; + fallthrough; + case XFS_DAS_LEAF_REPLACE: + case XFS_DAS_NODE_REPLACE: /* * If this is an atomic rename operation, we must "flip" the * incomplete flags on the "new" and "old" attribute/value pairs @@ -470,10 +480,9 @@ next_state: * Commit the flag value change and start the next trans * in series at FLIP_FLAG. */ + error = -EAGAIN; attr->xattri_dela_state++; - trace_xfs_attr_set_iter_return(attr->xattri_dela_state, - args->dp); - return -EAGAIN; + break; } attr->xattri_dela_state++; @@ -548,6 +557,8 @@ next_state: ASSERT(0); break; } + + trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp); return error; } diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 908a13d61716..a0e631df1e24 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -452,15 +452,17 @@ enum xfs_delattr_state { XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ /* Leaf state set sequence */ - XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */ + XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */ XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ + XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */ XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */ XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ XFS_DAS_RD_LEAF, /* Read in the new leaf */ /* Node state set sequence, must match leaf state above */ - XFS_DAS_FOUND_NBLK, /* We found node blk for attr */ + XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */ XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ + XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */ XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */ XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ @@ -476,13 +478,15 @@ enum xfs_delattr_state { { XFS_DAS_RMTBLK, "XFS_DAS_RMTBLK" }, \ { XFS_DAS_RM_NAME, "XFS_DAS_RM_NAME" }, \ { XFS_DAS_RM_SHRINK, "XFS_DAS_RM_SHRINK" }, \ - { XFS_DAS_FOUND_LBLK, "XFS_DAS_FOUND_LBLK" }, \ + { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \ { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \ - { XFS_DAS_FOUND_NBLK, "XFS_DAS_FOUND_NBLK" }, \ - { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \ + { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \ { XFS_DAS_FLIP_LFLAG, "XFS_DAS_FLIP_LFLAG" }, \ { XFS_DAS_RM_LBLK, "XFS_DAS_RM_LBLK" }, \ { XFS_DAS_RD_LEAF, "XFS_DAS_RD_LEAF" }, \ + { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \ + { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \ + { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \ { XFS_DAS_FLIP_NFLAG, "XFS_DAS_FLIP_NFLAG" }, \ { XFS_DAS_RM_NBLK, "XFS_DAS_RM_NBLK" }, \ { XFS_DAS_CLR_FLAG, "XFS_DAS_CLR_FLAG" }, \ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 067ab31d7a20..cb9122327114 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4136,13 +4136,15 @@ TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD); TRACE_DEFINE_ENUM(XFS_DAS_RMTBLK); TRACE_DEFINE_ENUM(XFS_DAS_RM_NAME); TRACE_DEFINE_ENUM(XFS_DAS_RM_SHRINK); -TRACE_DEFINE_ENUM(XFS_DAS_FOUND_LBLK); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT); -TRACE_DEFINE_ENUM(XFS_DAS_FOUND_NBLK); -TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE); TRACE_DEFINE_ENUM(XFS_DAS_FLIP_LFLAG); TRACE_DEFINE_ENUM(XFS_DAS_RM_LBLK); TRACE_DEFINE_ENUM(XFS_DAS_RD_LEAF); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE); TRACE_DEFINE_ENUM(XFS_DAS_FLIP_NFLAG); TRACE_DEFINE_ENUM(XFS_DAS_RM_NBLK); TRACE_DEFINE_ENUM(XFS_DAS_CLR_FLAG); @@ -4172,6 +4174,7 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return); DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add); DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace); -- cgit From 411b434a63248ecff58aaf498b09eaf3b3f52f90 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:55 +1000 Subject: xfs: XFS_DAS_LEAF_REPLACE state only needed if !LARP We can skip the REPLACE state when LARP is enabled, but that means the XFS_DAS_FLIP_LFLAG state is now poorly named - it indicates something that has been done rather than what the state is going to do. Rename it to "REMOVE_OLD" to indicate that we are now going to perform removal of the old attr. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 81 ++++++++++++++++++++++++++++++------------------ fs/xfs/libxfs/xfs_attr.h | 44 +++++++++++++------------- fs/xfs/xfs_trace.h | 4 +-- 3 files changed, 75 insertions(+), 54 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 513f0b1a6a4c..9b5ef38b09b2 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -295,6 +295,26 @@ out: return error; } +/* + * When we bump the state to REPLACE, we may actually need to skip over the + * state. When LARP mode is enabled, we don't need to run the atomic flags flip, + * so we skip straight over the REPLACE state and go on to REMOVE_OLD. + */ +static void +xfs_attr_dela_state_set_replace( + struct xfs_attr_item *attr, + enum xfs_delattr_state replace) +{ + struct xfs_da_args *args = attr->xattri_da_args; + + ASSERT(replace == XFS_DAS_LEAF_REPLACE || + replace == XFS_DAS_NODE_REPLACE); + + attr->xattri_dela_state = replace; + if (xfs_has_larp(args->dp->i_mount)) + attr->xattri_dela_state++; +} + static int xfs_attr_leaf_addname( struct xfs_attr_item *attr) @@ -337,7 +357,7 @@ xfs_attr_leaf_addname( attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; error = -EAGAIN; } else if (args->op_flags & XFS_DA_OP_RENAME) { - attr->xattri_dela_state = XFS_DAS_LEAF_REPLACE; + xfs_attr_dela_state_set_replace(attr, XFS_DAS_LEAF_REPLACE); error = -EAGAIN; } else { attr->xattri_dela_state = XFS_DAS_DONE; @@ -368,7 +388,7 @@ xfs_attr_node_addname( attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT; error = -EAGAIN; } else if (args->op_flags & XFS_DA_OP_RENAME) { - attr->xattri_dela_state = XFS_DAS_NODE_REPLACE; + xfs_attr_dela_state_set_replace(attr, XFS_DAS_NODE_REPLACE); error = -EAGAIN; } else { attr->xattri_dela_state = XFS_DAS_DONE; @@ -395,8 +415,11 @@ xfs_attr_rmtval_alloc( error = xfs_attr_rmtval_set_blk(attr); if (error) return error; - error = -EAGAIN; - goto out; + /* Roll the transaction only if there is more to allocate. */ + if (attr->xattri_blkcnt > 0) { + error = -EAGAIN; + goto out; + } } error = xfs_attr_rmtval_set_value(args); @@ -407,6 +430,13 @@ xfs_attr_rmtval_alloc( if (!(args->op_flags & XFS_DA_OP_RENAME)) { error = xfs_attr3_leaf_clearflag(args); attr->xattri_dela_state = XFS_DAS_DONE; + } else { + /* + * We are running a REPLACE operation, so we need to bump the + * state to the step in that operation. + */ + attr->xattri_dela_state++; + xfs_attr_dela_state_set_replace(attr, attr->xattri_dela_state); } out: trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp); @@ -428,7 +458,6 @@ xfs_attr_set_iter( struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; int forkoff, error = 0; - struct xfs_mount *mp = args->dp->i_mount; /* State machine switch */ next_state: @@ -458,37 +487,29 @@ next_state: return error; if (attr->xattri_dela_state == XFS_DAS_DONE) break; - attr->xattri_dela_state++; - fallthrough; + goto next_state; case XFS_DAS_LEAF_REPLACE: case XFS_DAS_NODE_REPLACE: /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. + * We must "flip" the incomplete flags on the "new" and "old" + * attribute/value pairs so that one disappears and one appears + * atomically. Then we must remove the "old" attribute/value + * pair. */ - if (!xfs_has_larp(mp)) { - error = xfs_attr3_leaf_flipflags(args); - if (error) - return error; - /* - * Commit the flag value change and start the next trans - * in series at FLIP_FLAG. - */ - error = -EAGAIN; - attr->xattri_dela_state++; - break; - } - + error = xfs_attr3_leaf_flipflags(args); + if (error) + return error; + /* + * Commit the flag value change and start the next trans + * in series at REMOVE_OLD. + */ + error = -EAGAIN; attr->xattri_dela_state++; - fallthrough; - case XFS_DAS_FLIP_LFLAG: - case XFS_DAS_FLIP_NFLAG: + break; + + case XFS_DAS_LEAF_REMOVE_OLD: + case XFS_DAS_NODE_REMOVE_OLD: /* * Dismantle the "old" attribute/value pair by removing a * "remote" value (if it exists). diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index a0e631df1e24..01a50613726f 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -455,7 +455,7 @@ enum xfs_delattr_state { XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */ XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */ - XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */ + XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */ XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ XFS_DAS_RD_LEAF, /* Read in the new leaf */ @@ -463,7 +463,7 @@ enum xfs_delattr_state { XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */ XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */ - XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */ + XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */ XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ @@ -471,26 +471,26 @@ enum xfs_delattr_state { }; #define XFS_DAS_STRINGS \ - { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \ - { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \ - { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \ - { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \ - { XFS_DAS_RMTBLK, "XFS_DAS_RMTBLK" }, \ - { XFS_DAS_RM_NAME, "XFS_DAS_RM_NAME" }, \ - { XFS_DAS_RM_SHRINK, "XFS_DAS_RM_SHRINK" }, \ - { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \ - { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \ - { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \ - { XFS_DAS_FLIP_LFLAG, "XFS_DAS_FLIP_LFLAG" }, \ - { XFS_DAS_RM_LBLK, "XFS_DAS_RM_LBLK" }, \ - { XFS_DAS_RD_LEAF, "XFS_DAS_RD_LEAF" }, \ - { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \ - { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \ - { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \ - { XFS_DAS_FLIP_NFLAG, "XFS_DAS_FLIP_NFLAG" }, \ - { XFS_DAS_RM_NBLK, "XFS_DAS_RM_NBLK" }, \ - { XFS_DAS_CLR_FLAG, "XFS_DAS_CLR_FLAG" }, \ - { XFS_DAS_DONE, "XFS_DAS_DONE" } + { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \ + { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \ + { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \ + { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \ + { XFS_DAS_RMTBLK, "XFS_DAS_RMTBLK" }, \ + { XFS_DAS_RM_NAME, "XFS_DAS_RM_NAME" }, \ + { XFS_DAS_RM_SHRINK, "XFS_DAS_RM_SHRINK" }, \ + { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \ + { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \ + { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \ + { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \ + { XFS_DAS_RM_LBLK, "XFS_DAS_RM_LBLK" }, \ + { XFS_DAS_RD_LEAF, "XFS_DAS_RD_LEAF" }, \ + { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \ + { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \ + { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \ + { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \ + { XFS_DAS_RM_NBLK, "XFS_DAS_RM_NBLK" }, \ + { XFS_DAS_CLR_FLAG, "XFS_DAS_CLR_FLAG" }, \ + { XFS_DAS_DONE, "XFS_DAS_DONE" } /* * Defines for xfs_attr_item.xattri_flags diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index cb9122327114..b528c0f375c2 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4139,13 +4139,13 @@ TRACE_DEFINE_ENUM(XFS_DAS_RM_SHRINK); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE); -TRACE_DEFINE_ENUM(XFS_DAS_FLIP_LFLAG); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD); TRACE_DEFINE_ENUM(XFS_DAS_RM_LBLK); TRACE_DEFINE_ENUM(XFS_DAS_RD_LEAF); TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT); TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT); TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE); -TRACE_DEFINE_ENUM(XFS_DAS_FLIP_NFLAG); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD); TRACE_DEFINE_ENUM(XFS_DAS_RM_NBLK); TRACE_DEFINE_ENUM(XFS_DAS_CLR_FLAG); -- cgit From 2e7ef218e489f5b3f5156a305b55a08c41839c1b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:55 +1000 Subject: xfs: remote xattr removal in xfs_attr_set_iter() is conditional We may not have a remote value for the old xattr we have to remove, so skip over the remote value removal states and go straight to the xattr name removal in the leaf/node block. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 59 ++++++++++++++++++++++++------------------------ fs/xfs/libxfs/xfs_attr.h | 8 +++---- fs/xfs/xfs_trace.h | 4 ++-- 3 files changed, 36 insertions(+), 35 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 9b5ef38b09b2..c0e72e9c4f53 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -494,15 +494,14 @@ next_state: /* * We must "flip" the incomplete flags on the "new" and "old" * attribute/value pairs so that one disappears and one appears - * atomically. Then we must remove the "old" attribute/value - * pair. + * atomically. */ error = xfs_attr3_leaf_flipflags(args); if (error) return error; /* - * Commit the flag value change and start the next trans - * in series at REMOVE_OLD. + * We must commit the flag value change now to make it atomic + * and then we can start the next trans in series at REMOVE_OLD. */ error = -EAGAIN; attr->xattri_dela_state++; @@ -511,41 +510,43 @@ next_state: case XFS_DAS_LEAF_REMOVE_OLD: case XFS_DAS_NODE_REMOVE_OLD: /* - * Dismantle the "old" attribute/value pair by removing a - * "remote" value (if it exists). + * If we have a remote attr, start the process of removing it + * by invalidating any cached buffers. + * + * If we don't have a remote attr, we skip the remote block + * removal state altogether with a second state increment. */ xfs_attr_restore_rmt_blk(args); - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - - attr->xattri_dela_state++; - fallthrough; - case XFS_DAS_RM_LBLK: - case XFS_DAS_RM_NBLK: if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(attr); - if (error == -EAGAIN) - trace_xfs_attr_set_iter_return( - attr->xattri_dela_state, args->dp); + error = xfs_attr_rmtval_invalidate(args); if (error) return error; - - attr->xattri_dela_state = XFS_DAS_RD_LEAF; - trace_xfs_attr_set_iter_return(attr->xattri_dela_state, - args->dp); - return -EAGAIN; + } else { + attr->xattri_dela_state++; } + attr->xattri_dela_state++; + goto next_state; + + case XFS_DAS_LEAF_REMOVE_RMT: + case XFS_DAS_NODE_REMOVE_RMT: + error = xfs_attr_rmtval_remove(attr); + if (error == -EAGAIN) + break; + if (error) + return error; + /* - * This is the end of the shared leaf/node sequence. We need - * to continue at the next state in the sequence, but we can't - * easily just fall through. So we increment to the next state - * and then jump back to switch statement to evaluate the next - * state correctly. + * We've finished removing the remote attr blocks, so commit the + * transaction and move on to removing the attr name from the + * leaf/node block. Removing the attr might require a full + * transaction reservation for btree block freeing, so we + * can't do that in the same transaction where we removed the + * remote attr blocks. */ + error = -EAGAIN; attr->xattri_dela_state++; - goto next_state; + break; case XFS_DAS_RD_LEAF: /* diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 01a50613726f..1e038c23029a 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -456,7 +456,7 @@ enum xfs_delattr_state { XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */ XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */ - XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ + XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */ XFS_DAS_RD_LEAF, /* Read in the new leaf */ /* Node state set sequence, must match leaf state above */ @@ -464,7 +464,7 @@ enum xfs_delattr_state { XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */ XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */ - XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ + XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */ XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ XFS_DAS_DONE, /* finished operation */ @@ -482,13 +482,13 @@ enum xfs_delattr_state { { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \ { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \ { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \ - { XFS_DAS_RM_LBLK, "XFS_DAS_RM_LBLK" }, \ + { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \ { XFS_DAS_RD_LEAF, "XFS_DAS_RD_LEAF" }, \ { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \ { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \ { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \ { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \ - { XFS_DAS_RM_NBLK, "XFS_DAS_RM_NBLK" }, \ + { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \ { XFS_DAS_CLR_FLAG, "XFS_DAS_CLR_FLAG" }, \ { XFS_DAS_DONE, "XFS_DAS_DONE" } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index b528c0f375c2..793d2a86ab2c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4140,13 +4140,13 @@ TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD); -TRACE_DEFINE_ENUM(XFS_DAS_RM_LBLK); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT); TRACE_DEFINE_ENUM(XFS_DAS_RD_LEAF); TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT); TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT); TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE); TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD); -TRACE_DEFINE_ENUM(XFS_DAS_RM_NBLK); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT); TRACE_DEFINE_ENUM(XFS_DAS_CLR_FLAG); DECLARE_EVENT_CLASS(xfs_das_state_class, -- cgit From b11fa61bc4c679172a35e48d149f797ee37db3fc Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:55 +1000 Subject: xfs: clean up final attr removal in xfs_attr_set_iter Clean up the final leaf/node states in xfs_attr_set_iter() to further simplify the high level state machine and to set the completion state correctly. As we are adding a separate state for node format removal, we need to ensure that node formats are collapsed back to shortform or empty correctly. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 149 +++++++++++++++++++++++++++-------------------- fs/xfs/libxfs/xfs_attr.h | 12 ++-- fs/xfs/xfs_trace.h | 5 +- 3 files changed, 94 insertions(+), 72 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index c0e72e9c4f53..467e23602005 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -60,7 +60,7 @@ STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); static int xfs_attr_node_try_addname(struct xfs_attr_item *attr); STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_item *attr); -STATIC int xfs_attr_node_addname_clear_incomplete(struct xfs_attr_item *attr); +STATIC int xfs_attr_node_remove_attr(struct xfs_attr_item *attr); STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, struct xfs_da_state **state); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); @@ -443,6 +443,77 @@ out: return error; } +/* + * Remove the original attr we have just replaced. This is dependent on the + * original lookup and insert placing the old attr in args->blkno/args->index + * and the new attr in args->blkno2/args->index2. + */ +static int +xfs_attr_leaf_remove_attr( + struct xfs_attr_item *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; + int forkoff; + int error; + + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + &bp); + if (error) + return error; + + xfs_attr3_leaf_remove(bp, args); + + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + + return error; +} + +/* + * Shrink an attribute from leaf to shortform. Used by the node format remove + * path when the node format collapses to a single block and so we have to check + * if it can be collapsed further. + */ +static int +xfs_attr_leaf_shrink( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + struct xfs_inode *dp = args->dp; + int error, forkoff; + struct xfs_buf *bp; + + if (!xfs_attr_is_leaf(dp)) + return 0; + + /* + * Have to get rid of the copy of this dabuf in the state. + */ + if (state) { + ASSERT(state->path.active == 1); + ASSERT(state->path.blk[0].bp); + state->path.blk[0].bp = NULL; + } + + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; + + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) { + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + } else { + xfs_trans_brelse(args->trans, bp); + } + + return error; +} + /* * Set the attribute specified in @args. * This routine is meant to function as a delayed operation, and may return @@ -455,9 +526,7 @@ xfs_attr_set_iter( struct xfs_attr_item *attr) { struct xfs_da_args *args = attr->xattri_da_args; - struct xfs_inode *dp = args->dp; - struct xfs_buf *bp = NULL; - int forkoff, error = 0; + int error = 0; /* State machine switch */ next_state: @@ -548,32 +617,16 @@ next_state: attr->xattri_dela_state++; break; - case XFS_DAS_RD_LEAF: - /* - * This is the last step for leaf format. Read the block with - * the old attr, remove the old attr, check for shortform - * conversion and return. - */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); - if (error) - return error; - - xfs_attr3_leaf_remove(bp, args); - - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - - return error; + case XFS_DAS_LEAF_REMOVE_ATTR: + error = xfs_attr_leaf_remove_attr(attr); + attr->xattri_dela_state = XFS_DAS_DONE; + break; - case XFS_DAS_CLR_FLAG: - /* - * The last state for node format. Look up the old attr and - * remove it. - */ - error = xfs_attr_node_addname_clear_incomplete(attr); + case XFS_DAS_NODE_REMOVE_ATTR: + error = xfs_attr_node_remove_attr(attr); + if (!error) + error = xfs_attr_leaf_shrink(args, NULL); + attr->xattri_dela_state = XFS_DAS_DONE; break; default: ASSERT(0); @@ -1268,8 +1321,8 @@ out: } -STATIC int -xfs_attr_node_addname_clear_incomplete( +static int +xfs_attr_node_remove_attr( struct xfs_attr_item *attr) { struct xfs_da_args *args = attr->xattri_da_args; @@ -1310,38 +1363,6 @@ out: return retval; } -/* - * Shrink an attribute from leaf to shortform - */ -STATIC int -xfs_attr_node_shrink( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - struct xfs_inode *dp = args->dp; - int error, forkoff; - struct xfs_buf *bp; - - /* - * Have to get rid of the copy of this dabuf in the state. - */ - ASSERT(state->path.active == 1); - ASSERT(state->path.blk[0].bp); - state->path.blk[0].bp = NULL; - - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - return error; - - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - } else - xfs_trans_brelse(args->trans, bp); - - return error; -} /* * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers @@ -1550,7 +1571,7 @@ xfs_attr_remove_iter( * transaction. */ if (xfs_attr_is_leaf(dp)) - error = xfs_attr_node_shrink(args, state); + error = xfs_attr_leaf_shrink(args, state); ASSERT(error != -EAGAIN); break; default: diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 1e038c23029a..7b0a5a165725 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -451,21 +451,21 @@ enum xfs_delattr_state { XFS_DAS_RM_NAME, /* Remove attr name */ XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ - /* Leaf state set sequence */ + /* Leaf state set/replace sequence */ XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */ XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */ XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */ XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */ - XFS_DAS_RD_LEAF, /* Read in the new leaf */ + XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */ - /* Node state set sequence, must match leaf state above */ + /* Node state set/replace sequence, must match leaf state above */ XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */ XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */ XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */ XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */ - XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ + XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */ XFS_DAS_DONE, /* finished operation */ }; @@ -483,13 +483,13 @@ enum xfs_delattr_state { { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \ { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \ { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \ - { XFS_DAS_RD_LEAF, "XFS_DAS_RD_LEAF" }, \ + { XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \ { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \ { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \ { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \ { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \ { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \ - { XFS_DAS_CLR_FLAG, "XFS_DAS_CLR_FLAG" }, \ + { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \ { XFS_DAS_DONE, "XFS_DAS_DONE" } /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 793d2a86ab2c..260760ce2d05 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4141,13 +4141,14 @@ TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT); -TRACE_DEFINE_ENUM(XFS_DAS_RD_LEAF); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_ATTR); TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT); TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT); TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE); TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD); TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT); -TRACE_DEFINE_ENUM(XFS_DAS_CLR_FLAG); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_ATTR); +TRACE_DEFINE_ENUM(XFS_DAS_DONE); DECLARE_EVENT_CLASS(xfs_das_state_class, TP_PROTO(int das, struct xfs_inode *ip), -- cgit From 4e3d96a57a06f20f4ce04a92422cc100251f346d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:55 +1000 Subject: xfs: xfs_attr_set_iter() does not need to return EAGAIN Now that the full xfs_attr_set_iter() state machine always terminates with either the state being XFS_DAS_DONE on success or an error on failure, we can get rid of the need for it to return -EAGAIN whenever it needs to roll the transaction before running the next state. That is, we don't need to spray -EAGAIN return states everywhere, the caller just check the state machine state for completion to determine what action should be taken next. This greatly simplifies the code within the state machine implementation as it now only has to handle 0 for success or -errno for error and it doesn't need to tell the caller to retry. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 90 ++++++++++++++++++++---------------------------- fs/xfs/xfs_attr_item.c | 2 ++ 2 files changed, 39 insertions(+), 53 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 467e23602005..70d052a06644 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -289,7 +289,6 @@ xfs_attr_sf_addname( */ xfs_trans_bhold(args->trans, attr->xattri_leaf_bp); attr->xattri_dela_state = XFS_DAS_LEAF_ADD; - error = -EAGAIN; out: trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp); return error; @@ -342,7 +341,6 @@ xfs_attr_leaf_addname( * retry the add to the newly allocated node block. */ attr->xattri_dela_state = XFS_DAS_NODE_ADD; - error = -EAGAIN; goto out; } if (error) @@ -353,20 +351,24 @@ xfs_attr_leaf_addname( * or perform more xattr manipulations. Otherwise there is nothing more * to do and we can return success. */ - if (args->rmtblkno) { + if (args->rmtblkno) attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; - error = -EAGAIN; - } else if (args->op_flags & XFS_DA_OP_RENAME) { + else if (args->op_flags & XFS_DA_OP_RENAME) xfs_attr_dela_state_set_replace(attr, XFS_DAS_LEAF_REPLACE); - error = -EAGAIN; - } else { + else attr->xattri_dela_state = XFS_DAS_DONE; - } out: trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); return error; } +/* + * Add an entry to a node format attr tree. + * + * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell + * the difference between leaf + remote attr blocks and a node format tree, + * so we may still end up having to convert from leaf to node format here. + */ static int xfs_attr_node_addname( struct xfs_attr_item *attr) @@ -381,19 +383,26 @@ xfs_attr_node_addname( return error; error = xfs_attr_node_try_addname(attr); + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; + /* + * No state change, we really are in node form now + * but we need the transaction rolled to continue. + */ + goto out; + } if (error) return error; - if (args->rmtblkno) { + if (args->rmtblkno) attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT; - error = -EAGAIN; - } else if (args->op_flags & XFS_DA_OP_RENAME) { + else if (args->op_flags & XFS_DA_OP_RENAME) xfs_attr_dela_state_set_replace(attr, XFS_DAS_NODE_REPLACE); - error = -EAGAIN; - } else { + else attr->xattri_dela_state = XFS_DAS_DONE; - } - +out: trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp); return error; } @@ -416,10 +425,8 @@ xfs_attr_rmtval_alloc( if (error) return error; /* Roll the transaction only if there is more to allocate. */ - if (attr->xattri_blkcnt > 0) { - error = -EAGAIN; + if (attr->xattri_blkcnt > 0) goto out; - } } error = xfs_attr_rmtval_set_value(args); @@ -515,11 +522,12 @@ xfs_attr_leaf_shrink( } /* - * Set the attribute specified in @args. - * This routine is meant to function as a delayed operation, and may return - * -EAGAIN when the transaction needs to be rolled. Calling functions will need - * to handle this, and recall the function until a successful error code is - * returned. + * Run the attribute operation specified in @attr. + * + * This routine is meant to function as a delayed operation and will set the + * state to XFS_DAS_DONE when the operation is complete. Calling functions will + * need to handle this, and recall the function until either an error or + * XFS_DAS_DONE is detected. */ int xfs_attr_set_iter( @@ -572,7 +580,6 @@ next_state: * We must commit the flag value change now to make it atomic * and then we can start the next trans in series at REMOVE_OLD. */ - error = -EAGAIN; attr->xattri_dela_state++; break; @@ -600,8 +607,10 @@ next_state: case XFS_DAS_LEAF_REMOVE_RMT: case XFS_DAS_NODE_REMOVE_RMT: error = xfs_attr_rmtval_remove(attr); - if (error == -EAGAIN) + if (error == -EAGAIN) { + error = 0; break; + } if (error) return error; @@ -613,7 +622,6 @@ next_state: * can't do that in the same transaction where we removed the * remote attr blocks. */ - error = -EAGAIN; attr->xattri_dela_state++; break; @@ -1249,14 +1257,6 @@ error: * This will involve walking down the Btree, and may involve splitting * leaf nodes and even splitting intermediate nodes up to and including * the root node (a special case of an intermediate node). - * - * "Remote" attribute values confuse the issue and atomic rename operations - * add a whole extra layer of confusion on top of that. - * - * This routine is meant to function as a delayed operation, and may return - * -EAGAIN when the transaction needs to be rolled. Calling functions will need - * to handle this, and recall the function until a successful error code is - *returned. */ static int xfs_attr_node_try_addname( @@ -1278,24 +1278,9 @@ xfs_attr_node_try_addname( /* * Its really a single leaf node, but it had * out-of-line values so it looked like it *might* - * have been a b-tree. - */ - xfs_da_state_free(state); - state = NULL; - error = xfs_attr3_leaf_to_node(args); - if (error) - goto out; - - /* - * Now that we have converted the leaf to a node, we can - * roll the transaction, and try xfs_attr3_leaf_add - * again on re-entry. No need to set dela_state to do - * this. dela_state is still unset by this function at - * this point. + * have been a b-tree. Let the caller deal with this. */ - trace_xfs_attr_node_addname_return( - attr->xattri_dela_state, args->dp); - return -EAGAIN; + goto out; } /* @@ -1315,8 +1300,7 @@ xfs_attr_node_try_addname( } out: - if (state) - xfs_da_state_free(state); + xfs_da_state_free(state); return error; } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index f058b034ee34..1bf812830a5d 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -320,6 +320,8 @@ xfs_xattri_finish_update( case XFS_ATTR_OP_FLAGS_SET: case XFS_ATTR_OP_FLAGS_REPLACE: error = xfs_attr_set_iter(attr); + if (!error && attr->xattri_dela_state != XFS_DAS_DONE) + error = -EAGAIN; break; case XFS_ATTR_OP_FLAGS_REMOVE: ASSERT(XFS_IFORK_Q(args->dp)); -- cgit From e5d5596a2a1790d8c57938f820aa33e58f90ad0d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:56 +1000 Subject: xfs: introduce attr remove initial states into xfs_attr_set_iter We need to merge the add and remove code paths to enable safe recovery of replace operations. Hoist the initial remove states from xfs_attr_remove_iter into xfs_attr_set_iter. We will make use of them in the next patches. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 139 ++++++++++++++++++++++++++--------------------- fs/xfs/libxfs/xfs_attr.h | 4 ++ fs/xfs/xfs_trace.h | 3 + 3 files changed, 84 insertions(+), 62 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 70d052a06644..cb46dca2d6fa 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -450,6 +450,68 @@ out: return error; } +/* + * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers + * for later deletion of the entry. + */ +static int +xfs_attr_leaf_mark_incomplete( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + int error; + + /* + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. + */ + error = xfs_attr_fillstate(state); + if (error) + return error; + + /* + * Mark the attribute as INCOMPLETE + */ + return xfs_attr3_leaf_setflag(args); +} + +/* + * Initial setup for xfs_attr_node_removename. Make sure the attr is there and + * the blocks are valid. Attr keys with remote blocks will be marked + * incomplete. + */ +static +int xfs_attr_node_removename_setup( + struct xfs_attr_item *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state **state = &attr->xattri_da_state; + int error; + + error = xfs_attr_node_hasname(args, state); + if (error != -EEXIST) + goto out; + error = 0; + + ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); + ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == + XFS_ATTR_LEAF_MAGIC); + + if (args->rmtblkno > 0) { + error = xfs_attr_leaf_mark_incomplete(args, *state); + if (error) + goto out; + + error = xfs_attr_rmtval_invalidate(args); + } +out: + if (error) + xfs_da_state_free(*state); + + return error; +} + /* * Remove the original attr we have just replaced. This is dependent on the * original lookup and insert placing the old attr in args->blkno/args->index @@ -549,6 +611,21 @@ next_state: case XFS_DAS_NODE_ADD: return xfs_attr_node_addname(attr); + case XFS_DAS_SF_REMOVE: + attr->xattri_dela_state = XFS_DAS_DONE; + return xfs_attr_sf_removename(args); + case XFS_DAS_LEAF_REMOVE: + attr->xattri_dela_state = XFS_DAS_DONE; + return xfs_attr_leaf_removename(args); + case XFS_DAS_NODE_REMOVE: + error = xfs_attr_node_removename_setup(attr); + if (error) + return error; + attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT; + if (args->rmtblkno == 0) + attr->xattri_dela_state++; + break; + case XFS_DAS_LEAF_SET_RMT: case XFS_DAS_NODE_SET_RMT: error = xfs_attr_rmtval_find_space(attr); @@ -1348,68 +1425,6 @@ out: } -/* - * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers - * for later deletion of the entry. - */ -STATIC int -xfs_attr_leaf_mark_incomplete( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - int error; - - /* - * Fill in disk block numbers in the state structure - * so that we can get the buffers back after we commit - * several transactions in the following calls. - */ - error = xfs_attr_fillstate(state); - if (error) - return error; - - /* - * Mark the attribute as INCOMPLETE - */ - return xfs_attr3_leaf_setflag(args); -} - -/* - * Initial setup for xfs_attr_node_removename. Make sure the attr is there and - * the blocks are valid. Attr keys with remote blocks will be marked - * incomplete. - */ -STATIC -int xfs_attr_node_removename_setup( - struct xfs_attr_item *attr) -{ - struct xfs_da_args *args = attr->xattri_da_args; - struct xfs_da_state **state = &attr->xattri_da_state; - int error; - - error = xfs_attr_node_hasname(args, state); - if (error != -EEXIST) - goto out; - error = 0; - - ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); - ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == - XFS_ATTR_LEAF_MAGIC); - - if (args->rmtblkno > 0) { - error = xfs_attr_leaf_mark_incomplete(args, *state); - if (error) - goto out; - - error = xfs_attr_rmtval_invalidate(args); - } -out: - if (error) - xfs_da_state_free(*state); - - return error; -} - STATIC int xfs_attr_node_removename( struct xfs_da_args *args, diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 7b0a5a165725..988c2451683a 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -451,6 +451,10 @@ enum xfs_delattr_state { XFS_DAS_RM_NAME, /* Remove attr name */ XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ + XFS_DAS_SF_REMOVE, /* Initial shortform set iter state */ + XFS_DAS_LEAF_REMOVE, /* Initial leaf form set iter state */ + XFS_DAS_NODE_REMOVE, /* Initial node form set iter state */ + /* Leaf state set/replace sequence */ XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */ XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 260760ce2d05..01b047d86cd1 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4136,6 +4136,9 @@ TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD); TRACE_DEFINE_ENUM(XFS_DAS_RMTBLK); TRACE_DEFINE_ENUM(XFS_DAS_RM_NAME); TRACE_DEFINE_ENUM(XFS_DAS_RM_SHRINK); +TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE); -- cgit From 4b9879b19cafa63ae02fef30f678b2179a648d45 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:56 +1000 Subject: xfs: switch attr remove to xfs_attri_set_iter Now that xfs_attri_set_iter() has initial states for removing attributes, switch the pure attribute removal code over to using it. This requires attrs being removed to always be marked as INCOMPLETE before we start the removal due to the fact we look up the attr to remove again in xfs_attr_node_remove_attr(). Note: this drops the fillstate/refillstate optimisations from the remove path that avoid having to look up the path again after setting the incomplete flag and removing remote attrs. Restoring that optimisation to this path is future Dave's problem. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 21 +++++++++------------ fs/xfs/libxfs/xfs_attr.h | 10 ++++++++++ fs/xfs/xfs_attr_item.c | 31 +++++++------------------------ 3 files changed, 26 insertions(+), 36 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index cb46dca2d6fa..83b20d050ff0 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -498,13 +498,11 @@ int xfs_attr_node_removename_setup( ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == XFS_ATTR_LEAF_MAGIC); - if (args->rmtblkno > 0) { - error = xfs_attr_leaf_mark_incomplete(args, *state); - if (error) - goto out; - + error = xfs_attr_leaf_mark_incomplete(args, *state); + if (error) + goto out; + if (args->rmtblkno > 0) error = xfs_attr_rmtval_invalidate(args); - } out: if (error) xfs_da_state_free(*state); @@ -820,7 +818,7 @@ xfs_attr_defer_remove( if (error) return error; - new->xattri_dela_state = XFS_DAS_UNINIT; + new->xattri_dela_state = xfs_attr_init_remove_state(args); xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp); @@ -1388,16 +1386,15 @@ xfs_attr_node_remove_attr( { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_da_state *state = NULL; - struct xfs_mount *mp = args->dp->i_mount; int retval = 0; int error = 0; /* - * Re-find the "old" attribute entry after any split ops. The INCOMPLETE - * flag means that we will find the "old" attr, not the "new" one. + * The attr we are removing has already been marked incomplete, so + * we need to set the filter appropriately to re-find the "old" + * attribute entry after any split ops. */ - if (!xfs_has_larp(mp)) - args->attr_filter |= XFS_ATTR_INCOMPLETE; + args->attr_filter |= XFS_ATTR_INCOMPLETE; state = xfs_da_state_alloc(args); state->inleaf = 0; error = xfs_da3_node_lookup_int(state, &retval); diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 988c2451683a..41d70ad62cbf 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -602,6 +602,16 @@ xfs_attr_init_add_state(struct xfs_da_args *args) return XFS_DAS_NODE_ADD; } +static inline enum xfs_delattr_state +xfs_attr_init_remove_state(struct xfs_da_args *args) +{ + if (xfs_attr_is_shortform(args->dp)) + return XFS_DAS_SF_REMOVE; + if (xfs_attr_is_leaf(args->dp)) + return XFS_DAS_LEAF_REMOVE; + return XFS_DAS_NODE_REMOVE; +} + static inline enum xfs_delattr_state xfs_attr_init_replace_state(struct xfs_da_args *args) { diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 1bf812830a5d..19ceb2d257b7 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -303,12 +303,9 @@ xfs_attrd_item_intent( STATIC int xfs_xattri_finish_update( struct xfs_attr_item *attr, - struct xfs_attrd_log_item *attrdp, - uint32_t op_flags) + struct xfs_attrd_log_item *attrdp) { struct xfs_da_args *args = attr->xattri_da_args; - unsigned int op = op_flags & - XFS_ATTR_OP_FLAGS_TYPE_MASK; int error; if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { @@ -316,22 +313,9 @@ xfs_xattri_finish_update( goto out; } - switch (op) { - case XFS_ATTR_OP_FLAGS_SET: - case XFS_ATTR_OP_FLAGS_REPLACE: - error = xfs_attr_set_iter(attr); - if (!error && attr->xattri_dela_state != XFS_DAS_DONE) - error = -EAGAIN; - break; - case XFS_ATTR_OP_FLAGS_REMOVE: - ASSERT(XFS_IFORK_Q(args->dp)); - error = xfs_attr_remove_iter(attr); - break; - default: - error = -EFSCORRUPTED; - break; - } - + error = xfs_attr_set_iter(attr); + if (!error && attr->xattri_dela_state != XFS_DAS_DONE) + error = -EAGAIN; out: /* * Mark the transaction dirty, even on error. This ensures the @@ -439,8 +423,7 @@ xfs_attr_finish_item( */ attr->xattri_da_args->trans = tp; - error = xfs_xattri_finish_update(attr, done_item, - attr->xattri_op_flags); + error = xfs_xattri_finish_update(attr, done_item); if (error != -EAGAIN) kmem_free(attr); @@ -588,7 +571,7 @@ xfs_attri_item_recover( attr->xattri_dela_state = xfs_attr_init_add_state(args); break; case XFS_ATTR_OP_FLAGS_REMOVE: - attr->xattri_dela_state = XFS_DAS_UNINIT; + attr->xattri_dela_state = xfs_attr_init_remove_state(args); break; default: ASSERT(0); @@ -607,7 +590,7 @@ xfs_attri_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - ret = xfs_xattri_finish_update(attr, done_item, attrp->alfi_op_flags); + ret = xfs_xattri_finish_update(attr, done_item); if (ret == -EAGAIN) { /* There's more work to do, so add it to this transaction */ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); -- cgit From 59782a236b622a983ff101b2cb1333f714e4ed4e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:56 +1000 Subject: xfs: remove xfs_attri_remove_iter xfs_attri_remove_iter is not used anymore, so remove it and all the infrastructure it uses and is needed to drive it. THe xfs_attr_refillstate() function now throws an unused warning, so isolate the xfs_attr_fillstate()/xfs_attr_refillstate() code pair with an #if 0 and a comment explaining why we want to keep this code and restore the optimisation it provides in the near future. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 418 ++++++++++++++++------------------------------- 1 file changed, 139 insertions(+), 279 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 83b20d050ff0..dc3e3de66ab4 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -63,10 +63,6 @@ STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_item *attr); STATIC int xfs_attr_node_remove_attr(struct xfs_attr_item *attr); STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, struct xfs_da_state **state); -STATIC int xfs_attr_fillstate(xfs_da_state_t *state); -STATIC int xfs_attr_refillstate(xfs_da_state_t *state); -STATIC int xfs_attr_node_removename(struct xfs_da_args *args, - struct xfs_da_state *state); int xfs_inode_hasattr( @@ -99,6 +95,123 @@ xfs_attr_is_leaf( return imap.br_startoff == 0 && imap.br_blockcount == 1; } +/* + * XXX (dchinner): name path state saving and refilling is an optimisation to + * avoid needing to look up name entries after rolling transactions removing + * remote xattr blocks between the name entry lookup and name entry removal. + * This optimisation got sidelined when combining the set and remove state + * machines, but the code has been left in place because it is worthwhile to + * restore the optimisation once the combined state machine paths have settled. + * + * This comment is a public service announcement to remind Future Dave that he + * still needs to restore this code to working order. + */ +#if 0 +/* + * Fill in the disk block numbers in the state structure for the buffers + * that are attached to the state structure. + * This is done so that we can quickly reattach ourselves to those buffers + * after some set of transaction commits have released these buffers. + */ +static int +xfs_attr_fillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level; + + trace_xfs_attr_fillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_buf_daddr(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_buf_daddr(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + return 0; +} + +/* + * Reattach the buffers to the state structure based on the disk block + * numbers stored in the state structure. + * This is done after some set of transaction commits have released those + * buffers from our grip. + */ +static int +xfs_attr_refillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level, error; + + trace_xfs_attr_refillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + return 0; +} +#else +static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; } +#endif + /*======================================================================== * Overall external interface routines. *========================================================================*/ @@ -547,25 +660,16 @@ xfs_attr_leaf_remove_attr( */ static int xfs_attr_leaf_shrink( - struct xfs_da_args *args, - struct xfs_da_state *state) + struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; - int error, forkoff; struct xfs_buf *bp; + int forkoff; + int error; if (!xfs_attr_is_leaf(dp)) return 0; - /* - * Have to get rid of the copy of this dabuf in the state. - */ - if (state) { - ASSERT(state->path.active == 1); - ASSERT(state->path.blk[0].bp); - state->path.blk[0].bp = NULL; - } - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); if (error) return error; @@ -708,7 +812,7 @@ next_state: case XFS_DAS_NODE_REMOVE_ATTR: error = xfs_attr_node_remove_attr(attr); if (!error) - error = xfs_attr_leaf_shrink(args, NULL); + error = xfs_attr_leaf_shrink(args); attr->xattri_dela_state = XFS_DAS_DONE; break; default: @@ -1379,6 +1483,24 @@ out: return error; } +static int +xfs_attr_node_removename( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + struct xfs_da_state_blk *blk; + int retval; + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[state->path.active-1]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + retval = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); + + return retval; +} static int xfs_attr_node_remove_attr( @@ -1421,268 +1543,6 @@ out: return retval; } - -STATIC int -xfs_attr_node_removename( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - struct xfs_da_state_blk *blk; - int retval; - - /* - * Remove the name and update the hashvals in the tree. - */ - blk = &state->path.blk[state->path.active-1]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - retval = xfs_attr3_leaf_remove(blk->bp, args); - xfs_da3_fixhashpath(state, &state->path); - - return retval; -} - -/* - * Remove the attribute specified in @args. - * - * This will involve walking down the Btree, and may involve joining - * leaf nodes and even joining intermediate nodes up to and including - * the root node (a special case of an intermediate node). - * - * This routine is meant to function as either an in-line or delayed operation, - * and may return -EAGAIN when the transaction needs to be rolled. Calling - * functions will need to handle this, and call the function until a - * successful error code is returned. - */ -int -xfs_attr_remove_iter( - struct xfs_attr_item *attr) -{ - struct xfs_da_args *args = attr->xattri_da_args; - struct xfs_da_state *state = attr->xattri_da_state; - int retval, error = 0; - struct xfs_inode *dp = args->dp; - - trace_xfs_attr_node_removename(args); - - switch (attr->xattri_dela_state) { - case XFS_DAS_UNINIT: - if (!xfs_inode_hasattr(dp)) - return -ENOATTR; - - /* - * Shortform or leaf formats don't require transaction rolls and - * thus state transitions. Call the right helper and return. - */ - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) - return xfs_attr_sf_removename(args); - - if (xfs_attr_is_leaf(dp)) - return xfs_attr_leaf_removename(args); - - /* - * Node format may require transaction rolls. Set up the - * state context and fall into the state machine. - */ - if (!attr->xattri_da_state) { - error = xfs_attr_node_removename_setup(attr); - if (error) - return error; - state = attr->xattri_da_state; - } - - fallthrough; - case XFS_DAS_RMTBLK: - attr->xattri_dela_state = XFS_DAS_RMTBLK; - - /* - * If there is an out-of-line value, de-allocate the blocks. - * This is done before we remove the attribute so that we don't - * overflow the maximum size of a transaction and/or hit a - * deadlock. - */ - if (args->rmtblkno > 0) { - /* - * May return -EAGAIN. Roll and repeat until all remote - * blocks are removed. - */ - error = xfs_attr_rmtval_remove(attr); - if (error == -EAGAIN) { - trace_xfs_attr_remove_iter_return( - attr->xattri_dela_state, args->dp); - return error; - } else if (error) { - goto out; - } - - /* - * Refill the state structure with buffers (the prior - * calls released our buffers) and close out this - * transaction before proceeding. - */ - ASSERT(args->rmtblkno == 0); - error = xfs_attr_refillstate(state); - if (error) - goto out; - - attr->xattri_dela_state = XFS_DAS_RM_NAME; - trace_xfs_attr_remove_iter_return( - attr->xattri_dela_state, args->dp); - return -EAGAIN; - } - - fallthrough; - case XFS_DAS_RM_NAME: - /* - * If we came here fresh from a transaction roll, reattach all - * the buffers to the current transaction. - */ - if (attr->xattri_dela_state == XFS_DAS_RM_NAME) { - error = xfs_attr_refillstate(state); - if (error) - goto out; - } - - retval = xfs_attr_node_removename(args, state); - - /* - * Check to see if the tree needs to be collapsed. If so, roll - * the transacton and fall into the shrink state. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - - attr->xattri_dela_state = XFS_DAS_RM_SHRINK; - trace_xfs_attr_remove_iter_return( - attr->xattri_dela_state, args->dp); - return -EAGAIN; - } - - fallthrough; - case XFS_DAS_RM_SHRINK: - /* - * If the result is small enough, push it all into the inode. - * This is our final state so it's safe to return a dirty - * transaction. - */ - if (xfs_attr_is_leaf(dp)) - error = xfs_attr_leaf_shrink(args, state); - ASSERT(error != -EAGAIN); - break; - default: - ASSERT(0); - error = -EINVAL; - goto out; - } -out: - if (state) - xfs_da_state_free(state); - return error; -} - -/* - * Fill in the disk block numbers in the state structure for the buffers - * that are attached to the state structure. - * This is done so that we can quickly reattach ourselves to those buffers - * after some set of transaction commits have released these buffers. - */ -STATIC int -xfs_attr_fillstate(xfs_da_state_t *state) -{ - xfs_da_state_path_t *path; - xfs_da_state_blk_t *blk; - int level; - - trace_xfs_attr_fillstate(state->args); - - /* - * Roll down the "path" in the state structure, storing the on-disk - * block number for those buffers in the "path". - */ - path = &state->path; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->bp) { - blk->disk_blkno = xfs_buf_daddr(blk->bp); - blk->bp = NULL; - } else { - blk->disk_blkno = 0; - } - } - - /* - * Roll down the "altpath" in the state structure, storing the on-disk - * block number for those buffers in the "altpath". - */ - path = &state->altpath; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->bp) { - blk->disk_blkno = xfs_buf_daddr(blk->bp); - blk->bp = NULL; - } else { - blk->disk_blkno = 0; - } - } - - return 0; -} - -/* - * Reattach the buffers to the state structure based on the disk block - * numbers stored in the state structure. - * This is done after some set of transaction commits have released those - * buffers from our grip. - */ -STATIC int -xfs_attr_refillstate(xfs_da_state_t *state) -{ - xfs_da_state_path_t *path; - xfs_da_state_blk_t *blk; - int level, error; - - trace_xfs_attr_refillstate(state->args); - - /* - * Roll down the "path" in the state structure, storing the on-disk - * block number for those buffers in the "path". - */ - path = &state->path; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->disk_blkno) { - error = xfs_da3_node_read_mapped(state->args->trans, - state->args->dp, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); - if (error) - return error; - } else { - blk->bp = NULL; - } - } - - /* - * Roll down the "altpath" in the state structure, storing the on-disk - * block number for those buffers in the "altpath". - */ - path = &state->altpath; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->disk_blkno) { - error = xfs_da3_node_read_mapped(state->args->trans, - state->args->dp, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); - if (error) - return error; - } else { - blk->bp = NULL; - } - } - - return 0; -} - /* * Retrieve the attribute data from a node attribute list. * -- cgit From e7f358dee4e5cf1ce8b11ff2e65d5ccb1ced24db Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:56 +1000 Subject: xfs: use XFS_DA_OP flags in deferred attr ops We currently store the high level attr operation in args->attr_flags. This field contains what the VFS is telling us to do, but don't necessarily match what we are doing in the low level modification state machine. e.g. XATTR_REPLACE implies both XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME because it is doing both a remove and adding a new attr. However, deep in the individual state machine operations, we check errors against this high level VFS op flags, not the low level XFS_DA_OP flags. Indeed, we don't even have a low level flag for a REMOVE operation, so the only way we know we are doing a remove is the complete absence of XATTR_REPLACE, XATTR_CREATE, XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME. And because there are other flags in these fields, this is a pain to check if we need to. As the XFS_DA_OP flags are only needed once the deferred operations are set up, set these flags appropriately when we set the initial operation state. We also introduce a XFS_DA_OP_REMOVE flag to make it easy to know that we are doing a remove operation. With these, we can remove the use of XATTR_REPLACE and XATTR_CREATE in low level lookup operations, and manipulate the low level flags according to the low level context that is operating. e.g. log recovery does not have a VFS xattr operation state to copy into args->attr_flags, and the low level state machine ops we do for recovery do not match the high level VFS operations that were in progress when the system failed... Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 136 +++++++++++++++++++++++------------------- fs/xfs/libxfs/xfs_attr.h | 5 +- fs/xfs/libxfs/xfs_attr_leaf.c | 2 +- fs/xfs/libxfs/xfs_da_btree.h | 8 ++- 4 files changed, 84 insertions(+), 67 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index dc3e3de66ab4..5072c156833b 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -466,7 +466,7 @@ xfs_attr_leaf_addname( */ if (args->rmtblkno) attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; - else if (args->op_flags & XFS_DA_OP_RENAME) + else if (args->op_flags & XFS_DA_OP_REPLACE) xfs_attr_dela_state_set_replace(attr, XFS_DAS_LEAF_REPLACE); else attr->xattri_dela_state = XFS_DAS_DONE; @@ -511,7 +511,7 @@ xfs_attr_node_addname( if (args->rmtblkno) attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT; - else if (args->op_flags & XFS_DA_OP_RENAME) + else if (args->op_flags & XFS_DA_OP_REPLACE) xfs_attr_dela_state_set_replace(attr, XFS_DAS_NODE_REPLACE); else attr->xattri_dela_state = XFS_DAS_DONE; @@ -547,7 +547,7 @@ xfs_attr_rmtval_alloc( return error; /* If this is not a rename, clear the incomplete flag and we're done. */ - if (!(args->op_flags & XFS_DA_OP_RENAME)) { + if (!(args->op_flags & XFS_DA_OP_REPLACE)) { error = xfs_attr3_leaf_clearflag(args); attr->xattri_dela_state = XFS_DAS_DONE; } else { @@ -966,8 +966,6 @@ xfs_attr_set( if (args->value) { XFS_STATS_INC(mp, xs_attr_set); - - args->op_flags |= XFS_DA_OP_ADDNAME; args->total = xfs_attr_calc_size(args, &local); /* @@ -1125,28 +1123,41 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) * Add a name to the shortform attribute list structure * This is the external routine. */ -STATIC int -xfs_attr_shortform_addname(xfs_da_args_t *args) +static int +xfs_attr_shortform_addname( + struct xfs_da_args *args) { - int newsize, forkoff, retval; + int newsize, forkoff; + int error; trace_xfs_attr_sf_addname(args); - retval = xfs_attr_shortform_lookup(args); - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - return retval; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) - return retval; - retval = xfs_attr_sf_removename(args); - if (retval) - return retval; + error = xfs_attr_shortform_lookup(args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + return error; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + return error; + + error = xfs_attr_sf_removename(args); + if (error) + return error; + /* - * Since we have removed the old attr, clear ATTR_REPLACE so - * that the leaf format add routine won't trip over the attr - * not being around. + * Since we have removed the old attr, clear XFS_DA_OP_REPLACE + * so that the new attr doesn't fit in shortform format, the + * leaf format add routine won't trip over the attr not being + * around. */ - args->attr_flags &= ~XATTR_REPLACE; + args->op_flags &= ~XFS_DA_OP_REPLACE; + break; + case 0: + break; + default: + return error; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || @@ -1169,8 +1180,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * External routines when attribute list is one block *========================================================================*/ -/* Store info about a remote block */ -STATIC void +/* Save the current remote block info and clear the current pointers. */ +static void xfs_attr_save_rmt_blk( struct xfs_da_args *args) { @@ -1179,10 +1190,13 @@ xfs_attr_save_rmt_blk( args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; args->rmtvaluelen2 = args->rmtvaluelen; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* Set stored info about a remote block */ -STATIC void +static void xfs_attr_restore_rmt_blk( struct xfs_da_args *args) { @@ -1228,28 +1242,27 @@ xfs_attr_leaf_try_add( * Look up the xattr name to set the insertion point for the new xattr. */ error = xfs_attr3_leaf_lookup_int(bp, args); - if (error != -ENOATTR && error != -EEXIST) - goto out_brelse; - if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto out_brelse; - if (error == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto out_brelse; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) goto out_brelse; trace_xfs_attr_leaf_replace(args); - - /* save the attribute state for later removal*/ - args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ - xfs_attr_save_rmt_blk(args); - /* - * clear the remote attr state now that it is saved so that the - * values reflect the state of the attribute we are about to + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to * add, not the attribute we just found and will remove later. */ - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto out_brelse; } return xfs_attr3_leaf_add(bp, args); @@ -1388,46 +1401,45 @@ xfs_attr_node_hasname( STATIC int xfs_attr_node_addname_find_attr( - struct xfs_attr_item *attr) + struct xfs_attr_item *attr) { - struct xfs_da_args *args = attr->xattri_da_args; - int retval; + struct xfs_da_args *args = attr->xattri_da_args; + int error; /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - retval = xfs_attr_node_hasname(args, &attr->xattri_da_state); - if (retval != -ENOATTR && retval != -EEXIST) - goto error; - - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto error; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) + error = xfs_attr_node_hasname(args, &attr->xattri_da_state); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto error; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) goto error; - trace_xfs_attr_node_replace(args); - - /* save the attribute state for later removal*/ - args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ - xfs_attr_save_rmt_blk(args); + trace_xfs_attr_node_replace(args); /* - * clear the remote attr state now that it is saved so that the - * values reflect the state of the attribute we are about to + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to * add, not the attribute we just found and will remove later. */ - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto error; } return 0; error: if (attr->xattri_da_state) xfs_da_state_free(attr->xattri_da_state); - return retval; + return error; } /* diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 41d70ad62cbf..689a96689f1a 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -584,7 +584,6 @@ xfs_attr_is_shortform( static inline enum xfs_delattr_state xfs_attr_init_add_state(struct xfs_da_args *args) { - /* * When called from the completion of a attr remove to determine the * next state, the attribute fork may be null. This can occur only occur @@ -595,6 +594,8 @@ xfs_attr_init_add_state(struct xfs_da_args *args) */ if (!args->dp->i_afp) return XFS_DAS_DONE; + + args->op_flags |= XFS_DA_OP_ADDNAME; if (xfs_attr_is_shortform(args->dp)) return XFS_DAS_SF_ADD; if (xfs_attr_is_leaf(args->dp)) @@ -605,6 +606,7 @@ xfs_attr_init_add_state(struct xfs_da_args *args) static inline enum xfs_delattr_state xfs_attr_init_remove_state(struct xfs_da_args *args) { + args->op_flags |= XFS_DA_OP_REMOVE; if (xfs_attr_is_shortform(args->dp)) return XFS_DAS_SF_REMOVE; if (xfs_attr_is_leaf(args->dp)) @@ -615,6 +617,7 @@ xfs_attr_init_remove_state(struct xfs_da_args *args) static inline enum xfs_delattr_state xfs_attr_init_replace_state(struct xfs_da_args *args) { + args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE; return xfs_attr_init_add_state(args); } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index e90bfd9d7551..53d02ce9ed78 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1492,7 +1492,7 @@ xfs_attr3_leaf_add_work( entry->flags = args->attr_filter; if (tmp) entry->flags |= XFS_ATTR_LOCAL; - if (args->op_flags & XFS_DA_OP_RENAME) { + if (args->op_flags & XFS_DA_OP_REPLACE) { if (!xfs_has_larp(mp)) entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index deb368d041e3..468ca70cd35d 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -85,19 +85,21 @@ typedef struct xfs_da_args { * Operation flags: */ #define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */ -#define XFS_DA_OP_RENAME (1u << 1) /* this is an atomic rename op */ +#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */ #define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */ #define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */ #define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ +#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ - { XFS_DA_OP_RENAME, "RENAME" }, \ + { XFS_DA_OP_REPLACE, "REPLACE" }, \ { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_NOTIME, "NOTIME" } + { XFS_DA_OP_NOTIME, "NOTIME" }, \ + { XFS_DA_OP_REMOVE, "REMOVE" } /* * Storage for holding state during Btree searches and split/join ops. -- cgit From fdaf1bb3cafcfee9ef05c4eaf6ee1193fd90cbd2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:56 +1000 Subject: xfs: ATTR_REPLACE algorithm with LARP enabled needs rework We can't use the same algorithm for replacing an existing attribute when logging attributes. The existing algorithm is essentially: 1. create new attr w/ INCOMPLETE 2. atomically flip INCOMPLETE flags between old + new attribute 3. remove old attr which is marked w/ INCOMPLETE This algorithm guarantees that we see either the old or new attribute, and if we fail after the atomic flag flip, we don't have to recover the removal of the old attr because we never see INCOMPLETE attributes in lookups. For logged attributes, however, this does not work. The logged attribute intents do not track the work that has been done as the transaction rolls, and hence the only recovery mechanism we have is "run the replace operation from scratch". This is further exacerbated by the attempt to avoid needing the INCOMPLETE flag to create an atomic swap. This means we can create a second active attribute of the same name before we remove the original. If we fail at any point after the create but before the removal has completed, we end up with duplicate attributes in the attr btree and recovery only tries to replace one of them. There are several other failure modes where we can leave partially allocated remote attributes that expose stale data, partially free remote attributes that enable UAF based stale data exposure, etc. TO fix this, we need a different algorithm for replace operations when LARP is enabled. Luckily, it's not that complex if we take the right first step. That is, the first thing we log is the attri intent with the new name/value pair and mark the old attr as INCOMPLETE in the same transaction. From there, we then remove the old attr and keep relogging the new name/value in the intent, such that we always know that we have to create the new attr in recovery. Once the old attr is removed, we then run a normal ATTR_CREATE operation relogging the intent as we go. If the new attr is local, then it gets created in a single atomic transaction that also logs the final intent done. If the new attr is remote, the we set INCOMPLETE on the new attr while we allocate and set the remote value, and then we clear the INCOMPLETE flag at in the last transaction taht logs the final intent done. If we fail at any point in this algorithm, log recovery will always see the same state on disk: the new name/value in the intent, and either an INCOMPLETE attr or no attr in the attr btree. If we find an INCOMPLETE attr, we run the full replace starting with removing the INCOMPLETE attr. If we don't find it, then we simply create the new attr. Notably, recovery of a failed create that has an INCOMPLETE flag set is now the same - we start with the lookup of the INCOMPLETE attr, and if that exists then we do the full replace recovery process, otherwise we just create the new attr. Hence changing the way we do the replace operation when LARP is enabled allows us to use the same log recovery algorithm for both the ATTR_CREATE and ATTR_REPLACE operations. This is also the same algorithm we use for runtime ATTR_REPLACE operations (except for the step setting up the initial conditions). The result is that: - ATTR_CREATE uses the same algorithm regardless of whether LARP is enabled or not - ATTR_REPLACE with larp=0 is identical to the old algorithm - ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm from the larp=0 code and then runs the unmodified ATTR_CREATE code. - log recovery when larp=1 runs the same ATTR_REPLACE algorithm as it uses at runtime. Because the state machine is now quite clean, changing the algorithm is really just a case of changing the initial state and how the states link together for the ATTR_REPLACE case. Hence it's not a huge amount of code for what is a fairly substantial rework of the attr logging and recovery algorithm.... Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 97 ++++++++++++++++++++++++++----------------- fs/xfs/libxfs/xfs_attr.h | 49 ++++++++++++++-------- fs/xfs/libxfs/xfs_attr_leaf.c | 44 ++++++++++++++++---- fs/xfs/libxfs/xfs_da_btree.h | 4 +- fs/xfs/xfs_attr_item.c | 8 +++- fs/xfs/xfs_trace.h | 7 +--- 6 files changed, 137 insertions(+), 72 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 5072c156833b..14ae0826bc15 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -68,9 +68,12 @@ int xfs_inode_hasattr( struct xfs_inode *ip) { - if (!XFS_IFORK_Q(ip) || - (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0)) + if (!XFS_IFORK_Q(ip)) + return 0; + if (!ip->i_afp) + return 0; + if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0) return 0; return 1; } @@ -408,23 +411,30 @@ out: } /* - * When we bump the state to REPLACE, we may actually need to skip over the - * state. When LARP mode is enabled, we don't need to run the atomic flags flip, - * so we skip straight over the REPLACE state and go on to REMOVE_OLD. + * Handle the state change on completion of a multi-state attr operation. + * + * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first + * modification in a attr replace operation and we still have to do the second + * state, indicated by @replace_state. + * + * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on + * completion of the second half of the attr replace operation we correctly + * signal that it is done. */ -static void -xfs_attr_dela_state_set_replace( +static enum xfs_delattr_state +xfs_attr_complete_op( struct xfs_attr_item *attr, - enum xfs_delattr_state replace) + enum xfs_delattr_state replace_state) { struct xfs_da_args *args = attr->xattri_da_args; + bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; - ASSERT(replace == XFS_DAS_LEAF_REPLACE || - replace == XFS_DAS_NODE_REPLACE); - - attr->xattri_dela_state = replace; - if (xfs_has_larp(args->dp->i_mount)) - attr->xattri_dela_state++; + args->op_flags &= ~XFS_DA_OP_REPLACE; + if (do_replace) { + args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + return replace_state; + } + return XFS_DAS_DONE; } static int @@ -466,10 +476,9 @@ xfs_attr_leaf_addname( */ if (args->rmtblkno) attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; - else if (args->op_flags & XFS_DA_OP_REPLACE) - xfs_attr_dela_state_set_replace(attr, XFS_DAS_LEAF_REPLACE); else - attr->xattri_dela_state = XFS_DAS_DONE; + attr->xattri_dela_state = xfs_attr_complete_op(attr, + XFS_DAS_LEAF_REPLACE); out: trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); return error; @@ -511,10 +520,9 @@ xfs_attr_node_addname( if (args->rmtblkno) attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT; - else if (args->op_flags & XFS_DA_OP_REPLACE) - xfs_attr_dela_state_set_replace(attr, XFS_DAS_NODE_REPLACE); else - attr->xattri_dela_state = XFS_DAS_DONE; + attr->xattri_dela_state = xfs_attr_complete_op(attr, + XFS_DAS_NODE_REPLACE); out: trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp); return error; @@ -546,18 +554,15 @@ xfs_attr_rmtval_alloc( if (error) return error; - /* If this is not a rename, clear the incomplete flag and we're done. */ - if (!(args->op_flags & XFS_DA_OP_REPLACE)) { + attr->xattri_dela_state = xfs_attr_complete_op(attr, + ++attr->xattri_dela_state); + /* + * If we are not doing a rename, we've finished the operation but still + * have to clear the incomplete flag protecting the new attr from + * exposing partially initialised state if we crash during creation. + */ + if (attr->xattri_dela_state == XFS_DAS_DONE) error = xfs_attr3_leaf_clearflag(args); - attr->xattri_dela_state = XFS_DAS_DONE; - } else { - /* - * We are running a REPLACE operation, so we need to bump the - * state to the step in that operation. - */ - attr->xattri_dela_state++; - xfs_attr_dela_state_set_replace(attr, attr->xattri_dela_state); - } out: trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp); return error; @@ -714,13 +719,24 @@ next_state: return xfs_attr_node_addname(attr); case XFS_DAS_SF_REMOVE: - attr->xattri_dela_state = XFS_DAS_DONE; - return xfs_attr_sf_removename(args); + error = xfs_attr_sf_removename(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; case XFS_DAS_LEAF_REMOVE: - attr->xattri_dela_state = XFS_DAS_DONE; - return xfs_attr_leaf_removename(args); + error = xfs_attr_leaf_removename(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; case XFS_DAS_NODE_REMOVE: error = xfs_attr_node_removename_setup(attr); + if (error == -ENOATTR && + (args->op_flags & XFS_DA_OP_RECOVERY)) { + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + error = 0; + break; + } if (error) return error; attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT; @@ -806,14 +822,16 @@ next_state: case XFS_DAS_LEAF_REMOVE_ATTR: error = xfs_attr_leaf_remove_attr(attr); - attr->xattri_dela_state = XFS_DAS_DONE; + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); break; case XFS_DAS_NODE_REMOVE_ATTR: error = xfs_attr_node_remove_attr(attr); if (!error) error = xfs_attr_leaf_shrink(args); - attr->xattri_dela_state = XFS_DAS_DONE; + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); break; default: ASSERT(0); @@ -1315,9 +1333,10 @@ xfs_attr_leaf_removename( dp = args->dp; error = xfs_attr_leaf_hasname(args, &bp); - if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); + if (args->op_flags & XFS_DA_OP_RECOVERY) + return 0; return error; } else if (error != -EEXIST) return error; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 689a96689f1a..1af7abe29eef 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -444,18 +444,23 @@ struct xfs_attr_list_context { */ enum xfs_delattr_state { XFS_DAS_UNINIT = 0, /* No state has been set yet */ - XFS_DAS_SF_ADD, /* Initial shortform set iter state */ - XFS_DAS_LEAF_ADD, /* Initial leaf form set iter state */ - XFS_DAS_NODE_ADD, /* Initial node form set iter state */ - XFS_DAS_RMTBLK, /* Removing remote blks */ - XFS_DAS_RM_NAME, /* Remove attr name */ - XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ - - XFS_DAS_SF_REMOVE, /* Initial shortform set iter state */ - XFS_DAS_LEAF_REMOVE, /* Initial leaf form set iter state */ - XFS_DAS_NODE_REMOVE, /* Initial node form set iter state */ - - /* Leaf state set/replace sequence */ + + /* + * Initial sequence states. The replace setup code relies on the + * ADD and REMOVE states for a specific format to be sequential so + * that we can transform the initial operation to be performed + * according to the xfs_has_larp() state easily. + */ + XFS_DAS_SF_ADD, /* Initial sf add state */ + XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */ + + XFS_DAS_LEAF_ADD, /* Initial leaf add state */ + XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */ + + XFS_DAS_NODE_ADD, /* Initial node add state */ + XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */ + + /* Leaf state set/replace/remove sequence */ XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */ XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */ @@ -463,7 +468,7 @@ enum xfs_delattr_state { XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */ XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */ - /* Node state set/replace sequence, must match leaf state above */ + /* Node state sequence, must match leaf state above */ XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */ XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */ @@ -477,11 +482,11 @@ enum xfs_delattr_state { #define XFS_DAS_STRINGS \ { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \ { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \ + { XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \ { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \ + { XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \ { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \ - { XFS_DAS_RMTBLK, "XFS_DAS_RMTBLK" }, \ - { XFS_DAS_RM_NAME, "XFS_DAS_RM_NAME" }, \ - { XFS_DAS_RM_SHRINK, "XFS_DAS_RM_SHRINK" }, \ + { XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \ { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \ { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \ { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \ @@ -525,8 +530,7 @@ struct xfs_attr_item { enum xfs_delattr_state xattri_dela_state; /* - * Indicates if the attr operation is a set or a remove - * XFS_ATTR_OP_FLAGS_{SET,REMOVE} + * Attr operation being performed - XFS_ATTR_OP_FLAGS_* */ unsigned int xattri_op_flags; @@ -614,10 +618,19 @@ xfs_attr_init_remove_state(struct xfs_da_args *args) return XFS_DAS_NODE_REMOVE; } +/* + * If we are logging the attributes, then we have to start with removal of the + * old attribute so that there is always consistent state that we can recover + * from if the system goes down part way through. We always log the new attr + * value, so even when we remove the attr first we still have the information in + * the log to finish the replace operation atomically. + */ static inline enum xfs_delattr_state xfs_attr_init_replace_state(struct xfs_da_args *args) { args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE; + if (xfs_has_larp(args->dp->i_mount)) + return xfs_attr_init_remove_state(args); return xfs_attr_init_add_state(args); } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 53d02ce9ed78..d15e92858bf0 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -446,6 +446,14 @@ xfs_attr3_leaf_read( * Namespace helper routines *========================================================================*/ +/* + * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE + * flag on disk - if there's an incomplete attr then recovery needs to tear it + * down. If there's no incomplete attr, then recovery needs to tear that attr + * down to replace it with the attr that has been logged. In this case, the + * INCOMPLETE flag will not be set in attr->attr_filter, but rather + * XFS_DA_OP_RECOVERY will be set in args->op_flags. + */ static bool xfs_attr_match( struct xfs_da_args *args, @@ -453,14 +461,18 @@ xfs_attr_match( unsigned char *name, int flags) { + if (args->namelen != namelen) return false; if (memcmp(args->name, name, namelen) != 0) return false; - /* - * If we are looking for incomplete entries, show only those, else only - * show complete entries. - */ + + /* Recovery ignores the INCOMPLETE flag. */ + if ((args->op_flags & XFS_DA_OP_RECOVERY) && + args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK)) + return true; + + /* All remaining matches need to be filtered by INCOMPLETE state. */ if (args->attr_filter != (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) return false; @@ -799,6 +811,14 @@ xfs_attr_sf_removename( sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; error = xfs_attr_sf_findname(args, &sfe, &base); + + /* + * If we are recovering an operation, finding nothing to + * remove is not an error - it just means there was nothing + * to clean up. + */ + if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY)) + return 0; if (error != -EEXIST) return error; size = xfs_attr_sf_entsize(sfe); @@ -819,7 +839,7 @@ xfs_attr_sf_removename( totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && - !(args->op_flags & XFS_DA_OP_ADDNAME)) { + !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); @@ -1128,9 +1148,17 @@ xfs_attr3_leaf_to_shortform( goto out; if (forkoff == -1) { - ASSERT(xfs_has_attr2(dp->i_mount)); - ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); - xfs_attr_fork_remove(dp, args->trans); + /* + * Don't remove the attr fork if this operation is the first + * part of a attr replace operations. We're going to add a new + * attr immediately, so we need to keep the attr fork around in + * this case. + */ + if (!(args->op_flags & XFS_DA_OP_REPLACE)) { + ASSERT(xfs_has_attr2(dp->i_mount)); + ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); + xfs_attr_fork_remove(dp, args->trans); + } goto out; } diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 468ca70cd35d..ed2303e4d46a 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -91,6 +91,7 @@ typedef struct xfs_da_args { #define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */ #define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ #define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ +#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -99,7 +100,8 @@ typedef struct xfs_da_args { { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ { XFS_DA_OP_NOTIME, "NOTIME" }, \ - { XFS_DA_OP_REMOVE, "REMOVE" } + { XFS_DA_OP_REMOVE, "REMOVE" }, \ + { XFS_DA_OP_RECOVERY, "RECOVERY" } /* * Storage for holding state during Btree searches and split/join ops. diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 19ceb2d257b7..56f678c965b7 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -561,6 +561,7 @@ xfs_attri_item_recover( args->namelen = attrp->alfi_name_len; args->hashval = xfs_da_hashname(args->name, args->namelen); args->attr_filter = attrp->alfi_attr_flags; + args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT; switch (attrp->alfi_op_flags & XFS_ATTR_OP_FLAGS_TYPE_MASK) { case XFS_ATTR_OP_FLAGS_SET: @@ -568,9 +569,14 @@ xfs_attri_item_recover( args->value = attrip->attri_value; args->valuelen = attrp->alfi_value_len; args->total = xfs_attr_calc_size(args, &local); - attr->xattri_dela_state = xfs_attr_init_add_state(args); + if (xfs_inode_hasattr(args->dp)) + attr->xattri_dela_state = xfs_attr_init_replace_state(args); + else + attr->xattri_dela_state = xfs_attr_init_add_state(args); break; case XFS_ATTR_OP_FLAGS_REMOVE: + if (!xfs_inode_hasattr(args->dp)) + goto out; attr->xattri_dela_state = xfs_attr_init_remove_state(args); break; default: diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 01b047d86cd1..d32026585c1b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4131,13 +4131,10 @@ DEFINE_ICLOG_EVENT(xlog_iclog_write); TRACE_DEFINE_ENUM(XFS_DAS_UNINIT); TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD); -TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD); -TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD); -TRACE_DEFINE_ENUM(XFS_DAS_RMTBLK); -TRACE_DEFINE_ENUM(XFS_DAS_RM_NAME); -TRACE_DEFINE_ENUM(XFS_DAS_RM_SHRINK); TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE); +TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD); TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT); TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT); -- cgit From 51e6104fdb95f377c8741794778319bd413f4fff Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:57 +1000 Subject: xfs: detect empty attr leaf blocks in xfs_attr3_leaf_verify xfs_repair flags these as a corruption error, so the verifier should catch software bugs that result in empty leaf blocks being written to disk, too. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_leaf.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index d15e92858bf0..15a990409463 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -310,6 +310,15 @@ xfs_attr3_leaf_verify( if (fa) return fa; + /* + * Empty leaf blocks should never occur; they imply the existence of a + * software bug that needs fixing. xfs_repair also flags them as a + * corruption that needs fixing, so we should never let these go to + * disk. + */ + if (ichdr.count == 0) + return __this_address; + /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. -- cgit From 45ff8b471cdc58701a7ba5c5dcd8dfc57ae06829 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 May 2022 15:12:57 +1000 Subject: xfs: can't use kmem_zalloc() for attribute buffers Because heap allocation of 64kB buffers will fail: .... XFS: fs_mark(8414) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40) XFS: fs_mark(8417) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40) XFS: fs_mark(8409) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40) XFS: fs_mark(8428) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40) XFS: fs_mark(8430) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40) XFS: fs_mark(8437) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40) XFS: fs_mark(8433) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40) XFS: fs_mark(8406) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40) XFS: fs_mark(8412) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40) XFS: fs_mark(8432) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40) XFS: fs_mark(8424) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40) .... I'd use kvmalloc() instead, but.... - 48.19% xfs_attr_create_intent - 46.89% xfs_attri_init - kvmalloc_node - 46.04% __kmalloc_node - kmalloc_large_node - 45.99% __alloc_pages - 39.39% __alloc_pages_slowpath.constprop.0 - 38.89% __alloc_pages_direct_compact - 38.71% try_to_compact_pages - compact_zone_order - compact_zone - 21.09% isolate_migratepages_block 10.31% PageHuge 5.82% set_pfnblock_flags_mask 0.86% get_pfnblock_flags_mask - 4.48% __reset_isolation_suitable 4.44% __reset_isolation_pfn - 3.56% __pageblock_pfn_to_page 1.33% pfn_to_online_page 2.83% get_pfnblock_flags_mask - 0.87% migrate_pages 0.86% compaction_alloc 0.84% find_suitable_fallback - 6.60% get_page_from_freelist 4.99% clear_page_erms - 1.19% _raw_spin_lock_irqsave - do_raw_spin_lock __pv_queued_spin_lock_slowpath - 0.86% __vmalloc_node_range 0.65% __alloc_pages_bulk .... this is just yet another reminder of how much kvmalloc() sucks. So lift xlog_cil_kvmalloc(), rename it to xlog_kvmalloc() and use that instead.... We also clean up the attribute name and value lengths as they no longer need to be rounded out to sizes compatible with log vectors. Signed-off-by: Dave Chinner Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_attr_item.c | 35 +++++++++++++++-------------------- fs/xfs/xfs_log_cil.c | 35 +---------------------------------- fs/xfs/xfs_log_priv.h | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 54 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 56f678c965b7..e8ac88d9fd14 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -44,7 +44,7 @@ xfs_attri_item_free( struct xfs_attri_log_item *attrip) { kmem_free(attrip->attri_item.li_lv_shadow); - kmem_free(attrip); + kvfree(attrip); } /* @@ -119,11 +119,11 @@ xfs_attri_item_format( sizeof(struct xfs_attri_log_format)); xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME, attrip->attri_name, - xlog_calc_iovec_len(attrip->attri_name_len)); + attrip->attri_name_len); if (attrip->attri_value_len > 0) xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE, attrip->attri_value, - xlog_calc_iovec_len(attrip->attri_value_len)); + attrip->attri_value_len); } /* @@ -163,26 +163,21 @@ xfs_attri_init( { struct xfs_attri_log_item *attrip; - uint32_t name_vec_len = 0; - uint32_t value_vec_len = 0; - uint32_t buffer_size; - - if (name_len) - name_vec_len = xlog_calc_iovec_len(name_len); - if (value_len) - value_vec_len = xlog_calc_iovec_len(value_len); - - buffer_size = name_vec_len + value_vec_len; + uint32_t buffer_size = name_len + value_len; if (buffer_size) { - attrip = kmem_zalloc(sizeof(struct xfs_attri_log_item) + - buffer_size, KM_NOFS); - if (attrip == NULL) - return NULL; + /* + * This could be over 64kB in length, so we have to use + * kvmalloc() for this. But kvmalloc() utterly sucks, so we + * use own version. + */ + attrip = xlog_kvmalloc(sizeof(struct xfs_attri_log_item) + + buffer_size); } else { - attrip = kmem_cache_zalloc(xfs_attri_cache, - GFP_NOFS | __GFP_NOFAIL); + attrip = kmem_cache_alloc(xfs_attri_cache, + GFP_NOFS | __GFP_NOFAIL); } + memset(attrip, 0, sizeof(struct xfs_attri_log_item)); attrip->attri_name_len = name_len; if (name_len) @@ -195,7 +190,7 @@ xfs_attri_init( if (value_len) attrip->attri_value = ((char *)attrip) + sizeof(struct xfs_attri_log_item) + - name_vec_len; + name_len; else attrip->attri_value = NULL; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 70f718d76ceb..6ca6fe8f2747 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -134,39 +134,6 @@ xlog_cil_iovec_space( sizeof(uint64_t)); } -/* - * shadow buffers can be large, so we need to use kvmalloc() here to ensure - * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall - * back to vmalloc, so we can't actually do anything useful with gfp flags to - * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do - * direct reclaim and compaction in the slow path, both of which are - * horrendously expensive. We just want kmalloc to fail fast and fall back to - * vmalloc if it can't get somethign straight away from the free lists or buddy - * allocator. Hence we have to open code kvmalloc outselves here. - * - * Also, we are in memalloc_nofs_save task context here, so despite the use of - * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This - * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets - * just all pretend this is a GFP_KERNEL context operation.... - */ -static inline void * -xlog_cil_kvmalloc( - size_t buf_size) -{ - gfp_t flags = GFP_KERNEL; - void *p; - - flags &= ~__GFP_DIRECT_RECLAIM; - flags |= __GFP_NOWARN | __GFP_NORETRY; - do { - p = kmalloc(buf_size, flags); - if (!p) - p = vmalloc(buf_size); - } while (!p); - - return p; -} - /* * Allocate or pin log vector buffers for CIL insertion. * @@ -283,7 +250,7 @@ xlog_cil_alloc_shadow_bufs( * storage. */ kmem_free(lip->li_lv_shadow); - lv = xlog_cil_kvmalloc(buf_size); + lv = xlog_kvmalloc(buf_size); memset(lv, 0, xlog_cil_iovec_space(niovecs)); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 4f7e844d28ad..67fd9789e69a 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -651,4 +651,38 @@ xlog_valid_lsn( return valid; } +/* + * Log vector and shadow buffers can be large, so we need to use kvmalloc() here + * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts + * to fall back to vmalloc, so we can't actually do anything useful with gfp + * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() + * will do direct reclaim and compaction in the slow path, both of which are + * horrendously expensive. We just want kmalloc to fail fast and fall back to + * vmalloc if it can't get somethign straight away from the free lists or + * buddy allocator. Hence we have to open code kvmalloc outselves here. + * + * This assumes that the caller uses memalloc_nofs_save task context here, so + * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS + * allocations. This is actually the only way to make vmalloc() do GFP_NOFS + * allocations, so lets just all pretend this is a GFP_KERNEL context + * operation.... + */ +static inline void * +xlog_kvmalloc( + size_t buf_size) +{ + gfp_t flags = GFP_KERNEL; + void *p; + + flags &= ~__GFP_DIRECT_RECLAIM; + flags |= __GFP_NOWARN | __GFP_NORETRY; + do { + p = kmalloc(buf_size, flags); + if (!p) + p = vmalloc(buf_size); + } while (!p); + + return p; +} + #endif /* __XFS_LOG_PRIV_H__ */ -- cgit