diff options
-rw-r--r-- | Documentation/filesystems/xfs/xfs-online-fsck-design.rst | 5 | ||||
-rw-r--r-- | fs/xfs/Kconfig | 8 | ||||
-rw-r--r-- | fs/xfs/Makefile | 2 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_ag.c | 6 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_ag.h | 4 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_btree.c | 286 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_btree.h | 7 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_btree_mem.c | 347 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_btree_mem.h | 75 | ||||
-rw-r--r-- | fs/xfs/scrub/scrub.c | 5 | ||||
-rw-r--r-- | fs/xfs/scrub/scrub.h | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.c | 214 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.h | 17 | ||||
-rw-r--r-- | fs/xfs/xfs_buf_mem.c | 270 | ||||
-rw-r--r-- | fs/xfs/xfs_buf_mem.h | 34 | ||||
-rw-r--r-- | fs/xfs/xfs_health.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.h | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_trace.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_trace.h | 167 | ||||
-rw-r--r-- | fs/xfs/xfs_trans.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_buf.c | 42 |
21 files changed, 1363 insertions, 138 deletions
diff --git a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst index d03480266e9b..6333697ba3e8 100644 --- a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst +++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst @@ -2270,13 +2270,12 @@ follows: pointing to the xfile. 3. Pass the buffer cache target, buffer ops, and other information to - ``xfbtree_create`` to write an initial tree header and root block to the - xfile. + ``xfbtree_init`` to initialize the passed in ``struct xfbtree`` and write an + initial root block to the xfile. Each btree type should define a wrapper that passes necessary arguments to the creation function. For example, rmap btrees define ``xfs_rmapbt_mem_create`` to take care of all the necessary details for callers. - A ``struct xfbtree`` object will be returned. 4. Pass the xfbtree object to the btree cursor creation function for the btree type. diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index fa7eb3e2a248..d41edd30388b 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -128,6 +128,12 @@ config XFS_LIVE_HOOKS bool select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL +config XFS_MEMORY_BUFS + bool + +config XFS_BTREE_IN_MEM + bool + config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n @@ -135,6 +141,7 @@ config XFS_ONLINE_SCRUB depends on TMPFS && SHMEM select XFS_LIVE_HOOKS select XFS_DRAIN_INTENTS + select XFS_MEMORY_BUFS help If you say Y here you will be able to check metadata on a mounted XFS filesystem. This feature is intended to reduce @@ -169,6 +176,7 @@ config XFS_ONLINE_REPAIR bool "XFS online metadata repair support" default n depends on XFS_FS && XFS_ONLINE_SCRUB + select XFS_BTREE_IN_MEM help If you say Y here you will be able to repair metadata on a mounted XFS filesystem. This feature is intended to reduce diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index ba8608f469ac..9a6574da8de5 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -137,6 +137,8 @@ endif xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o xfs-$(CONFIG_XFS_LIVE_HOOKS) += xfs_hooks.o +xfs-$(CONFIG_XFS_MEMORY_BUFS) += xfs_buf_mem.o +xfs-$(CONFIG_XFS_BTREE_IN_MEM) += libxfs/xfs_btree_mem.o # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index e598e14320be..32d80a76440c 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -264,7 +264,7 @@ xfs_free_perag( xfs_defer_drain_free(&pag->pag_intents_drain); cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_buf_hash_destroy(pag); + xfs_buf_cache_destroy(&pag->pag_bcache); /* drop the mount's active reference */ xfs_perag_rele(pag); @@ -352,7 +352,7 @@ xfs_free_unused_perag_range( spin_unlock(&mp->m_perag_lock); if (!pag) break; - xfs_buf_hash_destroy(pag); + xfs_buf_cache_destroy(&pag->pag_bcache); xfs_defer_drain_free(&pag->pag_intents_drain); kfree(pag); } @@ -419,7 +419,7 @@ xfs_initialize_perag( pag->pagb_tree = RB_ROOT; #endif /* __KERNEL__ */ - error = xfs_buf_hash_init(pag); + error = xfs_buf_cache_init(&pag->pag_bcache); if (error) goto out_remove_pag; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 19eddba09894..29bfa6273dec 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -106,9 +106,7 @@ struct xfs_perag { int pag_ici_reclaimable; /* reclaimable inodes */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ - /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ - struct rhashtable pag_buf_hash; + struct xfs_buf_cache pag_bcache; /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index ecdd5ea27a03..d29547572a68 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -28,6 +28,8 @@ #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_health.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" /* * Btree magic numbers. @@ -76,6 +78,25 @@ xfs_btree_check_fsblock_siblings( } static inline xfs_failaddr_t +xfs_btree_check_memblock_siblings( + struct xfs_buftarg *btp, + xfbno_t bno, + __be64 dsibling) +{ + xfbno_t sibling; + + if (dsibling == cpu_to_be64(NULLFSBLOCK)) + return NULL; + + sibling = be64_to_cpu(dsibling); + if (sibling == bno) + return __this_address; + if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling))) + return __this_address; + return NULL; +} + +static inline xfs_failaddr_t xfs_btree_check_agblock_siblings( struct xfs_perag *pag, xfs_agblock_t agbno, @@ -165,6 +186,34 @@ __xfs_btree_check_fsblock( } /* + * Check an in-memory btree block header. Return the address of the failing + * check, or NULL if everything is ok. + */ +static xfs_failaddr_t +__xfs_btree_check_memblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) +{ + struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; + xfs_failaddr_t fa; + xfbno_t bno; + + fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); + if (fa) + return fa; + + bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_rightsib); + return fa; +} + +/* * Check a short btree block header. Return the address of the failing check, * or NULL if everything is ok. */ @@ -216,9 +265,17 @@ __xfs_btree_check_block( int level, struct xfs_buf *bp) { - if (cur->bc_ops->type == XFS_BTREE_TYPE_AG) + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + return __xfs_btree_check_memblock(cur, block, level, bp); + case XFS_BTREE_TYPE_AG: return __xfs_btree_check_agblock(cur, block, level, bp); - return __xfs_btree_check_fsblock(cur, block, level, bp); + case XFS_BTREE_TYPE_INODE: + return __xfs_btree_check_fsblock(cur, block, level, bp); + default: + ASSERT(0); + return __this_address; + } } static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) @@ -262,14 +319,22 @@ __xfs_btree_check_ptr( if (level <= 0) return -EFSCORRUPTED; - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + if (!xfbtree_verify_bno(cur->bc_mem.xfbtree, + be64_to_cpu((&ptr->l)[index]))) + return -EFSCORRUPTED; + break; + case XFS_BTREE_TYPE_INODE: if (!xfs_verify_fsbno(cur->bc_mp, be64_to_cpu((&ptr->l)[index]))) return -EFSCORRUPTED; - } else { + break; + case XFS_BTREE_TYPE_AG: if (!xfs_verify_agbno(cur->bc_ag.pag, be32_to_cpu((&ptr->s)[index]))) return -EFSCORRUPTED; + break; } return 0; @@ -290,17 +355,26 @@ xfs_btree_check_ptr( error = __xfs_btree_check_ptr(cur, ptr, index, level); if (error) { - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + xfs_err(cur->bc_mp, +"In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.", + cur->bc_ops->name, cur->bc_flags, level, index, + __this_address); + break; + case XFS_BTREE_TYPE_INODE: xfs_err(cur->bc_mp, "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", cur->bc_ino.ip->i_ino, cur->bc_ino.whichfork, cur->bc_ops->name, level, index); - } else { + break; + case XFS_BTREE_TYPE_AG: xfs_err(cur->bc_mp, "AG %u: Corrupt %sbt pointer at level %d index %d.", cur->bc_ag.pag->pag_agno, cur->bc_ops->name, level, index); + break; } xfs_btree_mark_sick(cur); } @@ -457,11 +531,35 @@ xfs_btree_del_cursor( case XFS_BTREE_TYPE_INODE: /* nothing to do */ break; + case XFS_BTREE_TYPE_MEM: + if (cur->bc_mem.pag) + xfs_perag_put(cur->bc_mem.pag); + break; } kmem_cache_free(cur->bc_cache, cur); } +/* Return the buffer target for this btree's buffer. */ +static inline struct xfs_buftarg * +xfs_btree_buftarg( + struct xfs_btree_cur *cur) +{ + if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) + return cur->bc_mem.xfbtree->target; + return cur->bc_mp->m_ddev_targp; +} + +/* Return the block size (in units of 512b sectors) for this btree. */ +static inline unsigned int +xfs_btree_bbsize( + struct xfs_btree_cur *cur) +{ + if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) + return XFBNO_BBSIZE; + return cur->bc_mp->m_bsize; +} + /* * Duplicate the btree cursor. * Allocate a new one, copy the record, re-get the buffers. @@ -505,10 +603,11 @@ xfs_btree_dup_cursor( new->bc_levels[i].ra = cur->bc_levels[i].ra; bp = cur->bc_levels[i].bp; if (bp) { - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - xfs_buf_daddr(bp), mp->m_bsize, - 0, &bp, - cur->bc_ops->buf_ops); + error = xfs_trans_read_buf(mp, tp, + xfs_btree_buftarg(cur), + xfs_buf_daddr(bp), + xfs_btree_bbsize(cur), 0, &bp, + cur->bc_ops->buf_ops); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(new); if (error) { @@ -886,6 +985,32 @@ xfs_btree_readahead_fsblock( } STATIC int +xfs_btree_readahead_memblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) +{ + struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; + xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); + xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); + int rval = 0; + + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { + xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE, + cur->bc_ops->buf_ops); + rval++; + } + + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { + xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE, + cur->bc_ops->buf_ops); + rval++; + } + + return rval; +} + +STATIC int xfs_btree_readahead_agblock( struct xfs_btree_cur *cur, int lr, @@ -939,9 +1064,17 @@ xfs_btree_readahead( cur->bc_levels[lev].ra |= lr; block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); - if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: + return xfs_btree_readahead_agblock(cur, lr, block); + case XFS_BTREE_TYPE_INODE: return xfs_btree_readahead_fsblock(cur, lr, block); - return xfs_btree_readahead_agblock(cur, lr, block); + case XFS_BTREE_TYPE_MEM: + return xfs_btree_readahead_memblock(cur, lr, block); + default: + ASSERT(0); + return 0; + } } STATIC int @@ -950,23 +1083,24 @@ xfs_btree_ptr_to_daddr( const union xfs_btree_ptr *ptr, xfs_daddr_t *daddr) { - xfs_fsblock_t fsbno; - xfs_agblock_t agbno; int error; error = xfs_btree_check_ptr(cur, ptr, 0, 1); if (error) return error; - if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { - fsbno = be64_to_cpu(ptr->l); - *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); - } else { - agbno = be32_to_cpu(ptr->s); + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno); + be32_to_cpu(ptr->s)); + break; + case XFS_BTREE_TYPE_INODE: + *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); + break; + case XFS_BTREE_TYPE_MEM: + *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l)); + break; } - return 0; } @@ -986,8 +1120,9 @@ xfs_btree_readahead_ptr( if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) return; - xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr, - cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); + xfs_buf_readahead(xfs_btree_buftarg(cur), daddr, + xfs_btree_bbsize(cur) * count, + cur->bc_ops->buf_ops); } /* @@ -1043,6 +1178,17 @@ xfs_btree_set_ptr_null( ptr->s = cpu_to_be32(NULLAGBLOCK); } +static inline bool +xfs_btree_ptrs_equal( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr1, + union xfs_btree_ptr *ptr2) +{ + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) + return ptr1->l == ptr2->l; + return ptr1->s == ptr2->s; +} + /* * Get/set/init sibling pointers */ @@ -1161,9 +1307,17 @@ static inline __u64 xfs_btree_owner( struct xfs_btree_cur *cur) { - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + return cur->bc_mem.xfbtree->owner; + case XFS_BTREE_TYPE_INODE: return cur->bc_ino.ip->i_ino; - return cur->bc_ag.pag->pag_agno; + case XFS_BTREE_TYPE_AG: + return cur->bc_ag.pag->pag_agno; + default: + ASSERT(0); + return 0; + } } void @@ -1207,12 +1361,18 @@ xfs_btree_buf_to_ptr( struct xfs_buf *bp, union xfs_btree_ptr *ptr) { - if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) - ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, - xfs_buf_daddr(bp))); - else { + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp))); + break; + case XFS_BTREE_TYPE_INODE: + ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(bp))); + break; + case XFS_BTREE_TYPE_MEM: + ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp))); + break; } } @@ -1231,15 +1391,14 @@ xfs_btree_get_buf_block( struct xfs_btree_block **block, struct xfs_buf **bpp) { - struct xfs_mount *mp = cur->bc_mp; - xfs_daddr_t d; - int error; + xfs_daddr_t d; + int error; error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; - error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, - 0, bpp); + error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d, + xfs_btree_bbsize(cur), 0, bpp); if (error) return error; @@ -1270,9 +1429,9 @@ xfs_btree_read_buf_block( error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; - error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp, - cur->bc_ops->buf_ops); + error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d, + xfs_btree_bbsize(cur), flags, bpp, + cur->bc_ops->buf_ops); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(cur); if (error) @@ -4365,7 +4524,7 @@ xfs_btree_visit_block( { struct xfs_btree_block *block; struct xfs_buf *bp; - union xfs_btree_ptr rptr; + union xfs_btree_ptr rptr, bufptr; int error; /* do right sibling readahead */ @@ -4388,19 +4547,12 @@ xfs_btree_visit_block( * return the same block without checking if the right sibling points * back to us and creates a cyclic reference in the btree. */ - if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { - if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, - xfs_buf_daddr(bp))) { - xfs_btree_mark_sick(cur); - return -EFSCORRUPTED; - } - } else { - if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, - xfs_buf_daddr(bp))) { - xfs_btree_mark_sick(cur); - return -EFSCORRUPTED; - } + xfs_btree_buf_to_ptr(cur, bp, &bufptr); + if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; } + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } @@ -4578,6 +4730,8 @@ xfs_btree_fsblock_verify( xfs_fsblock_t fsb; xfs_failaddr_t fa; + ASSERT(!xfs_buftarg_is_mem(bp->b_target)); + /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; @@ -4592,6 +4746,36 @@ xfs_btree_fsblock_verify( return fa; } +/* Verify an in-memory btree block. */ +xfs_failaddr_t +xfs_btree_memblock_verify( + struct xfs_buf *bp, + unsigned int max_recs) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buftarg *btp = bp->b_target; + xfs_failaddr_t fa; + xfbno_t bno; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > max_recs) + return __this_address; + + /* sibling pointer verification */ + bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_leftsib); + if (fa) + return fa; + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_rightsib); + if (fa) + return fa; + + return NULL; +} /** * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format * btree block @@ -4633,6 +4817,8 @@ xfs_btree_agblock_verify( xfs_agblock_t agbno; xfs_failaddr_t fa; + ASSERT(!xfs_buftarg_is_mem(bp->b_target)); + /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index bacd67cc8ced..f93374278aa1 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -112,6 +112,7 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) enum xfs_btree_type { XFS_BTREE_TYPE_AG, XFS_BTREE_TYPE_INODE, + XFS_BTREE_TYPE_MEM, }; struct xfs_btree_ops { @@ -281,6 +282,10 @@ struct xfs_btree_cur struct xfs_buf *agbp; struct xbtree_afakeroot *afake; /* for staging cursor */ } bc_ag; + struct { + struct xfbtree *xfbtree; + struct xfs_perag *pag; + } bc_mem; }; /* per-format private data */ @@ -455,6 +460,8 @@ xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp, uint64_t owner); xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp, unsigned int max_recs); +xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp, + unsigned int max_recs); unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits, unsigned long long records); diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c new file mode 100644 index 000000000000..036061fe32cc --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_mem.c @@ -0,0 +1,347 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <[email protected]> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_error.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_ag.h" +#include "xfs_buf_item.h" +#include "xfs_trace.h" + +/* Set the root of an in-memory btree. */ +void +xfbtree_set_root( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) +{ + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + cur->bc_mem.xfbtree->root = *ptr; + cur->bc_mem.xfbtree->nlevels += inc; +} + +/* Initialize a pointer from the in-memory btree header. */ +void +xfbtree_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + *ptr = cur->bc_mem.xfbtree->root; +} + +/* Duplicate an in-memory btree cursor. */ +struct xfs_btree_cur * +xfbtree_dup_cursor( + struct xfs_btree_cur *cur) +{ + struct xfs_btree_cur *ncur; + + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + ncur = xfs_btree_alloc_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ops, + cur->bc_maxlevels, cur->bc_cache); + ncur->bc_flags = cur->bc_flags; + ncur->bc_nlevels = cur->bc_nlevels; + ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree; + + if (cur->bc_mem.pag) + ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag); + + return ncur; +} + +/* Close the btree xfile and release all resources. */ +void +xfbtree_destroy( + struct xfbtree *xfbt) +{ + xfs_buftarg_drain(xfbt->target); +} + +/* Compute the number of bytes available for records. */ +static inline unsigned int +xfbtree_rec_bytes( + struct xfs_mount *mp, + const struct xfs_btree_ops *ops) +{ + return XMBUF_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; +} + +/* Initialize an empty leaf block as the btree root. */ +STATIC int +xfbtree_init_leaf_block( + struct xfs_mount *mp, + struct xfbtree *xfbt, + const struct xfs_btree_ops *ops) +{ + struct xfs_buf *bp; + xfbno_t bno = xfbt->highest_bno++; + int error; + + error = xfs_buf_get(xfbt->target, xfbno_to_daddr(bno), XFBNO_BBSIZE, + &bp); + if (error) + return error; + + trace_xfbtree_create_root_buf(xfbt, bp); + + bp->b_ops = ops->buf_ops; + xfs_btree_init_buf(mp, bp, ops, 0, 0, xfbt->owner); + xfs_buf_relse(bp); + + xfbt->root.l = cpu_to_be64(bno); + return 0; +} + +/* + * Create an in-memory btree root that can be used with the given xmbuf. + * Callers must set xfbt->owner. + */ +int +xfbtree_init( + struct xfs_mount *mp, + struct xfbtree *xfbt, + struct xfs_buftarg *btp, + const struct xfs_btree_ops *ops) +{ + unsigned int blocklen = xfbtree_rec_bytes(mp, ops); + unsigned int keyptr_len; + int error; + + /* Requires a long-format CRC-format btree */ + if (!xfs_has_crc(mp)) { + ASSERT(xfs_has_crc(mp)); + return -EINVAL; + } + if (ops->ptr_len != XFS_BTREE_LONG_PTR_LEN) { + ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN); + return -EINVAL; + } + + memset(xfbt, 0, sizeof(*xfbt)); + xfbt->target = btp; + + /* Set up min/maxrecs for this btree. */ + keyptr_len = ops->key_len + sizeof(__be64); + xfbt->maxrecs[0] = blocklen / ops->rec_len; + xfbt->maxrecs[1] = blocklen / keyptr_len; + xfbt->minrecs[0] = xfbt->maxrecs[0] / 2; + xfbt->minrecs[1] = xfbt->maxrecs[1] / 2; + xfbt->highest_bno = 0; + xfbt->nlevels = 1; + + /* Initialize the empty btree. */ + error = xfbtree_init_leaf_block(mp, xfbt, ops); + if (error) + goto err_freesp; + + trace_xfbtree_init(mp, xfbt, ops); + + return 0; + +err_freesp: + xfs_buftarg_drain(xfbt->target); + return error; +} + +/* Allocate a block to our in-memory btree. */ +int +xfbtree_alloc_block( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + xfbno_t bno = xfbt->highest_bno++; + + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + trace_xfbtree_alloc_block(xfbt, cur, bno); + + /* Fail if the block address exceeds the maximum for the buftarg. */ + if (!xfbtree_verify_bno(xfbt, bno)) { + ASSERT(xfbtree_verify_bno(xfbt, bno)); + *stat = 0; + return 0; + } + + new->l = cpu_to_be64(bno); + *stat = 1; + return 0; +} + +/* Free a block from our in-memory btree. */ +int +xfbtree_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + xfs_daddr_t daddr = xfs_buf_daddr(bp); + xfbno_t bno = xfs_daddr_to_xfbno(daddr); + + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + trace_xfbtree_free_block(xfbt, cur, bno); + + if (bno + 1 == xfbt->highest_bno) + xfbt->highest_bno--; + + return 0; +} + +/* Return the minimum number of records for a btree block. */ +int +xfbtree_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + + return xfbt->minrecs[level != 0]; +} + +/* Return the maximum number of records for a btree block. */ +int +xfbtree_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + + return xfbt->maxrecs[level != 0]; +} + +/* If this log item is a buffer item that came from the xfbtree, return it. */ +static inline struct xfs_buf * +xfbtree_buf_match( + struct xfbtree *xfbt, + const struct xfs_log_item *lip) +{ + const struct xfs_buf_log_item *bli; + struct xfs_buf *bp; + + if (lip->li_type != XFS_LI_BUF) + return NULL; + + bli = container_of(lip, struct xfs_buf_log_item, bli_item); + bp = bli->bli_buf; + if (bp->b_target != xfbt->target) + return NULL; + + return bp; +} + +/* + * Commit changes to the incore btree immediately by writing all dirty xfbtree + * buffers to the backing xfile. This detaches all xfbtree buffers from the + * transaction, even on failure. The buffer locks are dropped between the + * delwri queue and submit, so the caller must synchronize btree access. + * + * Normally we'd let the buffers commit with the transaction and get written to + * the xfile via the log, but online repair stages ephemeral btrees in memory + * and uses the btree_staging functions to write new btrees to disk atomically. + * The in-memory btree (and its backing store) are discarded at the end of the + * repair phase, which means that xfbtree buffers cannot commit with the rest + * of a transaction. + * + * In other words, online repair only needs the transaction to collect buffer + * pointers and to avoid buffer deadlocks, not to guarantee consistency of + * updates. + */ +int +xfbtree_trans_commit( + struct xfbtree *xfbt, + struct xfs_trans *tp) +{ + struct xfs_log_item *lip, *n; + bool tp_dirty = false; + int error = 0; + + /* + * For each xfbtree buffer attached to the transaction, write the dirty + * buffers to the xfile and release them. + */ + list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) { + struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip); + + if (!bp) { + if (test_bit(XFS_LI_DIRTY, &lip->li_flags)) + tp_dirty |= true; + continue; + } + + trace_xfbtree_trans_commit_buf(xfbt, bp); + + xmbuf_trans_bdetach(tp, bp); + + /* + * If the buffer fails verification, note the failure but + * continue walking the transaction items so that we remove all + * ephemeral btree buffers. + */ + if (!error) + error = xmbuf_finalize(bp); + + xfs_buf_relse(bp); + } + + /* + * Reset the transaction's dirty flag to reflect the dirty state of the + * log items that are still attached. + */ + tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) | + (tp_dirty ? XFS_TRANS_DIRTY : 0); + + return error; +} + +/* + * Cancel changes to the incore btree by detaching all the xfbtree buffers. + * Changes are not undone, so callers must not access the btree ever again. + */ +void +xfbtree_trans_cancel( + struct xfbtree *xfbt, + struct xfs_trans *tp) +{ + struct xfs_log_item *lip, *n; + bool tp_dirty = false; + + list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) { + struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip); + + if (!bp) { + if (test_bit(XFS_LI_DIRTY, &lip->li_flags)) + tp_dirty |= true; + continue; + } + + trace_xfbtree_trans_cancel_buf(xfbt, bp); + + xmbuf_trans_bdetach(tp, bp); + xfs_buf_relse(bp); + } + + /* + * Reset the transaction's dirty flag to reflect the dirty state of the + * log items that are still attached. + */ + tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) | + (tp_dirty ? XFS_TRANS_DIRTY : 0); +} diff --git a/fs/xfs/libxfs/xfs_btree_mem.h b/fs/xfs/libxfs/xfs_btree_mem.h new file mode 100644 index 000000000000..1c3825786ec8 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_mem.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <[email protected]> + */ +#ifndef __XFS_BTREE_MEM_H__ +#define __XFS_BTREE_MEM_H__ + +typedef uint64_t xfbno_t; + +#define XFBNO_BLOCKSIZE (XMBUF_BLOCKSIZE) +#define XFBNO_BBSHIFT (XMBUF_BLOCKSHIFT - BBSHIFT) +#define XFBNO_BBSIZE (XFBNO_BLOCKSIZE >> BBSHIFT) + +static inline xfs_daddr_t xfbno_to_daddr(xfbno_t blkno) +{ + return blkno << XFBNO_BBSHIFT; +} + +static inline xfbno_t xfs_daddr_to_xfbno(xfs_daddr_t daddr) +{ + return daddr >> XFBNO_BBSHIFT; +} + +struct xfbtree { + /* buffer cache target for this in-memory btree */ + struct xfs_buftarg *target; + + /* Highest block number that has been written to. */ + xfbno_t highest_bno; + + /* Owner of this btree. */ + unsigned long long owner; + + /* Btree header */ + union xfs_btree_ptr root; + unsigned int nlevels; + + /* Minimum and maximum records per block. */ + unsigned int maxrecs[2]; + unsigned int minrecs[2]; +}; + +#ifdef CONFIG_XFS_BTREE_IN_MEM +static inline bool xfbtree_verify_bno(struct xfbtree *xfbt, xfbno_t bno) +{ + return xmbuf_verify_daddr(xfbt->target, xfbno_to_daddr(bno)); +} + +void xfbtree_set_root(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, int inc); +void xfbtree_init_ptr_from_cur(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); +struct xfs_btree_cur *xfbtree_dup_cursor(struct xfs_btree_cur *cur); + +int xfbtree_get_minrecs(struct xfs_btree_cur *cur, int level); +int xfbtree_get_maxrecs(struct xfs_btree_cur *cur, int level); + +int xfbtree_alloc_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, union xfs_btree_ptr *ptr, + int *stat); +int xfbtree_free_block(struct xfs_btree_cur *cur, struct xfs_buf *bp); + +/* Callers must set xfbt->target and xfbt->owner before calling this */ +int xfbtree_init(struct xfs_mount *mp, struct xfbtree *xfbt, + struct xfs_buftarg *btp, const struct xfs_btree_ops *ops); +void xfbtree_destroy(struct xfbtree *xfbt); + +int xfbtree_trans_commit(struct xfbtree *xfbt, struct xfs_trans *tp); +void xfbtree_trans_cancel(struct xfbtree *xfbt, struct xfs_trans *tp); +#else +# define xfbtree_verify_bno(...) (false) +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + +#endif /* __XFS_BTREE_MEM_H__ */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index aeac9cae4ad4..6828e72824fb 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -15,6 +15,7 @@ #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_scrub.h" +#include "xfs_buf_mem.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -190,6 +191,10 @@ xchk_teardown( sc->flags &= ~XCHK_HAVE_FREEZE_PROT; mnt_drop_write_file(sc->file); } + if (sc->xmbtp) { + xmbuf_free(sc->xmbtp); + sc->xmbtp = NULL; + } if (sc->xfile) { xfile_destroy(sc->xfile); sc->xfile = NULL; diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index f99a3c21d02e..1247284c17a0 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -99,6 +99,9 @@ struct xfs_scrub { /* xfile used by the scrubbers; freed at teardown. */ struct xfile *xfile; + /* buffer target for in-memory btrees; also freed at teardown. */ + struct xfs_buftarg *xmbtp; + /* Lock flags for @ip. */ uint ilock_flags; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ff243e3176a5..7fc26e64368d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -21,6 +21,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_ag.h" +#include "xfs_buf_mem.h" struct kmem_cache *xfs_buf_cache; @@ -318,7 +319,9 @@ xfs_buf_free( ASSERT(list_empty(&bp->b_lru)); - if (bp->b_flags & _XBF_PAGES) + if (xfs_buftarg_is_mem(bp->b_target)) + xmbuf_unmap_page(bp); + else if (bp->b_flags & _XBF_PAGES) xfs_buf_free_pages(bp); else if (bp->b_flags & _XBF_KMEM) kfree(bp->b_addr); @@ -510,18 +513,18 @@ static const struct rhashtable_params xfs_buf_hash_params = { }; int -xfs_buf_hash_init( - struct xfs_perag *pag) +xfs_buf_cache_init( + struct xfs_buf_cache *bch) { - spin_lock_init(&pag->pag_buf_lock); - return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); + spin_lock_init(&bch->bc_lock); + return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); } void -xfs_buf_hash_destroy( - struct xfs_perag *pag) +xfs_buf_cache_destroy( + struct xfs_buf_cache *bch) { - rhashtable_destroy(&pag->pag_buf_hash); + rhashtable_destroy(&bch->bc_hash); } static int @@ -584,7 +587,7 @@ xfs_buf_find_lock( static inline int xfs_buf_lookup( - struct xfs_perag *pag, + struct xfs_buf_cache *bch, struct xfs_buf_map *map, xfs_buf_flags_t flags, struct xfs_buf **bpp) @@ -593,7 +596,7 @@ xfs_buf_lookup( int error; rcu_read_lock(); - bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params); + bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { rcu_read_unlock(); return -ENOENT; @@ -618,6 +621,7 @@ xfs_buf_lookup( static int xfs_buf_find_insert( struct xfs_buftarg *btp, + struct xfs_buf_cache *bch, struct xfs_perag *pag, struct xfs_buf_map *cmap, struct xfs_buf_map *map, @@ -633,31 +637,33 @@ xfs_buf_find_insert( if (error) goto out_drop_pag; - /* - * For buffers that fit entirely within a single page, first attempt to - * allocate the memory from the heap to minimise memory usage. If we - * can't get heap memory for these small buffers, we fall back to using - * the page allocator. - */ - if (BBTOB(new_bp->b_length) >= PAGE_SIZE || - xfs_buf_alloc_kmem(new_bp, flags) < 0) { + if (xfs_buftarg_is_mem(new_bp->b_target)) { + error = xmbuf_map_page(new_bp); + } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE || + xfs_buf_alloc_kmem(new_bp, flags) < 0) { + /* + * For buffers that fit entirely within a single page, first + * attempt to allocate the memory from the heap to minimise + * memory usage. If we can't get heap memory for these small + * buffers, we fall back to using the page allocator. + */ error = xfs_buf_alloc_pages(new_bp, flags); - if (error) - goto out_free_buf; } + if (error) + goto out_free_buf; - spin_lock(&pag->pag_buf_lock); - bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash, + spin_lock(&bch->bc_lock); + bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, &new_bp->b_rhash_head, xfs_buf_hash_params); if (IS_ERR(bp)) { error = PTR_ERR(bp); - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); goto out_free_buf; } if (bp) { /* found an existing buffer */ atomic_inc(&bp->b_hold); - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); error = xfs_buf_find_lock(bp, flags); if (error) xfs_buf_rele(bp); @@ -668,17 +674,40 @@ xfs_buf_find_insert( /* The new buffer keeps the perag reference until it is freed. */ new_bp->b_pag = pag; - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); *bpp = new_bp; return 0; out_free_buf: xfs_buf_free(new_bp); out_drop_pag: - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); return error; } +static inline struct xfs_perag * +xfs_buftarg_get_pag( + struct xfs_buftarg *btp, + const struct xfs_buf_map *map) +{ + struct xfs_mount *mp = btp->bt_mount; + + if (xfs_buftarg_is_mem(btp)) + return NULL; + return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn)); +} + +static inline struct xfs_buf_cache * +xfs_buftarg_buf_cache( + struct xfs_buftarg *btp, + struct xfs_perag *pag) +{ + if (pag) + return &pag->pag_bcache; + return btp->bt_cache; +} + /* * Assembles a buffer covering the specified range. The code is optimised for * cache hits, as metadata intensive workloads will see 3 orders of magnitude @@ -692,6 +721,7 @@ xfs_buf_get_map( xfs_buf_flags_t flags, struct xfs_buf **bpp) { + struct xfs_buf_cache *bch; struct xfs_perag *pag; struct xfs_buf *bp = NULL; struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; @@ -707,10 +737,10 @@ xfs_buf_get_map( if (error) return error; - pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); + pag = xfs_buftarg_get_pag(btp, &cmap); + bch = xfs_buftarg_buf_cache(btp, pag); - error = xfs_buf_lookup(pag, &cmap, flags, &bp); + error = xfs_buf_lookup(bch, &cmap, flags, &bp); if (error && error != -ENOENT) goto out_put_perag; @@ -722,13 +752,14 @@ xfs_buf_get_map( goto out_put_perag; /* xfs_buf_find_insert() consumes the perag reference. */ - error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, + error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps, flags, &bp); if (error) return error; } else { XFS_STATS_INC(btp->bt_mount, xb_get_locked); - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); } /* We do not hold a perag reference anymore. */ @@ -756,7 +787,8 @@ xfs_buf_get_map( return 0; out_put_perag: - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); return error; } @@ -903,6 +935,13 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; + /* + * Currently we don't have a good means or justification for performing + * xmbuf_map_page asynchronously, so we don't do readahead. + */ + if (xfs_buftarg_is_mem(target)) + return; + xfs_buf_read_map(target, map, nmaps, XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, __this_address); @@ -968,7 +1007,10 @@ xfs_buf_get_uncached( if (error) return error; - error = xfs_buf_alloc_pages(bp, flags); + if (xfs_buftarg_is_mem(bp->b_target)) + error = xmbuf_map_page(bp); + else + error = xfs_buf_alloc_pages(bp, flags); if (error) goto fail_free_buf; @@ -1016,7 +1058,9 @@ static void xfs_buf_rele_cached( struct xfs_buf *bp) { + struct xfs_buftarg *btp = bp->b_target; struct xfs_perag *pag = bp->b_pag; + struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); bool release; bool freebuf = false; @@ -1035,7 +1079,7 @@ xfs_buf_rele_cached( * leading to a use-after-free scenario. */ spin_lock(&bp->b_lock); - release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock); if (!release) { /* * Drop the in-flight state if the buffer is already on the LRU @@ -1056,11 +1100,11 @@ xfs_buf_rele_cached( * buffer for the LRU and clear the (now stale) dispose list * state flag */ - if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) { + if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) { bp->b_state &= ~XFS_BSTATE_DISPOSE; atomic_inc(&bp->b_hold); } - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); } else { /* * most of the time buffers will already be removed from the @@ -1069,16 +1113,17 @@ xfs_buf_rele_cached( * was on was the disposal list */ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); + list_lru_del_obj(&btp->bt_lru, &bp->b_lru); } else { ASSERT(list_empty(&bp->b_lru)); } ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, - xfs_buf_hash_params); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); + rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, + xfs_buf_hash_params); + spin_unlock(&bch->bc_lock); + if (pag) + xfs_perag_put(pag); freebuf = true; } @@ -1607,6 +1652,12 @@ _xfs_buf_ioapply( /* we only use the buffer cache for meta-data */ op |= REQ_META; + /* in-memory targets are directly mapped, no IO required. */ + if (xfs_buftarg_is_mem(bp->b_target)) { + xfs_buf_ioend(bp); + return; + } + /* * Walk all the vectors issuing IO on them. Set up the initial offset * into the buffer and the desired IO size before we start - @@ -1962,19 +2013,24 @@ xfs_buftarg_shrink_count( } void -xfs_free_buftarg( +xfs_destroy_buftarg( struct xfs_buftarg *btp) { shrinker_free(btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); +} +void +xfs_free_buftarg( + struct xfs_buftarg *btp) +{ + xfs_destroy_buftarg(btp); fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) bdev_release(btp->bt_bdev_handle); - kfree(btp); } @@ -1997,6 +2053,45 @@ xfs_setsize_buftarg( return 0; } +int +xfs_init_buftarg( + struct xfs_buftarg *btp, + size_t logical_sectorsize, + const char *descr) +{ + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = logical_sectorsize; + btp->bt_logical_sectormask = logical_sectorsize - 1; + + /* + * Buffer IO error rate limiting. Limit it to no more than 10 messages + * per 30 seconds so as to not spam logs too much on repeated errors. + */ + ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, + DEFAULT_RATELIMIT_BURST); + + if (list_lru_init(&btp->bt_lru)) + return -ENOMEM; + if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + goto out_destroy_lru; + + btp->bt_shrinker = + shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr); + if (!btp->bt_shrinker) + goto out_destroy_io_count; + btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; + btp->bt_shrinker->private_data = btp; + shrinker_register(btp->bt_shrinker); + return 0; + +out_destroy_io_count: + percpu_counter_destroy(&btp->bt_io_count); +out_destroy_lru: + list_lru_destroy(&btp->bt_lru); + return -ENOMEM; +} + struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, @@ -2023,41 +2118,12 @@ xfs_alloc_buftarg( */ if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) goto error_free; - - /* Set up device logical sector size mask */ - btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); - btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; - - /* - * Buffer IO error rate limiting. Limit it to no more than 10 messages - * per 30 seconds so as to not spam logs too much on repeated errors. - */ - ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, - DEFAULT_RATELIMIT_BURST); - - if (list_lru_init(&btp->bt_lru)) + if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev), + mp->m_super->s_id)) goto error_free; - if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) - goto error_lru; - - btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", - mp->m_super->s_id); - if (!btp->bt_shrinker) - goto error_pcpu; - - btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; - btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; - btp->bt_shrinker->private_data = btp; - - shrinker_register(btp->bt_shrinker); - return btp; -error_pcpu: - percpu_counter_destroy(&btp->bt_io_count); -error_lru: - list_lru_destroy(&btp->bt_lru); error_free: kfree(btp); return NULL; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b9216dee7810..73249abca968 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -83,6 +83,14 @@ typedef unsigned int xfs_buf_flags_t; #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ #define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ +struct xfs_buf_cache { + spinlock_t bc_lock; + struct rhashtable bc_hash; +}; + +int xfs_buf_cache_init(struct xfs_buf_cache *bch); +void xfs_buf_cache_destroy(struct xfs_buf_cache *bch); + /* * The xfs_buftarg contains 2 notions of "sector size" - * @@ -101,6 +109,7 @@ struct xfs_buftarg { struct bdev_handle *bt_bdev_handle; struct block_device *bt_bdev; struct dax_device *bt_daxdev; + struct file *bt_file; u64 bt_dax_part_off; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; @@ -114,6 +123,9 @@ struct xfs_buftarg { struct percpu_counter bt_io_count; struct ratelimit_state bt_ioerror_rl; + + /* built-in cache, if we're not using the perag one */ + struct xfs_buf_cache bt_cache[]; }; #define XB_PAGES 2 @@ -379,4 +391,9 @@ int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); +/* for xfs_buf_mem.c only: */ +int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize, + const char *descr); +void xfs_destroy_buftarg(struct xfs_buftarg *btp); + #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c new file mode 100644 index 000000000000..8ad38c64708e --- /dev/null +++ b/fs/xfs/xfs_buf_mem.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <[email protected]> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_buf.h" +#include "xfs_buf_mem.h" +#include "xfs_trace.h" +#include <linux/shmem_fs.h> +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_error.h" + +/* + * Buffer Cache for In-Memory Files + * ================================ + * + * Online fsck wants to create ephemeral ordered recordsets. The existing + * btree infrastructure can do this, but we need the buffer cache to target + * memory instead of block devices. + * + * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those + * requirements. Therefore, the xmbuf mechanism uses an unlinked shmem file to + * store our staging data. This file is not installed in the file descriptor + * table so that user programs cannot access the data, which means that the + * xmbuf must be freed with xmbuf_destroy. + * + * xmbufs assume that the caller will handle all required concurrency + * management; standard vfs locks (freezer and inode) are not taken. Reads + * and writes are satisfied directly from the page cache. + * + * The only supported block size is PAGE_SIZE, and we cannot use highmem. + */ + +/* + * shmem files used to back an in-memory buffer cache must not be exposed to + * userspace. Upper layers must coordinate access to the one handle returned + * by the constructor, so establish a separate lock class for xmbufs to avoid + * confusing lockdep. + */ +static struct lock_class_key xmbuf_i_mutex_key; + +/* + * Allocate a buffer cache target for a memory-backed file and set up the + * buffer target. + */ +int +xmbuf_alloc( + struct xfs_mount *mp, + const char *descr, + struct xfs_buftarg **btpp) +{ + struct file *file; + struct inode *inode; + struct xfs_buftarg *btp; + int error; + + btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL); + if (!btp) + return -ENOMEM; + + file = shmem_kernel_file_setup(descr, 0, 0); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out_free_btp; + } + inode = file_inode(file); + + /* private file, private locking */ + lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key); + + /* + * We don't want to bother with kmapping data during repair, so don't + * allow highmem pages to back this mapping. + */ + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); + + /* ensure all writes are below EOF to avoid pagecache zeroing */ + i_size_write(inode, inode->i_sb->s_maxbytes); + + trace_xmbuf_create(btp); + + error = xfs_buf_cache_init(btp->bt_cache); + if (error) + goto out_file; + + /* Initialize buffer target */ + btp->bt_mount = mp; + btp->bt_dev = (dev_t)-1U; + btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */ + btp->bt_file = file; + btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE; + btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1; + + error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr); + if (error) + goto out_bcache; + + *btpp = btp; + return 0; + +out_bcache: + xfs_buf_cache_destroy(btp->bt_cache); +out_file: + fput(file); +out_free_btp: + kfree(btp); + return error; +} + +/* Free a buffer cache target for a memory-backed buffer cache. */ +void +xmbuf_free( + struct xfs_buftarg *btp) +{ + ASSERT(xfs_buftarg_is_mem(btp)); + ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); + + trace_xmbuf_free(btp); + + xfs_destroy_buftarg(btp); + xfs_buf_cache_destroy(btp->bt_cache); + fput(btp->bt_file); + kfree(btp); +} + +/* Directly map a shmem page into the buffer cache. */ +int +xmbuf_map_page( + struct xfs_buf *bp) +{ + struct inode *inode = file_inode(bp->b_target->bt_file); + struct folio *folio = NULL; + struct page *page; + loff_t pos = BBTOB(xfs_buf_daddr(bp)); + int error; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + if (bp->b_map_count != 1) + return -ENOMEM; + if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE) + return -ENOMEM; + if (offset_in_page(pos) != 0) { + ASSERT(offset_in_page(pos)); + return -ENOMEM; + } + + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE); + if (error) + return error; + + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); + return -EIO; + } + + page = folio_file_page(folio, pos >> PAGE_SHIFT); + + /* + * Mark the page dirty so that it won't be reclaimed once we drop the + * (potentially last) reference in xmbuf_unmap_page. + */ + set_page_dirty(page); + unlock_page(page); + + bp->b_addr = page_address(page); + bp->b_pages = bp->b_page_array; + bp->b_pages[0] = page; + bp->b_page_count = 1; + return 0; +} + +/* Unmap a shmem page that was mapped into the buffer cache. */ +void +xmbuf_unmap_page( + struct xfs_buf *bp) +{ + struct page *page = bp->b_pages[0]; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + put_page(page); + + bp->b_addr = NULL; + bp->b_pages[0] = NULL; + bp->b_pages = NULL; + bp->b_page_count = 0; +} + +/* Is this a valid daddr within the buftarg? */ +bool +xmbuf_verify_daddr( + struct xfs_buftarg *btp, + xfs_daddr_t daddr) +{ + struct inode *inode = file_inode(btp->bt_file); + + ASSERT(xfs_buftarg_is_mem(btp)); + + return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT); +} + +/* Discard the page backing this buffer. */ +static void +xmbuf_stale( + struct xfs_buf *bp) +{ + struct inode *inode = file_inode(bp->b_target->bt_file); + loff_t pos; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + pos = BBTOB(xfs_buf_daddr(bp)); + shmem_truncate_range(inode, pos, pos + BBTOB(bp->b_length) - 1); +} + +/* + * Finalize a buffer -- discard the backing page if it's stale, or run the + * write verifier to detect problems. + */ +int +xmbuf_finalize( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + int error = 0; + + if (bp->b_flags & XBF_STALE) { + xmbuf_stale(bp); + return 0; + } + + /* + * Although this btree is ephemeral, validate the buffer structure so + * that we can detect memory corruption errors and software bugs. + */ + fa = bp->b_ops->verify_struct(bp); + if (fa) { + error = -EFSCORRUPTED; + xfs_verifier_error(bp, error, fa); + } + + return error; +} + +/* + * Detach this xmbuf buffer from the transaction by any means necessary. + * All buffers are direct-mapped, so they do not need bwrite. + */ +void +xmbuf_trans_bdetach( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bli = bp->b_log_item; + + ASSERT(bli != NULL); + + bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED | + XFS_BLI_LOGGED | XFS_BLI_STALE); + clear_bit(XFS_LI_DIRTY, &bli->bli_item.li_flags); + + while (bp->b_log_item != NULL) + xfs_trans_bdetach(tp, bp); +} diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h new file mode 100644 index 000000000000..eed4a7b63232 --- /dev/null +++ b/fs/xfs/xfs_buf_mem.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <[email protected]> + */ +#ifndef __XFS_BUF_MEM_H__ +#define __XFS_BUF_MEM_H__ + +#define XMBUF_BLOCKSIZE (PAGE_SIZE) +#define XMBUF_BLOCKSHIFT (PAGE_SHIFT) + +#ifdef CONFIG_XFS_MEMORY_BUFS +static inline bool xfs_buftarg_is_mem(const struct xfs_buftarg *btp) +{ + return btp->bt_bdev == NULL; +} + +int xmbuf_alloc(struct xfs_mount *mp, const char *descr, + struct xfs_buftarg **btpp); +void xmbuf_free(struct xfs_buftarg *btp); + +int xmbuf_map_page(struct xfs_buf *bp); +void xmbuf_unmap_page(struct xfs_buf *bp); +bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr); +void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp); +int xmbuf_finalize(struct xfs_buf *bp); +#else +# define xfs_buftarg_is_mem(...) (false) +# define xmbuf_map_page(...) (-ENOMEM) +# define xmbuf_unmap_page(...) ((void)0) +# define xmbuf_verify_daddr(...) (false) +#endif /* CONFIG_XFS_MEMORY_BUFS */ + +#endif /* __XFS_BUF_MEM_H__ */ diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index b28546b6fe34..b39f959146bc 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -527,6 +527,9 @@ xfs_btree_mark_sick( struct xfs_btree_cur *cur) { switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + /* no health state tracking for ephemeral btrees */ + return; case XFS_BTREE_TYPE_AG: ASSERT(cur->bc_ops->sick_mask); xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 6c44e6db4d86..e880aa48de68 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -505,9 +505,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } -int xfs_buf_hash_init(struct xfs_perag *pag); -void xfs_buf_hash_destroy(struct xfs_perag *pag); - extern void xfs_uuid_table_free(void); extern uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 8a5dc1538aa8..3f253884fe5b 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -36,6 +36,8 @@ #include "xfs_error.h" #include <linux/iomap.h> #include "xfs_iomap.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e876a47f1427..e85ea1fdb9c1 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -79,6 +79,8 @@ union xfs_btree_ptr; struct xfs_dqtrx; struct xfs_icwalk; struct xfs_perag; +struct xfbtree; +struct xfs_btree_ops; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -640,6 +642,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf); DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur); DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf); DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bdetach); DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); @@ -2499,12 +2502,19 @@ TRACE_EVENT(xfs_btree_alloc_block, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_INODE: __entry->agno = 0; __entry->ino = cur->bc_ino.ip->i_ino; - } else { + break; + case XFS_BTREE_TYPE_AG: __entry->agno = cur->bc_ag.pag->pag_agno; __entry->ino = 0; + break; + case XFS_BTREE_TYPE_MEM: + __entry->agno = 0; + __entry->ino = 0; + break; } __assign_str(name, cur->bc_ops->name); __entry->error = error; @@ -4514,6 +4524,159 @@ DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents); #endif /* CONFIG_XFS_DRAIN_INTENTS */ +#ifdef CONFIG_XFS_MEMORY_BUFS +TRACE_EVENT(xmbuf_create, + TP_PROTO(struct xfs_buftarg *btp), + TP_ARGS(btp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, ino) + __array(char, pathname, 256) + ), + TP_fast_assign( + char pathname[257]; + char *path; + struct file *file = btp->bt_file; + + __entry->ino = file_inode(file)->i_ino; + memset(pathname, 0, sizeof(pathname)); + path = file_path(file, pathname, sizeof(pathname) - 1); + if (IS_ERR(path)) + path = "(unknown)"; + strncpy(__entry->pathname, path, sizeof(__entry->pathname)); + ), + TP_printk("xmino 0x%lx path '%s'", + __entry->ino, + __entry->pathname) +); + +TRACE_EVENT(xmbuf_free, + TP_PROTO(struct xfs_buftarg *btp), + TP_ARGS(btp), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, bytes) + __field(loff_t, size) + ), + TP_fast_assign( + struct file *file = btp->bt_file; + struct inode *inode = file_inode(file); + + __entry->size = i_size_read(inode); + __entry->bytes = (inode->i_blocks << SECTOR_SHIFT) + inode->i_bytes; + __entry->ino = inode->i_ino; + ), + TP_printk("xmino 0x%lx mem_bytes 0x%llx isize 0x%llx", + __entry->ino, + __entry->bytes, + __entry->size) +); +#endif /* CONFIG_XFS_MEMORY_BUFS */ + +#ifdef CONFIG_XFS_BTREE_IN_MEM +TRACE_EVENT(xfbtree_init, + TP_PROTO(struct xfs_mount *mp, struct xfbtree *xfbt, + const struct xfs_btree_ops *ops), + TP_ARGS(mp, xfbt, ops), + TP_STRUCT__entry( + __field(const void *, btree_ops) + __field(unsigned long, xfino) + __field(unsigned int, leaf_mxr) + __field(unsigned int, leaf_mnr) + __field(unsigned int, node_mxr) + __field(unsigned int, node_mnr) + __field(unsigned long long, owner) + ), + TP_fast_assign( + __entry->btree_ops = ops; + __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; + __entry->leaf_mxr = xfbt->maxrecs[0]; + __entry->node_mxr = xfbt->maxrecs[1]; + __entry->leaf_mnr = xfbt->minrecs[0]; + __entry->node_mnr = xfbt->minrecs[1]; + __entry->owner = xfbt->owner; + ), + TP_printk("xfino 0x%lx btree_ops %pS owner 0x%llx leaf_mxr %u leaf_mnr %u node_mxr %u node_mnr %u", + __entry->xfino, + __entry->btree_ops, + __entry->owner, + __entry->leaf_mxr, + __entry->leaf_mnr, + __entry->node_mxr, + __entry->node_mnr) +); + +DECLARE_EVENT_CLASS(xfbtree_buf_class, + TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), + TP_ARGS(xfbt, bp), + TP_STRUCT__entry( + __field(unsigned long, xfino) + __field(xfs_daddr_t, bno) + __field(int, nblks) + __field(int, hold) + __field(int, pincount) + __field(unsigned int, lockval) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; + __entry->bno = xfs_buf_daddr(bp); + __entry->nblks = bp->b_length; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = bp->b_sema.count; + __entry->flags = bp->b_flags; + ), + TP_printk("xfino 0x%lx daddr 0x%llx bbcount 0x%x hold %d pincount %d lock %d flags %s", + __entry->xfino, + (unsigned long long)__entry->bno, + __entry->nblks, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS)) +) + +#define DEFINE_XFBTREE_BUF_EVENT(name) \ +DEFINE_EVENT(xfbtree_buf_class, name, \ + TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), \ + TP_ARGS(xfbt, bp)) +DEFINE_XFBTREE_BUF_EVENT(xfbtree_create_root_buf); +DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_commit_buf); +DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_cancel_buf); + +DECLARE_EVENT_CLASS(xfbtree_freesp_class, + TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, + xfs_fileoff_t fileoff), + TP_ARGS(xfbt, cur, fileoff), + TP_STRUCT__entry( + __field(unsigned long, xfino) + __string(btname, cur->bc_ops->name) + __field(int, nlevels) + __field(xfs_fileoff_t, fileoff) + ), + TP_fast_assign( + __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; + __assign_str(btname, cur->bc_ops->name); + __entry->nlevels = cur->bc_nlevels; + __entry->fileoff = fileoff; + ), + TP_printk("xfino 0x%lx %sbt nlevels %d fileoff 0x%llx", + __entry->xfino, + __get_str(btname), + __entry->nlevels, + (unsigned long long)__entry->fileoff) +) + +#define DEFINE_XFBTREE_FREESP_EVENT(name) \ +DEFINE_EVENT(xfbtree_freesp_class, name, \ + TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, \ + xfs_fileoff_t fileoff), \ + TP_ARGS(xfbt, cur, fileoff)) +DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block); +DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block); +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 08ce757c7454..3f7e3a09a49f 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -215,6 +215,7 @@ struct xfs_buf *xfs_trans_getsb(struct xfs_trans *); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp); void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 6549e50d852c..e28ab74af4f0 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -393,6 +393,48 @@ xfs_trans_brelse( } /* + * Forcibly detach a buffer previously joined to the transaction. The caller + * will retain its locked reference to the buffer after this function returns. + * The buffer must be completely clean and must not be held to the transaction. + */ +void +xfs_trans_bdetach( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bip = bp->b_log_item; + + ASSERT(tp != NULL); + ASSERT(bp->b_transp == tp); + ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + trace_xfs_trans_bdetach(bip); + + /* + * Erase all recursion count, since we're removing this buffer from the + * transaction. + */ + bip->bli_recur = 0; + + /* + * The buffer must be completely clean. Specifically, it had better + * not be dirty, stale, logged, ordered, or held to the transaction. + */ + ASSERT(!test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)); + ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY)); + ASSERT(!(bip->bli_flags & XFS_BLI_HOLD)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + + /* Unlink the log item from the transaction and drop the log item. */ + xfs_trans_del_item(&bip->bli_item); + xfs_buf_item_put(bip); + bp->b_transp = NULL; +} + +/* * Mark the buffer as not needing to be unlocked when the buf item's * iop_committing() routine is called. The buffer must already be locked * and associated with the given transaction. |