diff options
Diffstat (limited to 'fs/xfs')
220 files changed, 27137 insertions, 7154 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index ed0bc8cbc703..d41edd30388b 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -124,12 +124,24 @@ config XFS_DRAIN_INTENTS bool select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL +config XFS_LIVE_HOOKS + bool + select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL + +config XFS_MEMORY_BUFS + bool + +config XFS_BTREE_IN_MEM + bool + config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n depends on XFS_FS depends on TMPFS && SHMEM + select XFS_LIVE_HOOKS select XFS_DRAIN_INTENTS + select XFS_MEMORY_BUFS help If you say Y here you will be able to check metadata on a mounted XFS filesystem. This feature is intended to reduce @@ -147,7 +159,7 @@ config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y depends on XFS_ONLINE_SCRUB - select XFS_DEBUG + select DEBUG_FS help If you say Y here, the kernel will gather usage data about the online metadata check subsystem. This includes the number @@ -164,6 +176,7 @@ config XFS_ONLINE_REPAIR bool "XFS online metadata repair support" default n depends on XFS_FS && XFS_ONLINE_SCRUB + select XFS_BTREE_IN_MEM help If you say Y here you will be able to repair metadata on a mounted XFS filesystem. This feature is intended to reduce diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7762c01a85cf..76674ad5833e 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -92,8 +92,7 @@ xfs-y += xfs_aops.o \ xfs_symlink.o \ xfs_sysfs.o \ xfs_trans.o \ - xfs_xattr.o \ - kmem.o + xfs_xattr.o # low-level transaction/log code xfs-y += xfs_log.o \ @@ -137,6 +136,9 @@ xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o endif xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o +xfs-$(CONFIG_XFS_LIVE_HOOKS) += xfs_hooks.o +xfs-$(CONFIG_XFS_MEMORY_BUFS) += xfs_buf_mem.o +xfs-$(CONFIG_XFS_BTREE_IN_MEM) += libxfs/xfs_btree_mem.o # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) @@ -145,6 +147,7 @@ ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) xfs-y += $(addprefix scrub/, \ trace.o \ + agb_bitmap.o \ agheader.o \ alloc.o \ attr.o \ @@ -158,6 +161,8 @@ xfs-y += $(addprefix scrub/, \ health.o \ ialloc.o \ inode.o \ + iscan.o \ + nlinks.o \ parent.o \ readdir.o \ refcount.o \ @@ -175,14 +180,39 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rtsummary.o \ ) -xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ + dqiterate.o \ + quota.o \ + quotacheck.o \ + ) # online repair ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + alloc_repair.o \ + bmap_repair.o \ + cow_repair.o \ + fscounters_repair.o \ + ialloc_repair.o \ + inode_repair.o \ + newbt.o \ + nlinks_repair.o \ + rcbag_btree.o \ + rcbag.o \ reap.o \ + refcount_repair.o \ repair.o \ + rmap_repair.o \ + ) + +xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ + rtbitmap_repair.o \ + ) + +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ + quota_repair.o \ + quotacheck_repair.o \ ) endif endif diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c deleted file mode 100644 index c557a030acfe..000000000000 --- a/fs/xfs/kmem.c +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - */ -#include "xfs.h" -#include "xfs_message.h" -#include "xfs_trace.h" - -void * -kmem_alloc(size_t size, xfs_km_flags_t flags) -{ - int retries = 0; - gfp_t lflags = kmem_flags_convert(flags); - void *ptr; - - trace_kmem_alloc(size, flags, _RET_IP_); - - do { - ptr = kmalloc(size, lflags); - if (ptr || (flags & KM_MAYFAIL)) - return ptr; - if (!(++retries % 100)) - xfs_err(NULL, - "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", - current->comm, current->pid, - (unsigned int)size, __func__, lflags); - memalloc_retry_wait(lflags); - } while (1); -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h deleted file mode 100644 index b987dc2c6851..000000000000 --- a/fs/xfs/kmem.h +++ /dev/null @@ -1,83 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - */ -#ifndef __XFS_SUPPORT_KMEM_H__ -#define __XFS_SUPPORT_KMEM_H__ - -#include <linux/slab.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/vmalloc.h> - -/* - * General memory allocation interfaces - */ - -typedef unsigned __bitwise xfs_km_flags_t; -#define KM_NOFS ((__force xfs_km_flags_t)0x0004u) -#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) -#define KM_ZERO ((__force xfs_km_flags_t)0x0010u) -#define KM_NOLOCKDEP ((__force xfs_km_flags_t)0x0020u) - -/* - * We use a special process flag to avoid recursive callbacks into - * the filesystem during transactions. We will also issue our own - * warnings, so we explicitly skip any generic ones (silly of us). - */ -static inline gfp_t -kmem_flags_convert(xfs_km_flags_t flags) -{ - gfp_t lflags; - - BUG_ON(flags & ~(KM_NOFS | KM_MAYFAIL | KM_ZERO | KM_NOLOCKDEP)); - - lflags = GFP_KERNEL | __GFP_NOWARN; - if (flags & KM_NOFS) - lflags &= ~__GFP_FS; - - /* - * Default page/slab allocator behavior is to retry for ever - * for small allocations. We can override this behavior by using - * __GFP_RETRY_MAYFAIL which will tell the allocator to retry as long - * as it is feasible but rather fail than retry forever for all - * request sizes. - */ - if (flags & KM_MAYFAIL) - lflags |= __GFP_RETRY_MAYFAIL; - - if (flags & KM_ZERO) - lflags |= __GFP_ZERO; - - if (flags & KM_NOLOCKDEP) - lflags |= __GFP_NOLOCKDEP; - - return lflags; -} - -extern void *kmem_alloc(size_t, xfs_km_flags_t); -static inline void kmem_free(const void *ptr) -{ - kvfree(ptr); -} - - -static inline void * -kmem_zalloc(size_t size, xfs_km_flags_t flags) -{ - return kmem_alloc(size, flags | KM_ZERO); -} - -/* - * Zone interfaces - */ -static inline struct page * -kmem_to_page(void *addr) -{ - if (is_vmalloc_addr(addr)) - return vmalloc_to_page(addr); - return virt_to_page(addr); -} - -#endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index f9f4d694640d..dc1873f76bff 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -217,6 +217,7 @@ xfs_initialize_perag_data( */ if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { xfs_alert(mp, "AGF corruption. Please run xfs_repair."); + xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); error = -EFSCORRUPTED; goto out; } @@ -241,7 +242,7 @@ __xfs_free_perag( struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - kmem_free(pag); + kfree(pag); } /* @@ -263,7 +264,7 @@ xfs_free_perag( xfs_defer_drain_free(&pag->pag_intents_drain); cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_buf_hash_destroy(pag); + xfs_buf_cache_destroy(&pag->pag_bcache); /* drop the mount's active reference */ xfs_perag_rele(pag); @@ -332,6 +333,31 @@ xfs_agino_range( return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last); } +/* + * Free perag within the specified AG range, it is only used to free unused + * perags under the error handling path. + */ +void +xfs_free_unused_perag_range( + struct xfs_mount *mp, + xfs_agnumber_t agstart, + xfs_agnumber_t agend) +{ + struct xfs_perag *pag; + xfs_agnumber_t index; + + for (index = agstart; index < agend; index++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, index); + spin_unlock(&mp->m_perag_lock); + if (!pag) + break; + xfs_buf_cache_destroy(&pag->pag_bcache); + xfs_defer_drain_free(&pag->pag_intents_drain); + kfree(pag); + } +} + int xfs_initialize_perag( struct xfs_mount *mp, @@ -356,7 +382,7 @@ xfs_initialize_perag( continue; } - pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + pag = kzalloc(sizeof(*pag), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!pag) { error = -ENOMEM; goto out_unwind_new_pags; @@ -364,7 +390,7 @@ xfs_initialize_perag( pag->pag_agno = index; pag->pag_mount = mp; - error = radix_tree_preload(GFP_NOFS); + error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (error) goto out_free_pag; @@ -391,9 +417,10 @@ xfs_initialize_perag( init_waitqueue_head(&pag->pag_active_wq); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; + xfs_hooks_init(&pag->pag_rmap_update_hooks); #endif /* __KERNEL__ */ - error = xfs_buf_hash_init(pag); + error = xfs_buf_cache_init(&pag->pag_bcache); if (error) goto out_remove_pag; @@ -424,19 +451,14 @@ xfs_initialize_perag( out_remove_pag: xfs_defer_drain_free(&pag->pag_intents_drain); + spin_lock(&mp->m_perag_lock); radix_tree_delete(&mp->m_perag_tree, index); + spin_unlock(&mp->m_perag_lock); out_free_pag: - kmem_free(pag); + kfree(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ - for (index = first_initialised; index < agcount; index++) { - pag = radix_tree_delete(&mp->m_perag_tree, index); - if (!pag) - break; - xfs_buf_hash_destroy(pag); - xfs_defer_drain_free(&pag->pag_intents_drain); - kmem_free(pag); - } + xfs_free_unused_perag_range(mp, first_initialised, agcount); return error; } @@ -471,7 +493,7 @@ xfs_btroot_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno); + xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno); } /* Finish initializing a free space btree. */ @@ -529,7 +551,7 @@ xfs_freesp_init_recs( } /* - * Alloc btree root block init functions + * bnobt/cntbt btree root block init functions */ static void xfs_bnoroot_init( @@ -537,17 +559,7 @@ xfs_bnoroot_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno); - xfs_freesp_init_recs(mp, bp, id); -} - -static void -xfs_cntroot_init( - struct xfs_mount *mp, - struct xfs_buf *bp, - struct aghdr_init_data *id) -{ - xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno); + xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno); xfs_freesp_init_recs(mp, bp, id); } @@ -563,7 +575,7 @@ xfs_rmaproot_init( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_rmap_rec *rrec; - xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno); + xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 4, id->agno); /* * mark the AG header regions as static metadata The BNO @@ -658,14 +670,13 @@ xfs_agfblock_init( agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(id->agno); agf->agf_length = cpu_to_be32(id->agsize); - agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); - agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); - agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); - agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); + agf->agf_bno_root = cpu_to_be32(XFS_BNO_BLOCK(mp)); + agf->agf_cnt_root = cpu_to_be32(XFS_CNT_BLOCK(mp)); + agf->agf_bno_level = cpu_to_be32(1); + agf->agf_cnt_level = cpu_to_be32(1); if (xfs_has_rmapbt(mp)) { - agf->agf_roots[XFS_BTNUM_RMAPi] = - cpu_to_be32(XFS_RMAP_BLOCK(mp)); - agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); + agf->agf_rmap_root = cpu_to_be32(XFS_RMAP_BLOCK(mp)); + agf->agf_rmap_level = cpu_to_be32(1); agf->agf_rmap_blocks = cpu_to_be32(1); } @@ -776,7 +787,7 @@ struct xfs_aghdr_grow_data { size_t numblks; const struct xfs_buf_ops *ops; aghdr_init_work_f work; - xfs_btnum_t type; + const struct xfs_btree_ops *bc_ops; bool need_init; }; @@ -830,13 +841,15 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_bnobt_buf_ops, .work = &xfs_bnoroot_init, + .bc_ops = &xfs_bnobt_ops, .need_init = true }, { /* CNT root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_cntbt_buf_ops, - .work = &xfs_cntroot_init, + .work = &xfs_bnoroot_init, + .bc_ops = &xfs_cntbt_ops, .need_init = true }, { /* INO root block */ @@ -844,7 +857,7 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_inobt_buf_ops, .work = &xfs_btroot_init, - .type = XFS_BTNUM_INO, + .bc_ops = &xfs_inobt_ops, .need_init = true }, { /* FINO root block */ @@ -852,7 +865,7 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, - .type = XFS_BTNUM_FINO, + .bc_ops = &xfs_finobt_ops, .need_init = xfs_has_finobt(mp) }, { /* RMAP root block */ @@ -860,6 +873,7 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_rmapbt_buf_ops, .work = &xfs_rmaproot_init, + .bc_ops = &xfs_rmapbt_ops, .need_init = xfs_has_rmapbt(mp) }, { /* REFC root block */ @@ -867,7 +881,7 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_refcountbt_buf_ops, .work = &xfs_btroot_init, - .type = XFS_BTNUM_REFC, + .bc_ops = &xfs_refcountbt_ops, .need_init = xfs_has_reflink(mp) }, { /* NULL terminating block */ @@ -885,7 +899,7 @@ xfs_ag_init_headers( id->daddr = dp->daddr; id->numblks = dp->numblks; - id->type = dp->type; + id->bc_ops = dp->bc_ops; error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops); if (error) break; @@ -930,8 +944,10 @@ xfs_ag_shrink_space( agf = agfbp->b_addr; aglen = be32_to_cpu(agi->agi_length); /* some extra paranoid checks before we shrink the ag */ - if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) + if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); return -EFSCORRUPTED; + } if (delta >= aglen) return -EINVAL; @@ -959,14 +975,23 @@ xfs_ag_shrink_space( if (error) { /* - * if extent allocation fails, need to roll the transaction to + * If extent allocation fails, need to roll the transaction to * ensure that the AGFL fixup has been committed anyway. + * + * We need to hold the AGF across the roll to ensure nothing can + * access the AG for allocation until the shrink is fully + * cleaned up. And due to the resetting of the AG block + * reservation space needing to lock the AGI, we also have to + * hold that so we don't get AGI/AGF lock order inversions in + * the error handling path. */ xfs_trans_bhold(*tpp, agfbp); + xfs_trans_bhold(*tpp, agibp); err2 = xfs_trans_roll(tpp); if (err2) return err2; xfs_trans_bjoin(*tpp, agfbp); + xfs_trans_bjoin(*tpp, agibp); goto resv_init_out; } @@ -984,7 +1009,7 @@ xfs_ag_shrink_space( if (err2 != -ENOSPC) goto resv_err; - err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, + err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, XFS_AG_RESV_NONE, true); if (err2) goto resv_err; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 2e0aef87d633..35de09a2516c 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -36,8 +36,9 @@ struct xfs_perag { atomic_t pag_active_ref; /* active reference count */ wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */ unsigned long pag_opstate; - uint8_t pagf_levels[XFS_BTNUM_AGF]; - /* # of levels in bno & cnt btree */ + uint8_t pagf_bno_level; /* # of levels in bno btree */ + uint8_t pagf_cnt_level; /* # of levels in cnt btree */ + uint8_t pagf_rmap_level;/* # of levels in rmap btree */ uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ @@ -80,6 +81,18 @@ struct xfs_perag { */ uint16_t pag_checked; uint16_t pag_sick; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Alternate btree heights so that online repair won't trip the write + * verifiers while rebuilding the AG btrees. + */ + uint8_t pagf_repair_bno_level; + uint8_t pagf_repair_cnt_level; + uint8_t pagf_repair_refcount_level; + uint8_t pagf_repair_rmap_level; +#endif + spinlock_t pag_state_lock; spinlock_t pagb_lock; /* lock for pagb_tree */ @@ -94,9 +107,7 @@ struct xfs_perag { int pag_ici_reclaimable; /* reclaimable inodes */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ - /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ - struct rhashtable pag_buf_hash; + struct xfs_buf_cache pag_bcache; /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; @@ -109,6 +120,9 @@ struct xfs_perag { * inconsistencies. */ struct xfs_defer_drain pag_intents_drain; + + /* Hook to feed rmapbt updates to an active online repair. */ + struct xfs_hooks pag_rmap_update_hooks; #endif /* __KERNEL__ */ }; @@ -133,6 +147,8 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA) __XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES) __XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET) +void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart, + xfs_agnumber_t agend); int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); @@ -319,7 +335,7 @@ struct aghdr_init_data { /* per header data */ xfs_daddr_t daddr; /* header location */ size_t numblks; /* size of header */ - xfs_btnum_t type; /* type of btree root block */ + const struct xfs_btree_ops *bc_ops; /* btree ops */ }; int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 7fd1fea95552..da1057bd0e60 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -411,6 +411,8 @@ xfs_ag_resv_free_extent( fallthrough; case XFS_AG_RESV_NONE: xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); + fallthrough; + case XFS_AG_RESV_IGNORE: return; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3069194527dd..9da52e92172a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -26,6 +26,7 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_bmap.h" +#include "xfs_health.h" struct kmem_cache *xfs_extfree_item_cache; @@ -150,23 +151,38 @@ xfs_alloc_ag_max_usable( return mp->m_sb.sb_agblocks - blocks; } + +static int +xfs_alloc_lookup( + struct xfs_btree_cur *cur, + xfs_lookup_t dir, + xfs_agblock_t bno, + xfs_extlen_t len, + int *stat) +{ + int error; + + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + error = xfs_btree_lookup(cur, dir, stat); + if (*stat == 1) + cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; + else + cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; + return error; +} + /* * Lookup the record equal to [bno, len] in the btree given by cur. */ -STATIC int /* error */ +static inline int /* error */ xfs_alloc_lookup_eq( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { - int error; - - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); - cur->bc_ag.abt.active = (*stat == 1); - return error; + return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat); } /* @@ -180,13 +196,7 @@ xfs_alloc_lookup_ge( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { - int error; - - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); - cur->bc_ag.abt.active = (*stat == 1); - return error; + return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat); } /* @@ -200,19 +210,14 @@ xfs_alloc_lookup_le( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { - int error; - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); - cur->bc_ag.abt.active = (*stat == 1); - return error; + return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat); } static inline bool xfs_alloc_cur_active( struct xfs_btree_cur *cur) { - return cur && cur->bc_ag.abt.active; + return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE); } /* @@ -246,11 +251,9 @@ xfs_alloc_btrec_to_irec( /* Simple checks for free space records. */ xfs_failaddr_t xfs_alloc_check_irec( - struct xfs_btree_cur *cur, - const struct xfs_alloc_rec_incore *irec) + struct xfs_perag *pag, + const struct xfs_alloc_rec_incore *irec) { - struct xfs_perag *pag = cur->bc_ag.pag; - if (irec->ar_blockcount == 0) return __this_address; @@ -270,12 +273,12 @@ xfs_alloc_complain_bad_rec( struct xfs_mount *mp = cur->bc_mp; xfs_warn(mp, - "%s Freespace BTree record corruption in AG %d detected at %pS!", - cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", - cur->bc_ag.pag->pag_agno, fa); + "%sbt record corruption in AG %d detected at %pS!", + cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "start block 0x%x block count 0x%x", irec->ar_startblock, irec->ar_blockcount); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -299,7 +302,7 @@ xfs_alloc_get_rec( return error; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur, &irec); + fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); @@ -499,14 +502,18 @@ xfs_alloc_fixup_trees( if (XFS_IS_CORRUPT(mp, i != 1 || nfbno1 != fbno || - nflen1 != flen)) + nflen1 != flen)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } #endif } else { if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } } /* * Look up the record in the by-block tree if necessary. @@ -518,14 +525,18 @@ xfs_alloc_fixup_trees( if (XFS_IS_CORRUPT(mp, i != 1 || nfbno1 != fbno || - nflen1 != flen)) + nflen1 != flen)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } #endif } else { if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } } #ifdef DEBUG @@ -538,8 +549,10 @@ xfs_alloc_fixup_trees( if (XFS_IS_CORRUPT(mp, bnoblock->bb_numrecs != - cntblock->bb_numrecs)) + cntblock->bb_numrecs)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } } #endif @@ -569,30 +582,40 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(cnt_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } /* * Add new by-size btree entry(s). */ if (nfbno1 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 0)) + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } } if (nfbno2 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 0)) + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } } /* * Fix up the by-block btree entry(s). @@ -603,8 +626,10 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(bno_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } } else { /* * Update the by-block entry to start later|be shorter. @@ -618,12 +643,16 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 0)) + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } if ((error = xfs_btree_insert(bno_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } } return 0; } @@ -757,6 +786,8 @@ xfs_alloc_read_agfl( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); if (error) return error; xfs_buf_set_ref(bp, XFS_AGFL_REF); @@ -778,6 +809,7 @@ xfs_alloc_update_counters( if (unlikely(be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))) { xfs_buf_mark_corrupt(agbp); + xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF); return -EFSCORRUPTED; } @@ -830,8 +862,8 @@ xfs_alloc_cur_setup( * attempt a small allocation. */ if (!acur->cnt) - acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->pag, XFS_BTNUM_CNT); + acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp, + args->agbp, args->pag); error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); if (error) return error; @@ -840,11 +872,11 @@ xfs_alloc_cur_setup( * Allocate the bnobt left and right search cursors. */ if (!acur->bnolt) - acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->pag, XFS_BTNUM_BNO); + acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp, + args->agbp, args->pag); if (!acur->bnogt) - acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->pag, XFS_BTNUM_BNO); + acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp, + args->agbp, args->pag); return i == 1 ? 0 : -ENOSPC; } @@ -886,15 +918,17 @@ xfs_alloc_cur_check( bool busy; unsigned busy_gen = 0; bool deactivate = false; - bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO; + bool isbnobt = xfs_btree_is_bno(cur->bc_ops); *new = 0; error = xfs_alloc_get_rec(cur, &bno, &len, &i); if (error) return error; - if (XFS_IS_CORRUPT(args->mp, i != 1)) + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } /* * Check minlen and deactivate a cntbt cursor if out of acceptable size @@ -960,9 +994,8 @@ xfs_alloc_cur_check( deactivate = true; out: if (deactivate) - cur->bc_ag.abt.active = false; - trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, - *new); + cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; + trace_xfs_alloc_cur_check(cur, bno, len, diff, *new); return 0; } @@ -1100,6 +1133,7 @@ xfs_alloc_ag_vextent_small( if (error) goto error; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(ccur); error = -EFSCORRUPTED; goto error; } @@ -1134,6 +1168,7 @@ xfs_alloc_ag_vextent_small( *fbnop = args->agbno = fbno; *flenp = args->len = 1; if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { + xfs_btree_mark_sick(ccur); error = -EFSCORRUPTED; goto error; } @@ -1199,8 +1234,8 @@ xfs_alloc_ag_vextent_exact( /* * Allocate/initialize a cursor for the by-number freespace btree. */ - bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->pag, XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); /* * Lookup bno and minlen in the btree (minlen is irrelevant, really). @@ -1220,6 +1255,7 @@ xfs_alloc_ag_vextent_exact( if (error) goto error0; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1259,8 +1295,8 @@ xfs_alloc_ag_vextent_exact( * We are allocating agbno for args->len * Allocate/initialize a cursor for the by-size btree. */ - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->pag, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); @@ -1332,7 +1368,7 @@ xfs_alloc_walk_iter( if (error) return error; if (i == 0) - cur->bc_ag.abt.active = false; + cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; if (count > 0) count--; @@ -1446,7 +1482,7 @@ xfs_alloc_ag_vextent_locality( if (error) return error; if (i) { - acur->cnt->bc_ag.abt.active = true; + acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; fbcur = acur->cnt; fbinc = false; } @@ -1499,8 +1535,10 @@ xfs_alloc_ag_vextent_lastblock( error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); if (error) return error; - if (XFS_IS_CORRUPT(args->mp, i != 1)) + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(acur->cnt); return -EFSCORRUPTED; + } if (*len >= args->minlen) break; error = xfs_btree_increment(acur->cnt, 0, &i); @@ -1672,8 +1710,8 @@ restart: /* * Allocate and initialize a cursor for the by-size btree. */ - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->pag, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); bno_cur = NULL; /* @@ -1712,6 +1750,7 @@ restart: if (error) goto error0; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1758,6 +1797,7 @@ restart: rlen != 0 && (rlen > flen || rbno + rlen > fbno + flen))) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1780,6 +1820,7 @@ restart: &i))) goto error0; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1792,6 +1833,7 @@ restart: rlen != 0 && (rlen > flen || rbno + rlen > fbno + flen))) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1808,6 +1850,7 @@ restart: &i))) goto error0; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1846,14 +1889,15 @@ restart: rlen = args->len; if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } /* * Allocate and initialize a cursor for the by-block tree. */ - bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->pag, XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, rbno, rlen, XFSA_FIXUP_CNT_OK))) goto error0; @@ -1865,6 +1909,7 @@ restart: if (XFS_IS_CORRUPT(args->mp, args->agbno + args->len > be32_to_cpu(agf->agf_length))) { + xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT); error = -EFSCORRUPTED; goto error0; } @@ -1926,7 +1971,7 @@ xfs_free_ag_extent( /* * Allocate and initialize a cursor for the by-block btree. */ - bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); /* * Look for a neighboring block on the left (lower block numbers) * that is contiguous with this space. @@ -1940,6 +1985,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1955,6 +2001,7 @@ xfs_free_ag_extent( * Very bad. */ if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1973,6 +2020,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1988,6 +2036,7 @@ xfs_free_ag_extent( * Very bad. */ if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1996,7 +2045,7 @@ xfs_free_ag_extent( /* * Now allocate and initialize a cursor for the by-size tree. */ - cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); /* * Have both left and right contiguous neighbors. * Merge all three into a single free block. @@ -2008,12 +2057,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2023,12 +2074,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2038,6 +2091,7 @@ xfs_free_ag_extent( if ((error = xfs_btree_delete(bno_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2047,6 +2101,7 @@ xfs_free_ag_extent( if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2066,6 +2121,7 @@ xfs_free_ag_extent( i != 1 || xxbno != ltbno || xxlen != ltlen)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2090,12 +2146,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2106,6 +2164,7 @@ xfs_free_ag_extent( if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2125,12 +2184,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2153,6 +2214,7 @@ xfs_free_ag_extent( if ((error = xfs_btree_insert(bno_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2165,12 +2227,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2269,23 +2333,41 @@ xfs_alloc_min_freelist( struct xfs_perag *pag) { /* AG btrees have at least 1 level. */ - static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; - const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; + const unsigned int bno_level = pag ? pag->pagf_bno_level : 1; + const unsigned int cnt_level = pag ? pag->pagf_cnt_level : 1; + const unsigned int rmap_level = pag ? pag->pagf_rmap_level : 1; unsigned int min_free; ASSERT(mp->m_alloc_maxlevels > 0); + /* + * For a btree shorter than the maximum height, the worst case is that + * every level gets split and a new level is added, then while inserting + * another entry to refill the AGFL, every level under the old root gets + * split again. This is: + * + * (full height split reservation) + (AGFL refill split height) + * = (current height + 1) + (current height - 1) + * = (new height) + (new height - 2) + * = 2 * new height - 2 + * + * For a btree of maximum height, the worst case is that every level + * under the root gets split, then while inserting another entry to + * refill the AGFL, every level under the root gets split again. This is + * also: + * + * 2 * (current height - 1) + * = 2 * (new height - 1) + * = 2 * new height - 2 + */ + /* space needed by-bno freespace btree */ - min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, - mp->m_alloc_maxlevels); + min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2; /* space needed by-size freespace btree */ - min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, - mp->m_alloc_maxlevels); + min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2; /* space needed reverse mapping used space btree */ if (xfs_has_rmapbt(mp)) - min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); - + min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2; return min_free; } @@ -2493,7 +2575,7 @@ xfs_defer_agfl_block( trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); + xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type); return 0; } @@ -2501,14 +2583,15 @@ xfs_defer_agfl_block( * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). */ -int -__xfs_free_extent_later( +static int +xfs_defer_extent_free( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, - bool skip_discard) + bool skip_discard, + struct xfs_defer_pending **dfpp) { struct xfs_extent_free_item *xefi; struct xfs_mount *mp = tp->t_mountp; @@ -2556,10 +2639,105 @@ __xfs_free_extent_later( XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type); + return 0; +} + +int +xfs_free_extent_later( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_filblks_t len, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, + bool skip_discard) +{ + struct xfs_defer_pending *dontcare = NULL; + + return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard, + &dontcare); +} + +/* + * Set up automatic freeing of unwritten space in the filesystem. + * + * This function attached a paused deferred extent free item to the + * transaction. Pausing means that the EFI will be logged in the next + * transaction commit, but the pending EFI will not be finished until the + * pending item is unpaused. + * + * If the system goes down after the EFI has been persisted to the log but + * before the pending item is unpaused, log recovery will find the EFI, fail to + * find the EFD, and free the space. + * + * If the pending item is unpaused, the next transaction commit will log an EFD + * without freeing the space. + * + * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the + * @args structure are set to the relevant values. + */ +int +xfs_alloc_schedule_autoreap( + const struct xfs_alloc_arg *args, + bool skip_discard, + struct xfs_alloc_autoreap *aarp) +{ + int error; + + error = xfs_defer_extent_free(args->tp, args->fsbno, args->len, + &args->oinfo, args->resv, skip_discard, &aarp->dfp); + if (error) + return error; + + xfs_defer_item_pause(args->tp, aarp->dfp); return 0; } +/* + * Cancel automatic freeing of unwritten space in the filesystem. + * + * Earlier, we created a paused deferred extent free item and attached it to + * this transaction so that we could automatically roll back a new space + * allocation if the system went down. Now we want to cancel the paused work + * item by marking the EFI stale so we don't actually free the space, unpausing + * the pending item and logging an EFD. + * + * The caller generally should have already mapped the space into the ondisk + * filesystem. If the reserved space was partially used, the caller must call + * xfs_free_extent_later to create a new EFI to free the unused space. + */ +void +xfs_alloc_cancel_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + struct xfs_defer_pending *dfp = aarp->dfp; + struct xfs_extent_free_item *xefi; + + if (!dfp) + return; + + list_for_each_entry(xefi, &dfp->dfp_work, xefi_list) + xefi->xefi_flags |= XFS_EFI_CANCELLED; + + xfs_defer_item_unpause(tp, dfp); +} + +/* + * Commit automatic freeing of unwritten space in the filesystem. + * + * This unpauses an earlier _schedule_autoreap and commits to freeing the + * allocated space. Call this if none of the reserved space was used. + */ +void +xfs_alloc_commit_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + if (aarp->dfp) + xfs_defer_item_unpause(tp, aarp->dfp); +} + #ifdef DEBUG /* * Check if an AGF has a free extent record whose length is equal to @@ -2576,13 +2754,14 @@ xfs_exact_minlen_extent_available( xfs_extlen_t flen; int error = 0; - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, - args->pag, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp, + args->pag); error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); if (error) goto out; if (*stat == 0) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto out; } @@ -2872,8 +3051,8 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_versionnum), offsetof(xfs_agf_t, agf_seqno), offsetof(xfs_agf_t, agf_length), - offsetof(xfs_agf_t, agf_roots[0]), - offsetof(xfs_agf_t, agf_levels[0]), + offsetof(xfs_agf_t, agf_bno_root), /* also cnt/rmap root */ + offsetof(xfs_agf_t, agf_bno_level), /* also cnt/rmap levels */ offsetof(xfs_agf_t, agf_flfirst), offsetof(xfs_agf_t, agf_fllast), offsetof(xfs_agf_t, agf_flcount), @@ -3052,12 +3231,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_freeblks) > agf_length) return __this_address; - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > - mp->m_alloc_maxlevels || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > - mp->m_alloc_maxlevels) + if (be32_to_cpu(agf->agf_bno_level) < 1 || + be32_to_cpu(agf->agf_cnt_level) < 1 || + be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels || + be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels) return __this_address; if (xfs_has_lazysbcount(mp) && @@ -3068,9 +3245,8 @@ xfs_agf_verify( if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length) return __this_address; - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > - mp->m_rmap_maxlevels) + if (be32_to_cpu(agf->agf_rmap_level) < 1 || + be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels) return __this_address; } @@ -3153,6 +3329,8 @@ xfs_read_agf( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); if (error) return error; @@ -3194,12 +3372,9 @@ xfs_alloc_read_agf( pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); pag->pagf_longest = be32_to_cpu(agf->agf_longest); - pag->pagf_levels[XFS_BTNUM_BNOi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); - pag->pagf_levels[XFS_BTNUM_CNTi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); - pag->pagf_levels[XFS_BTNUM_RMAPi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level); + pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); + pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); if (xfs_agfl_needs_reset(pag->pag_mount, agf)) set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); @@ -3228,10 +3403,8 @@ xfs_alloc_read_agf( ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); - ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi])); - ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); + ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level)); + ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level)); } #endif if (agfbpp) @@ -3780,17 +3953,23 @@ __xfs_free_extent( return -EIO; error = xfs_free_extent_fix_freelist(tp, pag, &agbp); - if (error) + if (error) { + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); return error; + } + agf = agbp->b_addr; if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); error = -EFSCORRUPTED; goto err_release; } /* validate the extent size is legal now we have the agf locked */ if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); error = -EFSCORRUPTED; goto err_release; } @@ -3827,7 +4006,7 @@ xfs_alloc_query_range_helper( xfs_failaddr_t fa; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur, &irec); + fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); @@ -3847,7 +4026,7 @@ xfs_alloc_query_range( union xfs_btree_irec high_brec = { .a = *high_rec }; struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn }; - ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + ASSERT(xfs_btree_is_bno(cur->bc_ops)); return xfs_btree_query_range(cur, &low_brec, &high_brec, xfs_alloc_query_range_helper, &query); } @@ -3861,7 +4040,7 @@ xfs_alloc_query_all( { struct xfs_alloc_query_range_info query; - ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + ASSERT(xfs_btree_is_bno(cur->bc_ops)); query.priv = priv; query.fn = fn; return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6bb8d295c321..0b956f8b9d5a 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -185,7 +185,7 @@ xfs_alloc_get_rec( union xfs_btree_rec; void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_alloc_rec_incore *irec); -xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_alloc_check_irec(struct xfs_perag *pag, const struct xfs_alloc_rec_incore *irec); int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, @@ -231,7 +231,7 @@ xfs_buf_to_agfl_bno( return bp->b_addr; } -int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, +int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, bool skip_discard); @@ -255,18 +255,18 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ +#define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */ -static inline int -xfs_free_extent_later( - struct xfs_trans *tp, - xfs_fsblock_t bno, - xfs_filblks_t len, - const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) -{ - return __xfs_free_extent_later(tp, bno, len, oinfo, type, false); -} +struct xfs_alloc_autoreap { + struct xfs_defer_pending *dfp; +}; +int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args, + bool skip_discard, struct xfs_alloc_autoreap *aarp); +void xfs_alloc_cancel_autoreap(struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp); +void xfs_alloc_commit_autoreap(struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp); extern struct kmem_cache *xfs_extfree_item_cache; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index c65228efed4a..6ef5ddd89600 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -16,6 +16,7 @@ #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_error.h" +#include "xfs_health.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_ag.h" @@ -23,13 +24,22 @@ static struct kmem_cache *xfs_allocbt_cur_cache; STATIC struct xfs_btree_cur * -xfs_allocbt_dup_cursor( +xfs_bnobt_dup_cursor( struct xfs_btree_cur *cur) { - return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum); + return xfs_bnobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, + cur->bc_ag.pag); } +STATIC struct xfs_btree_cur * +xfs_cntbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_cntbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, + cur->bc_ag.pag); +} + + STATIC void xfs_allocbt_set_root( struct xfs_btree_cur *cur, @@ -38,13 +48,18 @@ xfs_allocbt_set_root( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - int btnum = cur->bc_btnum; ASSERT(ptr->s != 0); - agf->agf_roots[btnum] = ptr->s; - be32_add_cpu(&agf->agf_levels[btnum], inc); - cur->bc_ag.pag->pagf_levels[btnum] += inc; + if (xfs_btree_is_bno(cur->bc_ops)) { + agf->agf_bno_root = ptr->s; + be32_add_cpu(&agf->agf_bno_level, inc); + cur->bc_ag.pag->pagf_bno_level += inc; + } else { + agf->agf_cnt_root = ptr->s; + be32_add_cpu(&agf->agf_cnt_level, inc); + cur->bc_ag.pag->pagf_cnt_level += inc; + } xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -116,7 +131,7 @@ xfs_allocbt_update_lastrec( __be32 len; int numrecs; - ASSERT(cur->bc_btnum == XFS_BTNUM_CNT); + ASSERT(!xfs_btree_is_bno(cur->bc_ops)); switch (reason) { case LASTREC_UPDATE: @@ -226,7 +241,10 @@ xfs_allocbt_init_ptr_from_cur( ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); - ptr->s = agf->agf_roots[cur->bc_btnum]; + if (xfs_btree_is_bno(cur->bc_ops)) + ptr->s = agf->agf_bno_root; + else + ptr->s = agf->agf_cnt_root; } STATIC int64_t @@ -299,13 +317,12 @@ xfs_allocbt_verify( struct xfs_perag *pag = bp->b_pag; xfs_failaddr_t fa; unsigned int level; - xfs_btnum_t btnum = XFS_BTNUM_BNOi; if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (xfs_has_crc(mp)) { - fa = xfs_btree_sblock_v5hdr_verify(bp); + fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; } @@ -320,15 +337,32 @@ xfs_allocbt_verify( * against. */ level = be16_to_cpu(block->bb_level); - if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) - btnum = XFS_BTNUM_CNTi; if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_levels[btnum]) + unsigned int maxlevel, repair_maxlevel = 0; + + /* + * Online repair could be rewriting the free space btrees, so + * we'll validate against the larger of either tree while this + * is going on. + */ + if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) { + maxlevel = pag->pagf_cnt_level; +#ifdef CONFIG_XFS_ONLINE_REPAIR + repair_maxlevel = pag->pagf_repair_cnt_level; +#endif + } else { + maxlevel = pag->pagf_bno_level; +#ifdef CONFIG_XFS_ONLINE_REPAIR + repair_maxlevel = pag->pagf_repair_bno_level; +#endif + } + + if (level >= max(maxlevel, repair_maxlevel)) return __this_address; } else if (level >= mp->m_alloc_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); + return xfs_btree_agblock_verify(bp, mp->m_alloc_mxr[level != 0]); } static void @@ -337,7 +371,7 @@ xfs_allocbt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_sblock_verify_crc(bp)) + if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_allocbt_verify(bp); @@ -361,7 +395,7 @@ xfs_allocbt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_sblock_calc_crc(bp); + xfs_btree_agblock_calc_crc(bp); } @@ -443,11 +477,19 @@ xfs_allocbt_keys_contiguous( be32_to_cpu(key2->alloc.ar_startblock)); } -static const struct xfs_btree_ops xfs_bnobt_ops = { +const struct xfs_btree_ops xfs_bnobt_ops = { + .name = "bno", + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, + + .lru_refs = XFS_ALLOC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_abtb_2), + .sick_mask = XFS_SICK_AG_BNOBT, - .dup_cursor = xfs_allocbt_dup_cursor, + .dup_cursor = xfs_bnobt_dup_cursor, .set_root = xfs_allocbt_set_root, .alloc_block = xfs_allocbt_alloc_block, .free_block = xfs_allocbt_free_block, @@ -466,11 +508,20 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .keys_contiguous = xfs_allocbt_keys_contiguous, }; -static const struct xfs_btree_ops xfs_cntbt_ops = { +const struct xfs_btree_ops xfs_cntbt_ops = { + .name = "cnt", + .type = XFS_BTREE_TYPE_AG, + .geom_flags = XFS_BTGEO_LASTREC_UPDATE, + .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, + + .lru_refs = XFS_ALLOC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_abtc_2), + .sick_mask = XFS_SICK_AG_CNTBT, - .dup_cursor = xfs_allocbt_dup_cursor, + .dup_cursor = xfs_cntbt_dup_cursor, .set_root = xfs_allocbt_set_root, .alloc_block = xfs_allocbt_alloc_block, .free_block = xfs_allocbt_free_block, @@ -489,76 +540,55 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .keys_contiguous = NULL, /* not needed right now */ }; -/* Allocate most of a new allocation btree cursor. */ -STATIC struct xfs_btree_cur * -xfs_allocbt_init_common( +/* + * Allocate a new bnobt cursor. + * + * For staging cursors tp and agbp are NULL. + */ +struct xfs_btree_cur * +xfs_bnobt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_perag *pag, - xfs_btnum_t btnum) + struct xfs_buf *agbp, + struct xfs_perag *pag) { struct xfs_btree_cur *cur; - ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - - cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels, - xfs_allocbt_cur_cache); - cur->bc_ag.abt.active = false; - - if (btnum == XFS_BTNUM_CNT) { - cur->bc_ops = &xfs_cntbt_ops; - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); - cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; - } else { - cur->bc_ops = &xfs_bnobt_ops; - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); - } - + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bnobt_ops, + mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agf *agf = agbp->b_addr; - if (xfs_has_crc(mp)) - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - + cur->bc_nlevels = be32_to_cpu(agf->agf_bno_level); + } return cur; } /* - * Allocate a new allocation btree cursor. + * Allocate a new cntbt cursor. + * + * For staging cursors tp and agbp are NULL. */ -struct xfs_btree_cur * /* new alloc btree cursor */ -xfs_allocbt_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for agf structure */ - struct xfs_perag *pag, - xfs_btnum_t btnum) /* btree identifier */ -{ - struct xfs_agf *agf = agbp->b_addr; - struct xfs_btree_cur *cur; - - cur = xfs_allocbt_init_common(mp, tp, pag, btnum); - if (btnum == XFS_BTNUM_CNT) - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); - else - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); - - cur->bc_ag.agbp = agbp; - - return cur; -} - -/* Create a free space btree cursor with a fake root for staging. */ struct xfs_btree_cur * -xfs_allocbt_stage_cursor( +xfs_cntbt_init_cursor( struct xfs_mount *mp, - struct xbtree_afakeroot *afake, - struct xfs_perag *pag, - xfs_btnum_t btnum) + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_perag *pag) { struct xfs_btree_cur *cur; - cur = xfs_allocbt_init_common(mp, NULL, pag, btnum); - xfs_btree_stage_afakeroot(cur, afake); + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_cntbt_ops, + mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); + cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agf *agf = agbp->b_addr; + + cur->bc_nlevels = be32_to_cpu(agf->agf_cnt_level); + } return cur; } @@ -577,16 +607,16 @@ xfs_allocbt_commit_staged_btree( ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); - agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); - xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); - - if (cur->bc_btnum == XFS_BTNUM_BNO) { - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops); + if (xfs_btree_is_bno(cur->bc_ops)) { + agf->agf_bno_root = cpu_to_be32(afake->af_root); + agf->agf_bno_level = cpu_to_be32(afake->af_levels); } else { - cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE; - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops); + agf->agf_cnt_root = cpu_to_be32(afake->af_root); + agf->agf_cnt_level = cpu_to_be32(afake->af_levels); } + xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); + + xfs_btree_commit_afakeroot(cur, tp, agbp); } /* Calculate number of records in an alloc btree block. */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 45df893ef6bb..155b47f231ab 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -47,12 +47,12 @@ struct xbtree_afakeroot; (maxrecs) * sizeof(xfs_alloc_key_t) + \ ((index) - 1) * sizeof(xfs_alloc_ptr_t))) -extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp, +struct xfs_btree_cur *xfs_bnobt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_perag *pag, xfs_btnum_t btnum); -struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag, - xfs_btnum_t btnum); + struct xfs_perag *pag); +struct xfs_btree_cur *xfs_cntbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_perag *pag); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, unsigned long long len); diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e28d93d232de..673a4b6d2e8d 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -224,7 +224,7 @@ int xfs_attr_get_ilocked( struct xfs_da_args *args) { - ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; @@ -421,10 +421,10 @@ xfs_attr_complete_op( bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; args->op_flags &= ~XFS_DA_OP_REPLACE; - if (do_replace) { - args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + if (do_replace) return replace_state; - } + return XFS_DAS_DONE; } @@ -862,8 +862,11 @@ xfs_attr_lookup( if (!xfs_inode_hasattr(dp)) return -ENOATTR; - if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) - return xfs_attr_sf_findname(args, NULL, NULL); + if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) { + if (xfs_attr_sf_findname(args)) + return -EEXIST; + return -ENOATTR; + } if (xfs_attr_is_leaf(dp)) { error = xfs_attr_leaf_hasname(args, &bp); @@ -880,79 +883,35 @@ xfs_attr_lookup( return error; } -static int -xfs_attr_intent_init( +static void +xfs_attr_defer_add( struct xfs_da_args *args, - unsigned int op_flags, /* op flag (set or remove) */ - struct xfs_attr_intent **attr) /* new xfs_attr_intent */ + unsigned int op_flags) { struct xfs_attr_intent *new; - new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL); + new = kmem_cache_zalloc(xfs_attr_intent_cache, + GFP_KERNEL | __GFP_NOFAIL); new->xattri_op_flags = op_flags; new->xattri_da_args = args; - *attr = new; - return 0; -} - -/* Sets an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_add( - struct xfs_da_args *args) -{ - struct xfs_attr_intent *new; - int error = 0; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); - if (error) - return error; + switch (op_flags) { + case XFS_ATTRI_OP_FLAGS_SET: + new->xattri_dela_state = xfs_attr_init_add_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REPLACE: + new->xattri_dela_state = xfs_attr_init_replace_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REMOVE: + new->xattri_dela_state = xfs_attr_init_remove_state(args); + break; + default: + ASSERT(0); + } - new->xattri_dela_state = xfs_attr_init_add_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type); trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); - - return 0; -} - -/* Sets an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_replace( - struct xfs_da_args *args) -{ - struct xfs_attr_intent *new; - int error = 0; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); - if (error) - return error; - - new->xattri_dela_state = xfs_attr_init_replace_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); - trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp); - - return 0; -} - -/* Removes an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_remove( - struct xfs_da_args *args) -{ - - struct xfs_attr_intent *new; - int error; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); - if (error) - return error; - - new->xattri_dela_state = xfs_attr_init_remove_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); - trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp); - - return 0; } /* @@ -1038,16 +997,16 @@ xfs_attr_set( error = xfs_attr_lookup(args); switch (error) { case -EEXIST: - /* if no value, we are performing a remove operation */ if (!args->value) { - error = xfs_attr_defer_remove(args); + /* if no value, we are performing a remove operation */ + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE); break; } + /* Pure create fails if the attr already exists */ if (args->attr_flags & XATTR_CREATE) goto out_trans_cancel; - - error = xfs_attr_defer_replace(args); + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE); break; case -ENOATTR: /* Can't remove what isn't there. */ @@ -1057,14 +1016,11 @@ xfs_attr_set( /* Pure replace fails if no existing attr to replace. */ if (args->attr_flags & XATTR_REPLACE) goto out_trans_cancel; - - error = xfs_attr_defer_add(args); + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET); break; default: goto out_trans_cancel; } - if (error) - goto out_trans_cancel; /* * If this is a synchronous mount, make sure that the @@ -1097,10 +1053,9 @@ out_trans_cancel: static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) { - struct xfs_attr_shortform *sf; + struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; - sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; - return be16_to_cpu(sf->hdr.totsize); + return be16_to_cpu(sf->totsize); } /* @@ -1112,19 +1067,13 @@ xfs_attr_shortform_addname( struct xfs_da_args *args) { int newsize, forkoff; - int error; trace_xfs_attr_sf_addname(args); - error = xfs_attr_shortform_lookup(args); - switch (error) { - case -ENOATTR: - if (args->op_flags & XFS_DA_OP_REPLACE) - return error; - break; - case -EEXIST: - if (!(args->op_flags & XFS_DA_OP_REPLACE)) - return error; + if (xfs_attr_sf_findname(args)) { + int error; + + ASSERT(args->op_flags & XFS_DA_OP_REPLACE); error = xfs_attr_sf_removename(args); if (error) @@ -1137,11 +1086,8 @@ xfs_attr_shortform_addname( * around. */ args->op_flags &= ~XFS_DA_OP_REPLACE; - break; - case 0: - break; - default: - return error; + } else { + ASSERT(!(args->op_flags & XFS_DA_OP_REPLACE)); } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2580ae47209a..ac904cc1a97b 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -29,6 +29,7 @@ #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_errortag.h" +#include "xfs_health.h" /* @@ -690,56 +691,32 @@ xfs_attr_shortform_create( ASSERT(ifp->if_bytes == 0); if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) ifp->if_format = XFS_DINODE_FMT_LOCAL; - xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); - hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data; + + hdr = xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); memset(hdr, 0, sizeof(*hdr)); hdr->totsize = cpu_to_be16(sizeof(*hdr)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } /* - * Return -EEXIST if attr is found, or -ENOATTR if not - * args: args containing attribute name and namelen - * sfep: If not null, pointer will be set to the last attr entry found on - -EEXIST. On -ENOATTR pointer is left at the last entry in the list - * basep: If not null, pointer is set to the byte offset of the entry in the - * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of - * the last entry in the list + * Return the entry if the attr in args is found, or NULL if not. */ -int +struct xfs_attr_sf_entry * xfs_attr_sf_findname( - struct xfs_da_args *args, - struct xfs_attr_sf_entry **sfep, - unsigned int *basep) + struct xfs_da_args *args) { - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; - unsigned int base = sizeof(struct xfs_attr_sf_hdr); - int size = 0; - int end; - int i; + struct xfs_attr_sf_hdr *sf = args->dp->i_af.if_data; + struct xfs_attr_sf_entry *sfe; - sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; - sfe = &sf->list[0]; - end = sf->hdr.count; - for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), - base += size, i++) { - size = xfs_attr_sf_entsize(sfe); - if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - continue; - break; + for (sfe = xfs_attr_sf_firstentry(sf); + sfe < xfs_attr_sf_endptr(sf); + sfe = xfs_attr_sf_nextentry(sfe)) { + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return sfe; } - if (sfep != NULL) - *sfep = sfe; - - if (basep != NULL) - *basep = base; - - if (i == end) - return -ENOATTR; - return -EEXIST; + return NULL; } /* @@ -751,38 +728,31 @@ xfs_attr_shortform_add( struct xfs_da_args *args, int forkoff) { - struct xfs_attr_shortform *sf; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_ifork *ifp = &dp->i_af; + struct xfs_attr_sf_hdr *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; - int offset, size; - struct xfs_mount *mp; - struct xfs_inode *dp; - struct xfs_ifork *ifp; + int size; trace_xfs_attr_sf_add(args); - dp = args->dp; - mp = dp->i_mount; dp->i_forkoff = forkoff; - ifp = &dp->i_af; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) - ASSERT(0); + ASSERT(!xfs_attr_sf_findname(args)); - offset = (char *)sfe - (char *)sf; size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); - xfs_idata_realloc(dp, size, XFS_ATTR_FORK); - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset); + sf = xfs_idata_realloc(dp, size, XFS_ATTR_FORK); + sfe = xfs_attr_sf_endptr(sf); sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; sfe->flags = args->attr_filter; memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); - sf->hdr.count++; - be16_add_cpu(&sf->hdr.totsize, size); + sf->count++; + be16_add_cpu(&sf->totsize, size); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); xfs_sbversion_add_attr2(mp, args->trans); @@ -811,48 +781,43 @@ int xfs_attr_sf_removename( struct xfs_da_args *args) { - struct xfs_attr_shortform *sf; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; - int size = 0, end, totsize; - unsigned int base; - struct xfs_mount *mp; - struct xfs_inode *dp; - int error; + uint16_t totsize = be16_to_cpu(sf->totsize); + void *next, *end; + int size = 0; trace_xfs_attr_sf_remove(args); - dp = args->dp; - mp = dp->i_mount; - sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; - - error = xfs_attr_sf_findname(args, &sfe, &base); - - /* - * If we are recovering an operation, finding nothing to - * remove is not an error - it just means there was nothing - * to clean up. - */ - if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY)) - return 0; - if (error != -EEXIST) - return error; - size = xfs_attr_sf_entsize(sfe); + sfe = xfs_attr_sf_findname(args); + if (!sfe) { + /* + * If we are recovering an operation, finding nothing to remove + * is not an error, it just means there was nothing to clean up. + */ + if (args->op_flags & XFS_DA_OP_RECOVERY) + return 0; + return -ENOATTR; + } /* * Fix up the attribute fork data, covering the hole */ - end = base + size; - totsize = be16_to_cpu(sf->hdr.totsize); - if (end != totsize) - memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end); - sf->hdr.count--; - be16_add_cpu(&sf->hdr.totsize, -size); + size = xfs_attr_sf_entsize(sfe); + next = xfs_attr_sf_nextentry(sfe); + end = xfs_attr_sf_endptr(sf); + if (next < end) + memmove(sfe, next, end - next); + sf->count--; + totsize -= size; + sf->totsize = cpu_to_be16(totsize); /* * Fix up the start offset of the attribute fork */ - totsize -= size; - if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && + if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { xfs_attr_fork_remove(dp, args->trans); @@ -860,7 +825,7 @@ xfs_attr_sf_removename( xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); ASSERT(dp->i_forkoff); - ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || + ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE); @@ -874,33 +839,6 @@ xfs_attr_sf_removename( } /* - * Look up a name in a shortform attribute list structure. - */ -/*ARGSUSED*/ -int -xfs_attr_shortform_lookup(xfs_da_args_t *args) -{ - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; - int i; - struct xfs_ifork *ifp; - - trace_xfs_attr_sf_lookup(args); - - ifp = &args->dp->i_af; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; - sfe = xfs_attr_sf_nextentry(sfe), i++) { - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - return -EEXIST; - } - return -ENOATTR; -} - -/* * Retrieve the attribute value and length. * * If args->valuelen is zero, only the length needs to be returned. Unlike a @@ -909,23 +847,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) */ int xfs_attr_shortform_getvalue( - struct xfs_da_args *args) + struct xfs_da_args *args) { - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; - int i; + struct xfs_attr_sf_entry *sfe; ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; - sfe = xfs_attr_sf_nextentry(sfe), i++) { - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - return xfs_attr_copy_value(args, - &sfe->nameval[args->namelen], sfe->valuelen); - } - return -ENOATTR; + + trace_xfs_attr_sf_lookup(args); + + sfe = xfs_attr_sf_findname(args); + if (!sfe) + return -ENOATTR; + return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], + sfe->valuelen); } /* Convert from using the shortform to the leaf format. */ @@ -933,26 +867,22 @@ int xfs_attr_shortform_to_leaf( struct xfs_da_args *args) { - struct xfs_inode *dp; - struct xfs_attr_shortform *sf; + struct xfs_inode *dp = args->dp; + struct xfs_ifork *ifp = &dp->i_af; + struct xfs_attr_sf_hdr *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; + int size = be16_to_cpu(sf->totsize); struct xfs_da_args nargs; char *tmpbuffer; - int error, i, size; + int error, i; xfs_dablk_t blkno; struct xfs_buf *bp; - struct xfs_ifork *ifp; trace_xfs_attr_sf_to_leaf(args); - dp = args->dp; - ifp = &dp->i_af; - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - size = be16_to_cpu(sf->hdr.totsize); - tmpbuffer = kmem_alloc(size, 0); - ASSERT(tmpbuffer != NULL); - memcpy(tmpbuffer, ifp->if_u1.if_data, size); - sf = (struct xfs_attr_shortform *)tmpbuffer; + tmpbuffer = kmalloc(size, GFP_KERNEL | __GFP_NOFAIL); + memcpy(tmpbuffer, ifp->if_data, size); + sf = (struct xfs_attr_sf_hdr *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); @@ -975,8 +905,8 @@ xfs_attr_shortform_to_leaf( nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; i++) { + sfe = xfs_attr_sf_firstentry(sf); + for (i = 0; i < sf->count; i++) { nargs.name = sfe->nameval; nargs.namelen = sfe->namelen; nargs.value = &sfe->nameval[nargs.namelen]; @@ -994,7 +924,7 @@ xfs_attr_shortform_to_leaf( } error = 0; out: - kmem_free(tmpbuffer); + kfree(tmpbuffer); return error; } @@ -1040,23 +970,16 @@ xfs_attr_shortform_allfit( return xfs_attr_shortform_bytesfit(dp, bytes); } -/* Verify the consistency of an inline attribute fork. */ +/* Verify the consistency of a raw inline attribute fork. */ xfs_failaddr_t xfs_attr_shortform_verify( - struct xfs_inode *ip) + struct xfs_attr_sf_hdr *sfp, + size_t size) { - struct xfs_attr_shortform *sfp; - struct xfs_attr_sf_entry *sfep; + struct xfs_attr_sf_entry *sfep = xfs_attr_sf_firstentry(sfp); struct xfs_attr_sf_entry *next_sfep; char *endp; - struct xfs_ifork *ifp; int i; - int64_t size; - - ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL); - ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); - sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - size = ifp->if_bytes; /* * Give up if the attribute is way too short. @@ -1067,8 +990,7 @@ xfs_attr_shortform_verify( endp = (char *)sfp + size; /* Check all reported entries */ - sfep = &sfp->list[0]; - for (i = 0; i < sfp->hdr.count; i++) { + for (i = 0; i < sfp->count; i++) { /* * struct xfs_attr_sf_entry has a variable length. * Check the fixed-offset parts of the structure are @@ -1137,7 +1059,7 @@ xfs_attr3_leaf_to_shortform( trace_xfs_attr_leaf_to_sf(args); - tmpbuffer = kmem_alloc(args->geo->blksize, 0); + tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); if (!tmpbuffer) return -ENOMEM; @@ -1203,7 +1125,7 @@ xfs_attr3_leaf_to_shortform( error = 0; out: - kmem_free(tmpbuffer); + kfree(tmpbuffer); return error; } @@ -1244,14 +1166,10 @@ xfs_attr3_leaf_to_node( if (error) goto out; - /* copy leaf to new buffer, update identifiers */ - xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); - bp2->b_ops = bp1->b_ops; - memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); - if (xfs_has_crc(mp)) { - struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; - hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2)); - } + /* + * Copy leaf to new buffer and log it. + */ + xfs_da_buf_copy(bp2, bp1, args->geo->blksize); xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); /* @@ -1615,7 +1533,7 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); - tmpbuffer = kmem_alloc(args->geo->blksize, 0); + tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); memset(bp->b_addr, 0, args->geo->blksize); leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; @@ -1653,7 +1571,7 @@ xfs_attr3_leaf_compact( */ xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); - kmem_free(tmpbuffer); + kfree(tmpbuffer); } /* @@ -2332,7 +2250,8 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0); + tmp_leaf = kzalloc(state->args->geo->blksize, + GFP_KERNEL | __GFP_NOFAIL); /* * Copy the header into the temp leaf so that all the stuff @@ -2372,7 +2291,7 @@ xfs_attr3_leaf_unbalance( } memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); savehdr = tmphdr; /* struct copy */ - kmem_free(tmp_leaf); + kfree(tmp_leaf); } xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); @@ -2425,6 +2344,7 @@ xfs_attr3_leaf_lookup_int( entries = xfs_attr3_leaf_entryp(leaf); if (ichdr.count >= args->geo->blksize / 8) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -2444,10 +2364,12 @@ xfs_attr3_leaf_lookup_int( } if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 368f4d9fa1d5..9b9948639c0f 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -47,16 +47,14 @@ struct xfs_attr3_icleaf_hdr { */ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); -int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_sf_removename(struct xfs_da_args *args); -int xfs_attr_sf_findname(struct xfs_da_args *args, - struct xfs_attr_sf_entry **sfep, - unsigned int *basep); +struct xfs_attr_sf_entry *xfs_attr_sf_findname(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); -xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_attr_sf_hdr *sfp, + size_t size); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d440393b40eb..ff0412828772 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -22,6 +22,7 @@ #include "xfs_attr_remote.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_health.h" #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ @@ -276,17 +277,18 @@ xfs_attr3_rmt_hdr_set( */ STATIC int xfs_attr_rmtval_copyout( - struct xfs_mount *mp, - struct xfs_buf *bp, - xfs_ino_t ino, - int *offset, - int *valuelen, - uint8_t **dst) + struct xfs_mount *mp, + struct xfs_buf *bp, + struct xfs_inode *dp, + int *offset, + int *valuelen, + uint8_t **dst) { - char *src = bp->b_addr; - xfs_daddr_t bno = xfs_buf_daddr(bp); - int len = BBTOB(bp->b_length); - int blksize = mp->m_attr_geo->blksize; + char *src = bp->b_addr; + xfs_ino_t ino = dp->i_ino; + xfs_daddr_t bno = xfs_buf_daddr(bp); + int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; ASSERT(len >= blksize); @@ -302,6 +304,7 @@ xfs_attr_rmtval_copyout( xfs_alert(mp, "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", bno, *offset, byte_cnt, ino); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } hdr_size = sizeof(struct xfs_attr3_rmt_hdr); @@ -418,10 +421,12 @@ xfs_attr_rmtval_get( dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0, &bp, &xfs_attr3_rmt_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK); if (error) return error; - error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + error = xfs_attr_rmtval_copyout(mp, bp, args->dp, &offset, &valuelen, &dst); xfs_buf_relse(bp); @@ -545,11 +550,13 @@ xfs_attr_rmtval_stale( struct xfs_buf *bp; int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) || - XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) + XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) { + xfs_bmap_mark_sick(ip, XFS_ATTR_FORK); return -EFSCORRUPTED; + } error = xfs_buf_incore(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map->br_startblock), @@ -659,8 +666,10 @@ xfs_attr_rmtval_invalidate( blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; - if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) + if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) { + xfs_bmap_mark_sick(args->dp, XFS_ATTR_FORK); return -EFSCORRUPTED; + } error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index 37578b369d9b..bc4422223024 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -7,14 +7,6 @@ #define __XFS_ATTR_SF_H__ /* - * Attribute storage when stored inside the inode. - * - * Small attribute lists are packed as tightly as possible so as - * to fit into the literal area of the inode. - */ -typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; - -/* * We generate this then sort it, attr_list() must return things in hash-order. */ typedef struct xfs_attr_sf_sort { @@ -41,11 +33,25 @@ static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep) return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen); } -/* next entry in struct */ +/* first entry in the SF attr fork */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_firstentry(struct xfs_attr_sf_hdr *hdr) +{ + return (struct xfs_attr_sf_entry *)(hdr + 1); +} + +/* next entry after sfep */ static inline struct xfs_attr_sf_entry * xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep) { return (void *)sfep + xfs_attr_sf_entsize(sfep); } +/* pointer to the space after the last entry, e.g. for adding a new one */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_endptr(struct xfs_attr_sf_hdr *sf) +{ + return (void *)sf + be16_to_cpu(sf->totsize); +} + #endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 30c931b38853..656c95a22f2e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -21,7 +21,7 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_quota.h" @@ -36,6 +36,9 @@ #include "xfs_refcount.h" #include "xfs_icache.h" #include "xfs_iomap.h" +#include "xfs_health.h" +#include "xfs_bmap_item.h" +#include "xfs_symlink_remote.h" struct kmem_cache *xfs_bmap_intent_cache; @@ -225,6 +228,28 @@ xfs_bmap_forkoff_reset( } } +static int +xfs_bmap_read_buf( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + struct xfs_buf **bpp) /* buffer for fsbno */ +{ + struct xfs_buf *bp; /* return value */ + int error; + + if (!xfs_verify_fsbno(mp, fsbno)) + return -EFSCORRUPTED; + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, fsbno), mp->m_bsize, 0, &bp, + &xfs_bmbt_buf_ops); + if (!error) { + xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); + *bpp = bp; + } + return error; +} + #ifdef DEBUG STATIC struct xfs_buf * xfs_bmap_get_bp( @@ -364,9 +389,9 @@ xfs_bmap_check_leaf_extents( bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); if (!bp) { bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); + error = xfs_bmap_read_buf(mp, NULL, bno, &bp); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(cur); if (error) goto error_norelse; } @@ -383,6 +408,7 @@ xfs_bmap_check_leaf_extents( pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -450,9 +476,9 @@ xfs_bmap_check_leaf_extents( bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); if (!bp) { bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); + error = xfs_bmap_read_buf(mp, NULL, bno, &bp); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(cur); if (error) goto error_norelse; } @@ -562,11 +588,14 @@ xfs_bmap_btree_to_extents( pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); #ifdef DEBUG - if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1))) + if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_verify_fsbno(mp, cbno))) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } #endif - error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); + error = xfs_bmap_read_buf(mp, tp, cbno, &cbp); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(cur); if (error) return error; cblock = XFS_BUF_TO_BLOCK(cbp); @@ -575,7 +604,7 @@ xfs_bmap_btree_to_extents( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) return error; @@ -634,14 +663,13 @@ xfs_bmap_extents_to_btree( * Fill in the root. */ block = ifp->if_broot; - xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, - XFS_BTNUM_BMAP, 1, 1, ip->i_ino, - XFS_BTREE_LONG_PTRS); + xfs_bmbt_init_block(ip, block, NULL, 1, 1); /* * Need a cursor. Can't allocate until bb_level is filled in. */ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; + if (wasdel) + cur->bc_flags |= XFS_BTREE_BMBT_WASDEL; /* * Convert to a btree with two levels, one record in root. */ @@ -667,7 +695,7 @@ xfs_bmap_extents_to_btree( goto out_root_realloc; } - cur->bc_ino.allocated++; + cur->bc_bmap.allocated++; ip->i_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, @@ -679,11 +707,8 @@ xfs_bmap_extents_to_btree( /* * Fill in the child block. */ - abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); - xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp), - XFS_BTNUM_BMAP, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS); + xfs_bmbt_init_block(ip, ablock, abp, 0, 0); for_each_xfs_iext(ifp, &icur, &rec) { if (isnullstartblock(rec.br_startblock)) @@ -747,7 +772,7 @@ xfs_bmap_local_to_extents_empty( ASSERT(ifp->if_nextents == 0); xfs_bmap_forkoff_reset(ip, whichfork); - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; ifp->if_format = XFS_DINODE_FMT_EXTENTS; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -832,7 +857,7 @@ xfs_bmap_local_to_extents( xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags |= XFS_ILOG_CORE; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; rec.br_startoff = 0; @@ -878,6 +903,7 @@ xfs_bmap_add_attrfork_btree( goto error0; /* must be at least one entry */ if (XFS_IS_CORRUPT(mp, stat != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -887,7 +913,7 @@ xfs_bmap_add_attrfork_btree( xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return -ENOSPC; } - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); } return 0; @@ -915,7 +941,7 @@ xfs_bmap_add_attrfork_extents( error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, XFS_DATA_FORK); if (cur) { - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -960,6 +986,7 @@ xfs_bmap_add_attrfork_local( /* should only be called for types that support local format data */ ASSERT(0); + xfs_bmap_mark_sick(ip, XFS_ATTR_FORK); return -EFSCORRUPTED; } @@ -1143,6 +1170,7 @@ xfs_iread_bmbt_block( (unsigned long long)ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block, sizeof(*block), __this_address); + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -1158,6 +1186,7 @@ xfs_iread_bmbt_block( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iread_extents(2)", frp, sizeof(*frp), fa); + xfs_bmap_mark_sick(ip, whichfork); return xfs_bmap_complain_bad_rec(ip, whichfork, fa, &new); } @@ -1189,7 +1218,7 @@ xfs_iread_extents( if (!xfs_need_iread_extents(ifp)) return 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ir.loaded = 0; xfs_iext_first(ifp, &ir.icur); @@ -1201,6 +1230,7 @@ xfs_iread_extents( goto out; if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) { + xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto out; } @@ -1213,6 +1243,8 @@ xfs_iread_extents( smp_store_release(&ifp->if_needextents, 0); return 0; out: + if (xfs_metadata_is_sick(error)) + xfs_bmap_mark_sick(ip, whichfork); xfs_iext_destroy(ifp); return error; } @@ -1292,6 +1324,7 @@ xfs_bmap_last_before( break; default: ASSERT(0); + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -1388,8 +1421,10 @@ xfs_bmap_last_offset( if (ifp->if_format == XFS_DINODE_FMT_LOCAL) return 0; - if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) + if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; + } error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); if (error || is_empty) @@ -1429,8 +1464,7 @@ xfs_bmap_add_extent_delay_real( ASSERT(whichfork != XFS_ATTR_FORK); ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!bma->cur || - (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); + ASSERT(!bma->cur || (bma->cur->bc_flags & XFS_BTREE_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -1528,6 +1562,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1535,6 +1570,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1542,6 +1578,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1571,6 +1608,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1604,6 +1642,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1632,6 +1671,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1639,6 +1679,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1673,6 +1714,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1698,6 +1740,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1705,6 +1748,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1721,7 +1765,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_ino.allocated : 0)); + (bma->cur ? bma->cur->bc_bmap.allocated : 0)); PREV.br_startoff = new_endoff; PREV.br_blockcount = temp; @@ -1749,6 +1793,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1785,6 +1830,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1792,6 +1838,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1808,7 +1855,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_ino.allocated : 0)); + (bma->cur ? bma->cur->bc_bmap.allocated : 0)); PREV.br_startblock = nullstartblock(da_new); PREV.br_blockcount = temp; @@ -1871,6 +1918,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1878,6 +1926,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1929,8 +1978,8 @@ xfs_bmap_add_extent_delay_real( xfs_mod_delalloc(mp, (int64_t)da_new - da_old); if (bma->cur) { - da_new += bma->cur->bc_ino.allocated; - bma->cur->bc_ino.allocated = 0; + da_new += bma->cur->bc_bmap.allocated; + bma->cur->bc_bmap.allocated = 0; } /* adjust for changes in reserved delayed indirect blocks */ @@ -2074,30 +2123,35 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2126,18 +2180,21 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2169,18 +2226,21 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2207,6 +2267,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2240,6 +2301,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2277,6 +2339,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2287,6 +2350,7 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2317,6 +2381,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2353,6 +2418,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2363,12 +2429,14 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2405,6 +2473,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2417,6 +2486,7 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2429,6 +2499,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2436,6 +2507,7 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2472,7 +2544,7 @@ xfs_bmap_add_extent_unwritten_real( /* clear out the allocated field, done with it now in any case. */ if (cur) { - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; *curp = cur; } @@ -2651,7 +2723,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); + ASSERT(!cur || !(cur->bc_flags & XFS_BTREE_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2721,6 +2793,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2728,6 +2801,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2735,6 +2809,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2764,6 +2839,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2794,6 +2870,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2820,6 +2897,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2827,6 +2905,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2853,7 +2932,7 @@ xfs_bmap_add_extent_hole_real( /* clear out the allocated field, done with it now in any case. */ if (cur) - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: @@ -2989,7 +3068,7 @@ xfs_bmap_extsize_align( * If realtime, and the result isn't a multiple of the realtime * extent size we need to remove blocks until it is. */ - if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) { + if (rt && (temp = xfs_extlen_to_rtxmod(mp, align_alen))) { /* * We're not covering the original request, or * we won't be able to once we fix the length. @@ -3016,7 +3095,7 @@ xfs_bmap_extsize_align( else { align_alen -= orig_off - align_off; align_off = orig_off; - align_alen -= align_alen % mp->m_sb.sb_rextsize; + align_alen -= xfs_extlen_to_rtxmod(mp, align_alen); } /* * Result doesn't cover the request, fail it. @@ -3044,7 +3123,8 @@ xfs_bmap_extsize_align( #define XFS_ALLOC_GAP_UNITS 4 -void +/* returns true if ap->blkno was modified */ +bool xfs_bmap_adjacent( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { @@ -3079,13 +3159,14 @@ xfs_bmap_adjacent( if (adjust && ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) ap->blkno += adjust; + return true; } /* * If not at eof, then compare the two neighbor blocks. * Figure out whether either one gives us a good starting point, * and pick the better one. */ - else if (!ap->eof) { + if (!ap->eof) { xfs_fsblock_t gotbno; /* right side block number */ xfs_fsblock_t gotdiff=0; /* right side difference */ xfs_fsblock_t prevbno; /* left side block number */ @@ -3165,14 +3246,21 @@ xfs_bmap_adjacent( * If both valid, pick the better one, else the only good * one, else ap->blkno is already set (to 0 or the inode block). */ - if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) + if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) { ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno; - else if (prevbno != NULLFSBLOCK) + return true; + } + if (prevbno != NULLFSBLOCK) { ap->blkno = prevbno; - else if (gotbno != NULLFSBLOCK) + return true; + } + if (gotbno != NULLFSBLOCK) { ap->blkno = gotbno; + return true; + } } #undef ISVALID + return false; } int @@ -3263,11 +3351,14 @@ xfs_bmap_btalloc_select_lengths( } /* Update all inode and quota accounting for the allocation we just did. */ -static void -xfs_bmap_btalloc_accounting( - struct xfs_bmalloca *ap, - struct xfs_alloc_arg *args) +void +xfs_bmap_alloc_account( + struct xfs_bmalloca *ap) { + bool isrt = XFS_IS_REALTIME_INODE(ap->ip) && + !(ap->flags & XFS_BMAPI_ATTRFORK); + uint fld; + if (ap->flags & XFS_BMAPI_COWFORK) { /* * COW fork blocks are in-core only and thus are treated as @@ -3279,7 +3370,7 @@ xfs_bmap_btalloc_accounting( * yet. */ if (ap->wasdel) { - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len); + xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); return; } @@ -3291,22 +3382,25 @@ xfs_bmap_btalloc_accounting( * This essentially transfers the transaction quota reservation * to that of a delalloc extent. */ - ap->ip->i_delayed_blks += args->len; - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS, - -(long)args->len); + ap->ip->i_delayed_blks += ap->length; + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, isrt ? + XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS, + -(long)ap->length); return; } /* data/attr fork only */ - ap->ip->i_nblocks += args->len; + ap->ip->i_nblocks += ap->length; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) { - ap->ip->i_delayed_blks -= args->len; - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len); + ap->ip->i_delayed_blks -= ap->length; + xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); + fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT; + } else { + fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; } - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT, - args->len); + + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, fld, ap->length); } static int @@ -3380,7 +3474,7 @@ xfs_bmap_process_allocated_extent( ap->offset = orig_offset; else if (ap->offset + ap->length < orig_offset + orig_length) ap->offset = orig_offset + orig_length - ap->length; - xfs_bmap_btalloc_accounting(ap, args); + xfs_bmap_alloc_account(ap); } #ifdef DEBUG @@ -3883,14 +3977,18 @@ xfs_bmapi_read( ASSERT(*nmap >= 1); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE))); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); - if (WARN_ON_ONCE(!ifp)) + if (WARN_ON_ONCE(!ifp)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; + } if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; + } if (xfs_is_shutdown(mp)) return -EIO; @@ -4145,9 +4243,8 @@ xfs_bmapi_allocate( */ bma->nallocs++; - if (bma->cur) - bma->cur->bc_ino.flags = - bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; + if (bma->cur && bma->wasdel) + bma->cur->bc_flags |= XFS_BTREE_BMBT_WASDEL; bma->got.br_startoff = bma->offset; bma->got.br_startblock = bma->blkno; @@ -4354,7 +4451,7 @@ xfs_bmapi_write( ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!(flags & XFS_BMAPI_REMAP)); /* zeroing is for currently only for data extents, not metadata */ @@ -4371,6 +4468,7 @@ xfs_bmapi_write( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4598,9 +4696,11 @@ xfs_bmapi_convert_delalloc( error = -ENOSPC; if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) goto out_finish; - error = -EFSCORRUPTED; - if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) + if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) { + xfs_bmap_mark_sick(ip, whichfork); + error = -EFSCORRUPTED; goto out_finish; + } XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); XFS_STATS_INC(mp, xs_xstrat_quick); @@ -4651,7 +4751,7 @@ xfs_bmapi_remap( ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(len > 0); ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_NORMAP))); ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != @@ -4659,6 +4759,7 @@ xfs_bmapi_remap( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4678,10 +4779,8 @@ xfs_bmapi_remap( ip->i_nblocks += len; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; - } got.br_startoff = bno; got.br_startblock = startblock; @@ -4816,7 +4915,7 @@ xfs_bmap_del_extent_delay( XFS_STATS_INC(mp, xs_del_exlist); - isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + isrt = xfs_ifork_is_realtime(ip, whichfork); del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got->br_startoff + got->br_blockcount; da_old = startblockval(got->br_startblock); @@ -4826,12 +4925,8 @@ xfs_bmap_del_extent_delay( ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); - if (isrt) { - uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); - - do_div(rtexts, mp->m_sb.sb_rextsize); - xfs_mod_frextents(mp, rtexts); - } + if (isrt) + xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount)); /* * Update the inode delalloc counter now and wait to update the @@ -5014,7 +5109,6 @@ xfs_bmap_del_extent_real( xfs_fileoff_t del_endoff; /* first offset past del */ int do_fx; /* free extent at end of routine */ int error; /* error return value */ - int flags = 0;/* inode logging flags */ struct xfs_bmbt_irec got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ @@ -5027,6 +5121,8 @@ xfs_bmap_del_extent_real( uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; + *logflagsp = 0; + mp = ip->i_mount; XFS_STATS_INC(mp, xs_del_exlist); @@ -5039,7 +5135,6 @@ xfs_bmap_del_extent_real( ASSERT(got_endoff >= del_endoff); ASSERT(!isnullstartblock(got.br_startblock)); qfield = 0; - error = 0; /* * If it's the case where the directory code is running with no block @@ -5055,44 +5150,31 @@ xfs_bmap_del_extent_real( del->br_startoff > got.br_startoff && del_endoff < got_endoff) return -ENOSPC; - flags = XFS_ILOG_CORE; - if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_filblks_t len; - xfs_extlen_t mod; - - len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize, - &mod); - ASSERT(mod == 0); - + *logflagsp = XFS_ILOG_CORE; + if (xfs_ifork_is_realtime(ip, whichfork)) { if (!(bflags & XFS_BMAPI_REMAP)) { - xfs_fsblock_t bno; - - bno = div_u64_rem(del->br_startblock, - mp->m_sb.sb_rextsize, &mod); - ASSERT(mod == 0); - - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + error = xfs_rtfree_blocks(tp, del->br_startblock, + del->br_blockcount); if (error) - goto done; + return error; } do_fx = 0; - nblks = len * mp->m_sb.sb_rextsize; qfield = XFS_TRANS_DQ_RTBCOUNT; } else { do_fx = 1; - nblks = del->br_blockcount; qfield = XFS_TRANS_DQ_BCOUNT; } + nblks = del->br_blockcount; del_endblock = del->br_startblock + del->br_blockcount; if (cur) { error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) - goto done; + return error; if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; } } @@ -5110,16 +5192,16 @@ xfs_bmap_del_extent_real( xfs_iext_prev(ifp, icur); ifp->if_nextents--; - flags |= XFS_ILOG_CORE; + *logflagsp |= XFS_ILOG_CORE; if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_btree_delete(cur, &i))) - goto done; + return error; if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; } break; case BMAP_LEFT_FILLING: @@ -5131,12 +5213,12 @@ xfs_bmap_del_extent_real( got.br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; break; case BMAP_RIGHT_FILLING: /* @@ -5145,12 +5227,12 @@ xfs_bmap_del_extent_real( got.br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; break; case 0: /* @@ -5167,18 +5249,18 @@ xfs_bmap_del_extent_real( new.br_state = got.br_state; new.br_startblock = del_endblock; - flags |= XFS_ILOG_CORE; + *logflagsp |= XFS_ILOG_CORE; if (cur) { error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; error = xfs_btree_increment(cur, 0, &i); if (error) - goto done; + return error; cur->bc_rec.b = new; error = xfs_btree_insert(cur, &i); if (error && error != -ENOSPC) - goto done; + return error; /* * If get no-space back from btree insert, it tried a * split, and we have a zero block reservation. Fix up @@ -5191,10 +5273,10 @@ xfs_bmap_del_extent_real( */ error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) - goto done; + return error; if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; } /* * Update the btree record back @@ -5202,22 +5284,21 @@ xfs_bmap_del_extent_real( */ error = xfs_bmbt_update(cur, &old); if (error) - goto done; + return error; /* * Reset the extent record back * to the original value. */ xfs_iext_update_extent(ip, state, icur, &old); - flags = 0; - error = -ENOSPC; - goto done; + *logflagsp = 0; + return -ENOSPC; } if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; } } else - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); ifp->if_nextents++; xfs_iext_next(ifp, icur); @@ -5235,13 +5316,13 @@ xfs_bmap_del_extent_real( if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); } else { - error = __xfs_free_extent_later(tp, del->br_startblock, + error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, XFS_AG_RESV_NONE, ((bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN)); if (error) - goto done; + return error; } } @@ -5256,9 +5337,7 @@ xfs_bmap_del_extent_real( if (qfield && !(bflags & XFS_BMAPI_REMAP)) xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); -done: - *logflagsp = flags; - return error; + return 0; } /* @@ -5267,7 +5346,7 @@ done: * that value. If not all extents in the block range can be removed then * *done is set. */ -int /* error */ +static int __xfs_bunmapi( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ @@ -5289,7 +5368,6 @@ __xfs_bunmapi( int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ - xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ xfs_fileoff_t end; struct xfs_iext_cursor icur; @@ -5300,12 +5378,14 @@ __xfs_bunmapi( whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); ifp = xfs_ifork_ptr(ip, whichfork); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; + } if (xfs_is_shutdown(mp)) return -EIO; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(len > 0); ASSERT(nexts >= 0); @@ -5318,7 +5398,7 @@ __xfs_bunmapi( return 0; } XFS_STATS_INC(mp, xs_blk_unmap); - isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + isrt = xfs_ifork_is_realtime(ip, whichfork); end = start + len; if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) { @@ -5331,7 +5411,6 @@ __xfs_bunmapi( if (ifp->if_format == XFS_DINODE_FMT_BTREE) { ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; } else cur = NULL; @@ -5381,11 +5460,11 @@ __xfs_bunmapi( if (del.br_startoff + del.br_blockcount > end + 1) del.br_blockcount = end + 1 - del.br_startoff; - if (!isrt) + if (!isrt || (flags & XFS_BMAPI_REMAP)) goto delete; - sum = del.br_startblock + del.br_blockcount; - div_u64_rem(sum, mp->m_sb.sb_rextsize, &mod); + mod = xfs_rtb_to_rtxoff(mp, + del.br_startblock + del.br_blockcount); if (mod) { /* * Realtime extent not lined up at the end. @@ -5399,7 +5478,7 @@ __xfs_bunmapi( * This piece is unwritten, or we're not * using unwritten extents. Skip over it. */ - ASSERT(end >= mod); + ASSERT((flags & XFS_BMAPI_REMAP) || end >= mod); end -= mod > del.br_blockcount ? del.br_blockcount : mod; if (end < got.br_startoff && @@ -5432,7 +5511,8 @@ __xfs_bunmapi( goto error0; goto nodelete; } - div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod); + + mod = xfs_rtb_to_rtxoff(mp, del.br_startblock); if (mod) { xfs_extlen_t off = mp->m_sb.sb_rextsize - mod; @@ -5568,7 +5648,7 @@ error0: xfs_trans_log_inode(tp, ip, logflags); if (cur) { if (!error) - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -5648,8 +5728,7 @@ xfs_bmse_merge( blockcount = left->br_blockcount + got->br_blockcount; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); ASSERT(xfs_bmse_can_merge(left, got, shift)); new = *left; @@ -5670,21 +5749,27 @@ xfs_bmse_merge( error = xfs_bmbt_lookup_eq(cur, got, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_btree_delete(cur, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } /* lookup and update size of the previous extent */ error = xfs_bmbt_lookup_eq(cur, left, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_bmbt_update(cur, &new); if (error) @@ -5732,8 +5817,10 @@ xfs_bmap_shift_update_extent( error = xfs_bmbt_lookup_eq(cur, &prev, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_bmbt_update(cur, got); if (error) @@ -5771,28 +5858,28 @@ xfs_bmap_collapse_extents( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } if (xfs_is_shutdown(mp)) return -EIO; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; - if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; - } if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { *done = true; goto del_cursor; } if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) { + xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto del_cursor; } @@ -5850,7 +5937,7 @@ xfs_bmap_can_insert_extents( int is_empty; int error = 0; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); if (xfs_is_shutdown(ip->i_mount)) return -EIO; @@ -5886,22 +5973,21 @@ xfs_bmap_insert_extents( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } if (xfs_is_shutdown(mp)) return -EIO; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; - if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; - } if (*next_fsb == NULLFSBLOCK) { xfs_iext_last(ifp, &icur); @@ -5917,11 +6003,13 @@ xfs_bmap_insert_extents( } } if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) { + xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto del_cursor; } if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) { + xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto del_cursor; } @@ -5989,6 +6077,7 @@ xfs_bmap_split_extent( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -6015,11 +6104,11 @@ xfs_bmap_split_extent( if (ifp->if_format == XFS_DINODE_FMT_BTREE) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto del_cursor; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto del_cursor; } @@ -6047,6 +6136,7 @@ xfs_bmap_split_extent( if (error) goto del_cursor; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto del_cursor; } @@ -6054,6 +6144,7 @@ xfs_bmap_split_extent( if (error) goto del_cursor; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto del_cursor; } @@ -6073,7 +6164,7 @@ xfs_bmap_split_extent( del_cursor: if (cur) { - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, error); } @@ -6082,17 +6173,8 @@ del_cursor: return error; } -/* Deferred mapping is only for real extents in the data fork. */ -static bool -xfs_bmap_is_update_needed( - struct xfs_bmbt_irec *bmap) -{ - return bmap->br_startblock != HOLESTARTBLOCK && - bmap->br_startblock != DELAYSTARTBLOCK; -} - /* Record a bmap intent. */ -static int +static inline void __xfs_bmap_add( struct xfs_trans *tp, enum xfs_bmap_intent_type type, @@ -6102,25 +6184,19 @@ __xfs_bmap_add( { struct xfs_bmap_intent *bi; - trace_xfs_bmap_defer(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), - type, - XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), - ip->i_ino, whichfork, - bmap->br_startoff, - bmap->br_blockcount, - bmap->br_state); + if ((whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) || + bmap->br_startblock == HOLESTARTBLOCK || + bmap->br_startblock == DELAYSTARTBLOCK) + return; - bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&bi->bi_list); bi->bi_type = type; bi->bi_owner = ip; bi->bi_whichfork = whichfork; bi->bi_bmap = *bmap; - xfs_bmap_update_get_group(tp->t_mountp, bi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); - return 0; + xfs_bmap_defer_add(tp, bi); } /* Map an extent into a file. */ @@ -6128,12 +6204,10 @@ void xfs_bmap_map_extent( struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_bmap_is_update_needed(PREV)) - return; - - __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); + __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, whichfork, PREV); } /* Unmap an extent out of a file. */ @@ -6141,12 +6215,10 @@ void xfs_bmap_unmap_extent( struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_bmap_is_update_needed(PREV)) - return; - - __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); + __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, whichfork, PREV); } /* @@ -6160,57 +6232,55 @@ xfs_bmap_finish_one( { struct xfs_bmbt_irec *bmap = &bi->bi_bmap; int error = 0; + int flags = 0; - ASSERT(tp->t_highest_agno == NULLAGNUMBER); + if (bi->bi_whichfork == XFS_ATTR_FORK) + flags |= XFS_BMAPI_ATTRFORK; - trace_xfs_bmap_deferred(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), - bi->bi_type, - XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), - bi->bi_owner->i_ino, bi->bi_whichfork, - bmap->br_startoff, bmap->br_blockcount, - bmap->br_state); + ASSERT(tp->t_highest_agno == NULLAGNUMBER); - if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK)) - return -EFSCORRUPTED; + trace_xfs_bmap_deferred(bi); - if (XFS_TEST_ERROR(false, tp->t_mountp, - XFS_ERRTAG_BMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; switch (bi->bi_type) { case XFS_BMAP_MAP: + if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN) + flags |= XFS_BMAPI_PREALLOC; error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff, - bmap->br_blockcount, bmap->br_startblock, 0); + bmap->br_blockcount, bmap->br_startblock, + flags); bmap->br_blockcount = 0; break; case XFS_BMAP_UNMAP: error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff, - &bmap->br_blockcount, XFS_BMAPI_REMAP, 1); + &bmap->br_blockcount, flags | XFS_BMAPI_REMAP, + 1); break; default: ASSERT(0); + xfs_bmap_mark_sick(bi->bi_owner, bi->bi_whichfork); error = -EFSCORRUPTED; } return error; } -/* Check that an inode's extent does not have invalid flags or bad ranges. */ +/* Check that an extent does not have invalid flags or bad ranges. */ xfs_failaddr_t -xfs_bmap_validate_extent( - struct xfs_inode *ip, +xfs_bmap_validate_extent_raw( + struct xfs_mount *mp, + bool rtfile, int whichfork, struct xfs_bmbt_irec *irec) { - struct xfs_mount *mp = ip->i_mount; - if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) return __this_address; - if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) { - if (!xfs_verify_rtext(mp, irec->br_startblock, - irec->br_blockcount)) + if (rtfile && whichfork == XFS_DATA_FORK) { + if (!xfs_verify_rtbext(mp, irec->br_startblock, + irec->br_blockcount)) return __this_address; } else { if (!xfs_verify_fsbext(mp, irec->br_startblock, @@ -6238,3 +6308,96 @@ xfs_bmap_intent_destroy_cache(void) kmem_cache_destroy(xfs_bmap_intent_cache); xfs_bmap_intent_cache = NULL; } + +/* Check that an inode's extent does not have invalid flags or bad ranges. */ +xfs_failaddr_t +xfs_bmap_validate_extent( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *irec) +{ + return xfs_bmap_validate_extent_raw(ip->i_mount, + XFS_IS_REALTIME_INODE(ip), whichfork, irec); +} + +/* + * Used in xfs_itruncate_extents(). This is the maximum number of extents + * freed from a file in a single transaction. + */ +#define XFS_ITRUNC_MAX_EXTENTS 2 + +/* + * Unmap every extent in part of an inode's fork. We don't do any higher level + * invalidation work at all. + */ +int +xfs_bunmapi_range( + struct xfs_trans **tpp, + struct xfs_inode *ip, + uint32_t flags, + xfs_fileoff_t startoff, + xfs_fileoff_t endoff) +{ + xfs_filblks_t unmap_len = endoff - startoff + 1; + int error = 0; + + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + + while (unmap_len > 0) { + ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); + error = __xfs_bunmapi(*tpp, ip, startoff, &unmap_len, flags, + XFS_ITRUNC_MAX_EXTENTS); + if (error) + goto out; + + /* free the just unmapped extents */ + error = xfs_defer_finish(tpp); + if (error) + goto out; + } +out: + return error; +} + +struct xfs_bmap_query_range { + xfs_bmap_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_bmap_query_range_helper( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_bmap_query_range *query = priv; + struct xfs_bmbt_irec irec; + xfs_failaddr_t fa; + + xfs_bmbt_disk_get_all(&rec->bmbt, &irec); + fa = xfs_bmap_validate_extent(cur->bc_ino.ip, cur->bc_ino.whichfork, + &irec); + if (fa) { + xfs_btree_mark_sick(cur); + return xfs_bmap_complain_bad_rec(cur->bc_ino.ip, + cur->bc_ino.whichfork, fa, &irec); + } + + return query->fn(cur, &irec, query->priv); +} + +/* Find all bmaps. */ +int +xfs_bmap_query_all( + struct xfs_btree_cur *cur, + xfs_bmap_query_range_fn fn, + void *priv) +{ + struct xfs_bmap_query_range query = { + .priv = priv, + .fn = fn, + }; + + return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query); +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e33470e39728..f7662595309d 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -116,6 +116,8 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) return XFS_DATA_FORK; } +void xfs_bmap_alloc_account(struct xfs_bmalloca *ap); + /* * Special values for xfs_bmbt_irec_t br_startblock field. */ @@ -190,9 +192,6 @@ int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); -int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags, - xfs_extnum_t nexts); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); @@ -233,6 +232,10 @@ enum xfs_bmap_intent_type { XFS_BMAP_UNMAP, }; +#define XFS_BMAP_INTENT_STRINGS \ + { XFS_BMAP_MAP, "map" }, \ + { XFS_BMAP_UNMAP, "unmap" } + struct xfs_bmap_intent { struct list_head bi_list; enum xfs_bmap_intent_type bi_type; @@ -242,14 +245,11 @@ struct xfs_bmap_intent { struct xfs_bmbt_irec bi_bmap; }; -void xfs_bmap_update_get_group(struct xfs_mount *mp, - struct xfs_bmap_intent *bi); - int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); + int whichfork, struct xfs_bmbt_irec *imap); void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); + int whichfork, struct xfs_bmbt_irec *imap); static inline uint32_t xfs_bmap_fork_to_state(int whichfork) { @@ -263,6 +263,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork) } } +xfs_failaddr_t xfs_bmap_validate_extent_raw(struct xfs_mount *mp, bool rtfile, + int whichfork, struct xfs_bmbt_irec *irec); xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, @@ -271,10 +273,20 @@ int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, uint32_t flags); +int xfs_bunmapi_range(struct xfs_trans **tpp, struct xfs_inode *ip, + uint32_t flags, xfs_fileoff_t startoff, xfs_fileoff_t endoff); extern struct kmem_cache *xfs_bmap_intent_cache; int __init xfs_bmap_intent_init_cache(void); void xfs_bmap_intent_destroy_cache(void); +typedef int (*xfs_bmap_query_range_fn)( + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *rec, + void *priv); + +int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn, + void *priv); + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index bf3f1b36fdd2..f5d84dcb58da 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -15,6 +15,7 @@ #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" @@ -25,6 +26,22 @@ static struct kmem_cache *xfs_bmbt_cur_cache; +void +xfs_bmbt_init_block( + struct xfs_inode *ip, + struct xfs_btree_block *buf, + struct xfs_buf *bp, + __u16 level, + __u16 numrecs) +{ + if (bp) + xfs_btree_init_buf(ip->i_mount, bp, &xfs_bmbt_ops, level, + numrecs, ip->i_ino); + else + xfs_btree_init_block(ip->i_mount, buf, &xfs_bmbt_ops, level, + numrecs, ip->i_ino); +} + /* * Convert on-disk form of btree root to in-memory form. */ @@ -43,9 +60,7 @@ xfs_bmdr_to_bmbt( xfs_bmbt_key_t *tkp; __be64 *tpp; - xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, - XFS_BTNUM_BMAP, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS); + xfs_bmbt_init_block(ip, rblock, NULL, 0, 0); rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; @@ -170,13 +185,8 @@ xfs_bmbt_dup_cursor( new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ino.ip, cur->bc_ino.whichfork); - - /* - * Copy the firstblock, dfops, and flags values, - * since init cursor doesn't get them. - */ - new->bc_ino.flags = cur->bc_ino.flags; - + new->bc_flags |= (cur->bc_flags & + (XFS_BTREE_BMBT_INVALID_OWNER | XFS_BTREE_BMBT_WASDEL)); return new; } @@ -188,10 +198,10 @@ xfs_bmbt_update_cursor( ASSERT((dst->bc_tp->t_highest_agno != NULLAGNUMBER) || (dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME)); - dst->bc_ino.allocated += src->bc_ino.allocated; + dst->bc_bmap.allocated += src->bc_bmap.allocated; dst->bc_tp->t_highest_agno = src->bc_tp->t_highest_agno; - src->bc_ino.allocated = 0; + src->bc_bmap.allocated = 0; } STATIC int @@ -210,7 +220,7 @@ xfs_bmbt_alloc_block( xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino, cur->bc_ino.whichfork); args.minlen = args.maxlen = args.prod = 1; - args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL; + args.wasdel = cur->bc_flags & XFS_BTREE_BMBT_WASDEL; if (!args.wasdel && args.tp->t_blk_res == 0) return -ENOSPC; @@ -246,7 +256,7 @@ xfs_bmbt_alloc_block( } ASSERT(args.len == 1); - cur->bc_ino.allocated++; + cur->bc_bmap.allocated++; cur->bc_ino.ip->i_nblocks++; xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE); xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip, @@ -272,7 +282,7 @@ xfs_bmbt_free_block( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) return error; @@ -288,10 +298,7 @@ xfs_bmbt_get_minrecs( int level) { if (level == cur->bc_nlevels - 1) { - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(cur->bc_ino.ip, - cur->bc_ino.whichfork); + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0) / 2; @@ -306,10 +313,7 @@ xfs_bmbt_get_maxrecs( int level) { if (level == cur->bc_nlevels - 1) { - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(cur->bc_ino.ip, - cur->bc_ino.whichfork); + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0); @@ -365,14 +369,6 @@ xfs_bmbt_init_rec_from_cur( xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b); } -STATIC void -xfs_bmbt_init_ptr_from_cur( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) -{ - ptr->l = 0; -} - STATIC int64_t xfs_bmbt_key_diff( struct xfs_btree_cur *cur, @@ -424,7 +420,7 @@ xfs_bmbt_verify( * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. */ - fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); if (fa) return fa; } @@ -440,7 +436,7 @@ xfs_bmbt_verify( if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) return __this_address; - return xfs_btree_lblock_verify(bp, mp->m_bmap_dmxr[level != 0]); + return xfs_btree_fsblock_verify(bp, mp->m_bmap_dmxr[level != 0]); } static void @@ -449,7 +445,7 @@ xfs_bmbt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_lblock_verify_crc(bp)) + if (!xfs_btree_fsblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_bmbt_verify(bp); @@ -473,7 +469,7 @@ xfs_bmbt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_lblock_calc_crc(bp); + xfs_btree_fsblock_calc_crc(bp); } const struct xfs_buf_ops xfs_bmbt_buf_ops = { @@ -520,9 +516,16 @@ xfs_bmbt_keys_contiguous( be64_to_cpu(key2->bmbt.br_startoff)); } -static const struct xfs_btree_ops xfs_bmbt_ops = { +const struct xfs_btree_ops xfs_bmbt_ops = { + .name = "bmap", + .type = XFS_BTREE_TYPE_INODE, + .rec_len = sizeof(xfs_bmbt_rec_t), .key_len = sizeof(xfs_bmbt_key_t), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_BMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2), .dup_cursor = xfs_bmbt_dup_cursor, .update_cursor = xfs_bmbt_update_cursor, @@ -534,7 +537,6 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_key_from_rec = xfs_bmbt_init_key_from_rec, .init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec, .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, - .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, .diff_two_keys = xfs_bmbt_diff_two_keys, .buf_ops = &xfs_bmbt_buf_ops, @@ -544,35 +546,45 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { }; /* - * Allocate a new bmap btree cursor. + * Create a new bmap btree cursor. + * + * For staging cursors -1 in passed in whichfork. */ -struct xfs_btree_cur * /* new bmap btree cursor */ +struct xfs_btree_cur * xfs_bmbt_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* inode owning the btree */ - int whichfork) /* data or attr fork */ + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; - ASSERT(whichfork != XFS_COW_FORK); - - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, - mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); - cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); + unsigned int maxlevels; - cur->bc_ops = &xfs_bmbt_ops; - cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; - if (xfs_has_crc(mp)) - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + ASSERT(whichfork != XFS_COW_FORK); - cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); + /* + * The Data fork always has larger maxlevel, so use that for staging + * cursors. + */ + switch (whichfork) { + case XFS_STAGING_FORK: + maxlevels = mp->m_bm_maxlevels[XFS_DATA_FORK]; + break; + default: + maxlevels = mp->m_bm_maxlevels[whichfork]; + break; + } + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bmbt_ops, maxlevels, + xfs_bmbt_cur_cache); cur->bc_ino.ip = ip; - cur->bc_ino.allocated = 0; - cur->bc_ino.flags = 0; cur->bc_ino.whichfork = whichfork; + cur->bc_bmap.allocated = 0; + if (whichfork != XFS_STAGING_FORK) { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); + } return cur; } @@ -588,6 +600,49 @@ xfs_bmbt_block_maxrecs( } /* + * Swap in the new inode fork root. Once we pass this point the newly rebuilt + * mappings are in place and we have to kill off any old btree blocks. + */ +void +xfs_bmbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + int whichfork) +{ + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; + struct xfs_ifork *ifp; + static const short brootflag[2] = {XFS_ILOG_DBROOT, XFS_ILOG_ABROOT}; + static const short extflag[2] = {XFS_ILOG_DEXT, XFS_ILOG_AEXT}; + int flags = XFS_ILOG_CORE; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(whichfork != XFS_COW_FORK); + + /* + * Free any resources hanging off the real fork, then shallow-copy the + * staging fork's contents into the real fork to transfer everything + * we just built. + */ + ifp = xfs_ifork_ptr(cur->bc_ino.ip, whichfork); + xfs_idestroy_fork(ifp); + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); + + switch (ifp->if_format) { + case XFS_DINODE_FMT_EXTENTS: + flags |= extflag[whichfork]; + break; + case XFS_DINODE_FMT_BTREE: + flags |= brootflag[whichfork]; + break; + default: + ASSERT(0); + break; + } + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); + xfs_btree_commit_ifakeroot(cur, tp, whichfork); +} + +/* * Calculate number of records in a bmap btree block. */ int @@ -670,7 +725,7 @@ xfs_bmbt_change_owner( ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); - cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER; + cur->bc_flags |= XFS_BTREE_BMBT_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 3e7a40a83835..de1b73f1225c 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -11,6 +11,7 @@ struct xfs_btree_block; struct xfs_mount; struct xfs_inode; struct xfs_trans; +struct xbtree_ifakeroot; /* * Btree block header size depends on a superblock flag. @@ -106,6 +107,8 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, int whichfork); extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp, unsigned long long len); @@ -115,4 +118,7 @@ unsigned int xfs_bmbt_maxlevels_ondisk(void); int __init xfs_bmbt_init_cur_cache(void); void xfs_bmbt_destroy_cur_cache(void); +void xfs_bmbt_init_block(struct xfs_inode *ip, struct xfs_btree_block *buf, + struct xfs_buf *bp, __u16 level, __u16 numrecs); + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 6a6503ab0cd7..d29547572a68 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -27,28 +27,24 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_health.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" /* * Btree magic numbers. */ -static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { - { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, - XFS_FIBT_MAGIC, 0 }, - { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, - XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, - XFS_REFC_CRC_MAGIC } -}; - uint32_t xfs_btree_magic( - int crc, - xfs_btnum_t btnum) + struct xfs_mount *mp, + const struct xfs_btree_ops *ops) { - uint32_t magic = xfs_magics[crc][btnum]; + int idx = xfs_has_crc(mp) ? 1 : 0; + __be32 magic = ops->buf_ops->magic[idx]; /* Ensure we asked for crc for crc-only magics. */ ASSERT(magic != 0); - return magic; + return be32_to_cpu(magic); } /* @@ -63,10 +59,8 @@ xfs_btree_magic( * bytes. */ static inline xfs_failaddr_t -xfs_btree_check_lblock_siblings( +xfs_btree_check_fsblock_siblings( struct xfs_mount *mp, - struct xfs_btree_cur *cur, - int level, xfs_fsblock_t fsb, __be64 dsibling) { @@ -78,22 +72,33 @@ xfs_btree_check_lblock_siblings( sibling = be64_to_cpu(dsibling); if (sibling == fsb) return __this_address; - if (level >= 0) { - if (!xfs_btree_check_lptr(cur, sibling, level + 1)) - return __this_address; - } else { - if (!xfs_verify_fsbno(mp, sibling)) - return __this_address; - } + if (!xfs_verify_fsbno(mp, sibling)) + return __this_address; + return NULL; +} + +static inline xfs_failaddr_t +xfs_btree_check_memblock_siblings( + struct xfs_buftarg *btp, + xfbno_t bno, + __be64 dsibling) +{ + xfbno_t sibling; + if (dsibling == cpu_to_be64(NULLFSBLOCK)) + return NULL; + + sibling = be64_to_cpu(dsibling); + if (sibling == bno) + return __this_address; + if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling))) + return __this_address; return NULL; } static inline xfs_failaddr_t -xfs_btree_check_sblock_siblings( +xfs_btree_check_agblock_siblings( struct xfs_perag *pag, - struct xfs_btree_cur *cur, - int level, xfs_agblock_t agbno, __be32 dsibling) { @@ -105,34 +110,21 @@ xfs_btree_check_sblock_siblings( sibling = be32_to_cpu(dsibling); if (sibling == agbno) return __this_address; - if (level >= 0) { - if (!xfs_btree_check_sptr(cur, sibling, level + 1)) - return __this_address; - } else { - if (!xfs_verify_agbno(pag, sibling)) - return __this_address; - } + if (!xfs_verify_agbno(pag, sibling)) + return __this_address; return NULL; } -/* - * Check a long btree block header. Return the address of the failing check, - * or NULL if everything is ok. - */ -xfs_failaddr_t -__xfs_btree_check_lblock( +static xfs_failaddr_t +__xfs_btree_check_lblock_hdr( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - xfs_btnum_t btnum = cur->bc_btnum; - int crc = xfs_has_crc(mp); - xfs_failaddr_t fa; - xfs_fsblock_t fsb = NULLFSBLOCK; - if (crc) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.l.bb_blkno != @@ -142,7 +134,7 @@ __xfs_btree_check_lblock( return __this_address; } - if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) + if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) return __this_address; if (be16_to_cpu(block->bb_level) != level) return __this_address; @@ -150,44 +142,83 @@ __xfs_btree_check_lblock( cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (bp) - fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + return NULL; +} + +/* + * Check a long btree block header. Return the address of the failing check, + * or NULL if everything is ok. + */ +static xfs_failaddr_t +__xfs_btree_check_fsblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_failaddr_t fa; + xfs_fsblock_t fsb; + + fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); + if (fa) + return fa; - fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + /* + * For inode-rooted btrees, the root block sits in the inode fork. In + * that case bp is NULL, and the block must not have any siblings. + */ + if (!bp) { + if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK)) + return __this_address; + if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK)) + return __this_address; + return NULL; + } + + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_leftsib); if (!fa) - fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_rightsib); return fa; } -/* Check a long btree block header. */ -static int -xfs_btree_check_lblock( +/* + * Check an in-memory btree block header. Return the address of the failing + * check, or NULL if everything is ok. + */ +static xfs_failaddr_t +__xfs_btree_check_memblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { - struct xfs_mount *mp = cur->bc_mp; + struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; xfs_failaddr_t fa; + xfbno_t bno; - fa = __xfs_btree_check_lblock(cur, block, level, bp); - if (XFS_IS_CORRUPT(mp, fa != NULL) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) { - if (bp) - trace_xfs_btree_corrupt(bp, _RET_IP_); - return -EFSCORRUPTED; - } - return 0; + fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); + if (fa) + return fa; + + bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_rightsib); + return fa; } /* * Check a short btree block header. Return the address of the failing check, * or NULL if everything is ok. */ -xfs_failaddr_t -__xfs_btree_check_sblock( +static xfs_failaddr_t +__xfs_btree_check_agblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, @@ -195,20 +226,17 @@ __xfs_btree_check_sblock( { struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = cur->bc_ag.pag; - xfs_btnum_t btnum = cur->bc_btnum; - int crc = xfs_has_crc(mp); xfs_failaddr_t fa; - xfs_agblock_t agbno = NULLAGBLOCK; + xfs_agblock_t agbno; - if (crc) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (block->bb_u.s.bb_blkno != - cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) + if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; } - if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) + if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) return __this_address; if (be16_to_cpu(block->bb_level) != level) return __this_address; @@ -216,36 +244,45 @@ __xfs_btree_check_sblock( cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (bp) - agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - - fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_agblock_siblings(pag, agbno, block->bb_u.s.bb_leftsib); if (!fa) - fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, + fa = xfs_btree_check_agblock_siblings(pag, agbno, block->bb_u.s.bb_rightsib); return fa; } -/* Check a short btree block header. */ -STATIC int -xfs_btree_check_sblock( +/* + * Internal btree block check. + * + * Return NULL if the block is ok or the address of the failed check otherwise. + */ +xfs_failaddr_t +__xfs_btree_check_block( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { - struct xfs_mount *mp = cur->bc_mp; - xfs_failaddr_t fa; - - fa = __xfs_btree_check_sblock(cur, block, level, bp); - if (XFS_IS_CORRUPT(mp, fa != NULL) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) { - if (bp) - trace_xfs_btree_corrupt(bp, _RET_IP_); - return -EFSCORRUPTED; + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + return __xfs_btree_check_memblock(cur, block, level, bp); + case XFS_BTREE_TYPE_AG: + return __xfs_btree_check_agblock(cur, block, level, bp); + case XFS_BTREE_TYPE_INODE: + return __xfs_btree_check_fsblock(cur, block, level, bp); + default: + ASSERT(0); + return __this_address; } - return 0; +} + +static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) +{ + if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) + return XFS_ERRTAG_BTREE_CHECK_SBLOCK; + return XFS_ERRTAG_BTREE_CHECK_LBLOCK; } /* @@ -258,34 +295,49 @@ xfs_btree_check_block( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer containing block, if any */ { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - return xfs_btree_check_lblock(cur, block, level, bp); - else - return xfs_btree_check_sblock(cur, block, level, bp); -} + struct xfs_mount *mp = cur->bc_mp; + xfs_failaddr_t fa; -/* Check that this long pointer is valid and points within the fs. */ -bool -xfs_btree_check_lptr( - struct xfs_btree_cur *cur, - xfs_fsblock_t fsbno, - int level) -{ - if (level <= 0) - return false; - return xfs_verify_fsbno(cur->bc_mp, fsbno); + fa = __xfs_btree_check_block(cur, block, level, bp); + if (XFS_IS_CORRUPT(mp, fa != NULL) || + XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { + if (bp) + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + return 0; } -/* Check that this short pointer is valid and points within the AG. */ -bool -xfs_btree_check_sptr( - struct xfs_btree_cur *cur, - xfs_agblock_t agbno, - int level) +int +__xfs_btree_check_ptr( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int index, + int level) { if (level <= 0) - return false; - return xfs_verify_agbno(cur->bc_ag.pag, agbno); + return -EFSCORRUPTED; + + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + if (!xfbtree_verify_bno(cur->bc_mem.xfbtree, + be64_to_cpu((&ptr->l)[index]))) + return -EFSCORRUPTED; + break; + case XFS_BTREE_TYPE_INODE: + if (!xfs_verify_fsbno(cur->bc_mp, + be64_to_cpu((&ptr->l)[index]))) + return -EFSCORRUPTED; + break; + case XFS_BTREE_TYPE_AG: + if (!xfs_verify_agbno(cur->bc_ag.pag, + be32_to_cpu((&ptr->s)[index]))) + return -EFSCORRUPTED; + break; + } + + return 0; } /* @@ -299,26 +351,35 @@ xfs_btree_check_ptr( int index, int level) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]), - level)) - return 0; - xfs_err(cur->bc_mp, -"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.", + int error; + + error = __xfs_btree_check_ptr(cur, ptr, index, level); + if (error) { + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + xfs_err(cur->bc_mp, +"In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.", + cur->bc_ops->name, cur->bc_flags, level, index, + __this_address); + break; + case XFS_BTREE_TYPE_INODE: + xfs_err(cur->bc_mp, +"Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", cur->bc_ino.ip->i_ino, - cur->bc_ino.whichfork, cur->bc_btnum, + cur->bc_ino.whichfork, cur->bc_ops->name, level, index); - } else { - if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), - level)) - return 0; - xfs_err(cur->bc_mp, -"AG %u: Corrupt btree %d pointer at level %d index %d.", - cur->bc_ag.pag->pag_agno, cur->bc_btnum, + break; + case XFS_BTREE_TYPE_AG: + xfs_err(cur->bc_mp, +"AG %u: Corrupt %sbt pointer at level %d index %d.", + cur->bc_ag.pag->pag_agno, cur->bc_ops->name, level, index); + break; + } + xfs_btree_mark_sick(cur); } - return -EFSCORRUPTED; + return error; } #ifdef DEBUG @@ -336,7 +397,7 @@ xfs_btree_check_ptr( * it to disk. */ void -xfs_btree_lblock_calc_crc( +xfs_btree_fsblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -350,7 +411,7 @@ xfs_btree_lblock_calc_crc( } bool -xfs_btree_lblock_verify_crc( +xfs_btree_fsblock_verify_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -374,7 +435,7 @@ xfs_btree_lblock_verify_crc( * it to disk. */ void -xfs_btree_sblock_calc_crc( +xfs_btree_agblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -388,7 +449,7 @@ xfs_btree_sblock_calc_crc( } bool -xfs_btree_sblock_verify_crc( +xfs_btree_agblock_verify_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -410,6 +471,17 @@ xfs_btree_free_block( { int error; + trace_xfs_btree_free_block(cur, bp); + + /* + * Don't allow block freeing for a staging cursor, because staging + * cursors do not support regular btree modifications. + */ + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { + ASSERT(0); + return -EFSCORRUPTED; + } + error = cur->bc_ops->free_block(cur, bp); if (!error) { xfs_trans_binval(cur->bc_tp, bp); @@ -448,33 +520,70 @@ xfs_btree_del_cursor( * zero, then we should be shut down or on our way to shutdown due to * cancelling a dirty transaction on error. */ - ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || + ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || xfs_is_shutdown(cur->bc_mp) || error != 0); - if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) - kmem_free(cur->bc_ops); - if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) - xfs_perag_put(cur->bc_ag.pag); + + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: + if (cur->bc_ag.pag) + xfs_perag_put(cur->bc_ag.pag); + break; + case XFS_BTREE_TYPE_INODE: + /* nothing to do */ + break; + case XFS_BTREE_TYPE_MEM: + if (cur->bc_mem.pag) + xfs_perag_put(cur->bc_mem.pag); + break; + } + kmem_cache_free(cur->bc_cache, cur); } +/* Return the buffer target for this btree's buffer. */ +static inline struct xfs_buftarg * +xfs_btree_buftarg( + struct xfs_btree_cur *cur) +{ + if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) + return cur->bc_mem.xfbtree->target; + return cur->bc_mp->m_ddev_targp; +} + +/* Return the block size (in units of 512b sectors) for this btree. */ +static inline unsigned int +xfs_btree_bbsize( + struct xfs_btree_cur *cur) +{ + if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) + return XFBNO_BBSIZE; + return cur->bc_mp->m_bsize; +} + /* * Duplicate the btree cursor. * Allocate a new one, copy the record, re-get the buffers. */ -int /* error */ +int /* error */ xfs_btree_dup_cursor( - struct xfs_btree_cur *cur, /* input cursor */ - struct xfs_btree_cur **ncur) /* output cursor */ + struct xfs_btree_cur *cur, /* input cursor */ + struct xfs_btree_cur **ncur) /* output cursor */ { - struct xfs_buf *bp; /* btree block's buffer pointer */ - int error; /* error return value */ - int i; /* level number of btree block */ - xfs_mount_t *mp; /* mount structure for filesystem */ - struct xfs_btree_cur *new; /* new cursor value */ - xfs_trans_t *tp; /* transaction pointer, can be NULL */ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_trans *tp = cur->bc_tp; + struct xfs_buf *bp; + struct xfs_btree_cur *new; + int error; + int i; - tp = cur->bc_tp; - mp = cur->bc_mp; + /* + * Don't allow staging cursors to be duplicated because they're supposed + * to be kept private to a single thread. + */ + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { + ASSERT(0); + return -EFSCORRUPTED; + } /* * Allocate a new cursor like the old one. @@ -494,10 +603,13 @@ xfs_btree_dup_cursor( new->bc_levels[i].ra = cur->bc_levels[i].ra; bp = cur->bc_levels[i].bp; if (bp) { - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - xfs_buf_daddr(bp), mp->m_bsize, - 0, &bp, - cur->bc_ops->buf_ops); + error = xfs_trans_read_buf(mp, tp, + xfs_btree_buftarg(cur), + xfs_buf_daddr(bp), + xfs_btree_bbsize(cur), 0, &bp, + cur->bc_ops->buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(new); if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; @@ -539,7 +651,7 @@ xfs_btree_dup_cursor( * record, key or pointer (xfs_btree_*_addr). Note that all addressing * inside the btree block is done using indices starting at one, not zero! * - * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing + * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing * overlapping intervals. In such a tree, records are still sorted lowest to * highest and indexed by the smallest key value that refers to the record. * However, nodes are different: each pointer has two associated keys -- one @@ -589,26 +701,17 @@ xfs_btree_dup_cursor( */ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { + if (xfs_has_crc(cur->bc_mp)) return XFS_BTREE_LBLOCK_CRC_LEN; return XFS_BTREE_LBLOCK_LEN; } - if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + if (xfs_has_crc(cur->bc_mp)) return XFS_BTREE_SBLOCK_CRC_LEN; return XFS_BTREE_SBLOCK_LEN; } /* - * Return size of btree block pointers for this btree instance. - */ -static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) -{ - return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? - sizeof(__be64) : sizeof(__be32); -} - -/* * Calculate offset of the n-th record in a btree block. */ STATIC size_t @@ -655,7 +758,7 @@ xfs_btree_ptr_offset( { return xfs_btree_block_len(cur) + cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + - (n - 1) * xfs_btree_ptr_len(cur); + (n - 1) * cur->bc_ops->ptr_len; } /* @@ -718,7 +821,7 @@ struct xfs_ifork * xfs_btree_ifork_ptr( struct xfs_btree_cur *cur) { - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); if (cur->bc_flags & XFS_BTREE_STAGING) return cur->bc_ino.ifake->if_fork; @@ -750,8 +853,7 @@ xfs_btree_get_block( int level, /* level in btree */ struct xfs_buf **bpp) /* buffer containing the block */ { - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - (level == cur->bc_nlevels - 1)) { + if (xfs_btree_at_iroot(cur, level)) { *bpp = NULL; return xfs_btree_get_iroot(cur); } @@ -856,95 +958,52 @@ xfs_btree_offsets( } } -/* - * Get a buffer for the block, return it read in. - * Long-form addressing. - */ -int -xfs_btree_read_bufl( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - struct xfs_buf **bpp, /* buffer for fsbno */ - int refval, /* ref count value for buffer */ - const struct xfs_buf_ops *ops) -{ - struct xfs_buf *bp; /* return value */ - xfs_daddr_t d; /* real disk block address */ - int error; - - if (!xfs_verify_fsbno(mp, fsbno)) - return -EFSCORRUPTED; - d = XFS_FSB_TO_DADDR(mp, fsbno); - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, 0, &bp, ops); - if (error) - return error; - if (bp) - xfs_buf_set_ref(bp, refval); - *bpp = bp; - return 0; -} - -/* - * Read-ahead the block, don't wait for it, don't return a buffer. - * Long-form addressing. - */ -/* ARGSUSED */ -void -xfs_btree_reada_bufl( - struct xfs_mount *mp, /* file system mount point */ - xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count, /* count of filesystem blocks */ - const struct xfs_buf_ops *ops) +STATIC int +xfs_btree_readahead_fsblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) { - xfs_daddr_t d; + struct xfs_mount *mp = cur->bc_mp; + xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); + xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); + int rval = 0; - ASSERT(fsbno != NULLFSBLOCK); - d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); -} + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { + xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left), + mp->m_bsize, cur->bc_ops->buf_ops); + rval++; + } -/* - * Read-ahead the block, don't wait for it, don't return a buffer. - * Short-form addressing. - */ -/* ARGSUSED */ -void -xfs_btree_reada_bufs( - struct xfs_mount *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count, /* count of filesystem blocks */ - const struct xfs_buf_ops *ops) -{ - xfs_daddr_t d; + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { + xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right), + mp->m_bsize, cur->bc_ops->buf_ops); + rval++; + } - ASSERT(agno != NULLAGNUMBER); - ASSERT(agbno != NULLAGBLOCK); - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); + return rval; } STATIC int -xfs_btree_readahead_lblock( +xfs_btree_readahead_memblock( struct xfs_btree_cur *cur, int lr, struct xfs_btree_block *block) { + struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; + xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); + xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); int rval = 0; - xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); - xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { - xfs_btree_reada_bufl(cur->bc_mp, left, 1, - cur->bc_ops->buf_ops); + xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE, + cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { - xfs_btree_reada_bufl(cur->bc_mp, right, 1, - cur->bc_ops->buf_ops); + xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE, + cur->bc_ops->buf_ops); rval++; } @@ -952,25 +1011,28 @@ xfs_btree_readahead_lblock( } STATIC int -xfs_btree_readahead_sblock( +xfs_btree_readahead_agblock( struct xfs_btree_cur *cur, int lr, - struct xfs_btree_block *block) + struct xfs_btree_block *block) { - int rval = 0; + struct xfs_mount *mp = cur->bc_mp; + xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); - + int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, - left, 1, cur->bc_ops->buf_ops); + xfs_buf_readahead(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, left), + mp->m_bsize, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, - right, 1, cur->bc_ops->buf_ops); + xfs_buf_readahead(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, right), + mp->m_bsize, cur->bc_ops->buf_ops); rval++; } @@ -993,8 +1055,7 @@ xfs_btree_readahead( * No readahead needed if we are at the root level and the * btree root is stored in the inode. */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - (lev == cur->bc_nlevels - 1)) + if (xfs_btree_at_iroot(cur, lev)) return 0; if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) @@ -1003,9 +1064,17 @@ xfs_btree_readahead( cur->bc_levels[lev].ra |= lr; block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - return xfs_btree_readahead_lblock(cur, lr, block); - return xfs_btree_readahead_sblock(cur, lr, block); + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: + return xfs_btree_readahead_agblock(cur, lr, block); + case XFS_BTREE_TYPE_INODE: + return xfs_btree_readahead_fsblock(cur, lr, block); + case XFS_BTREE_TYPE_MEM: + return xfs_btree_readahead_memblock(cur, lr, block); + default: + ASSERT(0); + return 0; + } } STATIC int @@ -1014,23 +1083,24 @@ xfs_btree_ptr_to_daddr( const union xfs_btree_ptr *ptr, xfs_daddr_t *daddr) { - xfs_fsblock_t fsbno; - xfs_agblock_t agbno; int error; error = xfs_btree_check_ptr(cur, ptr, 0, 1); if (error) return error; - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - fsbno = be64_to_cpu(ptr->l); - *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); - } else { - agbno = be32_to_cpu(ptr->s); + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno); + be32_to_cpu(ptr->s)); + break; + case XFS_BTREE_TYPE_INODE: + *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); + break; + case XFS_BTREE_TYPE_MEM: + *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l)); + break; } - return 0; } @@ -1050,8 +1120,9 @@ xfs_btree_readahead_ptr( if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) return; - xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr, - cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); + xfs_buf_readahead(xfs_btree_buftarg(cur), daddr, + xfs_btree_bbsize(cur) * count, + cur->bc_ops->buf_ops); } /* @@ -1072,7 +1143,7 @@ xfs_btree_setbuf( cur->bc_levels[lev].ra = 0; b = XFS_BUF_TO_BLOCK(bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) @@ -1090,7 +1161,7 @@ xfs_btree_ptr_is_null( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return ptr->l == cpu_to_be64(NULLFSBLOCK); else return ptr->s == cpu_to_be32(NULLAGBLOCK); @@ -1101,12 +1172,23 @@ xfs_btree_set_ptr_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ptr->l = cpu_to_be64(NULLFSBLOCK); else ptr->s = cpu_to_be32(NULLAGBLOCK); } +static inline bool +xfs_btree_ptrs_equal( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr1, + union xfs_btree_ptr *ptr2) +{ + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) + return ptr1->l == ptr2->l; + return ptr1->s == ptr2->s; +} + /* * Get/set/init sibling pointers */ @@ -1119,7 +1201,7 @@ xfs_btree_get_sibling( { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (lr == XFS_BB_RIGHTSIB) ptr->l = block->bb_u.l.bb_rightsib; else @@ -1141,7 +1223,7 @@ xfs_btree_set_sibling( { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (lr == XFS_BB_RIGHTSIB) block->bb_u.l.bb_rightsib = ptr->l; else @@ -1154,25 +1236,24 @@ xfs_btree_set_sibling( } } -void -xfs_btree_init_block_int( +static void +__xfs_btree_init_block( struct xfs_mount *mp, struct xfs_btree_block *buf, + const struct xfs_btree_ops *ops, xfs_daddr_t blkno, - xfs_btnum_t btnum, __u16 level, __u16 numrecs, - __u64 owner, - unsigned int flags) + __u64 owner) { - int crc = xfs_has_crc(mp); - __u32 magic = xfs_btree_magic(crc, btnum); + bool crc = xfs_has_crc(mp); + __u32 magic = xfs_btree_magic(mp, ops); buf->bb_magic = cpu_to_be32(magic); buf->bb_level = cpu_to_be16(level); buf->bb_numrecs = cpu_to_be16(numrecs); - if (flags & XFS_BTREE_LONG_PTRS) { + if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); if (crc) { @@ -1183,14 +1264,12 @@ xfs_btree_init_block_int( buf->bb_u.l.bb_lsn = 0; } } else { - /* owner is a 32 bit value on short blocks */ - __u32 __owner = (__u32)owner; - buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); if (crc) { buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); - buf->bb_u.s.bb_owner = cpu_to_be32(__owner); + /* owner is a 32 bit value on short blocks */ + buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner); uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.s.bb_lsn = 0; } @@ -1199,15 +1278,46 @@ xfs_btree_init_block_int( void xfs_btree_init_block( - struct xfs_mount *mp, - struct xfs_buf *bp, - xfs_btnum_t btnum, - __u16 level, - __u16 numrecs, - __u64 owner) + struct xfs_mount *mp, + struct xfs_btree_block *block, + const struct xfs_btree_ops *ops, + __u16 level, + __u16 numrecs, + __u64 owner) { - xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp), - btnum, level, numrecs, owner, 0); + __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level, + numrecs, owner); +} + +void +xfs_btree_init_buf( + struct xfs_mount *mp, + struct xfs_buf *bp, + const struct xfs_btree_ops *ops, + __u16 level, + __u16 numrecs, + __u64 owner) +{ + __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, + xfs_buf_daddr(bp), level, numrecs, owner); + bp->b_ops = ops->buf_ops; +} + +static inline __u64 +xfs_btree_owner( + struct xfs_btree_cur *cur) +{ + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + return cur->bc_mem.xfbtree->owner; + case XFS_BTREE_TYPE_INODE: + return cur->bc_ino.ip->i_ino; + case XFS_BTREE_TYPE_AG: + return cur->bc_ag.pag->pag_agno; + default: + ASSERT(0); + return 0; + } } void @@ -1217,22 +1327,8 @@ xfs_btree_init_block_cur( int level, int numrecs) { - __u64 owner; - - /* - * we can pull the owner from the cursor right now as the different - * owners align directly with the pointer size of the btree. This may - * change in future, but is safe for current users of the generic btree - * code. - */ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - owner = cur->bc_ino.ip->i_ino; - else - owner = cur->bc_ag.pag->pag_agno; - - xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), - xfs_buf_daddr(bp), cur->bc_btnum, level, - numrecs, owner, cur->bc_flags); + xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, + xfs_btree_owner(cur)); } /* @@ -1250,7 +1346,7 @@ xfs_btree_is_lastrec( if (level > 0) return 0; - if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_LASTREC_UPDATE)) return 0; xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); @@ -1265,41 +1361,27 @@ xfs_btree_buf_to_ptr( struct xfs_buf *bp, union xfs_btree_ptr *ptr) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, - xfs_buf_daddr(bp))); - else { + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp))); + break; + case XFS_BTREE_TYPE_INODE: + ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(bp))); + break; + case XFS_BTREE_TYPE_MEM: + ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp))); + break; } } -STATIC void +static inline void xfs_btree_set_refs( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - switch (cur->bc_btnum) { - case XFS_BTNUM_BNO: - case XFS_BTNUM_CNT: - xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); - break; - case XFS_BTNUM_INO: - case XFS_BTNUM_FINO: - xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); - break; - case XFS_BTNUM_BMAP: - xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); - break; - case XFS_BTNUM_RMAP: - xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); - break; - case XFS_BTNUM_REFC: - xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF); - break; - default: - ASSERT(0); - } + xfs_buf_set_ref(bp, cur->bc_ops->lru_refs); } int @@ -1309,15 +1391,14 @@ xfs_btree_get_buf_block( struct xfs_btree_block **block, struct xfs_buf **bpp) { - struct xfs_mount *mp = cur->bc_mp; - xfs_daddr_t d; - int error; + xfs_daddr_t d; + int error; error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; - error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, - 0, bpp); + error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d, + xfs_btree_bbsize(cur), 0, bpp); if (error) return error; @@ -1330,7 +1411,7 @@ xfs_btree_get_buf_block( * Read in the buffer at the given ptr and return the buffer and * the block pointer within the buffer. */ -STATIC int +int xfs_btree_read_buf_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, @@ -1348,9 +1429,11 @@ xfs_btree_read_buf_block( error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; - error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp, - cur->bc_ops->buf_ops); + error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d, + xfs_btree_bbsize(cur), flags, bpp, + cur->bc_ops->buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(cur); if (error) return error; @@ -1398,7 +1481,7 @@ xfs_btree_copy_ptrs( int numptrs) { ASSERT(numptrs >= 0); - memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur)); + memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len); } /* @@ -1454,8 +1537,8 @@ xfs_btree_shift_ptrs( ASSERT(numptrs >= 0); ASSERT(dir == 1 || dir == -1); - dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur)); - memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur)); + dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len); + memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len); } /* @@ -1566,7 +1649,7 @@ xfs_btree_log_block( if (bp) { int nbits; - if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (xfs_has_crc(cur->bc_mp)) { /* * We don't log the CRC when updating a btree * block but instead recreate it during log @@ -1581,7 +1664,7 @@ xfs_btree_log_block( nbits = XFS_BB_NUM_BITS; } xfs_btree_offsets(fields, - (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ? loffsets : soffsets, nbits, &first, &last); xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); @@ -1658,9 +1741,10 @@ xfs_btree_increment( * confused or have the tree root in an inode. */ if (lev == cur->bc_nlevels) { - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) goto out0; ASSERT(0); + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1751,9 +1835,10 @@ xfs_btree_decrement( * or the root of the tree is in an inode. */ if (lev == cur->bc_nlevels) { - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) goto out0; ASSERT(0); + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1786,6 +1871,33 @@ error0: return error; } +/* + * Check the btree block owner now that we have the context to know who the + * real owner is. + */ +static inline xfs_failaddr_t +xfs_btree_check_block_owner( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block) +{ + __u64 owner; + + if (!xfs_has_crc(cur->bc_mp) || + (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER)) + return NULL; + + owner = xfs_btree_owner(cur); + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { + if (be64_to_cpu(block->bb_u.l.bb_owner) != owner) + return __this_address; + } else { + if (be32_to_cpu(block->bb_u.s.bb_owner) != owner) + return __this_address; + } + + return NULL; +} + int xfs_btree_lookup_get_block( struct xfs_btree_cur *cur, /* btree cursor */ @@ -1798,8 +1910,7 @@ xfs_btree_lookup_get_block( int error = 0; /* special case the root block if in an inode */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - (level == cur->bc_nlevels - 1)) { + if (xfs_btree_at_iroot(cur, level)) { *blkp = xfs_btree_get_iroot(cur); return 0; } @@ -1824,11 +1935,7 @@ xfs_btree_lookup_get_block( return error; /* Check the inode owner since the verifiers don't. */ - if (xfs_has_crc(cur->bc_mp) && - !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && - (cur->bc_flags & XFS_BTREE_LONG_PTRS) && - be64_to_cpu((*blkp)->bb_u.l.bb_owner) != - cur->bc_ino.ip->i_ino) + if (xfs_btree_check_block_owner(cur, *blkp) != NULL) goto out_bad; /* Did we get the level we were looking for? */ @@ -1846,6 +1953,7 @@ out_bad: *blkp = NULL; xfs_buf_mark_corrupt(bp); xfs_trans_brelse(cur->bc_tp, bp); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -1872,6 +1980,27 @@ xfs_lookup_get_search_key( } /* + * Initialize a pointer to the root block. + */ +void +xfs_btree_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { + /* + * Inode-rooted btrees call xfs_btree_get_iroot to find the root + * in xfs_btree_lookup_get_block and don't need a pointer here. + */ + ptr->l = 0; + } else if (cur->bc_flags & XFS_BTREE_STAGING) { + ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root); + } else { + cur->bc_ops->init_ptr_from_cur(cur, ptr); + } +} + +/* * Lookup the record. The cursor is made to point to it, based on dir. * stat is set to 0 if can't find any such record, 1 for success. */ @@ -1892,14 +2021,16 @@ xfs_btree_lookup( XFS_BTREE_STATS_INC(cur, lookup); /* No such thing as a zero-level tree. */ - if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) + if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } block = NULL; keyno = 0; /* initialise start pointer from cursor */ - cur->bc_ops->init_ptr_from_cur(cur, &ptr); + xfs_btree_init_ptr_from_cur(cur, &ptr); pp = &ptr; /* @@ -1936,6 +2067,7 @@ xfs_btree_lookup( XFS_ERRLEVEL_LOW, cur->bc_mp, block, sizeof(*block)); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -2012,8 +2144,10 @@ xfs_btree_lookup( error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } *stat = 1; return 0; } @@ -2040,7 +2174,7 @@ xfs_btree_high_key_from_key( struct xfs_btree_cur *cur, union xfs_btree_key *key) { - ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); + ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); return (union xfs_btree_key *)((char *)key + (cur->bc_ops->key_len / 2)); } @@ -2061,7 +2195,7 @@ xfs_btree_get_leaf_keys( rec = xfs_btree_rec_addr(cur, 1, block); cur->bc_ops->init_key_from_rec(key, rec); - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { @@ -2088,7 +2222,7 @@ xfs_btree_get_node_keys( union xfs_btree_key *high; int n; - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { memcpy(key, xfs_btree_key_addr(cur, 1, block), cur->bc_ops->key_len / 2); @@ -2132,7 +2266,7 @@ xfs_btree_needs_key_update( struct xfs_btree_cur *cur, int ptr) { - return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1; + return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1; } /* @@ -2156,7 +2290,7 @@ __xfs_btree_updkeys( struct xfs_buf *bp; int ptr; - ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); + ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); /* Exit if there aren't any parent levels to update. */ if (level + 1 >= cur->bc_nlevels) @@ -2225,7 +2359,7 @@ xfs_btree_update_keys( ASSERT(level >= 0); block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) return __xfs_btree_updkeys(cur, level, block, bp, false); /* @@ -2332,8 +2466,7 @@ xfs_btree_lshift( int error; /* error return value */ int i; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - level == cur->bc_nlevels - 1) + if (xfs_btree_at_iroot(cur, level)) goto out0; /* Set up variables for this block as "right". */ @@ -2460,12 +2593,13 @@ xfs_btree_lshift( * Using a temporary cursor, update the parent key values of the * block on the left. */ - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_dup_cursor(cur, &tcur); if (error) goto error0; i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -2527,8 +2661,7 @@ xfs_btree_rshift( int error; /* error return value */ int i; /* loop counter */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - (level == cur->bc_nlevels - 1)) + if (xfs_btree_at_iroot(cur, level)) goto out0; /* Set up variables for this block as "left". */ @@ -2636,6 +2769,7 @@ xfs_btree_rshift( goto error0; i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -2645,7 +2779,7 @@ xfs_btree_rshift( goto error1; /* Update the parent high keys of the left block, if needed. */ - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_update_keys(cur, level); if (error) goto error1; @@ -2673,6 +2807,32 @@ error1: return error; } +static inline int +xfs_btree_alloc_block( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *hint_block, + union xfs_btree_ptr *new_block, + int *stat) +{ + int error; + + /* + * Don't allow block allocation for a staging cursor, because staging + * cursors do not support regular btree modifications. + * + * Bulk loading uses a separate callback to obtain new blocks from a + * preallocated list, which prevents ENOSPC failures during loading. + */ + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat); + trace_xfs_btree_alloc_block(cur, new_block, *stat, error); + return error; +} + /* * Split cur/level block in half. * Return new block number and the key to its first @@ -2716,7 +2876,7 @@ __xfs_btree_split( xfs_btree_buf_to_ptr(cur, lbp, &lptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); + error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat); if (error) goto error0; if (*stat == 0) @@ -2823,7 +2983,7 @@ __xfs_btree_split( } /* Update the parent high keys of the left block, if needed. */ - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_update_keys(cur, level); if (error) goto error0; @@ -2941,7 +3101,7 @@ xfs_btree_split( struct xfs_btree_split_args args; DECLARE_COMPLETION_ONSTACK(done); - if (cur->bc_btnum != XFS_BTNUM_BMAP || + if (!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_tp->t_highest_agno == NULLAGNUMBER) return __xfs_btree_split(cur, level, ptrp, key, curp, stat); @@ -2963,7 +3123,6 @@ xfs_btree_split( #define xfs_btree_split __xfs_btree_split #endif /* __KERNEL__ */ - /* * Copy the old inode root contents into a real block and make the * broot point to it. @@ -2988,7 +3147,7 @@ xfs_btree_new_iroot( XFS_BTREE_STATS_INC(cur, newroot); - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); level = cur->bc_nlevels - 1; @@ -2996,7 +3155,7 @@ xfs_btree_new_iroot( pp = xfs_btree_ptr_addr(cur, 1, block); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); + error = xfs_btree_alloc_block(cur, pp, &nptr, stat); if (error) goto error0; if (*stat == 0) @@ -3014,9 +3173,9 @@ xfs_btree_new_iroot( * In that case have to also ensure the blkno remains correct */ memcpy(cblock, block, xfs_btree_block_len(cur)); - if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (xfs_has_crc(cur->bc_mp)) { __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) cblock->bb_u.l.bb_blkno = bno; else cblock->bb_u.s.bb_blkno = bno; @@ -3069,6 +3228,21 @@ error0: return error; } +static void +xfs_btree_set_root( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) +{ + if (cur->bc_flags & XFS_BTREE_STAGING) { + /* Update the btree root information for a per-AG fake root. */ + cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s); + cur->bc_ag.afake->af_levels += inc; + } else { + cur->bc_ops->set_root(cur, ptr, inc); + } +} + /* * Allocate a new root block, fill it in. */ @@ -3093,10 +3267,10 @@ xfs_btree_new_root( XFS_BTREE_STATS_INC(cur, newroot); /* initialise our start point from the cursor */ - cur->bc_ops->init_ptr_from_cur(cur, &rptr); + xfs_btree_init_ptr_from_cur(cur, &rptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); + error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat); if (error) goto error0; if (*stat == 0) @@ -3109,7 +3283,7 @@ xfs_btree_new_root( goto error0; /* Set the root in the holding structure increasing the level by 1. */ - cur->bc_ops->set_root(cur, &lptr, 1); + xfs_btree_set_root(cur, &lptr, 1); /* * At the previous root level there are now two blocks: the old root, @@ -3213,8 +3387,7 @@ xfs_btree_make_block_unfull( { int error = 0; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - level == cur->bc_nlevels - 1) { + if (xfs_btree_at_iroot(cur, level)) { struct xfs_inode *ip = cur->bc_ino.ip; if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { @@ -3299,8 +3472,8 @@ xfs_btree_insrec( * If we have an external root pointer, and we've made it to the * root level, allocate a new root block and we're done. */ - if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - (level >= cur->bc_nlevels)) { + if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE && + level >= cur->bc_nlevels) { error = xfs_btree_new_root(cur, stat); xfs_btree_set_ptr_null(cur, ptrp); @@ -3524,6 +3697,7 @@ xfs_btree_insert( } if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -3537,7 +3711,8 @@ xfs_btree_insert( if (pcur != cur && (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { /* Save the state from the cursor before we trash it */ - if (cur->bc_ops->update_cursor) + if (cur->bc_ops->update_cursor && + !(cur->bc_flags & XFS_BTREE_STAGING)) cur->bc_ops->update_cursor(pcur, cur); cur->bc_nlevels = pcur->bc_nlevels; xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); @@ -3586,7 +3761,7 @@ xfs_btree_kill_iroot( #endif int i; - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); ASSERT(cur->bc_nlevels > 1); /* @@ -3680,7 +3855,7 @@ xfs_btree_kill_root( * Update the root pointer, decreasing the level by 1 and then * free the old root. */ - cur->bc_ops->set_root(cur, newroot, -1); + xfs_btree_set_root(cur, newroot, -1); error = xfs_btree_free_block(cur, bp); if (error) @@ -3822,27 +3997,25 @@ xfs_btree_delrec( * Try to get rid of the next level down. If we can't then there's * nothing left to do. */ - if (level == cur->bc_nlevels - 1) { - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { - xfs_iroot_realloc(cur->bc_ino.ip, -1, - cur->bc_ino.whichfork); + if (xfs_btree_at_iroot(cur, level)) { + xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); - error = xfs_btree_kill_iroot(cur); - if (error) - goto error0; + error = xfs_btree_kill_iroot(cur); + if (error) + goto error0; - error = xfs_btree_dec_cursor(cur, level, stat); - if (error) - goto error0; - *stat = 1; - return 0; - } + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + *stat = 1; + return 0; + } - /* - * If this is the root level, and there's only one entry left, - * and it's NOT the leaf level, then we can get rid of this - * level. - */ + /* + * If this is the root level, and there's only one entry left, and it's + * NOT the leaf level, then we can get rid of this level. + */ + if (level == cur->bc_nlevels - 1) { if (numrecs == 1 && level > 0) { union xfs_btree_ptr *pp; /* @@ -3891,7 +4064,7 @@ xfs_btree_delrec( xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { /* * One child of root, need to get a chance to copy its contents * into the root and delete it. Can't go up to next level, @@ -3931,6 +4104,7 @@ xfs_btree_delrec( */ i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -3939,12 +4113,14 @@ xfs_btree_delrec( if (error) goto error0; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -3992,6 +4168,7 @@ xfs_btree_delrec( if (!xfs_btree_ptr_is_null(cur, &lptr)) { i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -4000,6 +4177,7 @@ xfs_btree_delrec( if (error) goto error0; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -4017,6 +4195,7 @@ xfs_btree_delrec( */ i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -4026,6 +4205,7 @@ xfs_btree_delrec( goto error0; i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -4201,8 +4381,8 @@ xfs_btree_delrec( * If we joined with the right neighbor and there's a level above * us, increment the cursor at that level. */ - else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || - (level + 1 < cur->bc_nlevels)) { + else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE || + level + 1 < cur->bc_nlevels) { error = xfs_btree_increment(cur, level + 1, &i); if (error) goto error0; @@ -4270,7 +4450,7 @@ xfs_btree_delete( * If we combined blocks as part of deleting the record, delrec won't * have updated the parent high keys so we have to do that here. */ - if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) { + if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) { error = xfs_btree_updkeys_force(cur, 0); if (error) goto error0; @@ -4344,7 +4524,7 @@ xfs_btree_visit_block( { struct xfs_btree_block *block; struct xfs_buf *bp; - union xfs_btree_ptr rptr; + union xfs_btree_ptr rptr, bufptr; int error; /* do right sibling readahead */ @@ -4367,15 +4547,12 @@ xfs_btree_visit_block( * return the same block without checking if the right sibling points * back to us and creates a cyclic reference in the btree. */ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, - xfs_buf_daddr(bp))) - return -EFSCORRUPTED; - } else { - if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, - xfs_buf_daddr(bp))) - return -EFSCORRUPTED; + xfs_btree_buf_to_ptr(cur, bp, &bufptr); + if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; } + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } @@ -4393,7 +4570,7 @@ xfs_btree_visit_blocks( struct xfs_btree_block *block = NULL; int error = 0; - cur->bc_ops->init_ptr_from_cur(cur, &lptr); + xfs_btree_init_ptr_from_cur(cur, &lptr); /* for each level */ for (level = cur->bc_nlevels - 1; level >= 0; level--) { @@ -4471,7 +4648,7 @@ xfs_btree_block_change_owner( /* modify the owner */ block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) return 0; block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); @@ -4489,7 +4666,7 @@ xfs_btree_block_change_owner( * though, so everything is consistent in memory. */ if (!bp) { - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); ASSERT(level == cur->bc_nlevels - 1); return 0; } @@ -4523,7 +4700,7 @@ xfs_btree_change_owner( /* Verify the v5 fields of a long-format btree block. */ xfs_failaddr_t -xfs_btree_lblock_v5hdr_verify( +xfs_btree_fsblock_v5hdr_verify( struct xfs_buf *bp, uint64_t owner) { @@ -4544,7 +4721,7 @@ xfs_btree_lblock_v5hdr_verify( /* Verify a long-format btree block. */ xfs_failaddr_t -xfs_btree_lblock_verify( +xfs_btree_fsblock_verify( struct xfs_buf *bp, unsigned int max_recs) { @@ -4553,28 +4730,60 @@ xfs_btree_lblock_verify( xfs_fsblock_t fsb; xfs_failaddr_t fa; + ASSERT(!xfs_buftarg_is_mem(bp->b_target)); + /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_leftsib); if (!fa) - fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_rightsib); return fa; } +/* Verify an in-memory btree block. */ +xfs_failaddr_t +xfs_btree_memblock_verify( + struct xfs_buf *bp, + unsigned int max_recs) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buftarg *btp = bp->b_target; + xfs_failaddr_t fa; + xfbno_t bno; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > max_recs) + return __this_address; + + /* sibling pointer verification */ + bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_leftsib); + if (fa) + return fa; + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_rightsib); + if (fa) + return fa; + + return NULL; +} /** - * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format + * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format * btree block * * @bp: buffer containing the btree block */ xfs_failaddr_t -xfs_btree_sblock_v5hdr_verify( +xfs_btree_agblock_v5hdr_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; @@ -4593,13 +4802,13 @@ xfs_btree_sblock_v5hdr_verify( } /** - * xfs_btree_sblock_verify() -- verify a short-format btree block + * xfs_btree_agblock_verify() -- verify a short-format btree block * * @bp: buffer containing the btree block * @max_recs: maximum records allowed in this btree node */ xfs_failaddr_t -xfs_btree_sblock_verify( +xfs_btree_agblock_verify( struct xfs_buf *bp, unsigned int max_recs) { @@ -4608,16 +4817,18 @@ xfs_btree_sblock_verify( xfs_agblock_t agbno; xfs_failaddr_t fa; + ASSERT(!xfs_buftarg_is_mem(bp->b_target)); + /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, + fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, block->bb_u.s.bb_leftsib); if (!fa) - fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, + fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, block->bb_u.s.bb_rightsib); return fa; } @@ -4815,7 +5026,7 @@ xfs_btree_overlapped_query_range( /* Load the root of the btree. */ level = cur->bc_nlevels - 1; - cur->bc_ops->init_ptr_from_cur(cur, &ptr); + xfs_btree_init_ptr_from_cur(cur, &ptr); error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); if (error) return error; @@ -4966,7 +5177,7 @@ xfs_btree_query_range( if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) return -EINVAL; - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, @@ -5020,7 +5231,7 @@ xfs_btree_diff_two_ptrs( const union xfs_btree_ptr *a, const union xfs_btree_ptr *b) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); } @@ -5074,7 +5285,7 @@ xfs_btree_has_records_helper( key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, &rec_key, info->key_mask); if (key_contig == XBTREE_KEY_OVERLAP && - !(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return -EFSCORRUPTED; if (key_contig == XBTREE_KEY_GAP) return -ECANCELED; @@ -5168,7 +5379,7 @@ xfs_btree_has_more_records( return true; /* There are more record blocks. */ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); else return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); @@ -5212,3 +5423,30 @@ xfs_btree_destroy_cur_caches(void) xfs_rmapbt_destroy_cur_cache(); xfs_refcountbt_destroy_cur_cache(); } + +/* Move the btree cursor before the first record. */ +int +xfs_btree_goto_left_edge( + struct xfs_btree_cur *cur) +{ + int stat = 0; + int error; + + memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); + if (error) + return error; + if (!stat) + return 0; + + error = xfs_btree_decrement(cur, 0, &stat); + if (error) + return error; + if (stat != 0) { + ASSERT(0); + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 4d68a58be160..f93374278aa1 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -55,15 +55,8 @@ union xfs_btree_rec { #define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) #define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) -#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) -#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) -#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) -#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) -#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) -#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) -#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) - -uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); +struct xfs_btree_ops; +uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops); /* * For logging record fields. @@ -86,9 +79,11 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); * Generic stats interface */ #define XFS_BTREE_STATS_INC(cur, stat) \ - XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat) + XFS_STATS_INC_OFF((cur)->bc_mp, \ + (cur)->bc_ops->statoff + __XBTS_ ## stat) #define XFS_BTREE_STATS_ADD(cur, stat, val) \ - XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) + XFS_STATS_ADD_OFF((cur)->bc_mp, \ + (cur)->bc_ops->statoff + __XBTS_ ## stat, val) enum xbtree_key_contig { XBTREE_KEY_GAP = 0, @@ -111,10 +106,37 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) return XBTREE_KEY_OVERLAP; } +#define XFS_BTREE_LONG_PTR_LEN (sizeof(__be64)) +#define XFS_BTREE_SHORT_PTR_LEN (sizeof(__be32)) + +enum xfs_btree_type { + XFS_BTREE_TYPE_AG, + XFS_BTREE_TYPE_INODE, + XFS_BTREE_TYPE_MEM, +}; + struct xfs_btree_ops { - /* size of the key and record structures */ - size_t key_len; - size_t rec_len; + const char *name; + + /* Type of btree - AG-rooted or inode-rooted */ + enum xfs_btree_type type; + + /* XFS_BTGEO_* flags that determine the geometry of the btree */ + unsigned int geom_flags; + + /* size of the key, pointer, and record structures */ + size_t key_len; + size_t ptr_len; + size_t rec_len; + + /* LRU refcount to set on each btree buffer created */ + unsigned int lru_refs; + + /* offset of btree stats array */ + unsigned int statoff; + + /* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */ + unsigned int sick_mask; /* cursor operations */ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); @@ -199,6 +221,10 @@ struct xfs_btree_ops { const union xfs_btree_key *mask); }; +/* btree geometry flags */ +#define XFS_BTGEO_LASTREC_UPDATE (1U << 0) /* track last rec externally */ +#define XFS_BTGEO_OVERLAPPING (1U << 1) /* overlapping intervals */ + /* * Reasons for the update_lastrec method to be called. */ @@ -215,39 +241,6 @@ union xfs_btree_irec { struct xfs_refcount_irec rc; }; -/* Per-AG btree information. */ -struct xfs_btree_cur_ag { - struct xfs_perag *pag; - union { - struct xfs_buf *agbp; - struct xbtree_afakeroot *afake; /* for staging cursor */ - }; - union { - struct { - unsigned int nr_ops; /* # record updates */ - unsigned int shape_changes; /* # of extent splits */ - } refc; - struct { - bool active; /* allocation cursor state */ - } abt; - }; -}; - -/* Btree-in-inode cursor information */ -struct xfs_btree_cur_ino { - struct xfs_inode *ip; - struct xbtree_ifakeroot *ifake; /* for staging cursor */ - int allocated; - short forksize; - char whichfork; - char flags; -/* We are converting a delalloc reservation */ -#define XFS_BTCUR_BMBT_WASDEL (1 << 0) - -/* For extent swap, ignore owner check in verifier */ -#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1) -}; - struct xfs_btree_level { /* buffer pointer */ struct xfs_buf *bp; @@ -272,21 +265,38 @@ struct xfs_btree_cur const struct xfs_btree_ops *bc_ops; struct kmem_cache *bc_cache; /* cursor cache */ unsigned int bc_flags; /* btree features - below */ - xfs_btnum_t bc_btnum; /* identifies which btree type */ union xfs_btree_irec bc_rec; /* current insert/search record value */ uint8_t bc_nlevels; /* number of levels in the tree */ uint8_t bc_maxlevels; /* maximum levels for this btree type */ - int bc_statoff; /* offset of btree stats array */ - /* - * Short btree pointers need an agno to be able to turn the pointers - * into physical addresses for IO, so the btree cursor switches between - * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the - * cursor. - */ + /* per-type information */ union { - struct xfs_btree_cur_ag bc_ag; - struct xfs_btree_cur_ino bc_ino; + struct { + struct xfs_inode *ip; + short forksize; + char whichfork; + struct xbtree_ifakeroot *ifake; /* for staging cursor */ + } bc_ino; + struct { + struct xfs_perag *pag; + struct xfs_buf *agbp; + struct xbtree_afakeroot *afake; /* for staging cursor */ + } bc_ag; + struct { + struct xfbtree *xfbtree; + struct xfs_perag *pag; + } bc_mem; + }; + + /* per-format private data */ + union { + struct { + int allocated; + } bc_bmap; /* bmapbt */ + struct { + unsigned int nr_ops; /* # record updates */ + unsigned int shape_changes; /* # of extent splits */ + } bc_refc; /* refcountbt */ }; /* Must be at the end of the struct! */ @@ -304,18 +314,22 @@ xfs_btree_cur_sizeof(unsigned int nlevels) return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels); } -/* cursor flags */ -#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ -#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ -#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ -#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ -#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ +/* cursor state flags */ /* * The root of this btree is a fakeroot structure so that we can stage a btree * rebuild without leaving it accessible via primary metadata. The ops struct * is dynamically allocated and must be freed when the cursor is deleted. */ -#define XFS_BTREE_STAGING (1<<5) +#define XFS_BTREE_STAGING (1U << 0) + +/* We are converting a delalloc reservation (only for bmbt btrees) */ +#define XFS_BTREE_BMBT_WASDEL (1U << 1) + +/* For extent swap, ignore owner check in verifier (only for bmbt btrees) */ +#define XFS_BTREE_BMBT_INVALID_OWNER (1U << 2) + +/* Cursor is active (only for allocbt btrees) */ +#define XFS_BTREE_ALLOCBT_ACTIVE (1U << 3) #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 @@ -325,14 +339,10 @@ xfs_btree_cur_sizeof(unsigned int nlevels) */ #define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) -/* - * Internal long and short btree block checks. They return NULL if the - * block is ok or the address of the failed check otherwise. - */ -xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur, - struct xfs_btree_block *block, int level, struct xfs_buf *bp); -xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur, +xfs_failaddr_t __xfs_btree_check_block(struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp); +int __xfs_btree_check_ptr(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, int index, int level); /* * Check that block header is ok. @@ -345,24 +355,6 @@ xfs_btree_check_block( struct xfs_buf *bp); /* buffer containing block, if any */ /* - * Check that (long) pointer is ok. - */ -bool /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_lptr( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_fsblock_t fsbno, /* btree block disk address */ - int level); /* btree block level */ - -/* - * Check that (short) pointer is ok. - */ -bool /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_sptr( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agblock_t agbno, /* btree block disk address */ - int level); /* btree block level */ - -/* * Delete the btree cursor. */ void @@ -392,63 +384,14 @@ xfs_btree_offsets( int *last); /* output: last byte offset */ /* - * Get a buffer for the block, return it read in. - * Long-form addressing. - */ -int /* error */ -xfs_btree_read_bufl( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - struct xfs_buf **bpp, /* buffer for fsbno */ - int refval, /* ref count value for buffer */ - const struct xfs_buf_ops *ops); - -/* - * Read-ahead the block, don't wait for it, don't return a buffer. - * Long-form addressing. - */ -void /* error */ -xfs_btree_reada_bufl( - struct xfs_mount *mp, /* file system mount point */ - xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count, /* count of filesystem blocks */ - const struct xfs_buf_ops *ops); - -/* - * Read-ahead the block, don't wait for it, don't return a buffer. - * Short-form addressing. - */ -void /* error */ -xfs_btree_reada_bufs( - struct xfs_mount *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count, /* count of filesystem blocks */ - const struct xfs_buf_ops *ops); - -/* * Initialise a new btree block header */ -void -xfs_btree_init_block( - struct xfs_mount *mp, - struct xfs_buf *bp, - xfs_btnum_t btnum, - __u16 level, - __u16 numrecs, - __u64 owner); - -void -xfs_btree_init_block_int( - struct xfs_mount *mp, - struct xfs_btree_block *buf, - xfs_daddr_t blkno, - xfs_btnum_t btnum, - __u16 level, - __u16 numrecs, - __u64 owner, - unsigned int flags); +void xfs_btree_init_buf(struct xfs_mount *mp, struct xfs_buf *bp, + const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, + __u64 owner); +void xfs_btree_init_block(struct xfs_mount *mp, + struct xfs_btree_block *buf, const struct xfs_btree_ops *ops, + __u16 level, __u16 numrecs, __u64 owner); /* * Common btree core entry points. @@ -467,10 +410,10 @@ int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner, /* * btree block CRC helpers */ -void xfs_btree_lblock_calc_crc(struct xfs_buf *); -bool xfs_btree_lblock_verify_crc(struct xfs_buf *); -void xfs_btree_sblock_calc_crc(struct xfs_buf *); -bool xfs_btree_sblock_verify_crc(struct xfs_buf *); +void xfs_btree_fsblock_calc_crc(struct xfs_buf *); +bool xfs_btree_fsblock_verify_crc(struct xfs_buf *); +void xfs_btree_agblock_calc_crc(struct xfs_buf *); +bool xfs_btree_agblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. @@ -510,12 +453,14 @@ static inline int xfs_btree_get_level(const struct xfs_btree_block *block) #define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) -xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); -xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp, +xfs_failaddr_t xfs_btree_agblock_v5hdr_verify(struct xfs_buf *bp); +xfs_failaddr_t xfs_btree_agblock_verify(struct xfs_buf *bp, unsigned int max_recs); -xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp, +xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp, uint64_t owner); -xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, +xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp, + unsigned int max_recs); +xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp, unsigned int max_recs); unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits, @@ -690,7 +635,7 @@ xfs_btree_islastblock( block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } @@ -700,6 +645,9 @@ void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, struct xfs_buf **bpp); +int xfs_btree_read_buf_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, int flags, + struct xfs_btree_block **block, struct xfs_buf **bpp); void xfs_btree_set_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, int lr); @@ -711,21 +659,28 @@ void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, void xfs_btree_copy_keys(struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, const union xfs_btree_key *src_key, int numkeys); +void xfs_btree_init_ptr_from_cur(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); static inline struct xfs_btree_cur * xfs_btree_alloc_cursor( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_btnum_t btnum, + const struct xfs_btree_ops *ops, uint8_t maxlevels, struct kmem_cache *cache) { struct xfs_btree_cur *cur; - cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL); + ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN || + ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN); + + /* BMBT allocations can come through from non-transactional context. */ + cur = kmem_cache_zalloc(cache, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + cur->bc_ops = ops; cur->bc_tp = tp; cur->bc_mp = mp; - cur->bc_btnum = btnum; cur->bc_maxlevels = maxlevels; cur->bc_cache = cache; @@ -735,4 +690,16 @@ xfs_btree_alloc_cursor( int __init xfs_btree_init_cur_caches(void); void xfs_btree_destroy_cur_caches(void); +int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur); + +/* Does this level of the cursor point to the inode root (and not a block)? */ +static inline bool +xfs_btree_at_iroot( + const struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_ops->type == XFS_BTREE_TYPE_INODE && + level == cur->bc_nlevels - 1; +} + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c new file mode 100644 index 000000000000..036061fe32cc --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_mem.c @@ -0,0 +1,347 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_error.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_ag.h" +#include "xfs_buf_item.h" +#include "xfs_trace.h" + +/* Set the root of an in-memory btree. */ +void +xfbtree_set_root( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) +{ + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + cur->bc_mem.xfbtree->root = *ptr; + cur->bc_mem.xfbtree->nlevels += inc; +} + +/* Initialize a pointer from the in-memory btree header. */ +void +xfbtree_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + *ptr = cur->bc_mem.xfbtree->root; +} + +/* Duplicate an in-memory btree cursor. */ +struct xfs_btree_cur * +xfbtree_dup_cursor( + struct xfs_btree_cur *cur) +{ + struct xfs_btree_cur *ncur; + + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + ncur = xfs_btree_alloc_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ops, + cur->bc_maxlevels, cur->bc_cache); + ncur->bc_flags = cur->bc_flags; + ncur->bc_nlevels = cur->bc_nlevels; + ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree; + + if (cur->bc_mem.pag) + ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag); + + return ncur; +} + +/* Close the btree xfile and release all resources. */ +void +xfbtree_destroy( + struct xfbtree *xfbt) +{ + xfs_buftarg_drain(xfbt->target); +} + +/* Compute the number of bytes available for records. */ +static inline unsigned int +xfbtree_rec_bytes( + struct xfs_mount *mp, + const struct xfs_btree_ops *ops) +{ + return XMBUF_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; +} + +/* Initialize an empty leaf block as the btree root. */ +STATIC int +xfbtree_init_leaf_block( + struct xfs_mount *mp, + struct xfbtree *xfbt, + const struct xfs_btree_ops *ops) +{ + struct xfs_buf *bp; + xfbno_t bno = xfbt->highest_bno++; + int error; + + error = xfs_buf_get(xfbt->target, xfbno_to_daddr(bno), XFBNO_BBSIZE, + &bp); + if (error) + return error; + + trace_xfbtree_create_root_buf(xfbt, bp); + + bp->b_ops = ops->buf_ops; + xfs_btree_init_buf(mp, bp, ops, 0, 0, xfbt->owner); + xfs_buf_relse(bp); + + xfbt->root.l = cpu_to_be64(bno); + return 0; +} + +/* + * Create an in-memory btree root that can be used with the given xmbuf. + * Callers must set xfbt->owner. + */ +int +xfbtree_init( + struct xfs_mount *mp, + struct xfbtree *xfbt, + struct xfs_buftarg *btp, + const struct xfs_btree_ops *ops) +{ + unsigned int blocklen = xfbtree_rec_bytes(mp, ops); + unsigned int keyptr_len; + int error; + + /* Requires a long-format CRC-format btree */ + if (!xfs_has_crc(mp)) { + ASSERT(xfs_has_crc(mp)); + return -EINVAL; + } + if (ops->ptr_len != XFS_BTREE_LONG_PTR_LEN) { + ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN); + return -EINVAL; + } + + memset(xfbt, 0, sizeof(*xfbt)); + xfbt->target = btp; + + /* Set up min/maxrecs for this btree. */ + keyptr_len = ops->key_len + sizeof(__be64); + xfbt->maxrecs[0] = blocklen / ops->rec_len; + xfbt->maxrecs[1] = blocklen / keyptr_len; + xfbt->minrecs[0] = xfbt->maxrecs[0] / 2; + xfbt->minrecs[1] = xfbt->maxrecs[1] / 2; + xfbt->highest_bno = 0; + xfbt->nlevels = 1; + + /* Initialize the empty btree. */ + error = xfbtree_init_leaf_block(mp, xfbt, ops); + if (error) + goto err_freesp; + + trace_xfbtree_init(mp, xfbt, ops); + + return 0; + +err_freesp: + xfs_buftarg_drain(xfbt->target); + return error; +} + +/* Allocate a block to our in-memory btree. */ +int +xfbtree_alloc_block( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + xfbno_t bno = xfbt->highest_bno++; + + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + trace_xfbtree_alloc_block(xfbt, cur, bno); + + /* Fail if the block address exceeds the maximum for the buftarg. */ + if (!xfbtree_verify_bno(xfbt, bno)) { + ASSERT(xfbtree_verify_bno(xfbt, bno)); + *stat = 0; + return 0; + } + + new->l = cpu_to_be64(bno); + *stat = 1; + return 0; +} + +/* Free a block from our in-memory btree. */ +int +xfbtree_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + xfs_daddr_t daddr = xfs_buf_daddr(bp); + xfbno_t bno = xfs_daddr_to_xfbno(daddr); + + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + trace_xfbtree_free_block(xfbt, cur, bno); + + if (bno + 1 == xfbt->highest_bno) + xfbt->highest_bno--; + + return 0; +} + +/* Return the minimum number of records for a btree block. */ +int +xfbtree_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + + return xfbt->minrecs[level != 0]; +} + +/* Return the maximum number of records for a btree block. */ +int +xfbtree_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + + return xfbt->maxrecs[level != 0]; +} + +/* If this log item is a buffer item that came from the xfbtree, return it. */ +static inline struct xfs_buf * +xfbtree_buf_match( + struct xfbtree *xfbt, + const struct xfs_log_item *lip) +{ + const struct xfs_buf_log_item *bli; + struct xfs_buf *bp; + + if (lip->li_type != XFS_LI_BUF) + return NULL; + + bli = container_of(lip, struct xfs_buf_log_item, bli_item); + bp = bli->bli_buf; + if (bp->b_target != xfbt->target) + return NULL; + + return bp; +} + +/* + * Commit changes to the incore btree immediately by writing all dirty xfbtree + * buffers to the backing xfile. This detaches all xfbtree buffers from the + * transaction, even on failure. The buffer locks are dropped between the + * delwri queue and submit, so the caller must synchronize btree access. + * + * Normally we'd let the buffers commit with the transaction and get written to + * the xfile via the log, but online repair stages ephemeral btrees in memory + * and uses the btree_staging functions to write new btrees to disk atomically. + * The in-memory btree (and its backing store) are discarded at the end of the + * repair phase, which means that xfbtree buffers cannot commit with the rest + * of a transaction. + * + * In other words, online repair only needs the transaction to collect buffer + * pointers and to avoid buffer deadlocks, not to guarantee consistency of + * updates. + */ +int +xfbtree_trans_commit( + struct xfbtree *xfbt, + struct xfs_trans *tp) +{ + struct xfs_log_item *lip, *n; + bool tp_dirty = false; + int error = 0; + + /* + * For each xfbtree buffer attached to the transaction, write the dirty + * buffers to the xfile and release them. + */ + list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) { + struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip); + + if (!bp) { + if (test_bit(XFS_LI_DIRTY, &lip->li_flags)) + tp_dirty |= true; + continue; + } + + trace_xfbtree_trans_commit_buf(xfbt, bp); + + xmbuf_trans_bdetach(tp, bp); + + /* + * If the buffer fails verification, note the failure but + * continue walking the transaction items so that we remove all + * ephemeral btree buffers. + */ + if (!error) + error = xmbuf_finalize(bp); + + xfs_buf_relse(bp); + } + + /* + * Reset the transaction's dirty flag to reflect the dirty state of the + * log items that are still attached. + */ + tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) | + (tp_dirty ? XFS_TRANS_DIRTY : 0); + + return error; +} + +/* + * Cancel changes to the incore btree by detaching all the xfbtree buffers. + * Changes are not undone, so callers must not access the btree ever again. + */ +void +xfbtree_trans_cancel( + struct xfbtree *xfbt, + struct xfs_trans *tp) +{ + struct xfs_log_item *lip, *n; + bool tp_dirty = false; + + list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) { + struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip); + + if (!bp) { + if (test_bit(XFS_LI_DIRTY, &lip->li_flags)) + tp_dirty |= true; + continue; + } + + trace_xfbtree_trans_cancel_buf(xfbt, bp); + + xmbuf_trans_bdetach(tp, bp); + xfs_buf_relse(bp); + } + + /* + * Reset the transaction's dirty flag to reflect the dirty state of the + * log items that are still attached. + */ + tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) | + (tp_dirty ? XFS_TRANS_DIRTY : 0); +} diff --git a/fs/xfs/libxfs/xfs_btree_mem.h b/fs/xfs/libxfs/xfs_btree_mem.h new file mode 100644 index 000000000000..1c3825786ec8 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_mem.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_BTREE_MEM_H__ +#define __XFS_BTREE_MEM_H__ + +typedef uint64_t xfbno_t; + +#define XFBNO_BLOCKSIZE (XMBUF_BLOCKSIZE) +#define XFBNO_BBSHIFT (XMBUF_BLOCKSHIFT - BBSHIFT) +#define XFBNO_BBSIZE (XFBNO_BLOCKSIZE >> BBSHIFT) + +static inline xfs_daddr_t xfbno_to_daddr(xfbno_t blkno) +{ + return blkno << XFBNO_BBSHIFT; +} + +static inline xfbno_t xfs_daddr_to_xfbno(xfs_daddr_t daddr) +{ + return daddr >> XFBNO_BBSHIFT; +} + +struct xfbtree { + /* buffer cache target for this in-memory btree */ + struct xfs_buftarg *target; + + /* Highest block number that has been written to. */ + xfbno_t highest_bno; + + /* Owner of this btree. */ + unsigned long long owner; + + /* Btree header */ + union xfs_btree_ptr root; + unsigned int nlevels; + + /* Minimum and maximum records per block. */ + unsigned int maxrecs[2]; + unsigned int minrecs[2]; +}; + +#ifdef CONFIG_XFS_BTREE_IN_MEM +static inline bool xfbtree_verify_bno(struct xfbtree *xfbt, xfbno_t bno) +{ + return xmbuf_verify_daddr(xfbt->target, xfbno_to_daddr(bno)); +} + +void xfbtree_set_root(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, int inc); +void xfbtree_init_ptr_from_cur(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); +struct xfs_btree_cur *xfbtree_dup_cursor(struct xfs_btree_cur *cur); + +int xfbtree_get_minrecs(struct xfs_btree_cur *cur, int level); +int xfbtree_get_maxrecs(struct xfs_btree_cur *cur, int level); + +int xfbtree_alloc_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, union xfs_btree_ptr *ptr, + int *stat); +int xfbtree_free_block(struct xfs_btree_cur *cur, struct xfs_buf *bp); + +/* Callers must set xfbt->target and xfbt->owner before calling this */ +int xfbtree_init(struct xfs_mount *mp, struct xfbtree *xfbt, + struct xfs_buftarg *btp, const struct xfs_btree_ops *ops); +void xfbtree_destroy(struct xfbtree *xfbt); + +int xfbtree_trans_commit(struct xfbtree *xfbt, struct xfs_trans *tp); +void xfbtree_trans_cancel(struct xfbtree *xfbt, struct xfs_trans *tp); +#else +# define xfbtree_verify_bno(...) (false) +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + +#endif /* __XFS_BTREE_MEM_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index dd75e208b543..694929703152 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -39,63 +39,6 @@ */ /* - * Don't allow staging cursors to be duplicated because they're supposed to be - * kept private to a single thread. - */ -STATIC struct xfs_btree_cur * -xfs_btree_fakeroot_dup_cursor( - struct xfs_btree_cur *cur) -{ - ASSERT(0); - return NULL; -} - -/* - * Don't allow block allocation for a staging cursor, because staging cursors - * do not support regular btree modifications. - * - * Bulk loading uses a separate callback to obtain new blocks from a - * preallocated list, which prevents ENOSPC failures during loading. - */ -STATIC int -xfs_btree_fakeroot_alloc_block( - struct xfs_btree_cur *cur, - const union xfs_btree_ptr *start_bno, - union xfs_btree_ptr *new_bno, - int *stat) -{ - ASSERT(0); - return -EFSCORRUPTED; -} - -/* - * Don't allow block freeing for a staging cursor, because staging cursors - * do not support regular btree modifications. - */ -STATIC int -xfs_btree_fakeroot_free_block( - struct xfs_btree_cur *cur, - struct xfs_buf *bp) -{ - ASSERT(0); - return -EFSCORRUPTED; -} - -/* Initialize a pointer to the root block from the fakeroot. */ -STATIC void -xfs_btree_fakeroot_init_ptr_from_cur( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) -{ - struct xbtree_afakeroot *afake; - - ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - - afake = cur->bc_ag.afake; - ptr->s = cpu_to_be32(afake->af_root); -} - -/* * Bulk Loading for AG Btrees * ========================== * @@ -109,47 +52,20 @@ xfs_btree_fakeroot_init_ptr_from_cur( * cursor into a regular btree cursor. */ -/* Update the btree root information for a per-AG fake root. */ -STATIC void -xfs_btree_afakeroot_set_root( - struct xfs_btree_cur *cur, - const union xfs_btree_ptr *ptr, - int inc) -{ - struct xbtree_afakeroot *afake = cur->bc_ag.afake; - - ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - afake->af_root = be32_to_cpu(ptr->s); - afake->af_levels += inc; -} - /* * Initialize a AG-rooted btree cursor with the given AG btree fake root. - * The btree cursor's bc_ops will be overridden as needed to make the staging - * functionality work. */ void xfs_btree_stage_afakeroot( struct xfs_btree_cur *cur, struct xbtree_afakeroot *afake) { - struct xfs_btree_ops *nops; - ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); - ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)); + ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE); ASSERT(cur->bc_tp == NULL); - nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); - memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); - nops->alloc_block = xfs_btree_fakeroot_alloc_block; - nops->free_block = xfs_btree_fakeroot_free_block; - nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; - nops->set_root = xfs_btree_afakeroot_set_root; - nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; - cur->bc_ag.afake = afake; cur->bc_nlevels = afake->af_levels; - cur->bc_ops = nops; cur->bc_flags |= XFS_BTREE_STAGING; } @@ -163,17 +79,15 @@ void xfs_btree_commit_afakeroot( struct xfs_btree_cur *cur, struct xfs_trans *tp, - struct xfs_buf *agbp, - const struct xfs_btree_ops *ops) + struct xfs_buf *agbp) { ASSERT(cur->bc_flags & XFS_BTREE_STAGING); ASSERT(cur->bc_tp == NULL); trace_xfs_btree_commit_afakeroot(cur); - kmem_free((void *)cur->bc_ops); + cur->bc_ag.afake = NULL; cur->bc_ag.agbp = agbp; - cur->bc_ops = ops; cur->bc_flags &= ~XFS_BTREE_STAGING; cur->bc_tp = tp; } @@ -211,29 +125,16 @@ xfs_btree_commit_afakeroot( void xfs_btree_stage_ifakeroot( struct xfs_btree_cur *cur, - struct xbtree_ifakeroot *ifake, - struct xfs_btree_ops **new_ops) + struct xbtree_ifakeroot *ifake) { - struct xfs_btree_ops *nops; - ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); ASSERT(cur->bc_tp == NULL); - nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); - memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); - nops->alloc_block = xfs_btree_fakeroot_alloc_block; - nops->free_block = xfs_btree_fakeroot_free_block; - nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; - nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; - cur->bc_ino.ifake = ifake; cur->bc_nlevels = ifake->if_levels; - cur->bc_ops = nops; + cur->bc_ino.forksize = ifake->if_fork_size; cur->bc_flags |= XFS_BTREE_STAGING; - - if (new_ops) - *new_ops = nops; } /* @@ -246,18 +147,15 @@ void xfs_btree_commit_ifakeroot( struct xfs_btree_cur *cur, struct xfs_trans *tp, - int whichfork, - const struct xfs_btree_ops *ops) + int whichfork) { ASSERT(cur->bc_flags & XFS_BTREE_STAGING); ASSERT(cur->bc_tp == NULL); trace_xfs_btree_commit_ifakeroot(cur); - kmem_free((void *)cur->bc_ops); cur->bc_ino.ifake = NULL; cur->bc_ino.whichfork = whichfork; - cur->bc_ops = ops; cur->bc_flags &= ~XFS_BTREE_STAGING; cur->bc_tp = tp; } @@ -333,20 +231,41 @@ xfs_btree_commit_ifakeroot( /* * Put a btree block that we're loading onto the ordered list and release it. * The btree blocks will be written to disk when bulk loading is finished. + * If we reach the dirty buffer threshold, flush them to disk before + * continuing. */ -static void +static int xfs_btree_bload_drop_buf( - struct list_head *buffers_list, - struct xfs_buf **bpp) + struct xfs_btree_bload *bbl, + struct list_head *buffers_list, + struct xfs_buf **bpp) { - if (*bpp == NULL) - return; + struct xfs_buf *bp = *bpp; + int error; + + if (!bp) + return 0; - if (!xfs_buf_delwri_queue(*bpp, buffers_list)) - ASSERT(0); + /* + * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent + * xfs_buf_read will not pointlessly reread the contents from the disk. + */ + bp->b_flags |= XBF_DONE; - xfs_buf_relse(*bpp); + xfs_buf_delwri_queue_here(bp, buffers_list); + xfs_buf_relse(bp); *bpp = NULL; + bbl->nr_dirty++; + + if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty) + return 0; + + error = xfs_buf_delwri_submit(buffers_list); + if (error) + return error; + + bbl->nr_dirty = 0; + return 0; } /* @@ -376,23 +295,20 @@ xfs_btree_bload_prep_block( struct xfs_btree_block *new_block; int ret; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && - level == cur->bc_nlevels - 1) { + if (xfs_btree_at_iroot(cur, level)) { struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); size_t new_size; ASSERT(*bpp == NULL); /* Allocate a new incore btree root block. */ - new_size = bbl->iroot_size(cur, nr_this_block, priv); - ifp->if_broot = kmem_zalloc(new_size, 0); + new_size = bbl->iroot_size(cur, level, nr_this_block, priv); + ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); ifp->if_broot_bytes = (int)new_size; /* Initialize it and send it out. */ - xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot, - XFS_BUF_DADDR_NULL, cur->bc_btnum, level, - nr_this_block, cur->bc_ino.ip->i_ino, - cur->bc_flags); + xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops, + level, nr_this_block, cur->bc_ino.ip->i_ino); *bpp = NULL; *blockp = ifp->if_broot; @@ -418,7 +334,10 @@ xfs_btree_bload_prep_block( */ if (*blockp) xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); - xfs_btree_bload_drop_buf(buffers_list, bpp); + + ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp); + if (ret) + return ret; /* Initialize the new btree block. */ xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); @@ -436,22 +355,19 @@ STATIC int xfs_btree_bload_leaf( struct xfs_btree_cur *cur, unsigned int recs_this_block, - xfs_btree_bload_get_record_fn get_record, + xfs_btree_bload_get_records_fn get_records, struct xfs_btree_block *block, void *priv) { - unsigned int j; + unsigned int j = 1; int ret; /* Fill the leaf block with records. */ - for (j = 1; j <= recs_this_block; j++) { - union xfs_btree_rec *block_rec; - - ret = get_record(cur, priv); - if (ret) + while (j <= recs_this_block) { + ret = get_records(cur, j, block, recs_this_block - j + 1, priv); + if (ret < 0) return ret; - block_rec = xfs_btree_rec_addr(cur, j, block); - cur->bc_ops->init_rec_from_cur(cur, block_rec); + j += ret; } return 0; @@ -485,7 +401,12 @@ xfs_btree_bload_node( ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); - ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block, + /* + * Read the lower-level block in case the buffer for it has + * been reclaimed. LRU refs will be set on the block, which is + * desirable if the new btree commits. + */ + ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block, &child_bp); if (ret) return ret; @@ -570,7 +491,14 @@ xfs_btree_bload_level_geometry( unsigned int desired_npb; unsigned int maxnr; - maxnr = cur->bc_ops->get_maxrecs(cur, level); + /* + * Compute the absolute maximum number of records that we can store in + * the ondisk block or inode root. + */ + if (cur->bc_ops->get_dmaxrecs) + maxnr = cur->bc_ops->get_dmaxrecs(cur, level); + else + maxnr = cur->bc_ops->get_maxrecs(cur, level); /* * Compute the number of blocks we need to fill each block with the @@ -671,7 +599,7 @@ xfs_btree_bload_compute_geometry( xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, &avg_per_block, &level_blocks, &dontcare64); - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { /* * If all the items we want to store at this level * would fit in the inode root block, then we have our @@ -730,7 +658,7 @@ xfs_btree_bload_compute_geometry( return -EOVERFLOW; bbl->btree_height = cur->bc_nlevels; - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) bbl->nr_blocks = nr_blocks - 1; else bbl->nr_blocks = nr_blocks; @@ -764,6 +692,7 @@ xfs_btree_bload( cur->bc_nlevels = bbl->btree_height; xfs_btree_set_ptr_null(cur, &child_ptr); xfs_btree_set_ptr_null(cur, &ptr); + bbl->nr_dirty = 0; xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, &avg_per_block, &blocks, &blocks_with_extra); @@ -789,7 +718,7 @@ xfs_btree_bload( trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, nr_this_block); - ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record, + ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records, block, priv); if (ret) goto out; @@ -802,7 +731,10 @@ xfs_btree_bload( xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); } total_blocks += blocks; - xfs_btree_bload_drop_buf(&buffers_list, &bp); + + ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); + if (ret) + goto out; /* Populate the internal btree nodes. */ for (level = 1; level < cur->bc_nlevels; level++) { @@ -844,12 +776,16 @@ xfs_btree_bload( xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); } total_blocks += blocks; - xfs_btree_bload_drop_buf(&buffers_list, &bp); + + ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); + if (ret) + goto out; + xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); } /* Initialize the new root. */ - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); cur->bc_ino.ifake->if_levels = cur->bc_nlevels; cur->bc_ino.ifake->if_blocks = total_blocks - 1; diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index f0d2976050ae..0c9c2ffb127a 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -22,7 +22,7 @@ struct xbtree_afakeroot { void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur, struct xbtree_afakeroot *afake); void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, - struct xfs_buf *agbp, const struct xfs_btree_ops *ops); + struct xfs_buf *agbp); /* Fake root for an inode-rooted btree. */ struct xbtree_ifakeroot { @@ -37,35 +37,33 @@ struct xbtree_ifakeroot { /* Number of bytes available for this fork in the inode. */ unsigned int if_fork_size; - - /* Fork format. */ - unsigned int if_format; - - /* Number of records. */ - unsigned int if_extents; }; /* Cursor interactions with fake roots for inode-rooted btrees. */ void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur, - struct xbtree_ifakeroot *ifake, - struct xfs_btree_ops **new_ops); + struct xbtree_ifakeroot *ifake); void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, - int whichfork, const struct xfs_btree_ops *ops); + int whichfork); /* Bulk loading of staged btrees. */ -typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv); +typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur, + unsigned int idx, struct xfs_btree_block *block, + unsigned int nr_wanted, void *priv); typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, void *priv); typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, - unsigned int nr_this_level, void *priv); + unsigned int level, unsigned int nr_this_level, void *priv); struct xfs_btree_bload { /* - * This function will be called nr_records times to load records into - * the btree. The function does this by setting the cursor's bc_rec - * field in in-core format. Records must be returned in sort order. + * This function will be called to load @nr_wanted records into the + * btree. The implementation does this by setting the cursor's bc_rec + * field in in-core format and using init_rec_from_cur to set the + * records in the btree block. Records must be returned in sort order. + * The function must return the number of records loaded or the usual + * negative errno. */ - xfs_btree_bload_get_record_fn get_record; + xfs_btree_bload_get_records_fn get_records; /* * This function will be called nr_blocks times to obtain a pointer @@ -77,8 +75,7 @@ struct xfs_btree_bload { /* * This function should return the size of the in-core btree root - * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree - * types. + * block. It is only necessary for XFS_BTREE_TYPE_INODE btrees. */ xfs_btree_bload_iroot_size_fn iroot_size; @@ -113,6 +110,16 @@ struct xfs_btree_bload { * height of the new btree. */ unsigned int btree_height; + + /* + * Flush the new btree block buffer list to disk after this many blocks + * have been formatted. Zero prohibits writing any buffers until all + * blocks have been formatted. + */ + uint16_t max_dirty; + + /* Number of dirty buffers. */ + uint16_t nr_dirty; }; int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur, diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e576560b46e9..718d071bb21a 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -23,6 +23,7 @@ #include "xfs_buf_item.h" #include "xfs_log.h" #include "xfs_errortag.h" +#include "xfs_health.h" /* * xfs_da_btree.c @@ -85,7 +86,8 @@ xfs_da_state_alloc( { struct xfs_da_state *state; - state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL); + state = kmem_cache_zalloc(xfs_da_state_cache, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); state->args = args; state->mp = args->dp->i_mount; return state; @@ -352,6 +354,8 @@ const struct xfs_buf_ops xfs_da3_node_buf_ops = { static int xfs_da3_node_set_type( struct xfs_trans *tp, + struct xfs_inode *dp, + int whichfork, struct xfs_buf *bp) { struct xfs_da_blkinfo *info = bp->b_addr; @@ -373,6 +377,7 @@ xfs_da3_node_set_type( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp, info, sizeof(*info)); xfs_trans_brelse(tp, bp); + xfs_dirattr_mark_sick(dp, whichfork); return -EFSCORRUPTED; } } @@ -391,7 +396,7 @@ xfs_da3_node_read( &xfs_da3_node_buf_ops); if (error || !*bpp || !tp) return error; - return xfs_da3_node_set_type(tp, *bpp); + return xfs_da3_node_set_type(tp, dp, whichfork, *bpp); } int @@ -408,6 +413,8 @@ xfs_da3_node_read_mapped( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno, XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0, bpp, &xfs_da3_node_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_dirattr_mark_sick(dp, whichfork); if (error || !*bpp) return error; @@ -418,7 +425,26 @@ xfs_da3_node_read_mapped( if (!tp) return 0; - return xfs_da3_node_set_type(tp, *bpp); + return xfs_da3_node_set_type(tp, dp, whichfork, *bpp); +} + +/* + * Copy src directory/attr leaf/node buffer to the dst. + * For v5 file systems make sure the right blkno is stamped in. + */ +void +xfs_da_buf_copy( + struct xfs_buf *dst, + struct xfs_buf *src, + size_t size) +{ + struct xfs_da3_blkinfo *da3 = dst->b_addr; + + memcpy(dst->b_addr, src->b_addr, size); + dst->b_ops = src->b_ops; + xfs_trans_buf_copy_type(dst, src); + if (xfs_has_crc(dst->b_mount)) + da3->blkno = cpu_to_be64(xfs_buf_daddr(dst)); } /*======================================================================== @@ -612,6 +638,7 @@ xfs_da3_split( if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) { xfs_buf_mark_corrupt(oldblk->bp); + xfs_da_mark_sick(state->args); error = -EFSCORRUPTED; goto out; } @@ -625,6 +652,7 @@ xfs_da3_split( if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) { xfs_buf_mark_corrupt(oldblk->bp); + xfs_da_mark_sick(state->args); error = -EFSCORRUPTED; goto out; } @@ -690,12 +718,6 @@ xfs_da3_root_split( btree = icnodehdr.btree; size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); level = icnodehdr.level; - - /* - * we are about to copy oldroot to bp, so set up the type - * of bp while we know exactly what it will be. - */ - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); } else { struct xfs_dir3_icleaf_hdr leafhdr; @@ -707,31 +729,17 @@ xfs_da3_root_split( size = (int)((char *)&leafhdr.ents[leafhdr.count] - (char *)leaf); level = 0; - - /* - * we are about to copy oldroot to bp, so set up the type - * of bp while we know exactly what it will be. - */ - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); } /* - * we can copy most of the information in the node from one block to - * another, but for CRC enabled headers we have to make sure that the - * block specific identifiers are kept intact. We update the buffer - * directly for this. + * Copy old root to new buffer and log it. */ - memcpy(node, oldroot, size); - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || - oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { - struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; - - node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - } + xfs_da_buf_copy(bp, blk1->bp, size); xfs_trans_log_buf(tp, bp, 0, size - 1); - bp->b_ops = blk1->bp->b_ops; - xfs_trans_buf_copy_type(bp, blk1->bp); + /* + * Update blk1 to point to new buffer. + */ blk1->bp = bp; blk1->blkno = blkno; @@ -1220,21 +1228,14 @@ xfs_da3_root_join( xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* - * This could be copying a leaf back into the root block in the case of - * there only being a single leaf block left in the tree. Hence we have - * to update the b_ops pointer as well to match the buffer type change - * that could occur. For dir3 blocks we also need to update the block - * number in the buffer header. + * Copy child to root buffer and log it. */ - memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize); - root_blk->bp->b_ops = bp->b_ops; - xfs_trans_buf_copy_type(root_blk->bp, bp); - if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { - struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; - da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp)); - } + xfs_da_buf_copy(root_blk->bp, bp, args->geo->blksize); xfs_trans_log_buf(args->trans, root_blk->bp, 0, args->geo->blksize - 1); + /* + * Now we can drop the child buffer. + */ error = xfs_da_shrink_inode(args, child, bp); return error; } @@ -1643,6 +1644,7 @@ xfs_da3_node_lookup_int( if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) { xfs_buf_mark_corrupt(blk->bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -1658,6 +1660,7 @@ xfs_da3_node_lookup_int( /* Tree taller than we can handle; bail out! */ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { xfs_buf_mark_corrupt(blk->bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -1666,6 +1669,7 @@ xfs_da3_node_lookup_int( expected_level = nodehdr.level - 1; else if (expected_level != nodehdr.level) { xfs_buf_mark_corrupt(blk->bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } else expected_level--; @@ -1717,12 +1721,16 @@ xfs_da3_node_lookup_int( } /* We can't point back to the root. */ - if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk)) + if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk)) { + xfs_da_mark_sick(args); return -EFSCORRUPTED; + } } - if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0)) + if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0)) { + xfs_da_mark_sick(args); return -EFSCORRUPTED; + } /* * A leaf block that ends in the hashval that we are interested in @@ -1740,6 +1748,7 @@ xfs_da3_node_lookup_int( args->blkno = blk->blkno; } else { ASSERT(0); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } if (((retval == -ENOENT) || (retval == -ENOATTR)) && @@ -2190,7 +2199,8 @@ xfs_da_grow_inode_int( * If we didn't get it and the block might work if fragmented, * try without the CONTIG flag. Loop until we get it all. */ - mapp = kmem_alloc(sizeof(*mapp) * count, 0); + mapp = kmalloc(sizeof(*mapp) * count, + GFP_KERNEL | __GFP_NOFAIL); for (b = *bno, mapi = 0; b < *bno + count; ) { c = (int)(*bno + count - b); nmap = min(XFS_BMAP_MAX_NMAP, c); @@ -2227,7 +2237,7 @@ xfs_da_grow_inode_int( out_free_map: if (mapp != &map) - kmem_free(mapp); + kfree(mapp); return error; } @@ -2305,8 +2315,10 @@ xfs_da3_swap_lastblock( error = xfs_bmap_last_before(tp, dp, &lastoff, w); if (error) return error; - if (XFS_IS_CORRUPT(mp, lastoff == 0)) + if (XFS_IS_CORRUPT(mp, lastoff == 0)) { + xfs_da_mark_sick(args); return -EFSCORRUPTED; + } /* * Read the last block in the btree space. */ @@ -2317,9 +2329,10 @@ xfs_da3_swap_lastblock( /* * Copy the last block into the dead buffer and log it. */ - memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); + xfs_da_buf_copy(dead_buf, last_buf, args->geo->blksize); xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); dead_info = dead_buf->b_addr; + /* * Get values from the moved block. */ @@ -2355,6 +2368,7 @@ xfs_da3_swap_lastblock( if (XFS_IS_CORRUPT(mp, be32_to_cpu(sib_info->forw) != last_blkno || sib_info->magic != dead_info->magic)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2375,6 +2389,7 @@ xfs_da3_swap_lastblock( if (XFS_IS_CORRUPT(mp, be32_to_cpu(sib_info->back) != last_blkno || sib_info->magic != dead_info->magic)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2397,6 +2412,7 @@ xfs_da3_swap_lastblock( xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); if (XFS_IS_CORRUPT(mp, level >= 0 && level != par_hdr.level + 1)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2408,6 +2424,7 @@ xfs_da3_swap_lastblock( entno++) continue; if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2433,6 +2450,7 @@ xfs_da3_swap_lastblock( xfs_trans_brelse(tp, par_buf); par_buf = NULL; if (XFS_IS_CORRUPT(mp, par_blkno == 0)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2442,6 +2460,7 @@ xfs_da3_swap_lastblock( par_node = par_buf->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2525,7 +2544,8 @@ xfs_dabuf_map( int error = 0, nirecs, i; if (nfsb > 1) - irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS); + irecs = kzalloc(sizeof(irec) * nfsb, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); nirecs = nfsb; error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs, @@ -2538,7 +2558,8 @@ xfs_dabuf_map( * larger one that needs to be free by the caller. */ if (nirecs > 1) { - map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS); + map = kzalloc(nirecs * sizeof(struct xfs_buf_map), + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); if (!map) { error = -ENOMEM; goto out_free_irecs; @@ -2564,12 +2585,13 @@ xfs_dabuf_map( *nmaps = nirecs; out_free_irecs: if (irecs != &irec) - kmem_free(irecs); + kfree(irecs); return error; invalid_mapping: /* Caller ok with no mapping. */ if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) { + xfs_dirattr_mark_sick(dp, whichfork); error = -EFSCORRUPTED; if (xfs_error_level >= XFS_ERRLEVEL_LOW) { xfs_alert(mp, "%s: bno %u inode %llu", @@ -2620,7 +2642,7 @@ xfs_da_get_buf( out_free: if (mapp != &map) - kmem_free(mapp); + kfree(mapp); return error; } @@ -2651,6 +2673,8 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0, &bp, ops); + if (xfs_metadata_is_sick(error)) + xfs_dirattr_mark_sick(dp, whichfork); if (error) goto out_free; @@ -2661,7 +2685,7 @@ xfs_da_read_buf( *bpp = bp; out_free: if (mapp != &map) - kmem_free(mapp); + kfree(mapp); return error; } @@ -2692,7 +2716,7 @@ xfs_da_reada_buf( out_free: if (mapp != &map) - kmem_free(mapp); + kfree(mapp); return error; } diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ffa3df5b2893..706baf36e175 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -219,6 +219,8 @@ int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); +void xfs_da_buf_copy(struct xfs_buf *dst, struct xfs_buf *src, + size_t size); uint xfs_da_hashname(const uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index f9015f88eca7..060e5c96b70f 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -159,6 +159,17 @@ struct xfs_da3_intnode { #define XFS_DIR3_FT_MAX 9 +#define XFS_DIR3_FTYPE_STR \ + { XFS_DIR3_FT_UNKNOWN, "unknown" }, \ + { XFS_DIR3_FT_REG_FILE, "file" }, \ + { XFS_DIR3_FT_DIR, "directory" }, \ + { XFS_DIR3_FT_CHRDEV, "char" }, \ + { XFS_DIR3_FT_BLKDEV, "block" }, \ + { XFS_DIR3_FT_FIFO, "fifo" }, \ + { XFS_DIR3_FT_SOCK, "sock" }, \ + { XFS_DIR3_FT_SYMLINK, "symlink" }, \ + { XFS_DIR3_FT_WHT, "whiteout" } + /* * Byte offset in data block and shortform entry. */ @@ -578,20 +589,25 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) #define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ /* - * Entries are packed toward the top as tight as possible. - */ -struct xfs_attr_shortform { - struct xfs_attr_sf_hdr { /* constant-structure header block */ - __be16 totsize; /* total bytes in shortform list */ - __u8 count; /* count of active entries */ - __u8 padding; - } hdr; - struct xfs_attr_sf_entry { - uint8_t namelen; /* actual length of name (no NULL) */ - uint8_t valuelen; /* actual length of value (no NULL) */ - uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ - uint8_t nameval[]; /* name & value bytes concatenated */ - } list[]; /* variable sized array */ + * Attribute storage when stored inside the inode. + * + * Small attribute lists are packed as tightly as possible so as to fit into the + * literal area of the inode. + * + * These "shortform" attribute forks consist of a single xfs_attr_sf_hdr header + * followed by zero or more xfs_attr_sf_entry structures. + */ +struct xfs_attr_sf_hdr { /* constant-structure header block */ + __be16 totsize; /* total bytes in shortform list */ + __u8 count; /* count of active entries */ + __u8 padding; +}; + +struct xfs_attr_sf_entry { + __u8 namelen; /* actual length of name (no NULL) */ + __u8 valuelen; /* actual length of value (no NULL) */ + __u8 flags; /* flags bits (XFS_ATTR_*) */ + __u8 nameval[]; /* name & value bytes concatenated */ }; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index bcfb6a4203cd..c13276095cc0 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -26,6 +26,7 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" +#include "xfs_trans_priv.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -181,16 +182,89 @@ static struct kmem_cache *xfs_defer_pending_cache; * Note that the continuation requested between t2 and t3 is likely to * reoccur. */ +STATIC struct xfs_log_item * +xfs_defer_barrier_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return NULL; +} + +STATIC void +xfs_defer_barrier_abort_intent( + struct xfs_log_item *intent) +{ + /* empty */ +} -static const struct xfs_defer_op_type *defer_op_types[] = { - [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type, - [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type, - [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, - [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, - [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, - [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, +STATIC struct xfs_log_item * +xfs_defer_barrier_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + return NULL; +} + +STATIC int +xfs_defer_barrier_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +STATIC void +xfs_defer_barrier_cancel_item( + struct list_head *item) +{ + ASSERT(0); +} + +static const struct xfs_defer_op_type xfs_barrier_defer_type = { + .max_items = 1, + .create_intent = xfs_defer_barrier_create_intent, + .abort_intent = xfs_defer_barrier_abort_intent, + .create_done = xfs_defer_barrier_create_done, + .finish_item = xfs_defer_barrier_finish_item, + .cancel_item = xfs_defer_barrier_cancel_item, }; +/* Create a log intent done item for a log intent item. */ +static inline void +xfs_defer_create_done( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + struct xfs_log_item *lip; + + /* If there is no log intent item, there can be no log done item. */ + if (!dfp->dfp_intent) + return; + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the log intent item and frees the log done item + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + if (!lip) + return; + + tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; + xfs_trans_add_item(tp, lip); + set_bit(XFS_LI_DIRTY, &lip->li_flags); + dfp->dfp_done = lip; +} + /* * Ensure there's a log intent item associated with this deferred work item if * the operation must be restarted on crash. Returns 1 if there's a log item; @@ -202,18 +276,21 @@ xfs_defer_create_intent( struct xfs_defer_pending *dfp, bool sort) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; struct xfs_log_item *lip; if (dfp->dfp_intent) return 1; - lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, + sort); if (!lip) return 0; if (IS_ERR(lip)) return PTR_ERR(lip); + tp->t_flags |= XFS_TRANS_DIRTY; + xfs_trans_add_item(tp, lip); + set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_intent = lip; return 1; } @@ -245,26 +322,60 @@ xfs_defer_create_intents( return ret; } +static inline void +xfs_defer_pending_abort( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + trace_xfs_defer_pending_abort(mp, dfp); + + if (dfp->dfp_intent && !dfp->dfp_done) { + dfp->dfp_ops->abort_intent(dfp->dfp_intent); + dfp->dfp_intent = NULL; + } +} + +static inline void +xfs_defer_pending_cancel_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + struct list_head *pwi; + struct list_head *n; + + trace_xfs_defer_cancel_list(mp, dfp); + + list_del(&dfp->dfp_list); + list_for_each_safe(pwi, n, &dfp->dfp_work) { + list_del(pwi); + dfp->dfp_count--; + trace_xfs_defer_cancel_item(mp, dfp, pwi); + dfp->dfp_ops->cancel_item(pwi); + } + ASSERT(dfp->dfp_count == 0); + kmem_cache_free(xfs_defer_pending_cache, dfp); +} + +STATIC void +xfs_defer_pending_abort_list( + struct xfs_mount *mp, + struct list_head *dop_list) +{ + struct xfs_defer_pending *dfp; + + /* Abort intent items that don't have a done item. */ + list_for_each_entry(dfp, dop_list, dfp_list) + xfs_defer_pending_abort(mp, dfp); +} + /* Abort all the intents that were committed. */ STATIC void xfs_defer_trans_abort( struct xfs_trans *tp, struct list_head *dop_pending) { - struct xfs_defer_pending *dfp; - const struct xfs_defer_op_type *ops; - trace_xfs_defer_trans_abort(tp, _RET_IP_); - - /* Abort intent items that don't have a done item. */ - list_for_each_entry(dfp, dop_pending, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_abort(tp->t_mountp, dfp); - if (dfp->dfp_intent && !dfp->dfp_done) { - ops->abort_intent(dfp->dfp_intent); - dfp->dfp_intent = NULL; - } - } + xfs_defer_pending_abort_list(tp->t_mountp, dop_pending); } /* @@ -382,27 +493,31 @@ xfs_defer_cancel_list( { struct xfs_defer_pending *dfp; struct xfs_defer_pending *pli; - struct list_head *pwi; - struct list_head *n; - const struct xfs_defer_op_type *ops; /* * Free the pending items. Caller should already have arranged * for the intent items to be released. */ - list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_cancel_list(mp, dfp); - list_del(&dfp->dfp_list); - list_for_each_safe(pwi, n, &dfp->dfp_work) { - list_del(pwi); - dfp->dfp_count--; - trace_xfs_defer_cancel_item(mp, dfp, pwi); - ops->cancel_item(pwi); - } - ASSERT(dfp->dfp_count == 0); - kmem_cache_free(xfs_defer_pending_cache, dfp); + list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) + xfs_defer_pending_cancel_work(mp, dfp); +} + +static inline void +xfs_defer_relog_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + struct xfs_log_item *lip; + + xfs_defer_create_done(tp, dfp); + + lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); + if (lip) { + xfs_trans_add_item(tp, lip); + set_bit(XFS_LI_DIRTY, &lip->li_flags); } + dfp->dfp_done = NULL; + dfp->dfp_intent = lip; } /* @@ -410,7 +525,7 @@ xfs_defer_cancel_list( * done item to release the intent item; and then log a new intent item. * The caller should provide a fresh transaction and roll it after we're done. */ -static int +static void xfs_defer_relog( struct xfs_trans **tpp, struct list_head *dfops) @@ -449,31 +564,28 @@ xfs_defer_relog( trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); XFS_STATS_INC((*tpp)->t_mountp, defer_relog); - dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp); - } - if ((*tpp)->t_flags & XFS_TRANS_DIRTY) - return xfs_defer_trans_roll(tpp); - return 0; + xfs_defer_relog_intent(*tpp, dfp); + } } /* * Log an intent-done item for the first pending intent, and finish the work * items. */ -static int +int xfs_defer_finish_one( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + const struct xfs_defer_op_type *ops = dfp->dfp_ops; struct xfs_btree_cur *state = NULL; struct list_head *li, *n; int error; trace_xfs_defer_pending_finish(tp->t_mountp, dfp); - dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + xfs_defer_create_done(tp, dfp); list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; @@ -510,6 +622,24 @@ out: return error; } +/* Move all paused deferred work from @tp to @paused_list. */ +static void +xfs_defer_isolate_paused( + struct xfs_trans *tp, + struct list_head *paused_list) +{ + struct xfs_defer_pending *dfp; + struct xfs_defer_pending *pli; + + list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { + if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) + continue; + + list_move_tail(&dfp->dfp_list, paused_list); + trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); + } +} + /* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if @@ -525,6 +655,7 @@ xfs_defer_finish_noroll( struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); + LIST_HEAD(dop_paused); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -543,6 +674,8 @@ xfs_defer_finish_noroll( */ int has_intents = xfs_defer_create_intents(*tp); + xfs_defer_isolate_paused(*tp, &dop_paused); + list_splice_init(&(*tp)->t_dfops, &dop_pending); if (has_intents < 0) { @@ -555,22 +688,33 @@ xfs_defer_finish_noroll( goto out_shutdown; /* Relog intent items to keep the log moving. */ - error = xfs_defer_relog(tp, &dop_pending); - if (error) - goto out_shutdown; + xfs_defer_relog(tp, &dop_pending); + xfs_defer_relog(tp, &dop_paused); + + if ((*tp)->t_flags & XFS_TRANS_DIRTY) { + error = xfs_defer_trans_roll(tp); + if (error) + goto out_shutdown; + } } - dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, - dfp_list); + dfp = list_first_entry_or_null(&dop_pending, + struct xfs_defer_pending, dfp_list); + if (!dfp) + break; error = xfs_defer_finish_one(*tp, dfp); if (error && error != -EAGAIN) goto out_shutdown; } + /* Requeue the paused items in the outgoing transaction. */ + list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); + trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; out_shutdown: + list_splice_tail_init(&dop_paused, &dop_pending); xfs_defer_trans_abort(*tp, &dop_pending); xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); trace_xfs_defer_finish_error(*tp, error); @@ -583,6 +727,9 @@ int xfs_defer_finish( struct xfs_trans **tp) { +#ifdef DEBUG + struct xfs_defer_pending *dfp; +#endif int error; /* @@ -602,7 +749,10 @@ xfs_defer_finish( } /* Reset LOWMODE now that we've finished all the dfops. */ - ASSERT(list_empty(&(*tp)->t_dfops)); +#ifdef DEBUG + list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) + ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); +#endif (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; return 0; } @@ -614,48 +764,160 @@ xfs_defer_cancel( struct xfs_mount *mp = tp->t_mountp; trace_xfs_defer_cancel(tp, _RET_IP_); + xfs_defer_trans_abort(tp, &tp->t_dfops); xfs_defer_cancel_list(mp, &tp->t_dfops); } +/* + * Return the last pending work item attached to this transaction if it matches + * the deferred op type. + */ +static inline struct xfs_defer_pending * +xfs_defer_find_last( + struct xfs_trans *tp, + const struct xfs_defer_op_type *ops) +{ + struct xfs_defer_pending *dfp = NULL; + + /* No dfops at all? */ + if (list_empty(&tp->t_dfops)) + return NULL; + + dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, + dfp_list); + + /* Wrong type? */ + if (dfp->dfp_ops != ops) + return NULL; + return dfp; +} + +/* + * Decide if we can add a deferred work item to the last dfops item attached + * to the transaction. + */ +static inline bool +xfs_defer_can_append( + struct xfs_defer_pending *dfp, + const struct xfs_defer_op_type *ops) +{ + /* Already logged? */ + if (dfp->dfp_intent) + return false; + + /* Paused items cannot absorb more work */ + if (dfp->dfp_flags & XFS_DEFER_PAUSED) + return NULL; + + /* Already full? */ + if (ops->max_items && dfp->dfp_count >= ops->max_items) + return false; + + return true; +} + +/* Create a new pending item at the end of the transaction list. */ +static inline struct xfs_defer_pending * +xfs_defer_alloc( + struct list_head *dfops, + const struct xfs_defer_op_type *ops) +{ + struct xfs_defer_pending *dfp; + + dfp = kmem_cache_zalloc(xfs_defer_pending_cache, + GFP_KERNEL | __GFP_NOFAIL); + dfp->dfp_ops = ops; + INIT_LIST_HEAD(&dfp->dfp_work); + list_add_tail(&dfp->dfp_list, dfops); + + return dfp; +} + /* Add an item for later deferred processing. */ -void +struct xfs_defer_pending * xfs_defer_add( struct xfs_trans *tp, - enum xfs_defer_ops_type type, - struct list_head *li) + struct list_head *li, + const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = NULL; - const struct xfs_defer_op_type *ops = defer_op_types[type]; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); - /* - * Add the item to a pending item at the end of the intake list. - * If the last pending item has the same type, reuse it. Else, - * create a new pending item at the end of the intake list. - */ - if (!list_empty(&tp->t_dfops)) { - dfp = list_last_entry(&tp->t_dfops, - struct xfs_defer_pending, dfp_list); - if (dfp->dfp_type != type || - (ops->max_items && dfp->dfp_count >= ops->max_items)) - dfp = NULL; - } - if (!dfp) { - dfp = kmem_cache_zalloc(xfs_defer_pending_cache, - GFP_NOFS | __GFP_NOFAIL); - dfp->dfp_type = type; - dfp->dfp_intent = NULL; - dfp->dfp_done = NULL; - dfp->dfp_count = 0; - INIT_LIST_HEAD(&dfp->dfp_work); - list_add_tail(&dfp->dfp_list, &tp->t_dfops); - } + dfp = xfs_defer_find_last(tp, ops); + if (!dfp || !xfs_defer_can_append(dfp, ops)) + dfp = xfs_defer_alloc(&tp->t_dfops, ops); - list_add_tail(li, &dfp->dfp_work); + xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); - dfp->dfp_count++; + return dfp; +} + +/* + * Add a defer ops barrier to force two otherwise adjacent deferred work items + * to be tracked separately and have separate log items. + */ +void +xfs_defer_add_barrier( + struct xfs_trans *tp) +{ + struct xfs_defer_pending *dfp; + + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* If the last defer op added was a barrier, we're done. */ + dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type); + if (dfp) + return; + + xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type); + + trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); +} + +/* + * Create a pending deferred work item to replay the recovered intent item + * and add it to the list. + */ +void +xfs_defer_start_recovery( + struct xfs_log_item *lip, + struct list_head *r_dfops, + const struct xfs_defer_op_type *ops) +{ + struct xfs_defer_pending *dfp = xfs_defer_alloc(r_dfops, ops); + + dfp->dfp_intent = lip; +} + +/* + * Cancel a deferred work item created to recover a log intent item. @dfp + * will be freed after this function returns. + */ +void +xfs_defer_cancel_recovery( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + xfs_defer_pending_abort(mp, dfp); + xfs_defer_pending_cancel_work(mp, dfp); +} + +/* Replay the deferred work item created from a recovered log intent item. */ +int +xfs_defer_finish_recovery( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct list_head *capture_list) +{ + const struct xfs_defer_op_type *ops = dfp->dfp_ops; + int error; + + /* dfp is freed by recover_work and must not be accessed afterwards */ + error = ops->recover_work(dfp, capture_list); + if (error) + trace_xlog_intent_recovery_failed(mp, ops, error); + return error; } /* @@ -712,7 +974,7 @@ xfs_defer_ops_capture( return ERR_PTR(error); /* Create an object to capture the defer ops. */ - dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); + dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&dfc->dfc_list); INIT_LIST_HEAD(&dfc->dfc_dfops); @@ -744,7 +1006,7 @@ xfs_defer_ops_capture( * transaction. */ for (i = 0; i < dfc->dfc_held.dr_inos; i++) { - ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL); ihold(VFS_I(dfc->dfc_held.dr_ip[i])); } @@ -756,12 +1018,13 @@ xfs_defer_ops_capture( /* Release all resources that we used to capture deferred ops. */ void -xfs_defer_ops_capture_free( +xfs_defer_ops_capture_abort( struct xfs_mount *mp, struct xfs_defer_capture *dfc) { unsigned short i; + xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops); xfs_defer_cancel_list(mp, &dfc->dfc_dfops); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) @@ -770,7 +1033,7 @@ xfs_defer_ops_capture_free( for (i = 0; i < dfc->dfc_held.dr_inos; i++) xfs_irele(dfc->dfc_held.dr_ip[i]); - kmem_free(dfc); + kfree(dfc); } /* @@ -802,7 +1065,7 @@ xfs_defer_ops_capture_and_commit( /* Commit the transaction and add the capture structure to the list. */ error = xfs_trans_commit(tp); if (error) { - xfs_defer_ops_capture_free(mp, dfc); + xfs_defer_ops_capture_abort(mp, dfc); return error; } @@ -846,7 +1109,7 @@ xfs_defer_ops_continue( list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); tp->t_flags |= dfc->dfc_tpflags; - kmem_free(dfc); + kfree(dfc); } /* Release the resources captured and continued during recovery. */ @@ -930,3 +1193,36 @@ xfs_defer_destroy_item_caches(void) xfs_rmap_intent_destroy_cache(); xfs_defer_destroy_cache(); } + +/* + * Mark a deferred work item so that it will be requeued indefinitely without + * being finished. Caller must ensure there are no data dependencies on this + * work item in the meantime. + */ +void +xfs_defer_item_pause( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); + + dfp->dfp_flags |= XFS_DEFER_PAUSED; + + trace_xfs_defer_item_pause(tp->t_mountp, dfp); +} + +/* + * Release a paused deferred work item so that it will be finished during the + * next transaction roll. + */ +void +xfs_defer_item_unpause( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); + + dfp->dfp_flags &= ~XFS_DEFER_PAUSED; + + trace_xfs_defer_item_unpause(tp->t_mountp, dfp); +} diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 114a3a4930a3..18a9fb92dde8 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -11,19 +11,6 @@ struct xfs_defer_op_type; struct xfs_defer_capture; /* - * Header for deferred operation list. - */ -enum xfs_defer_ops_type { - XFS_DEFER_OPS_TYPE_BMAP, - XFS_DEFER_OPS_TYPE_REFCOUNT, - XFS_DEFER_OPS_TYPE_RMAP, - XFS_DEFER_OPS_TYPE_FREE, - XFS_DEFER_OPS_TYPE_AGFL_FREE, - XFS_DEFER_OPS_TYPE_ATTR, - XFS_DEFER_OPS_TYPE_MAX, -}; - -/* * Save a log intent item and a list of extents, so that we can replay * whatever action had to happen to the extent list and file the log done * item. @@ -33,19 +20,35 @@ struct xfs_defer_pending { struct list_head dfp_work; /* work items */ struct xfs_log_item *dfp_intent; /* log intent item */ struct xfs_log_item *dfp_done; /* log done item */ + const struct xfs_defer_op_type *dfp_ops; unsigned int dfp_count; /* # extent items */ - enum xfs_defer_ops_type dfp_type; + unsigned int dfp_flags; }; -void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type, - struct list_head *h); +/* + * Create a log intent item for this deferred item, but don't actually finish + * the work. Caller must clear this before the final transaction commit. + */ +#define XFS_DEFER_PAUSED (1U << 0) + +#define XFS_DEFER_PENDING_STRINGS \ + { XFS_DEFER_PAUSED, "paused" } + +void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); +void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); + +struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, struct list_head *h, + const struct xfs_defer_op_type *ops); int xfs_defer_finish_noroll(struct xfs_trans **tp); int xfs_defer_finish(struct xfs_trans **tp); +int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp); void xfs_defer_cancel(struct xfs_trans *); void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { + const char *name; + unsigned int max_items; struct xfs_log_item *(*create_intent)(struct xfs_trans *tp, struct list_head *items, unsigned int count, bool sort); void (*abort_intent)(struct xfs_log_item *intent); @@ -56,7 +59,11 @@ struct xfs_defer_op_type { void (*finish_cleanup)(struct xfs_trans *tp, struct xfs_btree_cur *state, int error); void (*cancel_item)(struct list_head *item); - unsigned int max_items; + int (*recover_work)(struct xfs_defer_pending *dfp, + struct list_head *capture_list); + struct xfs_log_item *(*relog_intent)(struct xfs_trans *tp, + struct xfs_log_item *intent, + struct xfs_log_item *done_item); }; extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; @@ -121,11 +128,29 @@ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, struct list_head *capture_list); void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, struct xfs_defer_resources *dres); -void xfs_defer_ops_capture_free(struct xfs_mount *mp, +void xfs_defer_ops_capture_abort(struct xfs_mount *mp, struct xfs_defer_capture *d); void xfs_defer_resources_rele(struct xfs_defer_resources *dres); +void xfs_defer_start_recovery(struct xfs_log_item *lip, + struct list_head *r_dfops, const struct xfs_defer_op_type *ops); +void xfs_defer_cancel_recovery(struct xfs_mount *mp, + struct xfs_defer_pending *dfp); +int xfs_defer_finish_recovery(struct xfs_mount *mp, + struct xfs_defer_pending *dfp, struct list_head *capture_list); + +static inline void +xfs_defer_add_item( + struct xfs_defer_pending *dfp, + struct list_head *work) +{ + list_add_tail(work, &dfp->dfp_work); + dfp->dfp_count++; +} + int __init xfs_defer_init_item_caches(void); void xfs_defer_destroy_item_caches(void); +void xfs_defer_add_barrier(struct xfs_trans *tp); + #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index f5462fd582d5..4821519efad4 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -18,6 +18,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_health.h" const struct xfs_name xfs_name_dotdot = { .name = (const unsigned char *)"..", @@ -25,6 +26,12 @@ const struct xfs_name xfs_name_dotdot = { .type = XFS_DIR3_FT_DIR, }; +const struct xfs_name xfs_name_dot = { + .name = (const unsigned char *)".", + .len = 1, + .type = XFS_DIR3_FT_DIR, +}; + /* * Convert inode mode to directory entry filetype */ @@ -104,13 +111,13 @@ xfs_da_mount( ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE); - mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), - KM_MAYFAIL); - mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), - KM_MAYFAIL); + mp->m_dir_geo = kzalloc(sizeof(struct xfs_da_geometry), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); + mp->m_attr_geo = kzalloc(sizeof(struct xfs_da_geometry), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!mp->m_dir_geo || !mp->m_attr_geo) { - kmem_free(mp->m_dir_geo); - kmem_free(mp->m_attr_geo); + kfree(mp->m_dir_geo); + kfree(mp->m_attr_geo); return -ENOMEM; } @@ -178,8 +185,8 @@ void xfs_da_unmount( struct xfs_mount *mp) { - kmem_free(mp->m_dir_geo); - kmem_free(mp->m_attr_geo); + kfree(mp->m_dir_geo); + kfree(mp->m_attr_geo); } /* @@ -196,7 +203,7 @@ xfs_dir_isempty( return 1; if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) return 0; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; return !sfp->count; } @@ -236,7 +243,7 @@ xfs_dir_init( if (error) return error; - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -244,7 +251,7 @@ xfs_dir_init( args->dp = dp; args->trans = tp; error = xfs_dir2_sf_create(args, pdp->i_ino); - kmem_free(args); + kfree(args); return error; } @@ -273,7 +280,7 @@ xfs_dir_createname( XFS_STATS_INC(dp->i_mount, xs_dir_create); } - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -313,7 +320,7 @@ xfs_dir_createname( rval = xfs_dir2_node_addname(args); out_free: - kmem_free(args); + kfree(args); return rval; } @@ -333,7 +340,8 @@ xfs_dir_cilookup_result( !(args->op_flags & XFS_DA_OP_CILOOKUP)) return -EEXIST; - args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); + args->value = kmalloc(len, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL); if (!args->value) return -ENOMEM; @@ -364,15 +372,8 @@ xfs_dir_lookup( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); - /* - * We need to use KM_NOFS here so that lockdep will not throw false - * positive deadlock warnings on a non-transactional lookup path. It is - * safe to recurse into inode recalim in that case, but lockdep can't - * easily be taught about it. Hence KM_NOFS avoids having to add more - * lockdep Doing this avoids having to add a bunch of lockdep class - * annotations into the reclaim path for the ilock. - */ - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; @@ -419,7 +420,7 @@ out_check_rval: } out_free: xfs_iunlock(dp, lock_mode); - kmem_free(args); + kfree(args); return rval; } @@ -441,7 +442,7 @@ xfs_dir_removename( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -477,7 +478,7 @@ xfs_dir_removename( else rval = xfs_dir2_node_removename(args); out_free: - kmem_free(args); + kfree(args); return rval; } @@ -502,7 +503,7 @@ xfs_dir_replace( if (rval) return rval; - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -538,7 +539,7 @@ xfs_dir_replace( else rval = xfs_dir2_node_replace(args); out_free: - kmem_free(args); + kfree(args); return rval; } @@ -626,8 +627,10 @@ xfs_dir2_isblock( return 0; *isblock = true; - if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) + if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) { + xfs_da_mark_sick(args); return -EFSCORRUPTED; + } return 0; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 19af22a16c41..8497d041f316 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -22,6 +22,19 @@ struct xfs_dir3_icfree_hdr; struct xfs_dir3_icleaf_hdr; extern const struct xfs_name xfs_name_dotdot; +extern const struct xfs_name xfs_name_dot; + +static inline bool +xfs_dir2_samename( + const struct xfs_name *n1, + const struct xfs_name *n2) +{ + if (n1 == n2) + return true; + if (n1->len != n2->len) + return false; + return !memcmp(n1->name, n2->name, n1->len); +} /* * Convert inode mode to directory entry filetype diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 00f960a703b2..a2da007adb46 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -20,6 +20,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_health.h" /* * Local function prototypes. @@ -152,6 +153,7 @@ xfs_dir3_block_read( __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } @@ -1089,7 +1091,7 @@ xfs_dir2_sf_to_block( int newoffset; /* offset from current entry */ unsigned int offset = geo->data_entry_offset; xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ - xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */ + struct xfs_dir2_sf_hdr *oldsfp = ifp->if_data; xfs_dir2_sf_hdr_t *sfp; /* shortform header */ __be16 *tagp; /* end of data entry */ struct xfs_name name; @@ -1099,10 +1101,8 @@ xfs_dir2_sf_to_block( ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); - oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; - ASSERT(ifp->if_bytes == dp->i_disk_size); - ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(oldsfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); ASSERT(dp->i_df.if_nextents == 0); @@ -1110,7 +1110,7 @@ xfs_dir2_sf_to_block( * Copy the directory into a temporary buffer. * Then pitch the incore inode data so we can make extents. */ - sfp = kmem_alloc(ifp->if_bytes, 0); + sfp = kmalloc(ifp->if_bytes, GFP_KERNEL | __GFP_NOFAIL); memcpy(sfp, oldsfp, ifp->if_bytes); xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); @@ -1255,7 +1255,7 @@ xfs_dir2_sf_to_block( sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } /* Done with the temporary buffer */ - kmem_free(sfp); + kfree(sfp); /* * Sort the leaf entries by hash value. */ @@ -1270,6 +1270,6 @@ xfs_dir2_sf_to_block( xfs_dir3_data_check(dp, bp); return 0; out_free: - kmem_free(sfp); + kfree(sfp); return error; } diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index dbcf58979a59..7a6d965bea71 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -18,6 +18,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_health.h" static xfs_failaddr_t xfs_dir2_data_freefind_verify( struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, @@ -433,6 +434,7 @@ xfs_dir3_data_read( __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } @@ -1198,6 +1200,7 @@ xfs_dir2_data_use_free( corrupt: xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount, hdr, sizeof(*hdr), __FILE__, __LINE__, fa); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index cb9e950a911d..08dda5ce9d91 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -19,6 +19,7 @@ #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_buf_item.h" +#include "xfs_health.h" /* * Local function declarations. @@ -1393,8 +1394,10 @@ xfs_dir2_leaf_removename( bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[db]) != oldbest) { xfs_buf_mark_corrupt(lbp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } + /* * Mark the former data entry unused. */ diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 7a03aeb9f4c9..be0b8834028c 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -20,6 +20,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_health.h" /* * Function declarations. @@ -231,6 +232,7 @@ __xfs_dir3_free_read( __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } @@ -443,6 +445,7 @@ xfs_dir2_leaf_to_node( if (be32_to_cpu(ltp->bestcount) > (uint)dp->i_disk_size / args->geo->blksize) { xfs_buf_mark_corrupt(lbp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -517,6 +520,7 @@ xfs_dir2_leafn_add( */ if (index < 0) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -736,6 +740,7 @@ xfs_dir2_leafn_lookup_for_addname( cpu_to_be16(NULLDATAOFF))) { if (curfdb != newfdb) xfs_trans_brelse(tp, curbp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } curfdb = newfdb; @@ -804,6 +809,7 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir3_leaf_check(dp, bp); if (leafhdr.count <= 0) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -1739,6 +1745,7 @@ xfs_dir2_node_add_datablk( } else { xfs_alert(mp, " ... fblk is NULL"); } + xfs_da_mark_sick(args); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 7404a9ff1a92..1db2e60ba827 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -175,7 +175,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); -extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, int64_t size); int xfs_dir2_sf_entsize(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, int len); void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 8cd37e6e9d38..17a20384c8b7 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -276,7 +276,7 @@ xfs_dir2_block_to_sf( * format the data into. Once we have formatted the data, we can free * the block and copy the formatted data into the inode literal area. */ - sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0); + sfp = kmalloc(mp->m_sb.sb_inodesize, GFP_KERNEL | __GFP_NOFAIL); memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); /* @@ -350,7 +350,7 @@ xfs_dir2_block_to_sf( xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(sfp); + kfree(sfp); return error; } @@ -364,25 +364,23 @@ int /* error */ xfs_dir2_sf_addname( xfs_da_args_t *args) /* operation arguments */ { - xfs_inode_t *dp; /* incore directory inode */ + struct xfs_inode *dp = args->dp; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int error; /* error return value */ int incr_isize; /* total change in size */ int new_isize; /* size after adding name */ int objchange; /* changing to 8-byte inodes */ xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ int pick; /* which algorithm to use */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ trace_xfs_dir2_sf_addname(args); ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT); - dp = args->dp; ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Compute entry (and change in) size. @@ -462,20 +460,17 @@ xfs_dir2_sf_addname_easy( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - int byteoff; /* byte offset in sf dir */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; + int byteoff = (int)((char *)sfep - (char *)sfp); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - byteoff = (int)((char *)sfep - (char *)sfp); /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen), + sfp = xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen), XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff); /* * Fill in the new entry. @@ -528,11 +523,10 @@ xfs_dir2_sf_addname_hard( /* * Copy the old directory to the stack buffer. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_disk_size; - buf = kmem_alloc(old_isize, 0); + buf = kmalloc(old_isize, GFP_KERNEL | __GFP_NOFAIL); oldsfp = (xfs_dir2_sf_hdr_t *)buf; - memcpy(oldsfp, sfp, old_isize); + memcpy(oldsfp, dp->i_df.if_data, old_isize); /* * Loop over the old directory finding the place we're going * to insert the new entry. @@ -556,11 +550,8 @@ xfs_dir2_sf_addname_hard( * the data. */ xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK); - xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK); - /* - * Reset the pointer since the buffer was reallocated. - */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK); + /* * Copy the first part of the directory, including the header. */ @@ -585,7 +576,7 @@ xfs_dir2_sf_addname_hard( sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); memcpy(sfep, oldsfep, old_isize - nbytes); } - kmem_free(buf); + kfree(buf); dp->i_disk_size = new_isize; xfs_dir2_sf_check(args); } @@ -610,11 +601,10 @@ xfs_dir2_sf_addname_pick( int i; /* entry number */ xfs_dir2_data_aoff_t offset; /* data block offset */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int size; /* entry's data size */ int used; /* data bytes used */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; size = xfs_dir2_data_entsize(mp, args->namelen); offset = args->geo->data_first_offset; sfep = xfs_dir2_sf_firstentry(sfp); @@ -673,14 +663,13 @@ xfs_dir2_sf_check( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int i; /* entry number */ int i8count; /* number of big inode#s */ xfs_ino_t ino; /* entry inode number */ int offset; /* data offset */ xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; offset = args->geo->data_first_offset; ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; @@ -707,11 +696,10 @@ xfs_dir2_sf_check( /* Verify the consistency of an inline directory. */ xfs_failaddr_t xfs_dir2_sf_verify( - struct xfs_inode *ip) + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, + int64_t size) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; char *endp; @@ -719,15 +707,9 @@ xfs_dir2_sf_verify( int i; int i8count; int offset; - int64_t size; int error; uint8_t filetype; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - - sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; - size = ifp->if_bytes; - /* * Give up if the directory is way too short. */ @@ -834,15 +816,13 @@ xfs_dir2_sf_create( ASSERT(dp->i_df.if_bytes == 0); i8count = pino > XFS_DIR2_MAX_SHORT_INUM; size = xfs_dir2_sf_hdr_size(i8count); + /* - * Make a buffer for the data. - */ - xfs_idata_realloc(dp, size, XFS_DATA_FORK); - /* - * Fill in the header, + * Make a buffer for the data and fill in the header. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = xfs_idata_realloc(dp, size, XFS_DATA_FORK); sfp->i8count = i8count; + /* * Now can put in the inode number, since i8count is set. */ @@ -864,9 +844,9 @@ xfs_dir2_sf_lookup( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int i; /* entry index */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ enum xfs_dacmp cmp; /* comparison result */ xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ @@ -877,8 +857,7 @@ xfs_dir2_sf_lookup( ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Special case for . @@ -940,13 +919,13 @@ xfs_dir2_sf_removename( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int byteoff; /* offset of removed entry */ int entsize; /* this entry's size */ int i; /* shortform entry index */ int newsize; /* new inode size */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ trace_xfs_dir2_sf_removename(args); @@ -954,8 +933,7 @@ xfs_dir2_sf_removename( oldsize = (int)dp->i_disk_size; ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == oldsize); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Loop over the old directory entries. @@ -992,11 +970,12 @@ xfs_dir2_sf_removename( */ sfp->count--; dp->i_disk_size = newsize; + /* * Reallocate, making it smaller. */ - xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); + /* * Are we changing inode number size? */ @@ -1019,13 +998,12 @@ xfs_dir2_sf_replace_needblock( struct xfs_inode *dp, xfs_ino_t inum) { + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int newsize; - struct xfs_dir2_sf_hdr *sfp; if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL) return false; - sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; return inum > XFS_DIR2_MAX_SHORT_INUM && @@ -1041,19 +1019,18 @@ xfs_dir2_sf_replace( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int i; /* entry index */ xfs_ino_t ino=0; /* entry old inode number */ int i8elevated; /* sf_toino8 set i8count=1 */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ trace_xfs_dir2_sf_replace(args); ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* @@ -1076,7 +1053,7 @@ xfs_dir2_sf_replace( */ xfs_dir2_sf_toino8(args); i8elevated = 1; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; } else i8elevated = 0; @@ -1157,11 +1134,11 @@ xfs_dir2_sf_toino4( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data; char *buf; /* old dir's buffer */ int i; /* entry index */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ - xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ @@ -1174,8 +1151,7 @@ xfs_dir2_sf_toino4( * Don't want xfs_idata_realloc copying the data here. */ oldsize = dp->i_df.if_bytes; - buf = kmem_alloc(oldsize, 0); - oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL); ASSERT(oldsfp->i8count == 1); memcpy(buf, oldsfp, oldsize); /* @@ -1188,7 +1164,7 @@ xfs_dir2_sf_toino4( * Reset our pointers, the data has moved. */ oldsfp = (xfs_dir2_sf_hdr_t *)buf; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; /* * Fill in the new header. */ @@ -1214,7 +1190,7 @@ xfs_dir2_sf_toino4( /* * Clean up the inode. */ - kmem_free(buf); + kfree(buf); dp->i_disk_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } @@ -1230,11 +1206,11 @@ xfs_dir2_sf_toino8( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data; char *buf; /* old dir's buffer */ int i; /* entry index */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ - xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ @@ -1247,8 +1223,7 @@ xfs_dir2_sf_toino8( * Don't want xfs_idata_realloc copying the data here. */ oldsize = dp->i_df.if_bytes; - buf = kmem_alloc(oldsize, 0); - oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL); ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); /* @@ -1261,7 +1236,7 @@ xfs_dir2_sf_toino8( * Reset our pointers, the data has moved. */ oldsfp = (xfs_dir2_sf_hdr_t *)buf; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; /* * Fill in the new header. */ @@ -1287,7 +1262,7 @@ xfs_dir2_sf_toino8( /* * Clean up the inode. */ - kmem_free(buf); + kfree(buf); dp->i_disk_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 371dc07233e0..2b2f9050fbfb 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -98,7 +98,7 @@ typedef struct xfs_sb { uint32_t sb_blocksize; /* logical block size, bytes */ xfs_rfsblock_t sb_dblocks; /* number of data blocks */ xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ - xfs_rtblock_t sb_rextents; /* number of realtime extents */ + xfs_rtbxlen_t sb_rextents; /* number of realtime extents */ uuid_t sb_uuid; /* user-visible file system unique id */ xfs_fsblock_t sb_logstart; /* starting block of log if internal */ xfs_ino_t sb_rootino; /* root inode number */ @@ -477,15 +477,9 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) /* - * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the - * arrays below. - */ -#define XFS_BTNUM_AGF ((int)XFS_BTNUM_RMAPi + 1) - -/* - * The second word of agf_levels in the first a.g. overlaps the EFS - * superblock's magic number. Since the magic numbers valid for EFS - * are > 64k, our value cannot be confused for an EFS superblock's. + * agf_cnt_level in the first AGF overlaps the EFS superblock's magic number. + * Since the magic numbers valid for EFS are > 64k, our value cannot be confused + * for an EFS superblock. */ typedef struct xfs_agf { @@ -499,8 +493,13 @@ typedef struct xfs_agf { /* * Freespace and rmap information */ - __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ - __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ + __be32 agf_bno_root; /* bnobt root block */ + __be32 agf_cnt_root; /* cntbt root block */ + __be32 agf_rmap_root; /* rmapbt root block */ + + __be32 agf_bno_level; /* bnobt btree levels */ + __be32 agf_cnt_level; /* cntbt btree levels */ + __be32 agf_rmap_level; /* rmapbt btree levels */ __be32 agf_flfirst; /* first freelist block's index */ __be32 agf_fllast; /* last freelist block's index */ @@ -691,6 +690,22 @@ struct xfs_agfl { xfs_daddr_to_agno(mp, (d) + (len) - 1))) /* + * Realtime bitmap information is accessed by the word, which is currently + * stored in host-endian format. + */ +union xfs_rtword_raw { + __u32 old; +}; + +/* + * Realtime summary counts are accessed by the word, which is currently + * stored in host-endian format. + */ +union xfs_suminfo_raw { + __u32 old; +}; + +/* * XFS Timestamps * ============== * @@ -992,7 +1007,7 @@ enum xfs_dinode_fmt { * Return pointers to the data or attribute forks. */ #define XFS_DFORK_DPTR(dip) \ - ((char *)dip + xfs_dinode_size(dip->di_version)) + ((void *)dip + xfs_dinode_size(dip->di_version)) #define XFS_DFORK_APTR(dip) \ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) #define XFS_DFORK_PTR(dip,w) \ @@ -1140,34 +1155,6 @@ static inline bool xfs_dinode_has_large_extent_counts( #define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ #define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ -#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) -#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) -#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) -#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) - -/* - * RT Summary and bit manipulation macros. - */ -#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) -#define XFS_SUMOFFSTOBLOCK(mp,s) \ - (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) -#define XFS_SUMPTR(mp,bp,so) \ - ((xfs_suminfo_t *)((bp)->b_addr + \ - (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) - -#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) -#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) -#define XFS_BITTOWORD(mp,bi) \ - ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) - -#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) -#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) - -#define XFS_RTLOBIT(w) xfs_lowbit32(w) -#define XFS_RTHIBIT(w) xfs_highbit32(w) - -#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) - /* * Dquot and dquot block format definitions */ @@ -1270,6 +1257,9 @@ static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds) #define XFS_DQ_GRACE_MIN ((int64_t)0) #define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX) +/* Maximum id value for a quota record */ +#define XFS_DQ_ID_MAX (U32_MAX) + /* * This is the main portion of the on-disk representation of quota information * for a user. We pad this with some more expansion room to construct the on diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 6360073865db..ca1b17d01437 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -195,6 +195,8 @@ struct xfs_fsop_geom { #define XFS_FSOP_GEOM_SICK_PQUOTA (1 << 3) /* project quota */ #define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */ #define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */ +#define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */ +#define XFS_FSOP_GEOM_SICK_NLINKS (1 << 7) /* inode link counts */ /* Output for XFS_FS_COUNTS */ typedef struct xfs_fsop_counts { @@ -292,6 +294,7 @@ struct xfs_ag_geometry { #define XFS_AG_GEOM_SICK_FINOBT (1 << 7) /* free inode index */ #define XFS_AG_GEOM_SICK_RMAPBT (1 << 8) /* reverse mappings */ #define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9) /* reference counts */ +#define XFS_AG_GEOM_SICK_INODES (1 << 10) /* bad inodes were seen */ /* * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT @@ -709,9 +712,12 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */ #define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */ #define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */ +#define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */ +#define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */ +#define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 25 +#define XFS_SCRUB_TYPE_NR 28 /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1u << 0) diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 99e796256c5d..3c64b5f9bd68 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -26,21 +26,40 @@ * and the "sick" field tells us if that piece was found to need repairs. * Therefore we can conclude that for a given sick flag value: * - * - checked && sick => metadata needs repair - * - checked && !sick => metadata is ok - * - !checked => has not been examined since mount + * - checked && sick => metadata needs repair + * - checked && !sick => metadata is ok + * - !checked && sick => errors have been observed during normal operation, + * but the metadata has not been checked thoroughly + * - !checked && !sick => has not been examined since mount + * + * Evidence of health problems can be sorted into three basic categories: + * + * a) Primary evidence, which signals that something is defective within the + * general grouping of metadata. + * + * b) Secondary evidence, which are side effects of primary problem but are + * not themselves problems. These can be forgotten when the primary + * health problems are addressed. + * + * c) Indirect evidence, which points to something being wrong in another + * group, but we had to release resources and this is all that's left of + * that state. */ struct xfs_mount; struct xfs_perag; struct xfs_inode; struct xfs_fsop_geom; +struct xfs_btree_cur; +struct xfs_da_args; /* Observable health issues for metadata spanning the entire filesystem. */ #define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */ #define XFS_SICK_FS_UQUOTA (1 << 1) /* user quota */ #define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */ #define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */ +#define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */ +#define XFS_SICK_FS_NLINKS (1 << 5) /* inode link counts */ /* Observable health issues for realtime volume metadata. */ #define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */ @@ -57,6 +76,7 @@ struct xfs_fsop_geom; #define XFS_SICK_AG_FINOBT (1 << 7) /* free inode index */ #define XFS_SICK_AG_RMAPBT (1 << 8) /* reverse mappings */ #define XFS_SICK_AG_REFCNTBT (1 << 9) /* reference counts */ +#define XFS_SICK_AG_INODES (1 << 10) /* inactivated bad inodes */ /* Observable health issues for inode metadata. */ #define XFS_SICK_INO_CORE (1 << 0) /* inode core */ @@ -68,11 +88,21 @@ struct xfs_fsop_geom; #define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */ #define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */ +#define XFS_SICK_INO_BMBTD_ZAPPED (1 << 8) /* data fork erased */ +#define XFS_SICK_INO_BMBTA_ZAPPED (1 << 9) /* attr fork erased */ +#define XFS_SICK_INO_DIR_ZAPPED (1 << 10) /* directory erased */ +#define XFS_SICK_INO_SYMLINK_ZAPPED (1 << 11) /* symlink erased */ + +/* Don't propagate sick status to ag health summary during inactivation */ +#define XFS_SICK_INO_FORGET (1 << 12) + /* Primary evidence of health problems in a given group. */ #define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ XFS_SICK_FS_UQUOTA | \ XFS_SICK_FS_GQUOTA | \ - XFS_SICK_FS_PQUOTA) + XFS_SICK_FS_PQUOTA | \ + XFS_SICK_FS_QUOTACHECK | \ + XFS_SICK_FS_NLINKS) #define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \ XFS_SICK_RT_SUMMARY) @@ -97,29 +127,91 @@ struct xfs_fsop_geom; XFS_SICK_INO_SYMLINK | \ XFS_SICK_INO_PARENT) -/* These functions must be provided by the xfs implementation. */ +#define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \ + XFS_SICK_INO_BMBTA_ZAPPED | \ + XFS_SICK_INO_DIR_ZAPPED | \ + XFS_SICK_INO_SYMLINK_ZAPPED) + +/* Secondary state related to (but not primary evidence of) health problems. */ +#define XFS_SICK_FS_SECONDARY (0) +#define XFS_SICK_RT_SECONDARY (0) +#define XFS_SICK_AG_SECONDARY (0) +#define XFS_SICK_INO_SECONDARY (XFS_SICK_INO_FORGET) + +/* Evidence of health problems elsewhere. */ +#define XFS_SICK_FS_INDIRECT (0) +#define XFS_SICK_RT_INDIRECT (0) +#define XFS_SICK_AG_INDIRECT (XFS_SICK_AG_INODES) +#define XFS_SICK_INO_INDIRECT (0) + +/* All health masks. */ +#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \ + XFS_SICK_FS_SECONDARY | \ + XFS_SICK_FS_INDIRECT) + +#define XFS_SICK_RT_ALL (XFS_SICK_RT_PRIMARY | \ + XFS_SICK_RT_SECONDARY | \ + XFS_SICK_RT_INDIRECT) + +#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \ + XFS_SICK_AG_SECONDARY | \ + XFS_SICK_AG_INDIRECT) + +#define XFS_SICK_INO_ALL (XFS_SICK_INO_PRIMARY | \ + XFS_SICK_INO_SECONDARY | \ + XFS_SICK_INO_INDIRECT | \ + XFS_SICK_INO_ZAPPED) + +/* + * These functions must be provided by the xfs implementation. Function + * behavior with respect to the first argument should be as follows: + * + * xfs_*_mark_sick: Set the sick flags and do not set checked flags. + * Runtime code should call this upon encountering + * a corruption. + * + * xfs_*_mark_corrupt: Set the sick and checked flags simultaneously. + * Fsck tools should call this when corruption is + * found. + * + * xfs_*_mark_healthy: Clear the sick flags and set the checked flags. + * Fsck tools should call this after correcting errors. + * + * xfs_*_measure_sickness: Return the sick and check status in the provided + * out parameters. + */ void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask); +void xfs_fs_mark_corrupt(struct xfs_mount *mp, unsigned int mask); void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask); void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick, unsigned int *checked); void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask); +void xfs_rt_mark_corrupt(struct xfs_mount *mp, unsigned int mask); void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask); void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick, unsigned int *checked); +void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno, + unsigned int mask); void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask); +void xfs_ag_mark_corrupt(struct xfs_perag *pag, unsigned int mask); void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask); void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick, unsigned int *checked); void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask); +void xfs_inode_mark_corrupt(struct xfs_inode *ip, unsigned int mask); void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask); void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick, unsigned int *checked); void xfs_health_unmount(struct xfs_mount *mp); +void xfs_bmap_mark_sick(struct xfs_inode *ip, int whichfork); +void xfs_btree_mark_sick(struct xfs_btree_cur *cur); +void xfs_dirattr_mark_sick(struct xfs_inode *ip, int whichfork); +void xfs_da_mark_sick(struct xfs_da_args *args); /* Now some helpers. */ @@ -187,4 +279,7 @@ void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo); void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs); +#define xfs_metadata_is_sick(error) \ + (unlikely((error) == -EFSCORRUPTED || (error) == -EFSBADCRC)) + #endif /* __XFS_HEALTH_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b83e54c70906..e5ac3e5430c4 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -27,6 +27,7 @@ #include "xfs_log.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_health.h" /* * Lookup a record by ino in the btree given by cur. @@ -95,18 +96,28 @@ xfs_inobt_btrec_to_irec( irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } +/* Compute the freecount of an incore inode record. */ +uint8_t +xfs_inobt_rec_freecount( + const struct xfs_inobt_rec_incore *irec) +{ + uint64_t realfree = irec->ir_free; + + if (xfs_inobt_issparse(irec->ir_holemask)) + realfree &= xfs_inobt_irec_to_allocmask(irec); + return hweight64(realfree); +} + /* Simple checks for inode records. */ xfs_failaddr_t xfs_inobt_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec) { - uint64_t realfree; - /* Record has to be properly aligned within the AG. */ - if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) + if (!xfs_verify_agino(pag, irec->ir_startino)) return __this_address; - if (!xfs_verify_agino(cur->bc_ag.pag, + if (!xfs_verify_agino(pag, irec->ir_startino + XFS_INODES_PER_CHUNK - 1)) return __this_address; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || @@ -115,12 +126,7 @@ xfs_inobt_check_irec( if (irec->ir_freecount > XFS_INODES_PER_CHUNK) return __this_address; - /* if there are no holes, return the first available offset */ - if (!xfs_inobt_issparse(irec->ir_holemask)) - realfree = irec->ir_free; - else - realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec); - if (hweight64(realfree) != irec->ir_freecount) + if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount) return __this_address; return NULL; @@ -135,13 +141,13 @@ xfs_inobt_complain_bad_rec( struct xfs_mount *mp = cur->bc_mp; xfs_warn(mp, - "%s Inode BTree record corruption in AG %d detected at %pS!", - cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", - cur->bc_ag.pag->pag_agno, fa); + "%sbt record corruption in AG %d detected at %pS!", + cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, irec->ir_free, irec->ir_holemask); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -164,7 +170,7 @@ xfs_inobt_get_rec( return error; xfs_inobt_btrec_to_irec(mp, rec, irec); - fa = xfs_inobt_check_irec(cur, irec); + fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, irec); @@ -200,14 +206,17 @@ xfs_inobt_insert( struct xfs_buf *agbp, xfs_agino_t newino, xfs_agino_t newlen, - xfs_btnum_t btnum) + bool is_finobt) { struct xfs_btree_cur *cur; xfs_agino_t thisino; int i; int error; - cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); + if (is_finobt) + cur = xfs_finobt_init_cursor(pag, tp, agbp); + else + cur = xfs_inobt_init_cursor(pag, tp, agbp); for (thisino = newino; thisino < newino + newlen; @@ -523,16 +532,14 @@ __xfs_inobt_rec_merge( } /* - * Insert a new sparse inode chunk into the associated inode btree. The inode - * record for the sparse chunk is pre-aligned to a startino that should match - * any pre-existing sparse inode record in the tree. This allows sparse chunks - * to fill over time. + * Insert a new sparse inode chunk into the associated inode allocation btree. + * The inode record for the sparse chunk is pre-aligned to a startino that + * should match any pre-existing sparse inode record in the tree. This allows + * sparse chunks to fill over time. * - * This function supports two modes of handling preexisting records depending on - * the merge flag. If merge is true, the provided record is merged with the + * If no preexisting record exists, the provided record is inserted. + * If there is a preexisting record, the provided record is merged with the * existing record and updated in place. The merged record is returned in nrec. - * If merge is false, an existing record is replaced with the provided record. - * If no preexisting record exists, the provided record is always inserted. * * It is considered corruption if a merge is requested and not possible. Given * the sparse inode alignment constraints, this should never happen. @@ -542,9 +549,7 @@ xfs_inobt_insert_sprec( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - int btnum, - struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ - bool merge) /* merge or replace */ + struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */ { struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; @@ -552,7 +557,7 @@ xfs_inobt_insert_sprec( int i; struct xfs_inobt_rec_incore rec; - cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); + cur = xfs_inobt_init_cursor(pag, tp, agbp); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); @@ -566,6 +571,7 @@ xfs_inobt_insert_sprec( if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -574,45 +580,45 @@ xfs_inobt_insert_sprec( } /* - * A record exists at this startino. Merge or replace the record - * depending on what we've been asked to do. + * A record exists at this startino. Merge the records. */ - if (merge) { - error = xfs_inobt_get_rec(cur, &rec, &i); - if (error) - goto error; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto error; - } - if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { - error = -EFSCORRUPTED; - goto error; - } + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } + if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } - /* - * This should never fail. If we have coexisting records that - * cannot merge, something is seriously wrong. - */ - if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { - error = -EFSCORRUPTED; - goto error; - } + /* + * This should never fail. If we have coexisting records that + * cannot merge, something is seriously wrong. + */ + if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } - trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, - rec.ir_holemask, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, + rec.ir_holemask, nrec->ir_startino, + nrec->ir_holemask); - /* merge to nrec to output the updated record */ - __xfs_inobt_rec_merge(nrec, &rec); + /* merge to nrec to output the updated record */ + __xfs_inobt_rec_merge(nrec, &rec); - trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, + nrec->ir_holemask); - error = xfs_inobt_rec_check_count(mp, nrec); - if (error) - goto error; - } + error = xfs_inobt_rec_check_count(mp, nrec); + if (error) + goto error; error = xfs_inobt_update(cur, nrec); if (error) @@ -627,6 +633,59 @@ error: } /* + * Insert a new sparse inode chunk into the free inode btree. The inode + * record for the sparse chunk is pre-aligned to a startino that should match + * any pre-existing sparse inode record in the tree. This allows sparse chunks + * to fill over time. + * + * The new record is always inserted, overwriting a pre-existing record if + * there is one. + */ +STATIC int +xfs_finobt_insert_sprec( + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */ +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_btree_cur *cur; + int error; + int i; + + cur = xfs_finobt_init_cursor(pag, tp, agbp); + + /* the new record is pre-aligned so we know where to look */ + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + /* if nothing there, insert a new record and return */ + if (i == 0) { + error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, + nrec->ir_count, nrec->ir_freecount, + nrec->ir_free, &i); + if (error) + goto error; + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } + } else { + error = xfs_inobt_update(cur, nrec); + if (error) + goto error; + } + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + + +/* * Allocate new inodes in the allocation group specified by agbp. Returns 0 if * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so * the caller knows it can try another AG, a hard -ENOSPC when over the maximum @@ -852,8 +911,7 @@ sparse_alloc: * if necessary. If a merge does occur, rec is updated to the * merged record. */ - error = xfs_inobt_insert_sprec(pag, tp, agbp, - XFS_BTNUM_INO, &rec, true); + error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec); if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", @@ -877,21 +935,19 @@ sparse_alloc: * existing record with this one. */ if (xfs_has_finobt(args.mp)) { - error = xfs_inobt_insert_sprec(pag, tp, agbp, - XFS_BTNUM_FINO, &rec, false); + error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec); if (error) return error; } } else { /* full chunk - insert new records to both btrees */ - error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, - XFS_BTNUM_INO); + error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false); if (error) return error; if (xfs_has_finobt(args.mp)) { error = xfs_inobt_insert(pag, tp, agbp, newino, - newlen, XFS_BTNUM_FINO); + newlen, true); if (error) return error; } @@ -944,8 +1000,10 @@ xfs_ialloc_next_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } return 0; @@ -969,8 +1027,10 @@ xfs_ialloc_get_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } return 0; @@ -1025,7 +1085,7 @@ xfs_dialloc_ag_inobt( ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1048,6 +1108,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1056,6 +1117,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, j != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1214,6 +1276,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1223,6 +1286,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1232,6 +1296,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1292,8 +1357,10 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(lcur); return -EFSCORRUPTED; + } /* * See if we've landed in the parent inode record. The finobt @@ -1317,12 +1384,14 @@ xfs_dialloc_ag_finobt_near( if (error) goto error_rcur; if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) { + xfs_btree_mark_sick(lcur); error = -EFSCORRUPTED; goto error_rcur; } } if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) { + xfs_btree_mark_sick(lcur); error = -EFSCORRUPTED; goto error_rcur; } @@ -1378,8 +1447,10 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return 0; } } @@ -1390,14 +1461,18 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return 0; } @@ -1419,14 +1494,18 @@ xfs_dialloc_ag_update_inobt( error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); @@ -1435,8 +1514,10 @@ xfs_dialloc_ag_update_inobt( if (XFS_IS_CORRUPT(cur->bc_mp, rec.ir_free != frec->ir_free || - rec.ir_freecount != frec->ir_freecount)) + rec.ir_freecount != frec->ir_freecount)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return xfs_inobt_update(cur, &rec); } @@ -1478,7 +1559,7 @@ xfs_dialloc_ag( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(cur); if (error) @@ -1521,7 +1602,7 @@ xfs_dialloc_ag( * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ - icur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + icur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(icur); if (error) @@ -1854,7 +1935,7 @@ xfs_difree_inode_chunk( return xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1900,7 +1981,8 @@ xfs_difree_inode_chunk( ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); error = xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE); + &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, + false); if (error) return error; @@ -1937,7 +2019,7 @@ xfs_difree_inobt( /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(cur); if (error) @@ -1952,6 +2034,7 @@ xfs_difree_inobt( goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1962,6 +2045,7 @@ xfs_difree_inobt( goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -2062,7 +2146,7 @@ xfs_difree_finobt( int error; int i; - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) @@ -2074,6 +2158,7 @@ xfs_difree_finobt( * something is out of sync. */ if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -2100,6 +2185,7 @@ xfs_difree_finobt( if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -2110,6 +2196,7 @@ xfs_difree_finobt( if (XFS_IS_CORRUPT(mp, rec.ir_free != ibtrec->ir_free || rec.ir_freecount != ibtrec->ir_freecount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -2259,7 +2346,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) @@ -2598,6 +2685,8 @@ xfs_read_agi( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); if (error) return error; if (tp) @@ -2739,7 +2828,7 @@ xfs_ialloc_count_inodes_rec( xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); - fa = xfs_inobt_check_irec(cur, &irec); + fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, &irec); @@ -2759,7 +2848,7 @@ xfs_ialloc_count_inodes( struct xfs_ialloc_count_inodes ci = {0}; int error; - ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + ASSERT(xfs_btree_is_ino(cur->bc_ops)); error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci); if (error) return error; @@ -2976,7 +3065,7 @@ xfs_ialloc_check_shrink( if (!xfs_has_sparseinodes(pag->pag_mount)) return 0; - cur = xfs_inobt_init_cursor(pag, tp, agibp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agibp); /* Look up the inobt record that would correspond to the new EOFS. */ agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length); @@ -2989,6 +3078,7 @@ xfs_ialloc_check_shrink( goto out; if (!has) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT); error = -EFSCORRUPTED; goto out; } diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index fe824bb04a09..f1412183bb44 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -79,6 +79,7 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, */ int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); +uint8_t xfs_inobt_rec_freecount(const struct xfs_inobt_rec_incore *irec); /* * Inode chunk initialisation routine @@ -93,7 +94,7 @@ union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); -xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_inobt_check_irec(struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec); int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9258f01c0015..cc661fca6ff5 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -17,6 +17,7 @@ #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_health.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_rmap.h" @@ -37,7 +38,15 @@ xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_btnum); + cur->bc_ag.agbp); +} + +STATIC struct xfs_btree_cur * +xfs_finobt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_finobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, + cur->bc_ag.agbp); } STATIC void @@ -81,9 +90,9 @@ xfs_inobt_mod_blockcount( if (!xfs_has_inobtcounts(cur->bc_mp)) return; - if (cur->bc_btnum == XFS_BTNUM_FINO) + if (xfs_btree_is_fino(cur->bc_ops)) be32_add_cpu(&agi->agi_fblocks, howmuch); - else if (cur->bc_btnum == XFS_BTNUM_INO) + else be32_add_cpu(&agi->agi_iblocks, howmuch); xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS); } @@ -161,7 +170,7 @@ __xfs_inobt_free_block( xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_INOBT, resv); + &XFS_RMAP_OINFO_INOBT, resv, false); } STATIC int @@ -300,7 +309,7 @@ xfs_inobt_verify( * xfs_perag_initialised_agi(pag)) if we ever do. */ if (xfs_has_crc(mp)) { - fa = xfs_btree_sblock_v5hdr_verify(bp); + fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; } @@ -310,7 +319,7 @@ xfs_inobt_verify( if (level >= M_IGEO(mp)->inobt_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, + return xfs_btree_agblock_verify(bp, M_IGEO(mp)->inobt_mxr[level != 0]); } @@ -320,7 +329,7 @@ xfs_inobt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_sblock_verify_crc(bp)) + if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_inobt_verify(bp); @@ -344,7 +353,7 @@ xfs_inobt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_sblock_calc_crc(bp); + xfs_btree_agblock_calc_crc(bp); } @@ -398,9 +407,17 @@ xfs_inobt_keys_contiguous( be32_to_cpu(key2->inobt.ir_startino)); } -static const struct xfs_btree_ops xfs_inobt_ops = { +const struct xfs_btree_ops xfs_inobt_ops = { + .name = "ino", + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, + + .lru_refs = XFS_INO_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_ibt_2), + .sick_mask = XFS_SICK_AG_INOBT, .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_inobt_set_root, @@ -420,11 +437,19 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .keys_contiguous = xfs_inobt_keys_contiguous, }; -static const struct xfs_btree_ops xfs_finobt_ops = { +const struct xfs_btree_ops xfs_finobt_ops = { + .name = "fino", + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, - .dup_cursor = xfs_inobt_dup_cursor, + .lru_refs = XFS_INO_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_fibt_2), + .sick_mask = XFS_SICK_AG_FINOBT, + + .dup_cursor = xfs_finobt_dup_cursor, .set_root = xfs_finobt_set_root, .alloc_block = xfs_finobt_alloc_block, .free_block = xfs_finobt_free_block, @@ -443,65 +468,54 @@ static const struct xfs_btree_ops xfs_finobt_ops = { }; /* - * Initialize a new inode btree cursor. + * Create an inode btree cursor. + * + * For staging cursors tp and agbp are NULL. */ -static struct xfs_btree_cur * -xfs_inobt_init_common( +struct xfs_btree_cur * +xfs_inobt_init_cursor( struct xfs_perag *pag, - struct xfs_trans *tp, /* transaction pointer */ - xfs_btnum_t btnum) /* ialloc or free ino btree */ + struct xfs_trans *tp, + struct xfs_buf *agbp) { struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; - cur = xfs_btree_alloc_cursor(mp, tp, btnum, + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); - if (btnum == XFS_BTNUM_INO) { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); - cur->bc_ops = &xfs_inobt_ops; - } else { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); - cur->bc_ops = &xfs_finobt_ops; - } - - if (xfs_has_crc(mp)) - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agi *agi = agbp->b_addr; + + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + } return cur; } -/* Create an inode btree cursor. */ +/* + * Create a free inode btree cursor. + * + * For staging cursors tp and agbp are NULL. + */ struct xfs_btree_cur * -xfs_inobt_init_cursor( +xfs_finobt_init_cursor( struct xfs_perag *pag, struct xfs_trans *tp, - struct xfs_buf *agbp, - xfs_btnum_t btnum) + struct xfs_buf *agbp) { + struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; - struct xfs_agi *agi = agbp->b_addr; - cur = xfs_inobt_init_common(pag, tp, btnum); - if (btnum == XFS_BTNUM_INO) - cur->bc_nlevels = be32_to_cpu(agi->agi_level); - else - cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops, + M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); + cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.agbp = agbp; - return cur; -} + if (agbp) { + struct xfs_agi *agi = agbp->b_addr; -/* Create an inode btree cursor with a fake root for staging. */ -struct xfs_btree_cur * -xfs_inobt_stage_cursor( - struct xfs_perag *pag, - struct xbtree_afakeroot *afake, - xfs_btnum_t btnum) -{ - struct xfs_btree_cur *cur; - - cur = xfs_inobt_init_common(pag, NULL, btnum); - xfs_btree_stage_afakeroot(cur, afake); + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + } return cur; } @@ -521,7 +535,7 @@ xfs_inobt_commit_staged_btree( ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - if (cur->bc_btnum == XFS_BTNUM_INO) { + if (xfs_btree_is_ino(cur->bc_ops)) { fields = XFS_AGI_ROOT | XFS_AGI_LEVEL; agi->agi_root = cpu_to_be32(afake->af_root); agi->agi_level = cpu_to_be32(afake->af_levels); @@ -530,7 +544,7 @@ xfs_inobt_commit_staged_btree( fields |= XFS_AGI_IBLOCKS; } xfs_ialloc_log_agi(tp, agbp, fields); - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops); + xfs_btree_commit_afakeroot(cur, tp, agbp); } else { fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL; agi->agi_free_root = cpu_to_be32(afake->af_root); @@ -540,7 +554,7 @@ xfs_inobt_commit_staged_btree( fields |= XFS_AGI_IBLOCKS; } xfs_ialloc_log_agi(tp, agbp, fields); - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops); + xfs_btree_commit_afakeroot(cur, tp, agbp); } } @@ -721,45 +735,21 @@ xfs_inobt_max_size( XFS_INODES_PER_CHUNK); } -/* Read AGI and create inobt cursor. */ -int -xfs_inobt_cur( - struct xfs_perag *pag, - struct xfs_trans *tp, - xfs_btnum_t which, - struct xfs_btree_cur **curpp, - struct xfs_buf **agi_bpp) -{ - struct xfs_btree_cur *cur; - int error; - - ASSERT(*agi_bpp == NULL); - ASSERT(*curpp == NULL); - - error = xfs_ialloc_read_agi(pag, tp, agi_bpp); - if (error) - return error; - - cur = xfs_inobt_init_cursor(pag, tp, *agi_bpp, which); - *curpp = cur; - return 0; -} - static int -xfs_inobt_count_blocks( +xfs_finobt_count_blocks( struct xfs_perag *pag, struct xfs_trans *tp, - xfs_btnum_t btnum, xfs_extlen_t *tree_blocks) { struct xfs_buf *agbp = NULL; - struct xfs_btree_cur *cur = NULL; + struct xfs_btree_cur *cur; int error; - error = xfs_inobt_cur(pag, tp, btnum, &cur, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) return error; + cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_btree_count_blocks(cur, tree_blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); @@ -807,8 +797,7 @@ xfs_finobt_calc_reserves( if (xfs_has_inobtcounts(pag->pag_mount)) error = xfs_finobt_read_blocks(pag, tp, &tree_len); else - error = xfs_inobt_count_blocks(pag, tp, XFS_BTNUM_FINO, - &tree_len); + error = xfs_finobt_count_blocks(pag, tp, &tree_len); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 3262c3fe5ebe..6472ec1ecbb4 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -46,10 +46,10 @@ struct xfs_perag; (maxrecs) * sizeof(xfs_inobt_key_t) + \ ((index) - 1) * sizeof(xfs_inobt_ptr_t))) -extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag, - struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum); -struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag, - struct xbtree_afakeroot *afake, xfs_btnum_t btnum); +struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag, + struct xfs_trans *tp, struct xfs_buf *agbp); +struct xfs_btree_cur *xfs_finobt_init_cursor(struct xfs_perag *pag, + struct xfs_trans *tp, struct xfs_buf *agbp); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ @@ -66,9 +66,6 @@ int xfs_finobt_calc_reserves(struct xfs_perag *perag, struct xfs_trans *tp, xfs_extlen_t *ask, xfs_extlen_t *used); extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, unsigned long long len); -int xfs_inobt_cur(struct xfs_perag *pag, struct xfs_trans *tp, - xfs_btnum_t btnum, struct xfs_btree_cur **curpp, - struct xfs_buf **agi_bpp); void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 773cf4349428..8796f2b3e534 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -158,7 +158,7 @@ static void * xfs_iext_find_first_leaf( struct xfs_ifork *ifp) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height; if (!ifp->if_height) @@ -176,7 +176,7 @@ static void * xfs_iext_find_last_leaf( struct xfs_ifork *ifp) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height, i; if (!ifp->if_height) @@ -306,7 +306,7 @@ xfs_iext_find_level( xfs_fileoff_t offset, int level) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height, i; if (!ifp->if_height) @@ -394,20 +394,27 @@ xfs_iext_leaf_key( return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK; } +static inline void * +xfs_iext_alloc_node( + int size) +{ + return kzalloc(size, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); +} + static void xfs_iext_grow( struct xfs_ifork *ifp) { - struct xfs_iext_node *node = kmem_zalloc(NODE_SIZE, KM_NOFS); + struct xfs_iext_node *node = xfs_iext_alloc_node(NODE_SIZE); int i; if (ifp->if_height == 1) { - struct xfs_iext_leaf *prev = ifp->if_u1.if_root; + struct xfs_iext_leaf *prev = ifp->if_data; node->keys[0] = xfs_iext_leaf_key(prev, 0); node->ptrs[0] = prev; } else { - struct xfs_iext_node *prev = ifp->if_u1.if_root; + struct xfs_iext_node *prev = ifp->if_data; ASSERT(ifp->if_height > 1); @@ -418,7 +425,7 @@ xfs_iext_grow( for (i = 1; i < KEYS_PER_NODE; i++) node->keys[i] = XFS_IEXT_KEY_INVALID; - ifp->if_u1.if_root = node; + ifp->if_data = node; ifp->if_height++; } @@ -430,7 +437,7 @@ xfs_iext_update_node( int level, void *ptr) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height, i; for (height = ifp->if_height; height > level; height--) { @@ -454,7 +461,7 @@ xfs_iext_split_node( int *nr_entries) { struct xfs_iext_node *node = *nodep; - struct xfs_iext_node *new = kmem_zalloc(NODE_SIZE, KM_NOFS); + struct xfs_iext_node *new = xfs_iext_alloc_node(NODE_SIZE); const int nr_move = KEYS_PER_NODE / 2; int nr_keep = nr_move + (KEYS_PER_NODE & 1); int i = 0; @@ -542,7 +549,7 @@ xfs_iext_split_leaf( int *nr_entries) { struct xfs_iext_leaf *leaf = cur->leaf; - struct xfs_iext_leaf *new = kmem_zalloc(NODE_SIZE, KM_NOFS); + struct xfs_iext_leaf *new = xfs_iext_alloc_node(NODE_SIZE); const int nr_move = RECS_PER_LEAF / 2; int nr_keep = nr_move + (RECS_PER_LEAF & 1); int i; @@ -583,11 +590,11 @@ xfs_iext_alloc_root( { ASSERT(ifp->if_bytes == 0); - ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS); + ifp->if_data = xfs_iext_alloc_node(sizeof(struct xfs_iext_rec)); ifp->if_height = 1; /* now that we have a node step into it */ - cur->leaf = ifp->if_u1.if_root; + cur->leaf = ifp->if_data; cur->pos = 0; } @@ -603,9 +610,10 @@ xfs_iext_realloc_root( if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) new_size = NODE_SIZE; - new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL); + new = krealloc(ifp->if_data, new_size, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); - ifp->if_u1.if_root = new; + ifp->if_data = new; cur->leaf = new; } @@ -622,13 +630,11 @@ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp) } void -xfs_iext_insert( - struct xfs_inode *ip, +xfs_iext_insert_raw( + struct xfs_ifork *ifp, struct xfs_iext_cursor *cur, - struct xfs_bmbt_irec *irec, - int state) + struct xfs_bmbt_irec *irec) { - struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); xfs_fileoff_t offset = irec->br_startoff; struct xfs_iext_leaf *new = NULL; int nr_entries, i; @@ -662,12 +668,23 @@ xfs_iext_insert( xfs_iext_set(cur_rec(cur), irec); ifp->if_bytes += sizeof(struct xfs_iext_rec); - trace_xfs_iext_insert(ip, cur, state, _RET_IP_); - if (new) xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2); } +void +xfs_iext_insert( + struct xfs_inode *ip, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *irec, + int state) +{ + struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); + + xfs_iext_insert_raw(ifp, cur, irec); + trace_xfs_iext_insert(ip, cur, state, _RET_IP_); +} + static struct xfs_iext_node * xfs_iext_rebalance_node( struct xfs_iext_node *parent, @@ -734,7 +751,7 @@ xfs_iext_remove_node( again: ASSERT(node->ptrs[pos]); ASSERT(node->ptrs[pos] == victim); - kmem_free(victim); + kfree(victim); nr_entries = xfs_iext_node_nr_entries(node, pos) - 1; offset = node->keys[0]; @@ -777,10 +794,10 @@ again: * If we are at the root and only one entry is left we can just * free this node and update the root pointer. */ - ASSERT(node == ifp->if_u1.if_root); - ifp->if_u1.if_root = node->ptrs[0]; + ASSERT(node == ifp->if_data); + ifp->if_data = node->ptrs[0]; ifp->if_height--; - kmem_free(node); + kfree(node); } } @@ -854,8 +871,8 @@ xfs_iext_free_last_leaf( struct xfs_ifork *ifp) { ifp->if_height--; - kmem_free(ifp->if_u1.if_root); - ifp->if_u1.if_root = NULL; + kfree(ifp->if_data); + ifp->if_data = NULL; } void @@ -872,7 +889,7 @@ xfs_iext_remove( trace_xfs_iext_remove(ip, cur, state, _RET_IP_); ASSERT(ifp->if_height > 0); - ASSERT(ifp->if_u1.if_root != NULL); + ASSERT(ifp->if_data != NULL); ASSERT(xfs_iext_valid(ifp, cur)); xfs_iext_inc_seq(ifp); @@ -1035,16 +1052,16 @@ xfs_iext_destroy_node( } } - kmem_free(node); + kfree(node); } void xfs_iext_destroy( struct xfs_ifork *ifp) { - xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height); + xfs_iext_destroy_node(ifp->if_data, ifp->if_height); ifp->if_bytes = 0; ifp->if_height = 0; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 543f3748c2a3..d0dcce462bf4 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -18,6 +18,7 @@ #include "xfs_trans.h" #include "xfs_ialloc.h" #include "xfs_dir2.h" +#include "xfs_health.h" #include <linux/iversion.h> @@ -132,9 +133,14 @@ xfs_imap_to_bp( struct xfs_imap *imap, struct xfs_buf **bpp) { - return xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - imap->im_len, XBF_UNMAPPED, bpp, - &xfs_inode_buf_ops); + int error; + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno), + XFS_SICK_AG_INODES); + return error; } static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts) @@ -510,6 +516,9 @@ xfs_dinode_verify( if (mode && nextents + naextents > nblocks) return __this_address; + if (nextents + naextents == 0 && nblocks != 0) + return __this_address; + if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 5a2e7ddfa76d..7d660a973909 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -25,6 +25,8 @@ #include "xfs_attr_leaf.h" #include "xfs_types.h" #include "xfs_errortag.h" +#include "xfs_health.h" +#include "xfs_symlink_remote.h" struct kmem_cache *xfs_ifork_cache; @@ -50,12 +52,16 @@ xfs_init_local_fork( mem_size++; if (size) { - ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS); - memcpy(ifp->if_u1.if_data, data, size); + char *new_data = kmalloc(mem_size, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + + memcpy(new_data, data, size); if (zero_terminate) - ifp->if_u1.if_data[size] = '\0'; + new_data[size] = '\0'; + + ifp->if_data = new_data; } else { - ifp->if_u1.if_data = NULL; + ifp->if_data = NULL; } ifp->if_bytes = size; @@ -74,7 +80,7 @@ xfs_iformat_local( /* * If the size is unreasonable, then something * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. + * kmalloc() or memcpy() below. */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, @@ -84,6 +90,7 @@ xfs_iformat_local( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_local", dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } @@ -113,7 +120,7 @@ xfs_iformat_extents( /* * If the number of extents is unreasonable, then something is wrong and - * we just bail out rather than crash in kmem_alloc() or memcpy() below. + * we just bail out rather than crash in kmalloc() or memcpy() below. */ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).", @@ -121,11 +128,12 @@ xfs_iformat_extents( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(1)", dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } ifp->if_bytes = 0; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; if (size) { dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); @@ -140,6 +148,7 @@ xfs_iformat_extents( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(2)", dp, sizeof(*dp), fa); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return xfs_bmap_complain_bad_rec(ip, whichfork, fa, &new); } @@ -198,11 +207,13 @@ xfs_iformat_btree( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_btree", dfp, size, __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } ifp->if_broot_bytes = size; - ifp->if_broot = kmem_alloc(size, KM_NOFS); + ifp->if_broot = kmalloc(size, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); ASSERT(ifp->if_broot != NULL); /* * Copy and convert from the on-disk structure @@ -212,7 +223,7 @@ xfs_iformat_btree( ifp->if_broot, size); ifp->if_bytes = 0; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; return 0; } @@ -262,12 +273,14 @@ xfs_iformat_data_fork( default: xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } break; default: xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } } @@ -276,10 +289,9 @@ static uint16_t xfs_dfork_attr_shortform_size( struct xfs_dinode *dip) { - struct xfs_attr_shortform *atp = - (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip); + struct xfs_attr_sf_hdr *sf = XFS_DFORK_APTR(dip); - return be16_to_cpu(atp->hdr.totsize); + return be16_to_cpu(sf->totsize); } void @@ -340,6 +352,7 @@ xfs_iformat_attr_fork( default: xfs_inode_verifier_error(ip, error, __func__, dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); error = -EFSCORRUPTED; break; } @@ -397,7 +410,8 @@ xfs_iroot_realloc( */ if (ifp->if_broot_bytes == 0) { new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); - ifp->if_broot = kmem_alloc(new_size, KM_NOFS); + ifp->if_broot = kmalloc(new_size, + GFP_KERNEL | __GFP_NOFAIL); ifp->if_broot_bytes = (int)new_size; return; } @@ -412,7 +426,7 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = krealloc(ifp->if_broot, new_size, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOFAIL); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, @@ -438,7 +452,7 @@ xfs_iroot_realloc( else new_size = 0; if (new_size > 0) { - new_broot = kmem_alloc(new_size, KM_NOFS); + new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); /* * First copy over the btree block header. */ @@ -468,7 +482,7 @@ xfs_iroot_realloc( (int)new_size); memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t)); } - kmem_free(ifp->if_broot); + kfree(ifp->if_broot); ifp->if_broot = new_broot; ifp->if_broot_bytes = (int)new_size; if (ifp->if_broot) @@ -486,14 +500,14 @@ xfs_iroot_realloc( * * If the amount of space needed has decreased below the size of the * inline buffer, then switch to using the inline buffer. Otherwise, - * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer + * use krealloc() or kmalloc() to adjust the size of the buffer * to what is needed. * * ip -- the inode whose if_data area is changing * byte_diff -- the change in the number of bytes, positive or negative, * requested for the if_data array. */ -void +void * xfs_idata_realloc( struct xfs_inode *ip, int64_t byte_diff, @@ -505,34 +519,31 @@ xfs_idata_realloc( ASSERT(new_size >= 0); ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork)); - if (byte_diff == 0) - return; - - if (new_size == 0) { - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; - ifp->if_bytes = 0; - return; + if (byte_diff) { + ifp->if_data = krealloc(ifp->if_data, new_size, + GFP_KERNEL | __GFP_NOFAIL); + if (new_size == 0) + ifp->if_data = NULL; + ifp->if_bytes = new_size; } - ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size, - GFP_NOFS | __GFP_NOFAIL); - ifp->if_bytes = new_size; + return ifp->if_data; } +/* Free all memory and reset a fork back to its initial state. */ void xfs_idestroy_fork( struct xfs_ifork *ifp) { if (ifp->if_broot != NULL) { - kmem_free(ifp->if_broot); + kfree(ifp->if_broot); ifp->if_broot = NULL; } switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; + kfree(ifp->if_data); + ifp->if_data = NULL; break; case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -563,7 +574,7 @@ xfs_iextents_copy( struct xfs_bmbt_irec rec; int64_t copied = 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); ASSERT(ifp->if_bytes > 0); for_each_xfs_iext(ifp, &icur, &rec) { @@ -625,9 +636,9 @@ xfs_iflush_fork( case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { - ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(ifp->if_data != NULL); ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork)); - memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); + memcpy(cp, ifp->if_data, ifp->if_bytes); } break; @@ -690,7 +701,7 @@ xfs_ifork_init_cow( return; ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } @@ -702,19 +713,27 @@ xfs_ifork_verify_local_data( xfs_failaddr_t fa = NULL; switch (VFS_I(ip)->i_mode & S_IFMT) { - case S_IFDIR: - fa = xfs_dir2_sf_verify(ip); + case S_IFDIR: { + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_dir2_sf_hdr *sfp = ifp->if_data; + + fa = xfs_dir2_sf_verify(mp, sfp, ifp->if_bytes); break; - case S_IFLNK: - fa = xfs_symlink_shortform_verify(ip); + } + case S_IFLNK: { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + + fa = xfs_symlink_shortform_verify(ifp->if_data, ifp->if_bytes); break; + } default: break; } if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", - ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa); + ip->i_df.if_data, ip->i_df.if_bytes, fa); return -EFSCORRUPTED; } @@ -729,14 +748,17 @@ xfs_ifork_verify_local_attr( struct xfs_ifork *ifp = &ip->i_af; xfs_failaddr_t fa; - if (!xfs_inode_has_attr_fork(ip)) + if (!xfs_inode_has_attr_fork(ip)) { fa = __this_address; - else - fa = xfs_attr_shortform_verify(ip); + } else { + struct xfs_ifork *ifp = &ip->i_af; + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); + fa = xfs_attr_shortform_verify(ifp->if_data, ifp->if_bytes); + } if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp->if_u1.if_data, ifp->if_bytes, fa); + ifp->if_data, ifp->if_bytes, fa); return -EFSCORRUPTED; } @@ -792,3 +814,12 @@ xfs_iext_count_upgrade( return 0; } + +/* Decide if a file mapping is on the realtime device or not. */ +bool +xfs_ifork_is_realtime( + struct xfs_inode *ip, + int whichfork) +{ + return XFS_IS_REALTIME_INODE(ip) && whichfork != XFS_ATTR_FORK; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 96d307784c85..bd53eb951b65 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -13,14 +13,12 @@ struct xfs_dinode; * File incore extent information, present for each of data & attr forks. */ struct xfs_ifork { - int64_t if_bytes; /* bytes in if_u1 */ + int64_t if_bytes; /* bytes in if_data */ struct xfs_btree_block *if_broot; /* file's incore btree root */ unsigned int if_seq; /* fork mod counter */ int if_height; /* height of the extent tree */ - union { - void *if_root; /* extent tree root */ - char *if_data; /* inline file data */ - } if_u1; + void *if_data; /* extent tree root or + inline data */ xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ @@ -170,7 +168,7 @@ int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_ifork *ifp); -void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, +void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, int whichfork); void xfs_iroot_realloc(struct xfs_inode *, int, int); int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); @@ -180,6 +178,9 @@ void xfs_init_local_fork(struct xfs_inode *ip, int whichfork, const void *data, int64_t size); xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp); +void xfs_iext_insert_raw(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *irec); void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *, int); void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *, @@ -259,6 +260,7 @@ int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, int nr_to_add); int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, uint nr_to_add); +bool xfs_ifork_is_realtime(struct xfs_inode *ip, int whichfork); /* returns true if the fork has extents but they are not read in yet. */ static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 269573c82808..16872972e1e9 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -838,10 +838,12 @@ struct xfs_cud_log_format { #define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31) #define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30) +#define XFS_BMAP_EXTENT_REALTIME (1U << 29) #define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \ XFS_BMAP_EXTENT_ATTR_FORK | \ - XFS_BMAP_EXTENT_UNWRITTEN) + XFS_BMAP_EXTENT_UNWRITTEN | \ + XFS_BMAP_EXTENT_REALTIME) /* * This is the structure used to lay out an bui log item in the diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index a5100a11faf9..9fe7a9564bca 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -11,6 +11,7 @@ * define how recovery should work for that type of log item. */ struct xlog_recover_item; +struct xfs_defer_op_type; /* Sorting hat for log items as they're read in. */ enum xlog_recover_reorder { @@ -153,4 +154,11 @@ xlog_recover_resv(const struct xfs_trans_res *r) return ret; } +struct xfs_defer_pending; + +void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, + xfs_lsn_t lsn, const struct xfs_defer_op_type *ops); +int xlog_recover_finish_intent(struct xfs_trans *tp, + struct xfs_defer_pending *dfp); + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index c4cc99b70dd3..81885a6a028e 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -7,16 +7,16 @@ #define __XFS_ONDISK_H #define XFS_CHECK_STRUCT_SIZE(structname, size) \ - BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ - #structname ") is wrong, expected " #size) + static_assert(sizeof(structname) == (size), \ + "XFS: sizeof(" #structname ") is wrong, expected " #size) #define XFS_CHECK_OFFSET(structname, member, off) \ - BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \ + static_assert(offsetof(structname, member) == (off), \ "XFS: offsetof(" #structname ", " #member ") is wrong, " \ "expected " #off) #define XFS_CHECK_VALUE(value, expected) \ - BUILD_BUG_ON_MSG((value) != (expected), \ + static_assert((value) == (expected), \ "XFS: value of " #value " is wrong, expected " #expected) static inline void __init @@ -72,6 +72,10 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4); + /* realtime structures */ + XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4); + XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4); + /* * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to * 4 bytes anyway so it's not obviously a problem. Hence for the moment @@ -89,13 +93,13 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); - XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4); + XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0); + XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, namelen, 0); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 646b3fa362ad..511c912d515c 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -23,6 +23,7 @@ #include "xfs_refcount.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_health.h" struct kmem_cache *xfs_refcount_intent_cache; @@ -123,11 +124,9 @@ xfs_refcount_btrec_to_irec( /* Simple checks for refcount records. */ xfs_failaddr_t xfs_refcount_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_refcount_irec *irec) { - struct xfs_perag *pag = cur->bc_ag.pag; - if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) return __this_address; @@ -158,6 +157,7 @@ xfs_refcount_complain_bad_rec( xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -179,7 +179,7 @@ xfs_refcount_get_rec( return error; xfs_refcount_btrec_to_irec(rec, irec); - fa = xfs_refcount_check_irec(cur, irec); + fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, irec); @@ -240,6 +240,7 @@ xfs_refcount_insert( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -270,12 +271,14 @@ xfs_refcount_delete( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec); error = xfs_btree_delete(cur, i); if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -400,6 +403,7 @@ xfs_refcount_split_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -427,6 +431,7 @@ xfs_refcount_split_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -472,6 +477,7 @@ xfs_refcount_merge_center_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -480,6 +486,7 @@ xfs_refcount_merge_center_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -489,6 +496,7 @@ xfs_refcount_merge_center_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -500,6 +508,7 @@ xfs_refcount_merge_center_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -544,6 +553,7 @@ xfs_refcount_merge_left_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -552,6 +562,7 @@ xfs_refcount_merge_left_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -563,6 +574,7 @@ xfs_refcount_merge_left_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -610,6 +622,7 @@ xfs_refcount_merge_right_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -618,6 +631,7 @@ xfs_refcount_merge_right_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -629,6 +643,7 @@ xfs_refcount_merge_right_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -676,6 +691,7 @@ xfs_refcount_find_left_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -695,6 +711,7 @@ xfs_refcount_find_left_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -769,6 +786,7 @@ xfs_refcount_find_right_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -788,6 +806,7 @@ xfs_refcount_find_right_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1058,7 +1077,7 @@ xfs_refcount_still_have_space( * to handle each of the shape changes to the refcount btree. */ overhead = xfs_allocfree_block_count(cur->bc_mp, - cur->bc_ag.refc.shape_changes); + cur->bc_refc.shape_changes); overhead += cur->bc_mp->m_refc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; @@ -1066,17 +1085,17 @@ xfs_refcount_still_have_space( * Only allow 2 refcount extent updates per transaction if the * refcount continue update "error" has been injected. */ - if (cur->bc_ag.refc.nr_ops > 2 && + if (cur->bc_refc.nr_ops > 2 && XFS_TEST_ERROR(false, cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; - if (cur->bc_ag.refc.nr_ops == 0) + if (cur->bc_refc.nr_ops == 0) return true; else if (overhead > cur->bc_tp->t_log_res) return false; - return cur->bc_tp->t_log_res - overhead > - cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; + return cur->bc_tp->t_log_res - overhead > + cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } /* @@ -1136,7 +1155,7 @@ xfs_refcount_adjust_extents( * Either cover the hole (increment) or * delete the range (decrement). */ - cur->bc_ag.refc.nr_ops++; + cur->bc_refc.nr_ops++; if (tmp.rc_refcount) { error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -1144,6 +1163,7 @@ xfs_refcount_adjust_extents( goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1153,7 +1173,7 @@ xfs_refcount_adjust_extents( tmp.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, tmp.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_error; } @@ -1182,6 +1202,7 @@ xfs_refcount_adjust_extents( */ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) || XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1195,7 +1216,7 @@ xfs_refcount_adjust_extents( ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &ext); - cur->bc_ag.refc.nr_ops++; + cur->bc_refc.nr_ops++; if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) @@ -1205,6 +1226,7 @@ xfs_refcount_adjust_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1215,7 +1237,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, ext.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_error; } @@ -1283,7 +1305,7 @@ xfs_refcount_adjust( if (shape_changed) shape_changes++; if (shape_changes) - cur->bc_ag.refc.shape_changes++; + cur->bc_refc.shape_changes++; /* Now that we've taken care of the ends, adjust the middle extents */ error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj); @@ -1329,8 +1351,10 @@ xfs_refcount_continue_op( struct xfs_perag *pag = cur->bc_ag.pag; if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, - ri->ri_blockcount))) + ri->ri_blockcount))) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); @@ -1376,8 +1400,8 @@ xfs_refcount_finish_one( */ rcur = *pcur; if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { - nr_ops = rcur->bc_ag.refc.nr_ops; - shape_changes = rcur->bc_ag.refc.shape_changes; + nr_ops = rcur->bc_refc.nr_ops; + shape_changes = rcur->bc_refc.shape_changes; xfs_refcount_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -1389,8 +1413,8 @@ xfs_refcount_finish_one( return error; rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag); - rcur->bc_ag.refc.nr_ops = nr_ops; - rcur->bc_ag.refc.shape_changes = shape_changes; + rcur->bc_refc.nr_ops = nr_ops; + rcur->bc_refc.shape_changes = shape_changes; } *pcur = rcur; @@ -1451,14 +1475,14 @@ __xfs_refcount_add( blockcount); ri = kmem_cache_alloc(xfs_refcount_intent_cache, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; xfs_refcount_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); + xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); } /* @@ -1537,6 +1561,7 @@ xfs_refcount_find_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1554,6 +1579,7 @@ xfs_refcount_find_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1587,6 +1613,7 @@ xfs_refcount_find_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1684,6 +1711,7 @@ xfs_refcount_adjust_cow_extents( goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec && ext.rc_domain != XFS_REFC_DOMAIN_COW)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1699,6 +1727,7 @@ xfs_refcount_adjust_cow_extents( /* Adding a CoW reservation, there should be nothing here. */ if (XFS_IS_CORRUPT(cur->bc_mp, agbno + aglen > ext.rc_startblock)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1716,6 +1745,7 @@ xfs_refcount_adjust_cow_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1723,14 +1753,17 @@ xfs_refcount_adjust_cow_extents( case XFS_REFCOUNT_ADJUST_COW_FREE: /* Removing a CoW reservation, there should be one extent. */ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1742,6 +1775,7 @@ xfs_refcount_adjust_cow_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1891,17 +1925,20 @@ xfs_refcount_recover_extent( struct xfs_refcount_recovery *rr; if (XFS_IS_CORRUPT(cur->bc_mp, - be32_to_cpu(rec->refc.rc_refcount) != 1)) + be32_to_cpu(rec->refc.rc_refcount) != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } rr = kmalloc(sizeof(struct xfs_refcount_recovery), GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL || + if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL || XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { + xfs_btree_mark_sick(cur); kfree(rr); return -EFSCORRUPTED; } @@ -1985,7 +2022,7 @@ xfs_refcount_recover_cow_leftovers( /* Free the block. */ error = xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_trans; @@ -2033,6 +2070,47 @@ xfs_refcount_has_records( return xfs_btree_has_records(cur, &low, &high, NULL, outcome); } +struct xfs_refcount_query_range_info { + xfs_refcount_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_refcount_query_range_helper( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_refcount_query_range_info *query = priv; + struct xfs_refcount_irec irec; + xfs_failaddr_t fa; + + xfs_refcount_btrec_to_irec(rec, &irec); + fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec); + if (fa) + return xfs_refcount_complain_bad_rec(cur, fa, &irec); + + return query->fn(cur, &irec, query->priv); +} + +/* Find all refcount records between two keys. */ +int +xfs_refcount_query_range( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *low_rec, + const struct xfs_refcount_irec *high_rec, + xfs_refcount_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_brec = { .rc = *low_rec }; + union xfs_btree_irec high_brec = { .rc = *high_rec }; + struct xfs_refcount_query_range_info query = { .priv = priv, .fn = fn }; + + return xfs_btree_query_range(cur, &low_brec, &high_brec, + xfs_refcount_query_range_helper, &query); +} + int __init xfs_refcount_intent_init_cache(void) { diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 783cd89ca195..9b56768a590c 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -117,7 +117,7 @@ extern int xfs_refcount_has_records(struct xfs_btree_cur *cur, union xfs_btree_rec; extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); -xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag, const struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); @@ -127,4 +127,14 @@ extern struct kmem_cache *xfs_refcount_intent_cache; int __init xfs_refcount_intent_init_cache(void); void xfs_refcount_intent_destroy_cache(void); +typedef int (*xfs_refcount_query_range_fn)( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv); + +int xfs_refcount_query_range(struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *low_rec, + const struct xfs_refcount_irec *high_rec, + xfs_refcount_query_range_fn fn, void *priv); + #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 5c3987d8dc24..ca59f6c89f3e 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -16,6 +16,7 @@ #include "xfs_refcount.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_health.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_bit.h" @@ -77,8 +78,6 @@ xfs_refcountbt_alloc_block( xfs_refc_block(args.mp))); if (error) goto out_error; - trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, - args.agbno, 1); if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; @@ -107,12 +106,10 @@ xfs_refcountbt_free_block( struct xfs_agf *agf = agbp->b_addr; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); - trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, - XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false); } STATIC int @@ -220,18 +217,29 @@ xfs_refcountbt_verify( if (!xfs_has_reflink(mp)) return __this_address; - fa = xfs_btree_sblock_v5hdr_verify(bp); + fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; level = be16_to_cpu(block->bb_level); if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_refcount_level) + unsigned int maxlevel = pag->pagf_refcount_level; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Online repair could be rewriting the refcount btree, so + * we'll validate against the larger of either tree while this + * is going on. + */ + maxlevel = max_t(unsigned int, maxlevel, + pag->pagf_repair_refcount_level); +#endif + if (level >= maxlevel) return __this_address; } else if (level >= mp->m_refc_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]); + return xfs_btree_agblock_verify(bp, mp->m_refc_mxr[level != 0]); } STATIC void @@ -240,7 +248,7 @@ xfs_refcountbt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_sblock_verify_crc(bp)) + if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_refcountbt_verify(bp); @@ -264,7 +272,7 @@ xfs_refcountbt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_sblock_calc_crc(bp); + xfs_btree_agblock_calc_crc(bp); } @@ -310,9 +318,17 @@ xfs_refcountbt_keys_contiguous( be32_to_cpu(key2->refc.rc_startblock)); } -static const struct xfs_btree_ops xfs_refcountbt_ops = { +const struct xfs_btree_ops xfs_refcountbt_ops = { + .name = "refcount", + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(struct xfs_refcount_rec), .key_len = sizeof(struct xfs_refcount_key), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, + + .lru_refs = XFS_REFC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2), + .sick_mask = XFS_SICK_AG_REFCNTBT, .dup_cursor = xfs_refcountbt_dup_cursor, .set_root = xfs_refcountbt_set_root, @@ -333,59 +349,32 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = { }; /* - * Initialize a new refcount btree cursor. + * Create a new refcount btree cursor. + * + * For staging cursors tp and agbp are NULL. */ -static struct xfs_btree_cur * -xfs_refcountbt_init_common( +struct xfs_btree_cur * +xfs_refcountbt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, + struct xfs_buf *agbp, struct xfs_perag *pag) { struct xfs_btree_cur *cur; ASSERT(pag->pag_agno < mp->m_sb.sb_agcount); - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC, + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops, mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); - - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ag.pag = xfs_perag_hold(pag); - cur->bc_ag.refc.nr_ops = 0; - cur->bc_ag.refc.shape_changes = 0; - cur->bc_ops = &xfs_refcountbt_ops; - return cur; -} - -/* Create a btree cursor. */ -struct xfs_btree_cur * -xfs_refcountbt_init_cursor( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_buf *agbp, - struct xfs_perag *pag) -{ - struct xfs_agf *agf = agbp->b_addr; - struct xfs_btree_cur *cur; - - cur = xfs_refcountbt_init_common(mp, tp, pag); - cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + cur->bc_refc.nr_ops = 0; + cur->bc_refc.shape_changes = 0; cur->bc_ag.agbp = agbp; - return cur; -} + if (agbp) { + struct xfs_agf *agf = agbp->b_addr; -/* Create a btree cursor with a fake root for staging. */ -struct xfs_btree_cur * -xfs_refcountbt_stage_cursor( - struct xfs_mount *mp, - struct xbtree_afakeroot *afake, - struct xfs_perag *pag) -{ - struct xfs_btree_cur *cur; - - cur = xfs_refcountbt_init_common(mp, NULL, pag); - xfs_btree_stage_afakeroot(cur, afake); + cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + } return cur; } @@ -410,7 +399,7 @@ xfs_refcountbt_commit_staged_btree( xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS | XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops); + xfs_btree_commit_afakeroot(cur, tp, agbp); } /* Calculate number of records in a refcount btree block. */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index d66b37259bed..1e0ab25f6c68 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -48,8 +48,6 @@ struct xbtree_afakeroot; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_perag *pag); -struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag); extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index fbb0b2637463..ef16f6f9cef6 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -23,6 +23,7 @@ #include "xfs_error.h" #include "xfs_inode.h" #include "xfs_ag.h" +#include "xfs_health.h" struct kmem_cache *xfs_rmap_intent_cache; @@ -56,8 +57,10 @@ xfs_rmap_lookup_le( error = xfs_rmap_get_rec(cur, irec, &get_stat); if (error) return error; - if (!get_stat) + if (!get_stat) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return 0; } @@ -132,6 +135,7 @@ xfs_rmap_insert( if (error) goto done; if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) { + xfs_btree_mark_sick(rcur); error = -EFSCORRUPTED; goto done; } @@ -145,6 +149,7 @@ xfs_rmap_insert( if (error) goto done; if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(rcur); error = -EFSCORRUPTED; goto done; } @@ -174,6 +179,7 @@ xfs_rmap_delete( if (error) goto done; if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(rcur); error = -EFSCORRUPTED; goto done; } @@ -182,6 +188,7 @@ xfs_rmap_delete( if (error) goto done; if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(rcur); error = -EFSCORRUPTED; goto done; } @@ -208,10 +215,10 @@ xfs_rmap_btrec_to_irec( /* Simple checks for rmap records. */ xfs_failaddr_t xfs_rmap_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_rmap_irec *irec) { - struct xfs_mount *mp = cur->bc_mp; + struct xfs_mount *mp = pag->pag_mount; bool is_inode; bool is_unwritten; bool is_bmbt; @@ -226,8 +233,8 @@ xfs_rmap_check_irec( return __this_address; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbext(cur->bc_ag.pag, irec->rm_startblock, - irec->rm_blockcount)) + if (!xfs_verify_agbext(pag, irec->rm_startblock, + irec->rm_blockcount)) return __this_address; } @@ -262,6 +269,16 @@ xfs_rmap_check_irec( return NULL; } +static inline xfs_failaddr_t +xfs_rmap_check_btrec( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec) +{ + if (xfs_btree_is_mem_rmap(cur->bc_ops)) + return xfs_rmap_check_irec(cur->bc_mem.pag, irec); + return xfs_rmap_check_irec(cur->bc_ag.pag, irec); +} + static inline int xfs_rmap_complain_bad_rec( struct xfs_btree_cur *cur, @@ -270,13 +287,18 @@ xfs_rmap_complain_bad_rec( { struct xfs_mount *mp = cur->bc_mp; - xfs_warn(mp, - "Reverse Mapping BTree record corruption in AG %d detected at %pS!", - cur->bc_ag.pag->pag_agno, fa); + if (xfs_btree_is_mem_rmap(cur->bc_ops)) + xfs_warn(mp, + "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa); + else + xfs_warn(mp, + "Reverse Mapping BTree record corruption in AG %d detected at %pS!", + cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, irec->rm_blockcount); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -299,7 +321,7 @@ xfs_rmap_get_rec( fa = xfs_rmap_btrec_to_irec(rec, irec); if (!fa) - fa = xfs_rmap_check_irec(cur, irec); + fa = xfs_rmap_check_btrec(cur, irec); if (fa) return xfs_rmap_complain_bad_rec(cur, fa, irec); @@ -512,7 +534,7 @@ xfs_rmap_lookup_le_range( */ static int xfs_rmap_free_check_owner( - struct xfs_mount *mp, + struct xfs_btree_cur *cur, uint64_t ltoff, struct xfs_rmap_irec *rec, xfs_filblks_t len, @@ -520,6 +542,7 @@ xfs_rmap_free_check_owner( uint64_t offset, unsigned int flags) { + struct xfs_mount *mp = cur->bc_mp; int error = 0; if (owner == XFS_RMAP_OWN_UNKNOWN) @@ -529,12 +552,14 @@ xfs_rmap_free_check_owner( if (XFS_IS_CORRUPT(mp, (flags & XFS_RMAP_UNWRITTEN) != (rec->rm_flags & XFS_RMAP_UNWRITTEN))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } /* Make sure the owner matches what we expect to find in the tree. */ if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } @@ -546,16 +571,19 @@ xfs_rmap_free_check_owner( if (flags & XFS_RMAP_BMBT_BLOCK) { if (XFS_IS_CORRUPT(mp, !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } } else { if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } if (XFS_IS_CORRUPT(mp, offset + len > ltoff + rec->rm_blockcount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } @@ -618,6 +646,7 @@ xfs_rmap_unmap( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -639,6 +668,7 @@ xfs_rmap_unmap( if (XFS_IS_CORRUPT(mp, bno < ltrec.rm_startblock + ltrec.rm_blockcount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -665,6 +695,7 @@ xfs_rmap_unmap( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -677,12 +708,13 @@ xfs_rmap_unmap( ltrec.rm_startblock > bno || ltrec.rm_startblock + ltrec.rm_blockcount < bno + len)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } /* Check owner information. */ - error = xfs_rmap_free_check_owner(mp, ltoff, <rec, len, owner, + error = xfs_rmap_free_check_owner(cur, ltoff, <rec, len, owner, offset, flags); if (error) goto out_error; @@ -697,6 +729,7 @@ xfs_rmap_unmap( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -788,6 +821,86 @@ out_error: return error; } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* + * Use a static key here to reduce the overhead of rmapbt live updates. If + * the compiler supports jump labels, the static branch will be replaced by a + * nop sled when there are no hook users. Online fsck is currently the only + * caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_rmap_hooks_switch); + +void +xfs_rmap_hook_disable(void) +{ + xfs_hooks_switch_off(&xfs_rmap_hooks_switch); +} + +void +xfs_rmap_hook_enable(void) +{ + xfs_hooks_switch_on(&xfs_rmap_hooks_switch); +} + +/* Call downstream hooks for a reverse mapping update. */ +static inline void +xfs_rmap_update_hook( + struct xfs_trans *tp, + struct xfs_perag *pag, + enum xfs_rmap_intent_type op, + xfs_agblock_t startblock, + xfs_extlen_t blockcount, + bool unwritten, + const struct xfs_owner_info *oinfo) +{ + if (xfs_hooks_switched_on(&xfs_rmap_hooks_switch)) { + struct xfs_rmap_update_params p = { + .startblock = startblock, + .blockcount = blockcount, + .unwritten = unwritten, + .oinfo = *oinfo, /* struct copy */ + }; + + if (pag) + xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p); + } +} + +/* Call the specified function during a reverse mapping update. */ +int +xfs_rmap_hook_add( + struct xfs_perag *pag, + struct xfs_rmap_hook *hook) +{ + return xfs_hooks_add(&pag->pag_rmap_update_hooks, &hook->rmap_hook); +} + +/* Stop calling the specified function during a reverse mapping update. */ +void +xfs_rmap_hook_del( + struct xfs_perag *pag, + struct xfs_rmap_hook *hook) +{ + xfs_hooks_del(&pag->pag_rmap_update_hooks, &hook->rmap_hook); +} + +/* Configure rmap update hook functions. */ +void +xfs_rmap_hook_setup( + struct xfs_rmap_hook *hook, + notifier_fn_t mod_fn) +{ + xfs_hook_setup(&hook->rmap_hook, mod_fn); +} +#else +# define xfs_rmap_update_hook(t, p, o, s, b, u, oi) do { } while (0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + /* * Remove a reference to an extent in the rmap btree. */ @@ -808,7 +921,7 @@ xfs_rmap_free( return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); - + xfs_rmap_update_hook(tp, pag, XFS_RMAP_UNMAP, bno, len, false, oinfo); error = xfs_rmap_unmap(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -900,6 +1013,7 @@ xfs_rmap_map( if (XFS_IS_CORRUPT(mp, have_lt != 0 && ltrec.rm_startblock + ltrec.rm_blockcount > bno)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -917,10 +1031,12 @@ xfs_rmap_map( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, have_gt != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -974,6 +1090,7 @@ xfs_rmap_map( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1021,6 +1138,7 @@ xfs_rmap_map( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1055,6 +1173,7 @@ xfs_rmap_alloc( return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); + xfs_rmap_update_hook(tp, pag, XFS_RMAP_MAP, bno, len, false, oinfo); error = xfs_rmap_map(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -1116,6 +1235,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1153,12 +1273,14 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if (XFS_IS_CORRUPT(mp, LEFT.rm_startblock + LEFT.rm_blockcount > bno)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1181,6 +1303,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1193,10 +1316,12 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1227,6 +1352,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1246,6 +1372,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1257,6 +1384,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1264,6 +1392,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1275,6 +1404,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1282,6 +1412,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1305,6 +1436,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1312,6 +1444,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1331,6 +1464,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1342,6 +1476,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1349,6 +1484,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1419,6 +1555,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1461,6 +1598,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1476,6 +1614,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1509,6 +1648,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1522,6 +1662,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1534,6 +1675,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1606,6 +1748,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1634,6 +1777,7 @@ xfs_rmap_convert_shared( if (XFS_IS_CORRUPT(mp, LEFT.rm_startblock + LEFT.rm_blockcount > bno)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1652,10 +1796,12 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1706,6 +1852,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1732,6 +1879,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1758,6 +1906,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1781,6 +1930,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1816,6 +1966,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1861,6 +2012,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1896,6 +2048,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1934,6 +2087,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2023,6 +2177,7 @@ xfs_rmap_unmap_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2033,12 +2188,14 @@ xfs_rmap_unmap_shared( ltrec.rm_startblock > bno || ltrec.rm_startblock + ltrec.rm_blockcount < bno + len)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } /* Make sure the owner matches what we expect to find in the tree. */ if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2047,16 +2204,19 @@ xfs_rmap_unmap_shared( if (XFS_IS_CORRUPT(mp, (flags & XFS_RMAP_UNWRITTEN) != (ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } /* Check the offset. */ if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2113,6 +2273,7 @@ xfs_rmap_unmap_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2142,6 +2303,7 @@ xfs_rmap_unmap_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2221,6 +2383,7 @@ xfs_rmap_map_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, have_gt != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2273,6 +2436,7 @@ xfs_rmap_map_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2335,15 +2499,12 @@ xfs_rmap_map_raw( { struct xfs_owner_info oinfo; - oinfo.oi_owner = rmap->rm_owner; - oinfo.oi_offset = rmap->rm_offset; - oinfo.oi_flags = 0; - if (rmap->rm_flags & XFS_RMAP_ATTR_FORK) - oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; - if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK) - oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; + xfs_owner_info_pack(&oinfo, rmap->rm_owner, rmap->rm_offset, + rmap->rm_flags); - if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + if ((rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) || + XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) return xfs_rmap_map(cur, rmap->rm_startblock, rmap->rm_blockcount, rmap->rm_flags & XFS_RMAP_UNWRITTEN, @@ -2373,7 +2534,7 @@ xfs_rmap_query_range_helper( fa = xfs_rmap_btrec_to_irec(rec, &irec); if (!fa) - fa = xfs_rmap_check_irec(cur, &irec); + fa = xfs_rmap_check_btrec(cur, &irec); if (fa) return xfs_rmap_complain_bad_rec(cur, fa, &irec); @@ -2428,6 +2589,38 @@ xfs_rmap_finish_one_cleanup( xfs_trans_brelse(tp, agbp); } +/* Commit an rmap operation into the ondisk tree. */ +int +__xfs_rmap_finish_intent( + struct xfs_btree_cur *rcur, + enum xfs_rmap_intent_type op, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + bool unwritten) +{ + switch (op) { + case XFS_RMAP_ALLOC: + case XFS_RMAP_MAP: + return xfs_rmap_map(rcur, bno, len, unwritten, oinfo); + case XFS_RMAP_MAP_SHARED: + return xfs_rmap_map_shared(rcur, bno, len, unwritten, oinfo); + case XFS_RMAP_FREE: + case XFS_RMAP_UNMAP: + return xfs_rmap_unmap(rcur, bno, len, unwritten, oinfo); + case XFS_RMAP_UNMAP_SHARED: + return xfs_rmap_unmap_shared(rcur, bno, len, unwritten, oinfo); + case XFS_RMAP_CONVERT: + return xfs_rmap_convert(rcur, bno, len, !unwritten, oinfo); + case XFS_RMAP_CONVERT_SHARED: + return xfs_rmap_convert_shared(rcur, bno, len, !unwritten, + oinfo); + default: + ASSERT(0); + return -EFSCORRUPTED; + } +} + /* * Process one of the deferred rmap operations. We pass back the * btree cursor to maintain our lock on the rmapbt between calls. @@ -2476,10 +2669,14 @@ xfs_rmap_finish_one( * allocate blocks. */ error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp); - if (error) + if (error) { + xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL); return error; - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) + } + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { + xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL); return -EFSCORRUPTED; + } rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag); } @@ -2490,39 +2687,14 @@ xfs_rmap_finish_one( unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN; bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock); - switch (ri->ri_type) { - case XFS_RMAP_ALLOC: - case XFS_RMAP_MAP: - error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount, - unwritten, &oinfo); - break; - case XFS_RMAP_MAP_SHARED: - error = xfs_rmap_map_shared(rcur, bno, - ri->ri_bmap.br_blockcount, unwritten, &oinfo); - break; - case XFS_RMAP_FREE: - case XFS_RMAP_UNMAP: - error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount, - unwritten, &oinfo); - break; - case XFS_RMAP_UNMAP_SHARED: - error = xfs_rmap_unmap_shared(rcur, bno, - ri->ri_bmap.br_blockcount, unwritten, &oinfo); - break; - case XFS_RMAP_CONVERT: - error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount, - !unwritten, &oinfo); - break; - case XFS_RMAP_CONVERT_SHARED: - error = xfs_rmap_convert_shared(rcur, bno, - ri->ri_bmap.br_blockcount, !unwritten, &oinfo); - break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - } + error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno, + ri->ri_bmap.br_blockcount, &oinfo, unwritten); + if (error) + return error; - return error; + xfs_rmap_update_hook(tp, ri->ri_pag, ri->ri_type, bno, + ri->ri_bmap.br_blockcount, unwritten, &oinfo); + return 0; } /* @@ -2559,7 +2731,7 @@ __xfs_rmap_add( bmap->br_blockcount, bmap->br_state); - ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_owner = owner; @@ -2567,7 +2739,7 @@ __xfs_rmap_add( ri->ri_bmap = *bmap; xfs_rmap_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); + xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); } /* Map an extent into a file. */ diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 3c98d9d50afb..9d01fe689497 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -186,6 +186,10 @@ void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, struct xfs_btree_cur *rcur, int error); int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur); +int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur, + enum xfs_rmap_intent_type op, xfs_agblock_t bno, + xfs_extlen_t len, const struct xfs_owner_info *oinfo, + bool unwritten); int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, @@ -195,7 +199,7 @@ int xfs_rmap_compare(const struct xfs_rmap_irec *a, union xfs_btree_rec; xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); -xfs_failaddr_t xfs_rmap_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag, const struct xfs_rmap_irec *irec); int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno, @@ -235,4 +239,29 @@ extern struct kmem_cache *xfs_rmap_intent_cache; int __init xfs_rmap_intent_init_cache(void); void xfs_rmap_intent_destroy_cache(void); +/* + * Parameters for tracking reverse mapping changes. The hook function arg + * parameter is enum xfs_rmap_intent_type, and the rest is below. + */ +struct xfs_rmap_update_params { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; + struct xfs_owner_info oinfo; + bool unwritten; +}; + +#ifdef CONFIG_XFS_LIVE_HOOKS + +struct xfs_rmap_hook { + struct xfs_hook rmap_hook; +}; + +void xfs_rmap_hook_disable(void); +void xfs_rmap_hook_enable(void); + +int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook); +void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook); +void xfs_rmap_hook_setup(struct xfs_rmap_hook *hook, notifier_fn_t mod_fn); +#endif + #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 6c81b20e97d2..9e759efa81cc 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -16,11 +16,14 @@ #include "xfs_btree_staging.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_health.h" #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_extent_busy.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" static struct kmem_cache *xfs_rmapbt_cur_cache; @@ -65,13 +68,12 @@ xfs_rmapbt_set_root( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - int btnum = cur->bc_btnum; ASSERT(ptr->s != 0); - agf->agf_roots[btnum] = ptr->s; - be32_add_cpu(&agf->agf_levels[btnum], inc); - cur->bc_ag.pag->pagf_levels[btnum] += inc; + agf->agf_rmap_root = ptr->s; + be32_add_cpu(&agf->agf_rmap_level, inc); + cur->bc_ag.pag->pagf_rmap_level += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -94,8 +96,6 @@ xfs_rmapbt_alloc_block( &bno, 1); if (error) return error; - - trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1); if (bno == NULLAGBLOCK) { *stat = 0; return 0; @@ -125,8 +125,6 @@ xfs_rmapbt_free_block( int error; bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); - trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno, - bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1); @@ -226,7 +224,7 @@ xfs_rmapbt_init_ptr_from_cur( ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); - ptr->s = agf->agf_roots[cur->bc_btnum]; + ptr->s = agf->agf_rmap_root; } /* @@ -340,18 +338,29 @@ xfs_rmapbt_verify( if (!xfs_has_rmapbt(mp)) return __this_address; - fa = xfs_btree_sblock_v5hdr_verify(bp); + fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; level = be16_to_cpu(block->bb_level); if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) + unsigned int maxlevel = pag->pagf_rmap_level; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Online repair could be rewriting the free space btrees, so + * we'll validate against the larger of either tree while this + * is going on. + */ + maxlevel = max_t(unsigned int, maxlevel, + pag->pagf_repair_rmap_level); +#endif + if (level >= maxlevel) return __this_address; } else if (level >= mp->m_rmap_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); + return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]); } static void @@ -360,7 +369,7 @@ xfs_rmapbt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_sblock_verify_crc(bp)) + if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_rmapbt_verify(bp); @@ -384,7 +393,7 @@ xfs_rmapbt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_sblock_calc_crc(bp); + xfs_btree_agblock_calc_crc(bp); } @@ -476,9 +485,19 @@ xfs_rmapbt_keys_contiguous( be32_to_cpu(key2->rmap.rm_startblock)); } -static const struct xfs_btree_ops xfs_rmapbt_ops = { +const struct xfs_btree_ops xfs_rmapbt_ops = { + .name = "rmap", + .type = XFS_BTREE_TYPE_AG, + .geom_flags = XFS_BTGEO_OVERLAPPING, + .rec_len = sizeof(struct xfs_rmap_rec), + /* Overlapping btree; 2 keys per pointer. */ .key_len = 2 * sizeof(struct xfs_rmap_key), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, + + .lru_refs = XFS_RMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rmap_2), + .sick_mask = XFS_SICK_AG_RMAPBT, .dup_cursor = xfs_rmapbt_dup_cursor, .set_root = xfs_rmapbt_set_root, @@ -498,55 +517,176 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = { .keys_contiguous = xfs_rmapbt_keys_contiguous, }; -static struct xfs_btree_cur * -xfs_rmapbt_init_common( +/* + * Create a new reverse mapping btree cursor. + * + * For staging cursors tp and agbp are NULL. + */ +struct xfs_btree_cur * +xfs_rmapbt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, + struct xfs_buf *agbp, struct xfs_perag *pag) { struct xfs_btree_cur *cur; - /* Overlapping btree; 2 keys per pointer. */ - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops, mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); - cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); - cur->bc_ops = &xfs_rmapbt_ops; - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agf *agf = agbp->b_addr; + + cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level); + } return cur; } -/* Create a new reverse mapping btree cursor. */ +#ifdef CONFIG_XFS_BTREE_IN_MEM +static inline unsigned int +xfs_rmapbt_mem_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / + (2 * sizeof(struct xfs_rmap_key) + sizeof(__be64)); +} + +/* + * Validate an in-memory rmap btree block. Callers are allowed to generate an + * in-memory btree even if the ondisk feature is not enabled. + */ +static xfs_failaddr_t +xfs_rmapbt_mem_verify( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + unsigned int level; + unsigned int maxrecs; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + + level = be16_to_cpu(block->bb_level); + if (level >= xfs_rmapbt_maxlevels_ondisk()) + return __this_address; + + maxrecs = xfs_rmapbt_mem_block_maxrecs( + XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0); + return xfs_btree_memblock_verify(bp, maxrecs); +} + +static void +xfs_rmapbt_mem_rw_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa = xfs_rmapbt_mem_verify(bp); + + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +/* skip crc checks on in-memory btrees to save time */ +static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = { + .name = "xfs_rmapbt_mem", + .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, + .verify_read = xfs_rmapbt_mem_rw_verify, + .verify_write = xfs_rmapbt_mem_rw_verify, + .verify_struct = xfs_rmapbt_mem_verify, +}; + +const struct xfs_btree_ops xfs_rmapbt_mem_ops = { + .name = "mem_rmap", + .type = XFS_BTREE_TYPE_MEM, + .geom_flags = XFS_BTGEO_OVERLAPPING, + + .rec_len = sizeof(struct xfs_rmap_rec), + /* Overlapping btree; 2 keys per pointer. */ + .key_len = 2 * sizeof(struct xfs_rmap_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_RMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rmap_mem_2), + + .dup_cursor = xfbtree_dup_cursor, + .set_root = xfbtree_set_root, + .alloc_block = xfbtree_alloc_block, + .free_block = xfbtree_free_block, + .get_minrecs = xfbtree_get_minrecs, + .get_maxrecs = xfbtree_get_maxrecs, + .init_key_from_rec = xfs_rmapbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, + .init_ptr_from_cur = xfbtree_init_ptr_from_cur, + .key_diff = xfs_rmapbt_key_diff, + .buf_ops = &xfs_rmapbt_mem_buf_ops, + .diff_two_keys = xfs_rmapbt_diff_two_keys, + .keys_inorder = xfs_rmapbt_keys_inorder, + .recs_inorder = xfs_rmapbt_recs_inorder, + .keys_contiguous = xfs_rmapbt_keys_contiguous, +}; + +/* Create a cursor for an in-memory btree. */ struct xfs_btree_cur * -xfs_rmapbt_init_cursor( - struct xfs_mount *mp, +xfs_rmapbt_mem_cursor( + struct xfs_perag *pag, struct xfs_trans *tp, - struct xfs_buf *agbp, - struct xfs_perag *pag) + struct xfbtree *xfbt) { - struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; + struct xfs_mount *mp = pag->pag_mount; - cur = xfs_rmapbt_init_common(mp, tp, pag); - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); - cur->bc_ag.agbp = agbp; + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops, + xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache); + cur->bc_mem.xfbtree = xfbt; + cur->bc_nlevels = xfbt->nlevels; + + cur->bc_mem.pag = xfs_perag_hold(pag); return cur; } -/* Create a new reverse mapping btree cursor with a fake root for staging. */ -struct xfs_btree_cur * -xfs_rmapbt_stage_cursor( +/* Create an in-memory rmap btree. */ +int +xfs_rmapbt_mem_init( struct xfs_mount *mp, - struct xbtree_afakeroot *afake, - struct xfs_perag *pag) + struct xfbtree *xfbt, + struct xfs_buftarg *btp, + xfs_agnumber_t agno) { - struct xfs_btree_cur *cur; + xfbt->owner = agno; + return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops); +} - cur = xfs_rmapbt_init_common(mp, NULL, pag); - xfs_btree_stage_afakeroot(cur, afake); - return cur; +/* Compute the max possible height for reverse mapping btrees in memory. */ +static unsigned int +xfs_rmapbt_mem_maxlevels(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2; + + /* + * How tall can an in-memory rmap btree become if we filled the entire + * AG with rmap records? + */ + return xfs_btree_compute_maxlevels(minrecs, + XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec)); } +#else +# define xfs_rmapbt_mem_maxlevels() (0) +#endif /* CONFIG_XFS_BTREE_IN_MEM */ /* * Install a new reverse mapping btree root. Caller is responsible for @@ -563,12 +703,12 @@ xfs_rmapbt_commit_staged_btree( ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); - agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + agf->agf_rmap_root = cpu_to_be32(afake->af_root); + agf->agf_rmap_level = cpu_to_be32(afake->af_levels); agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | XFS_AGF_RMAP_BLOCKS); - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops); + xfs_btree_commit_afakeroot(cur, tp, agbp); } /* Calculate number of records in a reverse mapping btree block. */ @@ -618,7 +758,8 @@ xfs_rmapbt_maxlevels_ondisk(void) * like if it consumes almost all the blocks in the AG due to maximal * sharing factor. */ - return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS); + return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS), + xfs_rmapbt_mem_maxlevels()); } /* Compute the maximum height of an rmap btree. */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 3244715dd111..eb90d89e8086 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -10,6 +10,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; struct xbtree_afakeroot; +struct xfbtree; /* rmaps only exist on crc enabled filesystems */ #define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN @@ -44,8 +45,6 @@ struct xbtree_afakeroot; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_perag *pag); -struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag); void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); int xfs_rmapbt_maxrecs(int blocklen, int leaf); @@ -64,4 +63,9 @@ unsigned int xfs_rmapbt_maxlevels_ondisk(void); int __init xfs_rmapbt_init_cur_cache(void); void xfs_rmapbt_destroy_cur_cache(void); +struct xfs_btree_cur *xfs_rmapbt_mem_cursor(struct xfs_perag *pag, + struct xfs_trans *tp, struct xfbtree *xfbtree); +int xfs_rmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, + struct xfs_buftarg *btp, xfs_agnumber_t agno); + #endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 396648acb5be..f246d6dbf4ec 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -16,6 +16,8 @@ #include "xfs_trans.h" #include "xfs_rtalloc.h" #include "xfs_error.h" +#include "xfs_rtbitmap.h" +#include "xfs_health.h" /* * Realtime allocator bitmap functions shared with userspace. @@ -46,43 +48,93 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { .verify_write = xfs_rtbuf_verify_write, }; +/* Release cached rt bitmap and summary buffers. */ +void +xfs_rtbuf_cache_relse( + struct xfs_rtalloc_args *args) +{ + if (args->rbmbp) { + xfs_trans_brelse(args->tp, args->rbmbp); + args->rbmbp = NULL; + args->rbmoff = NULLFILEOFF; + } + if (args->sumbp) { + xfs_trans_brelse(args->tp, args->sumbp); + args->sumbp = NULL; + args->sumoff = NULLFILEOFF; + } +} + /* * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ int xfs_rtbuf_get( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t block, /* block number in bitmap or summary */ - int issum, /* is summary not bitmap */ - struct xfs_buf **bpp) /* output: buffer for the block */ + struct xfs_rtalloc_args *args, + xfs_fileoff_t block, /* block number in bitmap or summary */ + int issum) /* is summary not bitmap */ { - struct xfs_buf *bp; /* block buffer, result */ - xfs_inode_t *ip; /* bitmap or summary inode */ - xfs_bmbt_irec_t map; - int nmap = 1; - int error; /* error value */ + struct xfs_mount *mp = args->mp; + struct xfs_buf **cbpp; /* cached block buffer */ + xfs_fileoff_t *coffp; /* cached block number */ + struct xfs_buf *bp; /* block buffer, result */ + struct xfs_inode *ip; /* bitmap or summary inode */ + struct xfs_bmbt_irec map; + enum xfs_blft type; + int nmap = 1; + int error; - ip = issum ? mp->m_rsumip : mp->m_rbmip; + if (issum) { + cbpp = &args->sumbp; + coffp = &args->sumoff; + ip = mp->m_rsumip; + type = XFS_BLFT_RTSUMMARY_BUF; + } else { + cbpp = &args->rbmbp; + coffp = &args->rbmoff; + ip = mp->m_rbmip; + type = XFS_BLFT_RTBITMAP_BUF; + } + + /* + * If we have a cached buffer, and the block number matches, use that. + */ + if (*cbpp && *coffp == block) + return 0; + + /* + * Otherwise we have to have to get the buffer. If there was an old + * one, get rid of it first. + */ + if (*cbpp) { + xfs_trans_brelse(args->tp, *cbpp); + *cbpp = NULL; + } error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0); if (error) return error; - if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) + if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) { + xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY : + XFS_SICK_RT_BITMAP); return -EFSCORRUPTED; + } ASSERT(map.br_startblock != NULLFSBLOCK); - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), mp->m_bsize, 0, &bp, &xfs_rtbuf_ops); + if (xfs_metadata_is_sick(error)) + xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY : + XFS_SICK_RT_BITMAP); if (error) return error; - xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF - : XFS_BLFT_RTBITMAP_BUF); - *bpp = bp; + xfs_trans_buf_set_type(args->tp, bp, type); + *cbpp = bp; + *coffp = block; return 0; } @@ -92,47 +144,44 @@ xfs_rtbuf_get( */ int xfs_rtfind_back( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to look at */ - xfs_rtblock_t limit, /* last block to look at */ - xfs_rtblock_t *rtblock) /* out: start block found */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext to look at */ + xfs_rtxnum_t limit, /* last rtext to look at */ + xfs_rtxnum_t *rtx) /* out: start rtext found */ { - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - struct xfs_buf *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t firstbit; /* first useful bit in the word */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t len; /* length of inspected area */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t want; /* mask for "good" values */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ + struct xfs_mount *mp = args->mp; + int bit; /* bit number in the word */ + xfs_fileoff_t block; /* bitmap block number */ + int error; /* error value */ + xfs_rtxnum_t firstbit; /* first useful bit in the word */ + xfs_rtxnum_t i; /* current bit number rel. to start */ + xfs_rtxnum_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + xfs_rtword_t incore; + unsigned int word; /* word number in the buffer */ /* * Compute and read in starting bitmap block for starting block. */ - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { + block = xfs_rtx_to_rbmblock(mp, start); + error = xfs_rtbitmap_read_buf(args, block); + if (error) return error; - } - bufp = bp->b_addr; + /* * Get the first word's index & point to it. */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; + word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); len = start - limit + 1; /* * Compute match value, based on the bit at start: if 1 (free) * then all-ones, else all-zeroes. */ - want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + incore = xfs_rtbitmap_getword(args, word); + want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0; /* * If the starting position is not word-aligned, deal with the * partial word. @@ -142,20 +191,19 @@ xfs_rtfind_back( * Calculate first (leftmost) bit number to look at, * and mask for all the relevant bits in this word. */ - firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); + firstbit = max_t(xfs_srtblock_t, bit - len + 1, 0); mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << firstbit; /* * Calculate the difference between the value there * and what we're looking for. */ - if ((wdiff = (*b ^ want) & mask)) { + if ((wdiff = (incore ^ want) & mask)) { /* * Different. Mark where we are and return. */ - xfs_trans_brelse(tp, bp); - i = bit - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; + i = bit - xfs_highbit32(wdiff); + *rtx = start - i + 1; return 0; } i = bit - firstbit + 1; @@ -167,19 +215,11 @@ xfs_rtfind_back( /* * If done with this block, get the previous one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, --block); + if (error) return error; - } - bufp = bp->b_addr; - word = XFS_BLOCKWMASK(mp); - b = &bufp[word]; - } else { - /* - * Go on to the previous word in the buffer. - */ - b--; + + word = mp->m_blockwsize - 1; } } else { /* @@ -195,13 +235,13 @@ xfs_rtfind_back( /* * Compute difference between actual and desired value. */ - if ((wdiff = *b ^ want)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = incore ^ want)) { /* * Different, mark where we are and return. */ - xfs_trans_brelse(tp, bp); - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; + i += XFS_NBWORD - 1 - xfs_highbit32(wdiff); + *rtx = start - i + 1; return 0; } i += XFS_NBWORD; @@ -213,19 +253,11 @@ xfs_rtfind_back( /* * If done with this block, get the previous one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, --block); + if (error) return error; - } - bufp = bp->b_addr; - word = XFS_BLOCKWMASK(mp); - b = &bufp[word]; - } else { - /* - * Go on to the previous word in the buffer. - */ - b--; + + word = mp->m_blockwsize - 1; } } /* @@ -242,13 +274,13 @@ xfs_rtfind_back( /* * Compute difference between actual and desired value. */ - if ((wdiff = (*b ^ want) & mask)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = (incore ^ want) & mask)) { /* * Different, mark where we are and return. */ - xfs_trans_brelse(tp, bp); - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; + i += XFS_NBWORD - 1 - xfs_highbit32(wdiff); + *rtx = start - i + 1; return 0; } else i = len; @@ -256,8 +288,7 @@ xfs_rtfind_back( /* * No match, return that we scanned the whole area. */ - xfs_trans_brelse(tp, bp); - *rtblock = start - i + 1; + *rtx = start - i + 1; return 0; } @@ -267,47 +298,44 @@ xfs_rtfind_back( */ int xfs_rtfind_forw( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to look at */ - xfs_rtblock_t limit, /* last block to look at */ - xfs_rtblock_t *rtblock) /* out: start block found */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext to look at */ + xfs_rtxnum_t limit, /* last rtext to look at */ + xfs_rtxnum_t *rtx) /* out: start rtext found */ { - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - struct xfs_buf *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t lastbit; /* last useful bit in the word */ - xfs_rtblock_t len; /* length of inspected area */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t want; /* mask for "good" values */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ + struct xfs_mount *mp = args->mp; + int bit; /* bit number in the word */ + xfs_fileoff_t block; /* bitmap block number */ + int error; + xfs_rtxnum_t i; /* current bit number rel. to start */ + xfs_rtxnum_t lastbit;/* last useful bit in the word */ + xfs_rtxnum_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + xfs_rtword_t incore; + unsigned int word; /* word number in the buffer */ /* * Compute and read in starting bitmap block for starting block. */ - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { + block = xfs_rtx_to_rbmblock(mp, start); + error = xfs_rtbitmap_read_buf(args, block); + if (error) return error; - } - bufp = bp->b_addr; + /* * Get the first word's index & point to it. */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; + word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); len = limit - start + 1; /* * Compute match value, based on the bit at start: if 1 (free) * then all-ones, else all-zeroes. */ - want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + incore = xfs_rtbitmap_getword(args, word); + want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0; /* * If the starting position is not word-aligned, deal with the * partial word. @@ -317,19 +345,18 @@ xfs_rtfind_forw( * Calculate last (rightmost) bit number to look at, * and mask for all the relevant bits in this word. */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + lastbit = min(bit + len, XFS_NBWORD); mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; /* * Calculate the difference between the value there * and what we're looking for. */ - if ((wdiff = (*b ^ want) & mask)) { + if ((wdiff = (incore ^ want) & mask)) { /* * Different. Mark where we are and return. */ - xfs_trans_brelse(tp, bp); - i = XFS_RTLOBIT(wdiff) - bit; - *rtblock = start + i - 1; + i = xfs_lowbit32(wdiff) - bit; + *rtx = start + i - 1; return 0; } i = lastbit - bit; @@ -337,22 +364,15 @@ xfs_rtfind_forw( * Go on to next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * If done with this block, get the previous one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - b = bufp = bp->b_addr; + word = 0; - } else { - /* - * Go on to the previous word in the buffer. - */ - b++; } } else { /* @@ -368,13 +388,13 @@ xfs_rtfind_forw( /* * Compute difference between actual and desired value. */ - if ((wdiff = *b ^ want)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = incore ^ want)) { /* * Different, mark where we are and return. */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *rtblock = start + i - 1; + i += xfs_lowbit32(wdiff); + *rtx = start + i - 1; return 0; } i += XFS_NBWORD; @@ -382,22 +402,15 @@ xfs_rtfind_forw( * Go on to next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * If done with this block, get the next one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - b = bufp = bp->b_addr; + word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; } } /* @@ -412,13 +425,13 @@ xfs_rtfind_forw( /* * Compute difference between actual and desired value. */ - if ((wdiff = (*b ^ want) & mask)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = (incore ^ want) & mask)) { /* * Different, mark where we are and return. */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *rtblock = start + i - 1; + i += xfs_lowbit32(wdiff); + *rtx = start + i - 1; return 0; } else i = len; @@ -426,102 +439,95 @@ xfs_rtfind_forw( /* * No match, return that we scanned the whole area. */ - xfs_trans_brelse(tp, bp); - *rtblock = start + i - 1; + *rtx = start + i - 1; return 0; } +/* Log rtsummary counter at @infoword. */ +static inline void +xfs_trans_log_rtsummary( + struct xfs_rtalloc_args *args, + unsigned int infoword) +{ + struct xfs_buf *bp = args->sumbp; + size_t first, last; + + first = (void *)xfs_rsumblock_infoptr(args, infoword) - bp->b_addr; + last = first + sizeof(xfs_suminfo_t) - 1; + + xfs_trans_log_buf(args->tp, bp, first, last); +} + /* - * Read and/or modify the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - * - * Summary information is returned in *sum if specified. - * If no delta is specified, returns summary only. + * Modify the summary information for a given extent size, bitmap block + * combination. */ int -xfs_rtmodify_summary_int( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - int delta, /* change to make to summary info */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_suminfo_t *sum) /* out: summary info for this block */ +xfs_rtmodify_summary( + struct xfs_rtalloc_args *args, + int log, /* log2 of extent size */ + xfs_fileoff_t bbno, /* bitmap block number */ + int delta) /* in/out: summary block number */ { - struct xfs_buf *bp; /* buffer for the summary block */ - int error; /* error value */ - xfs_fsblock_t sb; /* summary fsblock */ - int so; /* index into the summary file */ - xfs_suminfo_t *sp; /* pointer to returned data */ + struct xfs_mount *mp = args->mp; + xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno); + unsigned int infoword; + xfs_suminfo_t val; + int error; - /* - * Compute entry number in the summary file. - */ - so = XFS_SUMOFFS(mp, log, bbno); - /* - * Compute the block number in the summary file. - */ - sb = XFS_SUMOFFSTOBLOCK(mp, so); - /* - * If we have an old buffer, and the block number matches, use that. - */ - if (*rbpp && *rsb == sb) - bp = *rbpp; - /* - * Otherwise we have to get the buffer. - */ - else { - /* - * If there was an old one, get rid of it first. - */ - if (*rbpp) - xfs_trans_brelse(tp, *rbpp); - error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); - if (error) { - return error; - } - /* - * Remember this buffer and block for the next call. - */ - *rbpp = bp; - *rsb = sb; - } - /* - * Point to the summary information, modify/log it, and/or copy it out. - */ - sp = XFS_SUMPTR(mp, bp, so); - if (delta) { - uint first = (uint)((char *)sp - (char *)bp->b_addr); - - *sp += delta; - if (mp->m_rsum_cache) { - if (*sp == 0 && log == mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno]++; - if (*sp != 0 && log < mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log; - } - xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1); + error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so)); + if (error) + return error; + + infoword = xfs_rtsumoffs_to_infoword(mp, so); + val = xfs_suminfo_add(args, infoword, delta); + + if (mp->m_rsum_cache) { + if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log; + if (val != 0 && log >= mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log + 1; } - if (sum) - *sum = *sp; + + xfs_trans_log_rtsummary(args, infoword); return 0; } +/* + * Read and return the summary information for a given extent size, bitmap block + * combination. + */ int -xfs_rtmodify_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - int delta, /* change to make to summary info */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ +xfs_rtget_summary( + struct xfs_rtalloc_args *args, + int log, /* log2 of extent size */ + xfs_fileoff_t bbno, /* bitmap block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ { - return xfs_rtmodify_summary_int(mp, tp, log, bbno, - delta, rbpp, rsb, NULL); + struct xfs_mount *mp = args->mp; + xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno); + int error; + + error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so)); + if (!error) + *sum = xfs_suminfo_get(args, xfs_rtsumoffs_to_infoword(mp, so)); + return error; +} + +/* Log rtbitmap block from the word @from to the byte before @next. */ +static inline void +xfs_trans_log_rtbitmap( + struct xfs_rtalloc_args *args, + unsigned int from, + unsigned int next) +{ + struct xfs_buf *bp = args->rbmbp; + size_t first, last; + + first = (void *)xfs_rbmblock_wordptr(args, from) - bp->b_addr; + last = ((void *)xfs_rbmblock_wordptr(args, next) - 1) - bp->b_addr; + + xfs_trans_log_buf(args->tp, bp, first, last); } /* @@ -530,41 +536,37 @@ xfs_rtmodify_summary( */ int xfs_rtmodify_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to modify */ - xfs_extlen_t len, /* length of extent to modify */ - int val) /* 1 for free, 0 for allocated */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext to modify */ + xfs_rtxlen_t len, /* length of extent to modify */ + int val) /* 1 for free, 0 for allocated */ { - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - struct xfs_buf *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtword_t *first; /* first used word in the buffer */ - int i; /* current bit number rel. to start */ - int lastbit; /* last useful bit in word */ - xfs_rtword_t mask; /* mask o frelevant bits for value */ - int word; /* word number in the buffer */ + struct xfs_mount *mp = args->mp; + int bit; /* bit number in the word */ + xfs_fileoff_t block; /* bitmap block number */ + int error; + int i; /* current bit number rel. to start */ + int lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t incore; + unsigned int firstword; /* first word used in the buffer */ + unsigned int word; /* word number in the buffer */ /* * Compute starting bitmap block number. */ - block = XFS_BITTOBLOCK(mp, start); + block = xfs_rtx_to_rbmblock(mp, start); /* * Read the bitmap block, and point to its data. */ - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, block); + if (error) return error; - } - bufp = bp->b_addr; + /* * Compute the starting word's address, and starting bit. */ - word = XFS_BITTOWORD(mp, start); - first = b = &bufp[word]; + firstword = word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); /* * 0 (allocated) => all zeroes; 1 (free) => all ones. @@ -578,39 +580,33 @@ xfs_rtmodify_range( /* * Compute first bit not changed and mask of relevant bits. */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + lastbit = min(bit + len, XFS_NBWORD); mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; /* * Set/clear the active bits. */ + incore = xfs_rtbitmap_getword(args, word); if (val) - *b |= mask; + incore |= mask; else - *b &= ~mask; + incore &= ~mask; + xfs_rtbitmap_setword(args, word, incore); i = lastbit - bit; /* * Go on to the next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * Log the changed part of this block. * Get the next one. */ - xfs_trans_log_buf(tp, bp, - (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp)); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + xfs_trans_log_rtbitmap(args, firstword, word); + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - first = b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer - */ - b++; + + firstword = word = 0; } } else { /* @@ -626,31 +622,23 @@ xfs_rtmodify_range( /* * Set the word value correctly. */ - *b = val; + xfs_rtbitmap_setword(args, word, val); i += XFS_NBWORD; /* * Go on to the next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * Log the changed part of this block. * Get the next one. */ - xfs_trans_log_buf(tp, bp, - (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp)); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + xfs_trans_log_rtbitmap(args, firstword, word); + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - first = b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer - */ - b++; + + firstword = word = 0; } } /* @@ -665,18 +653,19 @@ xfs_rtmodify_range( /* * Set/clear the active bits. */ + incore = xfs_rtbitmap_getword(args, word); if (val) - *b |= mask; + incore |= mask; else - *b &= ~mask; - b++; + incore &= ~mask; + xfs_rtbitmap_setword(args, word, incore); + word++; } /* * Log any remaining changed bytes. */ - if (b > first) - xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp - 1)); + if (word > firstword) + xfs_trans_log_rtbitmap(args, firstword, word); return 0; } @@ -686,23 +675,21 @@ xfs_rtmodify_range( */ int xfs_rtfree_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to free */ - xfs_extlen_t len, /* length to free */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext to free */ + xfs_rtxlen_t len) /* in/out: summary block number */ { - xfs_rtblock_t end; /* end of the freed extent */ - int error; /* error value */ - xfs_rtblock_t postblock; /* first block freed > end */ - xfs_rtblock_t preblock; /* first block freed < start */ + struct xfs_mount *mp = args->mp; + xfs_rtxnum_t end; /* end of the freed extent */ + int error; /* error value */ + xfs_rtxnum_t postblock; /* first rtext freed > end */ + xfs_rtxnum_t preblock; /* first rtext freed < start */ end = start + len - 1; /* * Modify the bitmap to mark this extent freed. */ - error = xfs_rtmodify_range(mp, tp, start, len, 1); + error = xfs_rtmodify_range(args, start, len, 1); if (error) { return error; } @@ -711,15 +698,15 @@ xfs_rtfree_range( * We need to find the beginning and end of the extent so we can * properly update the summary. */ - error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + error = xfs_rtfind_back(args, start, 0, &preblock); if (error) { return error; } /* * Find the next allocated block (end of allocated extent). */ - error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, - &postblock); + error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, + &postblock); if (error) return error; /* @@ -727,9 +714,9 @@ xfs_rtfree_range( * old extent, add summary data for them to be allocated. */ if (preblock < start) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(start - preblock), - XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + error = xfs_rtmodify_summary(args, + xfs_highbit64(start - preblock), + xfs_rtx_to_rbmblock(mp, preblock), -1); if (error) { return error; } @@ -739,9 +726,9 @@ xfs_rtfree_range( * old extent, add summary data for them to be allocated. */ if (postblock > end) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock - end), - XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); + error = xfs_rtmodify_summary(args, + xfs_highbit64(postblock - end), + xfs_rtx_to_rbmblock(mp, end + 1), -1); if (error) { return error; } @@ -750,10 +737,9 @@ xfs_rtfree_range( * Increment the summary information corresponding to the entire * (new) free extent. */ - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock + 1 - preblock), - XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); - return error; + return xfs_rtmodify_summary(args, + xfs_highbit64(postblock + 1 - preblock), + xfs_rtx_to_rbmblock(mp, preblock), 1); } /* @@ -762,43 +748,39 @@ xfs_rtfree_range( */ int xfs_rtcheck_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - int val, /* 1 for free, 0 for allocated */ - xfs_rtblock_t *new, /* out: first block not matching */ - int *stat) /* out: 1 for matches, 0 for not */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext number of extent */ + xfs_rtxlen_t len, /* length of extent */ + int val, /* 1 for free, 0 for allocated */ + xfs_rtxnum_t *new, /* out: first rtext not matching */ + int *stat) /* out: 1 for matches, 0 for not */ { - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - struct xfs_buf *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t lastbit; /* last useful bit in word */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ + struct xfs_mount *mp = args->mp; + int bit; /* bit number in the word */ + xfs_fileoff_t block; /* bitmap block number */ + int error; + xfs_rtxnum_t i; /* current bit number rel. to start */ + xfs_rtxnum_t lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t wdiff; /* difference from wanted value */ + xfs_rtword_t incore; + unsigned int word; /* word number in the buffer */ /* * Compute starting bitmap block number */ - block = XFS_BITTOBLOCK(mp, start); + block = xfs_rtx_to_rbmblock(mp, start); /* * Read the bitmap block. */ - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, block); + if (error) return error; - } - bufp = bp->b_addr; + /* * Compute the starting word's address, and starting bit. */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; + word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); /* * 0 (allocated) => all zero's; 1 (free) => all one's. @@ -812,7 +794,7 @@ xfs_rtcheck_range( /* * Compute first bit not examined. */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + lastbit = min(bit + len, XFS_NBWORD); /* * Mask of relevant bits. */ @@ -820,12 +802,12 @@ xfs_rtcheck_range( /* * Compute difference between actual and desired value. */ - if ((wdiff = (*b ^ val) & mask)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = (incore ^ val) & mask)) { /* * Different, compute first wrong bit and return. */ - xfs_trans_brelse(tp, bp); - i = XFS_RTLOBIT(wdiff) - bit; + i = xfs_lowbit32(wdiff) - bit; *new = start + i; *stat = 0; return 0; @@ -835,22 +817,15 @@ xfs_rtcheck_range( * Go on to next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * If done with this block, get the next one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - b = bufp = bp->b_addr; + word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; } } else { /* @@ -866,12 +841,12 @@ xfs_rtcheck_range( /* * Compute difference between actual and desired value. */ - if ((wdiff = *b ^ val)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = incore ^ val)) { /* * Different, compute first wrong bit and return. */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *new = start + i; *stat = 0; return 0; @@ -881,22 +856,15 @@ xfs_rtcheck_range( * Go on to next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * If done with this block, get the next one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - b = bufp = bp->b_addr; + word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; } } /* @@ -911,12 +879,12 @@ xfs_rtcheck_range( /* * Compute difference between actual and desired value. */ - if ((wdiff = (*b ^ val) & mask)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = (incore ^ val) & mask)) { /* * Different, compute first wrong bit and return. */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *new = start + i; *stat = 0; return 0; @@ -926,7 +894,6 @@ xfs_rtcheck_range( /* * Successful, return. */ - xfs_trans_brelse(tp, bp); *new = start + i; *stat = 1; return 0; @@ -936,58 +903,57 @@ xfs_rtcheck_range( /* * Check that the given extent (block range) is allocated already. */ -STATIC int /* error */ +STATIC int xfs_rtcheck_alloc_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number of extent */ - xfs_extlen_t len) /* length of extent */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext number of extent */ + xfs_rtxlen_t len) /* length of extent */ { - xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ - int stat; - int error; + xfs_rtxnum_t new; /* dummy for xfs_rtcheck_range */ + int stat; + int error; - error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat); + error = xfs_rtcheck_range(args, start, len, 0, &new, &stat); if (error) return error; ASSERT(stat); return 0; } #else -#define xfs_rtcheck_alloc_range(m,t,b,l) (0) +#define xfs_rtcheck_alloc_range(a,b,l) (0) #endif /* * Free an extent in the realtime subvolume. Length is expressed in * realtime extents, as is the block number. */ -int /* error */ +int xfs_rtfree_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to free */ - xfs_extlen_t len) /* length of extent freed */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtxnum_t start, /* starting rtext number to free */ + xfs_rtxlen_t len) /* length of extent freed */ { - int error; /* error value */ - xfs_mount_t *mp; /* file system mount structure */ - xfs_fsblock_t sb; /* summary file block number */ - struct xfs_buf *sumbp = NULL; /* summary file block buffer */ - struct timespec64 atime; - - mp = tp->t_mountp; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = tp, + }; + int error; + struct timespec64 atime; ASSERT(mp->m_rbmip->i_itemp != NULL); - ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL); - error = xfs_rtcheck_alloc_range(mp, tp, bno, len); + error = xfs_rtcheck_alloc_range(&args, start, len); if (error) return error; /* * Free the range of realtime blocks. */ - error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); - if (error) { - return error; - } + error = xfs_rtfree_range(&args, start, len); + if (error) + goto out; + /* * Mark more blocks free in the superblock. */ @@ -1002,11 +968,47 @@ xfs_rtfree_extent( mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; atime = inode_get_atime(VFS_I(mp->m_rbmip)); - *((uint64_t *)&atime) = 0; + atime.tv_sec = 0; inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); } - return 0; + error = 0; +out: + xfs_rtbuf_cache_relse(&args); + return error; +} + +/* + * Free some blocks in the realtime subvolume. rtbno and rtlen are in units of + * rt blocks, not rt extents; must be aligned to the rt extent size; and rtlen + * cannot exceed XFS_MAX_BMBT_EXTLEN. + */ +int +xfs_rtfree_blocks( + struct xfs_trans *tp, + xfs_fsblock_t rtbno, + xfs_filblks_t rtlen) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_rtxnum_t start; + xfs_filblks_t len; + xfs_extlen_t mod; + + ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); + + len = xfs_rtb_to_rtxrem(mp, rtlen, &mod); + if (mod) { + ASSERT(mod == 0); + return -EIO; + } + + start = xfs_rtb_to_rtxrem(mp, rtbno, &mod); + if (mod) { + ASSERT(mod == 0); + return -EIO; + } + + return xfs_rtfree_extent(tp, start, len); } /* Find all the free records within a given range. */ @@ -1019,10 +1021,14 @@ xfs_rtalloc_query_range( xfs_rtalloc_query_range_fn fn, void *priv) { + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = tp, + }; struct xfs_rtalloc_rec rec; - xfs_rtblock_t rtstart; - xfs_rtblock_t rtend; - xfs_rtblock_t high_key; + xfs_rtxnum_t rtstart; + xfs_rtxnum_t rtend; + xfs_rtxnum_t high_key; int is_free; int error = 0; @@ -1038,13 +1044,13 @@ xfs_rtalloc_query_range( rtstart = low_rec->ar_startext; while (rtstart <= high_key) { /* Is the first block free? */ - error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, + error = xfs_rtcheck_range(&args, rtstart, 1, 1, &rtend, &is_free); if (error) break; /* How long does the extent go for? */ - error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend); + error = xfs_rtfind_forw(&args, rtstart, high_key, &rtend); if (error) break; @@ -1060,6 +1066,7 @@ xfs_rtalloc_query_range( rtstart = rtend + 1; } + xfs_rtbuf_cache_relse(&args); return error; } @@ -1085,18 +1092,79 @@ int xfs_rtalloc_extent_is_free( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, - xfs_extlen_t len, + xfs_rtxnum_t start, + xfs_rtxlen_t len, bool *is_free) { - xfs_rtblock_t end; + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = tp, + }; + xfs_rtxnum_t end; int matches; int error; - error = xfs_rtcheck_range(mp, tp, start, len, 1, &end, &matches); + error = xfs_rtcheck_range(&args, start, len, 1, &end, &matches); + xfs_rtbuf_cache_relse(&args); if (error) return error; *is_free = matches; return 0; } + +/* + * Compute the number of rtbitmap blocks needed to track the given number of rt + * extents. + */ +xfs_filblks_t +xfs_rtbitmap_blockcount( + struct xfs_mount *mp, + xfs_rtbxlen_t rtextents) +{ + return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize); +} + +/* + * Compute the number of rtbitmap words needed to populate every block of a + * bitmap that is large enough to track the given number of rt extents. + */ +unsigned long long +xfs_rtbitmap_wordcount( + struct xfs_mount *mp, + xfs_rtbxlen_t rtextents) +{ + xfs_filblks_t blocks; + + blocks = xfs_rtbitmap_blockcount(mp, rtextents); + return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; +} + +/* Compute the number of rtsummary blocks needed to track the given rt space. */ +xfs_filblks_t +xfs_rtsummary_blockcount( + struct xfs_mount *mp, + unsigned int rsumlevels, + xfs_extlen_t rbmblocks) +{ + unsigned long long rsumwords; + + rsumwords = (unsigned long long)rsumlevels * rbmblocks; + return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG); +} + +/* + * Compute the number of rtsummary info words needed to populate every block of + * a summary file that is large enough to track the given rt space. + */ +unsigned long long +xfs_rtsummary_wordcount( + struct xfs_mount *mp, + unsigned int rsumlevels, + xfs_extlen_t rbmblocks) +{ + xfs_filblks_t blocks; + + blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks); + return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; +} diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h new file mode 100644 index 000000000000..152a66750af5 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_RTBITMAP_H__ +#define __XFS_RTBITMAP_H__ + +struct xfs_rtalloc_args { + struct xfs_mount *mp; + struct xfs_trans *tp; + + struct xfs_buf *rbmbp; /* bitmap block buffer */ + struct xfs_buf *sumbp; /* summary block buffer */ + + xfs_fileoff_t rbmoff; /* bitmap block number */ + xfs_fileoff_t sumoff; /* summary block number */ +}; + +static inline xfs_rtblock_t +xfs_rtx_to_rtb( + struct xfs_mount *mp, + xfs_rtxnum_t rtx) +{ + if (mp->m_rtxblklog >= 0) + return rtx << mp->m_rtxblklog; + + return rtx * mp->m_sb.sb_rextsize; +} + +static inline xfs_extlen_t +xfs_rtxlen_to_extlen( + struct xfs_mount *mp, + xfs_rtxlen_t rtxlen) +{ + if (mp->m_rtxblklog >= 0) + return rtxlen << mp->m_rtxblklog; + + return rtxlen * mp->m_sb.sb_rextsize; +} + +/* Compute the misalignment between an extent length and a realtime extent .*/ +static inline unsigned int +xfs_extlen_to_rtxmod( + struct xfs_mount *mp, + xfs_extlen_t len) +{ + if (mp->m_rtxblklog >= 0) + return len & mp->m_rtxblkmask; + + return len % mp->m_sb.sb_rextsize; +} + +static inline xfs_rtxlen_t +xfs_extlen_to_rtxlen( + struct xfs_mount *mp, + xfs_extlen_t len) +{ + if (mp->m_rtxblklog >= 0) + return len >> mp->m_rtxblklog; + + return len / mp->m_sb.sb_rextsize; +} + +/* Convert an rt block number into an rt extent number. */ +static inline xfs_rtxnum_t +xfs_rtb_to_rtx( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + if (likely(mp->m_rtxblklog >= 0)) + return rtbno >> mp->m_rtxblklog; + + return div_u64(rtbno, mp->m_sb.sb_rextsize); +} + +/* Return the offset of an rt block number within an rt extent. */ +static inline xfs_extlen_t +xfs_rtb_to_rtxoff( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + if (likely(mp->m_rtxblklog >= 0)) + return rtbno & mp->m_rtxblkmask; + + return do_div(rtbno, mp->m_sb.sb_rextsize); +} + +/* + * Crack an rt block number into an rt extent number and an offset within that + * rt extent. Returns the rt extent number directly and the offset in @off. + */ +static inline xfs_rtxnum_t +xfs_rtb_to_rtxrem( + struct xfs_mount *mp, + xfs_rtblock_t rtbno, + xfs_extlen_t *off) +{ + if (likely(mp->m_rtxblklog >= 0)) { + *off = rtbno & mp->m_rtxblkmask; + return rtbno >> mp->m_rtxblklog; + } + + return div_u64_rem(rtbno, mp->m_sb.sb_rextsize, off); +} + +/* + * Convert an rt block number into an rt extent number, rounding up to the next + * rt extent if the rt block is not aligned to an rt extent boundary. + */ +static inline xfs_rtxnum_t +xfs_rtb_to_rtxup( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + if (likely(mp->m_rtxblklog >= 0)) { + if (rtbno & mp->m_rtxblkmask) + return (rtbno >> mp->m_rtxblklog) + 1; + return rtbno >> mp->m_rtxblklog; + } + + if (do_div(rtbno, mp->m_sb.sb_rextsize)) + rtbno++; + return rtbno; +} + +/* Round this rtblock up to the nearest rt extent size. */ +static inline xfs_rtblock_t +xfs_rtb_roundup_rtx( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return roundup_64(rtbno, mp->m_sb.sb_rextsize); +} + +/* Round this rtblock down to the nearest rt extent size. */ +static inline xfs_rtblock_t +xfs_rtb_rounddown_rtx( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return rounddown_64(rtbno, mp->m_sb.sb_rextsize); +} + +/* Convert an rt extent number to a file block offset in the rt bitmap file. */ +static inline xfs_fileoff_t +xfs_rtx_to_rbmblock( + struct xfs_mount *mp, + xfs_rtxnum_t rtx) +{ + return rtx >> mp->m_blkbit_log; +} + +/* Convert an rt extent number to a word offset within an rt bitmap block. */ +static inline unsigned int +xfs_rtx_to_rbmword( + struct xfs_mount *mp, + xfs_rtxnum_t rtx) +{ + return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1); +} + +/* Convert a file block offset in the rt bitmap file to an rt extent number. */ +static inline xfs_rtxnum_t +xfs_rbmblock_to_rtx( + struct xfs_mount *mp, + xfs_fileoff_t rbmoff) +{ + return rbmoff << mp->m_blkbit_log; +} + +/* Return a pointer to a bitmap word within a rt bitmap block. */ +static inline union xfs_rtword_raw * +xfs_rbmblock_wordptr( + struct xfs_rtalloc_args *args, + unsigned int index) +{ + union xfs_rtword_raw *words = args->rbmbp->b_addr; + + return words + index; +} + +/* Convert an ondisk bitmap word to its incore representation. */ +static inline xfs_rtword_t +xfs_rtbitmap_getword( + struct xfs_rtalloc_args *args, + unsigned int index) +{ + union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); + + return word->old; +} + +/* Set an ondisk bitmap word from an incore representation. */ +static inline void +xfs_rtbitmap_setword( + struct xfs_rtalloc_args *args, + unsigned int index, + xfs_rtword_t value) +{ + union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); + + word->old = value; +} + +/* + * Convert a rt extent length and rt bitmap block number to a xfs_suminfo_t + * offset within the rt summary file. + */ +static inline xfs_rtsumoff_t +xfs_rtsumoffs( + struct xfs_mount *mp, + int log2_len, + xfs_fileoff_t rbmoff) +{ + return log2_len * mp->m_sb.sb_rbmblocks + rbmoff; +} + +/* + * Convert an xfs_suminfo_t offset to a file block offset within the rt summary + * file. + */ +static inline xfs_fileoff_t +xfs_rtsumoffs_to_block( + struct xfs_mount *mp, + xfs_rtsumoff_t rsumoff) +{ + return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t)); +} + +/* + * Convert an xfs_suminfo_t offset to an info word offset within an rt summary + * block. + */ +static inline unsigned int +xfs_rtsumoffs_to_infoword( + struct xfs_mount *mp, + xfs_rtsumoff_t rsumoff) +{ + unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG; + + return rsumoff & mask; +} + +/* Return a pointer to a summary info word within a rt summary block. */ +static inline union xfs_suminfo_raw * +xfs_rsumblock_infoptr( + struct xfs_rtalloc_args *args, + unsigned int index) +{ + union xfs_suminfo_raw *info = args->sumbp->b_addr; + + return info + index; +} + +/* Get the current value of a summary counter. */ +static inline xfs_suminfo_t +xfs_suminfo_get( + struct xfs_rtalloc_args *args, + unsigned int index) +{ + union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + + return info->old; +} + +/* Add to the current value of a summary counter and return the new value. */ +static inline xfs_suminfo_t +xfs_suminfo_add( + struct xfs_rtalloc_args *args, + unsigned int index, + int delta) +{ + union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + + info->old += delta; + return info->old; +} + +/* + * Functions for walking free space rtextents in the realtime bitmap. + */ +struct xfs_rtalloc_rec { + xfs_rtxnum_t ar_startext; + xfs_rtbxlen_t ar_extcount; +}; + +typedef int (*xfs_rtalloc_query_range_fn)( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv); + +#ifdef CONFIG_XFS_RT +void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args); + +int xfs_rtbuf_get(struct xfs_rtalloc_args *args, xfs_fileoff_t block, + int issum); + +static inline int +xfs_rtbitmap_read_buf( + struct xfs_rtalloc_args *args, + xfs_fileoff_t block) +{ + return xfs_rtbuf_get(args, block, 0); +} + +static inline int +xfs_rtsummary_read_buf( + struct xfs_rtalloc_args *args, + xfs_fileoff_t block) +{ + return xfs_rtbuf_get(args, block, 1); +} + +int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat); +int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); +int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); +int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxlen_t len, int val); +int xfs_rtget_summary(struct xfs_rtalloc_args *args, int log, + xfs_fileoff_t bbno, xfs_suminfo_t *sum); +int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, + xfs_fileoff_t bbno, int delta); +int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxlen_t len); +int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, + const struct xfs_rtalloc_rec *low_rec, + const struct xfs_rtalloc_rec *high_rec, + xfs_rtalloc_query_range_fn fn, void *priv); +int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, + void *priv); +int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtxnum_t start, xfs_rtxlen_t len, + bool *is_free); +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtxnum_t start, /* starting rtext number to free */ + xfs_rtxlen_t len); /* length of extent freed */ + +/* Same as above, but in units of rt blocks. */ +int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, + xfs_filblks_t rtlen); + +xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t + rtextents); +unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, + xfs_rtbxlen_t rtextents); + +xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp, + unsigned int rsumlevels, xfs_extlen_t rbmblocks); +unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, + unsigned int rsumlevels, xfs_extlen_t rbmblocks); +#else /* CONFIG_XFS_RT */ +# define xfs_rtfree_extent(t,b,l) (-ENOSYS) +# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS) +# define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS) +# define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS) +# define xfs_rtbitmap_read_buf(a,b) (-ENOSYS) +# define xfs_rtsummary_read_buf(a,b) (-ENOSYS) +# define xfs_rtbuf_cache_relse(a) (0) +# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) +static inline xfs_filblks_t +xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) +{ + /* shut up gcc */ + return 0; +} +# define xfs_rtbitmap_wordcount(mp, r) (0) +# define xfs_rtsummary_blockcount(mp, l, b) (0) +# define xfs_rtsummary_wordcount(mp, l, b) (0) +#endif /* CONFIG_XFS_RT */ + +#endif /* __XFS_RTBITMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 6264daaab37b..73a4b895de67 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -25,6 +25,7 @@ #include "xfs_da_format.h" #include "xfs_health.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -508,8 +509,9 @@ xfs_validate_sb_common( rbmblocks = howmany_64(sbp->sb_rextents, NBBY * sbp->sb_blocksize); - if (sbp->sb_rextents != rexts || - sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) || + if (!xfs_validate_rtextents(rexts) || + sbp->sb_rextents != rexts || + sbp->sb_rextslog != xfs_compute_rextslog(rexts) || sbp->sb_rbmblocks != rbmblocks) { xfs_notice(mp, "realtime geometry sanity check failed"); @@ -528,7 +530,8 @@ xfs_validate_sb_common( } if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit), - XFS_FSB_TO_B(mp, sbp->sb_width), 0, false)) + XFS_FSB_TO_B(mp, sbp->sb_width), 0, + xfs_buf_daddr(bp) == XFS_SB_DADDR, false)) return -EFSCORRUPTED; /* @@ -975,6 +978,8 @@ xfs_sb_mount_common( mp->m_blockmask = sbp->sb_blocksize - 1; mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; mp->m_blockwmask = mp->m_blockwsize - 1; + mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); + mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); @@ -1286,6 +1291,8 @@ xfs_sb_read_secondary( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_agno_mark_sick(mp, agno, XFS_SICK_AG_SB); if (error) return error; xfs_buf_set_ref(bp, XFS_SSB_REF); @@ -1317,8 +1324,10 @@ xfs_sb_get_secondary( } /* - * sunit, swidth, sectorsize(optional with 0) should be all in bytes, - * so users won't be confused by values in error messages. + * sunit, swidth, sectorsize(optional with 0) should be all in bytes, so users + * won't be confused by values in error messages. This function returns false + * if the stripe geometry is invalid and the caller is unable to repair the + * stripe configuration later in the mount process. */ bool xfs_validate_stripe_geometry( @@ -1326,20 +1335,21 @@ xfs_validate_stripe_geometry( __s64 sunit, __s64 swidth, int sectorsize, + bool may_repair, bool silent) { if (swidth > INT_MAX) { if (!silent) xfs_notice(mp, "stripe width (%lld) is too large", swidth); - return false; + goto check_override; } if (sunit > swidth) { if (!silent) xfs_notice(mp, "stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth); - return false; + goto check_override; } if (sectorsize && (int)sunit % sectorsize) { @@ -1347,21 +1357,21 @@ xfs_validate_stripe_geometry( xfs_notice(mp, "stripe unit (%lld) must be a multiple of the sector size (%d)", sunit, sectorsize); - return false; + goto check_override; } if (sunit && !swidth) { if (!silent) xfs_notice(mp, "invalid stripe unit (%lld) and stripe width of 0", sunit); - return false; + goto check_override; } if (!sunit && swidth) { if (!silent) xfs_notice(mp, "invalid stripe width (%lld) and stripe unit of 0", swidth); - return false; + goto check_override; } if (sunit && (int)swidth % (int)sunit) { @@ -1369,7 +1379,39 @@ xfs_validate_stripe_geometry( xfs_notice(mp, "stripe width (%lld) must be a multiple of the stripe unit (%lld)", swidth, sunit); - return false; + goto check_override; } return true; + +check_override: + if (!may_repair) + return false; + /* + * During mount, mp->m_dalign will not be set unless the sunit mount + * option was set. If it was set, ignore the bad stripe alignment values + * and allow the validation and overwrite later in the mount process to + * attempt to overwrite the bad stripe alignment values with the values + * supplied by mount options. + */ + if (!mp->m_dalign) + return false; + if (!silent) + xfs_notice(mp, +"Will try to correct with specified mount options sunit (%d) and swidth (%d)", + BBTOB(mp->m_dalign), BBTOB(mp->m_swidth)); + return true; +} + +/* + * Compute the maximum level number of the realtime summary file, as defined by + * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct + * use of rt volumes with more than 2^32 extents. + */ +uint8_t +xfs_compute_rextslog( + xfs_rtbxlen_t rtextents) +{ + if (!rtextents) + return 0; + return xfs_highbit64(rtextents); } diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index a5e14740ec9a..37b1ed1bc209 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -25,7 +25,7 @@ extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp); extern int xfs_update_secondary_sbs(struct xfs_mount *mp); -#define XFS_FS_GEOM_MAX_STRUCT_VER (4) +#define XFS_FS_GEOM_MAX_STRUCT_VER (5) extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo, int struct_version); extern int xfs_sb_read_secondary(struct xfs_mount *mp, @@ -35,7 +35,10 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); -extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp, - __s64 sunit, __s64 swidth, int sectorsize, bool silent); +bool xfs_validate_stripe_geometry(struct xfs_mount *mp, + __s64 sunit, __s64 swidth, int sectorsize, bool may_repair, + bool silent); + +uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c4381388c0c1..dfd61fa8332e 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -43,6 +43,60 @@ extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; +/* btree ops */ +extern const struct xfs_btree_ops xfs_bnobt_ops; +extern const struct xfs_btree_ops xfs_cntbt_ops; +extern const struct xfs_btree_ops xfs_inobt_ops; +extern const struct xfs_btree_ops xfs_finobt_ops; +extern const struct xfs_btree_ops xfs_bmbt_ops; +extern const struct xfs_btree_ops xfs_refcountbt_ops; +extern const struct xfs_btree_ops xfs_rmapbt_ops; +extern const struct xfs_btree_ops xfs_rmapbt_mem_ops; + +static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_bnobt_ops; +} + +static inline bool xfs_btree_is_cnt(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_cntbt_ops; +} + +static inline bool xfs_btree_is_bmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_bmbt_ops; +} + +static inline bool xfs_btree_is_ino(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_inobt_ops; +} + +static inline bool xfs_btree_is_fino(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_finobt_ops; +} + +static inline bool xfs_btree_is_refcount(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_refcountbt_ops; +} + +static inline bool xfs_btree_is_rmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rmapbt_ops; +} + +#ifdef CONFIG_XFS_BTREE_IN_MEM +static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rmapbt_mem_ops; +} +#else +# define xfs_btree_is_mem_rmap(...) (false) +#endif + /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); @@ -128,19 +182,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ #define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ - -/* - * Symlink decoding/encoding functions - */ -int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); -int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, - uint32_t size, struct xfs_buf *bp); -bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, - uint32_t size, struct xfs_buf *bp); -void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_inode *ip, struct xfs_ifork *ifp); -xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip); - /* Computed inode geometry for the filesystem. */ struct xfs_ino_geometry { /* Maximum inode count in this filesystem. */ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index bdc777b9ec4a..ffb1317a9212 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -16,7 +16,10 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" - +#include "xfs_symlink_remote.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_health.h" /* * Each contiguous block has a header, so it is not just a simple pathlen @@ -175,7 +178,7 @@ xfs_symlink_local_to_remote( if (!xfs_has_crc(mp)) { bp->b_ops = NULL; - memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + memcpy(bp->b_addr, ifp->if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); return; } @@ -191,7 +194,7 @@ xfs_symlink_local_to_remote( buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); - memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); + memcpy(buf, ifp->if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1); } @@ -202,15 +205,11 @@ xfs_symlink_local_to_remote( */ xfs_failaddr_t xfs_symlink_shortform_verify( - struct xfs_inode *ip) + void *sfp, + int64_t size) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - char *sfp = (char *)ifp->if_u1.if_data; - int size = ifp->if_bytes; char *endp = sfp + size; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - /* * Zero length symlinks should never occur in memory as they are * never allowed to exist on disk. @@ -231,3 +230,153 @@ xfs_symlink_shortform_verify( return __this_address; return NULL; } + +/* Read a remote symlink target into the buffer. */ +int +xfs_symlink_remote_read( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_buf *bp; + xfs_daddr_t d; + char *cur_chunk; + int pathlen = ip->i_disk_size; + int nmaps = XFS_SYMLINK_MAPS; + int byte_cnt; + int n; + int error = 0; + int fsblocks = 0; + int offset; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + fsblocks = xfs_symlink_blocks(mp, pathlen); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + goto out; + + offset = 0; + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + + error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &bp, &xfs_symlink_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); + if (error) + return error; + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + if (pathlen < byte_cnt) + byte_cnt = pathlen; + + cur_chunk = bp->b_addr; + if (xfs_has_crc(mp)) { + if (!xfs_symlink_hdr_ok(ip->i_ino, offset, + byte_cnt, bp)) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); + error = -EFSCORRUPTED; + xfs_alert(mp, +"symlink header does not match required off/len/owner (0x%x/0x%x,0x%llx)", + offset, byte_cnt, ip->i_ino); + xfs_buf_relse(bp); + goto out; + + } + + cur_chunk += sizeof(struct xfs_dsymlink_hdr); + } + + memcpy(link + offset, cur_chunk, byte_cnt); + + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_buf_relse(bp); + } + ASSERT(pathlen == 0); + + link[ip->i_disk_size] = '\0'; + error = 0; + + out: + return error; +} + +/* Write the symlink target into the inode. */ +int +xfs_symlink_write_target( + struct xfs_trans *tp, + struct xfs_inode *ip, + const char *target_path, + int pathlen, + xfs_fsblock_t fs_blocks, + uint resblks) +{ + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_mount *mp = tp->t_mountp; + const char *cur_chunk; + struct xfs_buf *bp; + xfs_daddr_t d; + int byte_cnt; + int nmaps; + int offset = 0; + int n; + int error; + + /* + * If the symlink will fit into the inode, write it inline. + */ + if (pathlen <= xfs_inode_data_fork_size(ip)) { + xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); + + ip->i_disk_size = pathlen; + ip->i_df.if_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); + return 0; + } + + nmaps = XFS_SYMLINK_MAPS; + error = xfs_bmapi_write(tp, ip, 0, fs_blocks, XFS_BMAPI_METADATA, + resblks, mval, &nmaps); + if (error) + return error; + + ip->i_disk_size = pathlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + cur_chunk = target_path; + offset = 0; + for (n = 0; n < nmaps; n++) { + char *buf; + + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0, &bp); + if (error) + return error; + bp->b_ops = &xfs_symlink_buf_ops; + + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + byte_cnt = min(byte_cnt, pathlen); + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, byte_cnt, + bp); + + memcpy(buf, cur_chunk, byte_cnt); + + cur_chunk += byte_cnt; + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); + xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - + (char *)bp->b_addr); + } + ASSERT(pathlen == 0); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h new file mode 100644 index 000000000000..a63bd38ae4fa --- /dev/null +++ b/fs/xfs/libxfs/xfs_symlink_remote.h @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_SYMLINK_REMOTE_H +#define __XFS_SYMLINK_REMOTE_H + +/* + * Symlink decoding/encoding functions + */ +int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); +int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp); +xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); +int xfs_symlink_remote_read(struct xfs_inode *ip, char *link); +int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip, + const char *target_path, int pathlen, xfs_fsblock_t fs_blocks, + uint resblks); + +#endif /* __XFS_SYMLINK_REMOTE_H */ diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 70e97ea6eee7..69fc5b981352 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -31,7 +31,7 @@ xfs_trans_ijoin( { struct xfs_inode_log_item *iip; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; @@ -60,7 +60,7 @@ xfs_trans_ichgtime( struct timespec64 tv; ASSERT(tp); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); tv = current_time(inode); @@ -90,7 +90,7 @@ xfs_trans_log_inode( struct inode *inode = VFS_I(ip); ASSERT(iip); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); tp->t_flags |= XFS_TRANS_DIRTY; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 5b2f27cbdb80..6cd45e8c118d 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -19,6 +19,7 @@ #include "xfs_trans.h" #include "xfs_qm.h" #include "xfs_trans_space.h" +#include "xfs_rtbitmap.h" #define _ALLOC true #define _FREE false @@ -217,11 +218,12 @@ xfs_rtalloc_block_count( struct xfs_mount *mp, unsigned int num_ops) { - unsigned int blksz = XFS_FSB_TO_B(mp, 1); - unsigned int rtbmp_bytes; + unsigned int rtbmp_blocks; + xfs_rtxlen_t rtxlen; - rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY; - return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; + rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN); + rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen); + return (rtbmp_blocks + 1) * num_ops; } /* diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 5c2765934732..c299b16c9365 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -148,10 +148,10 @@ xfs_verify_rtbno( /* Verify that a realtime device extent is fully contained inside the volume. */ bool -xfs_verify_rtext( +xfs_verify_rtbext( struct xfs_mount *mp, xfs_rtblock_t rtbno, - xfs_rtblock_t len) + xfs_filblks_t len) { if (rtbno + len <= rtbno) return false; diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 851220021484..76eb9e328835 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -11,6 +11,7 @@ typedef uint32_t prid_t; /* project ID */ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ +typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ typedef uint64_t xfs_extnum_t; /* # of extents in a file */ typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ @@ -18,6 +19,7 @@ typedef int64_t xfs_fsize_t; /* bytes in a file */ typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ +typedef uint32_t xfs_rtsumoff_t; /* offset of an rtsummary info word */ typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */ typedef int64_t xfs_lsn_t; /* log sequence number */ @@ -31,6 +33,8 @@ typedef uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ typedef uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ typedef uint64_t xfs_fileoff_t; /* block number in a file */ typedef uint64_t xfs_filblks_t; /* number of blocks in a file */ +typedef uint64_t xfs_rtxnum_t; /* rtextent number */ +typedef uint64_t xfs_rtbxlen_t; /* rtbitmap extent length in rtextents */ typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ @@ -76,11 +80,13 @@ typedef void * xfs_failaddr_t; /* * Inode fork identifiers. */ -#define XFS_DATA_FORK 0 -#define XFS_ATTR_FORK 1 -#define XFS_COW_FORK 2 +#define XFS_STAGING_FORK (-1) /* fake fork for staging a btree */ +#define XFS_DATA_FORK (0) +#define XFS_ATTR_FORK (1) +#define XFS_COW_FORK (2) #define XFS_WHICHFORK_STRINGS \ + { XFS_STAGING_FORK, "staging" }, \ { XFS_DATA_FORK, "data" }, \ { XFS_ATTR_FORK, "attr" }, \ { XFS_COW_FORK, "cow" } @@ -110,24 +116,6 @@ typedef enum { { XFS_LOOKUP_LEi, "le" }, \ { XFS_LOOKUP_GEi, "ge" } -/* - * This enum is used in string mapping in xfs_trace.h and scrub/trace.h; - * please keep the TRACE_DEFINE_ENUMs for it up to date. - */ -typedef enum { - XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, - XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX -} xfs_btnum_t; - -#define XFS_BTNUM_STRINGS \ - { XFS_BTNUM_BNOi, "bnobt" }, \ - { XFS_BTNUM_CNTi, "cntbt" }, \ - { XFS_BTNUM_RMAPi, "rmapbt" }, \ - { XFS_BTNUM_BMAPi, "bmbt" }, \ - { XFS_BTNUM_INOi, "inobt" }, \ - { XFS_BTNUM_FINOi, "finobt" }, \ - { XFS_BTNUM_REFCi, "refcbt" } - struct xfs_name { const unsigned char *name; int len; @@ -145,6 +133,7 @@ typedef uint32_t xfs_dqid_t; */ #define XFS_NBBYLOG 3 /* log2(NBBY) */ #define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ +#define XFS_SUMINFOLOG 2 /* log2(sizeof(xfs_suminfo_t)) */ #define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) #define XFS_NBWORD (1 << XFS_NBWORDLOG) #define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) @@ -202,6 +191,13 @@ enum xfs_ag_resv_type { XFS_AG_RESV_AGFL, XFS_AG_RESV_METADATA, XFS_AG_RESV_RMAPBT, + + /* + * Don't increase fdblocks when freeing extent. This is a pony for + * the bnobt repair functions to re-free the free space without + * altering fdblocks. If you think you need this you're wrong. + */ + XFS_AG_RESV_IGNORE, }; /* Results of scanning a btree keyspace to check occupancy. */ @@ -229,8 +225,8 @@ bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); -bool xfs_verify_rtext(struct xfs_mount *mp, xfs_rtblock_t rtbno, - xfs_rtblock_t len); +bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno, + xfs_filblks_t len); bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount); bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off); void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min, @@ -239,4 +235,16 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off); bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off, xfs_fileoff_t len); +/* Do we support an rt volume having this number of rtextents? */ +static inline bool +xfs_validate_rtextents( + xfs_rtbxlen_t rtextents) +{ + /* No runt rt volumes */ + if (rtextents == 0) + return false; + + return true; +} + #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h deleted file mode 100644 index 79155eec341b..000000000000 --- a/fs/xfs/mrlock.h +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - */ -#ifndef __XFS_SUPPORT_MRLOCK_H__ -#define __XFS_SUPPORT_MRLOCK_H__ - -#include <linux/rwsem.h> - -typedef struct { - struct rw_semaphore mr_lock; -#if defined(DEBUG) || defined(XFS_WARN) - int mr_writer; -#endif -} mrlock_t; - -#if defined(DEBUG) || defined(XFS_WARN) -#define mrinit(mrp, name) \ - do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) -#else -#define mrinit(mrp, name) \ - do { init_rwsem(&(mrp)->mr_lock); } while (0) -#endif - -#define mrlock_init(mrp, t,n,s) mrinit(mrp, n) -#define mrfree(mrp) do { } while (0) - -static inline void mraccess_nested(mrlock_t *mrp, int subclass) -{ - down_read_nested(&mrp->mr_lock, subclass); -} - -static inline void mrupdate_nested(mrlock_t *mrp, int subclass) -{ - down_write_nested(&mrp->mr_lock, subclass); -#if defined(DEBUG) || defined(XFS_WARN) - mrp->mr_writer = 1; -#endif -} - -static inline int mrtryaccess(mrlock_t *mrp) -{ - return down_read_trylock(&mrp->mr_lock); -} - -static inline int mrtryupdate(mrlock_t *mrp) -{ - if (!down_write_trylock(&mrp->mr_lock)) - return 0; -#if defined(DEBUG) || defined(XFS_WARN) - mrp->mr_writer = 1; -#endif - return 1; -} - -static inline void mrunlock_excl(mrlock_t *mrp) -{ -#if defined(DEBUG) || defined(XFS_WARN) - mrp->mr_writer = 0; -#endif - up_write(&mrp->mr_lock); -} - -static inline void mrunlock_shared(mrlock_t *mrp) -{ - up_read(&mrp->mr_lock); -} - -static inline void mrdemote(mrlock_t *mrp) -{ -#if defined(DEBUG) || defined(XFS_WARN) - mrp->mr_writer = 0; -#endif - downgrade_write(&mrp->mr_lock); -} - -#endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff --git a/fs/xfs/scrub/agb_bitmap.c b/fs/xfs/scrub/agb_bitmap.c new file mode 100644 index 000000000000..573e4e062754 --- /dev/null +++ b/fs/xfs/scrub/agb_bitmap.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "bitmap.h" +#include "scrub/agb_bitmap.h" + +/* + * Record all btree blocks seen while iterating all records of a btree. + * + * We know that the btree query_all function starts at the left edge and walks + * towards the right edge of the tree. Therefore, we know that we can walk up + * the btree cursor towards the root; if the pointer for a given level points + * to the first record/key in that block, we haven't seen this block before; + * and therefore we need to remember that we saw this block in the btree. + * + * So if our btree is: + * + * 4 + * / | \ + * 1 2 3 + * + * Pretend for this example that each leaf block has 100 btree records. For + * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we + * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so + * we record block 4. The list is [1, 4]. + * + * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit + * the loop. The list remains [1, 4]. + * + * For the 101st btree record, we've moved onto leaf block 2. Now + * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that + * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. + * + * For the 102nd record, bc_levels[0].ptr == 2, so we continue. + * + * For the 201st record, we've moved on to leaf block 3. + * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. + * + * For the 300th record we just exit, with the list being [1, 4, 2, 3]. + */ + +/* Mark a btree block to the agblock bitmap. */ +STATIC int +xagb_bitmap_visit_btblock( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +int +xagb_bitmap_set_btblocks( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, + XFS_BTREE_VISIT_ALL, bitmap); +} + +/* + * Record all the buffers pointed to by the btree cursor. Callers already + * engaged in a btree walk should call this function to capture the list of + * blocks going from the leaf towards the root. + */ +int +xagb_bitmap_set_btcur_path( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + int i; + int error; + + for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { + error = xagb_bitmap_visit_btblock(cur, i, bitmap); + if (error) + return error; + } + + return 0; +} diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h new file mode 100644 index 000000000000..e488e1f4f63d --- /dev/null +++ b/fs/xfs/scrub/agb_bitmap.h @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_AGB_BITMAP_H__ +#define __XFS_SCRUB_AGB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_agblock_t */ + +struct xagb_bitmap { + struct xbitmap32 agbitmap; +}; + +static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->agbitmap); +} + +static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap32_clear(&bitmap->agbitmap, start, len); +} +static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->agbitmap, start, len); +} + +static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t *len) +{ + return xbitmap32_test(&bitmap->agbitmap, start, len); +} + +static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, + struct xagb_bitmap *sub) +{ + return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap); +} + +static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) +{ + return xbitmap32_hweight(&bitmap->agbitmap); +} +static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) +{ + return xbitmap32_empty(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->agbitmap, fn, priv); +} + +int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); +int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); + +static inline uint32_t xagb_bitmap_count_set_regions(struct xagb_bitmap *b) +{ + return xbitmap32_count_set_regions(&b->agbitmap); +} + +#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 6c6e5eba42c8..e954f07679dd 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -556,28 +556,28 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); /* Check the AGF btree roots and levels */ - agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]); + agbno = be32_to_cpu(agf->agf_bno_root); if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]); + agbno = be32_to_cpu(agf->agf_cnt_root); if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + level = be32_to_cpu(agf->agf_bno_level); if (level <= 0 || level > mp->m_alloc_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + level = be32_to_cpu(agf->agf_cnt_level); if (level <= 0 || level > mp->m_alloc_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); if (xfs_has_rmapbt(mp)) { - agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); + agbno = be32_to_cpu(agf->agf_rmap_root); if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + level = be32_to_cpu(agf->agf_rmap_level); if (level <= 0 || level > mp->m_rmap_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 876a2f41b063..427054b65b23 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -26,6 +26,7 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" #include "scrub/reap.h" /* Superblock */ @@ -72,7 +73,7 @@ xrep_superblock( /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); - return error; + return 0; } /* AGF */ @@ -173,8 +174,7 @@ xrep_agf_find_btrees( * We relied on the rmapbt to reconstruct the AGF. If we get a * different root then something's seriously wrong. */ - if (fab[XREP_AGF_RMAPBT].root != - be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi])) + if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root)) return -EFSCORRUPTED; /* We must find the refcountbt root if that feature is enabled. */ @@ -223,20 +223,14 @@ xrep_agf_set_roots( struct xfs_agf *agf, struct xrep_find_ag_btree *fab) { - agf->agf_roots[XFS_BTNUM_BNOi] = - cpu_to_be32(fab[XREP_AGF_BNOBT].root); - agf->agf_levels[XFS_BTNUM_BNOi] = - cpu_to_be32(fab[XREP_AGF_BNOBT].height); + agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root); + agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height); - agf->agf_roots[XFS_BTNUM_CNTi] = - cpu_to_be32(fab[XREP_AGF_CNTBT].root); - agf->agf_levels[XFS_BTNUM_CNTi] = - cpu_to_be32(fab[XREP_AGF_CNTBT].height); + agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root); + agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height); - agf->agf_roots[XFS_BTNUM_RMAPi] = - cpu_to_be32(fab[XREP_AGF_RMAPBT].root); - agf->agf_levels[XFS_BTNUM_RMAPi] = - cpu_to_be32(fab[XREP_AGF_RMAPBT].height); + agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root); + agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height); if (xfs_has_reflink(sc->mp)) { agf->agf_refcount_root = @@ -261,8 +255,7 @@ xrep_agf_calc_from_btrees( int error; /* Update the AGF counters from the bnobt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); + cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa); if (error) goto err; @@ -275,8 +268,7 @@ xrep_agf_calc_from_btrees( agf->agf_longest = cpu_to_be32(raa.longest); /* Update the AGF counters from the cntbt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -332,16 +324,13 @@ xrep_agf_commit_new( pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_longest = be32_to_cpu(agf->agf_longest); - pag->pagf_levels[XFS_BTNUM_BNOi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); - pag->pagf_levels[XFS_BTNUM_CNTi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); - pag->pagf_levels[XFS_BTNUM_RMAPi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level); + pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); + pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); - return 0; + return xrep_roll_ag_trans(sc); } /* Repair the AGF. v5 filesystems only. */ @@ -494,12 +483,11 @@ xrep_agfl_walk_rmap( /* Strike out the blocks that are cross-linked according to the rmapbt. */ STATIC int xrep_agfl_check_extent( - uint64_t start, - uint64_t len, + uint32_t agbno, + uint32_t len, void *priv) { struct xrep_agfl *ra = priv; - xfs_agblock_t agbno = start; xfs_agblock_t last_agbno = agbno + len - 1; int error; @@ -559,16 +547,14 @@ xrep_agfl_collect_blocks( goto out_bmp; /* Find all blocks currently being used by the bnobt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); + cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) goto out_bmp; /* Find all blocks currently being used by the cntbt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) @@ -647,8 +633,8 @@ struct xrep_agfl_fill { /* Fill the AGFL with whatever blocks are in this extent. */ static int xrep_agfl_fill( - uint64_t start, - uint64_t len, + uint32_t start, + uint32_t len, void *priv) { struct xrep_agfl_fill *af = priv; @@ -789,6 +775,9 @@ xrep_agfl( /* Dump any AGFL overflow. */ error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); + if (error) + goto err; + err: xagb_bitmap_destroy(&agfl_extents); return error; @@ -905,7 +894,7 @@ xrep_agi_calc_from_btrees( xfs_agino_t freecount; int error; - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; @@ -925,8 +914,7 @@ xrep_agi_calc_from_btrees( if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { xfs_agblock_t blocks; - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, - XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -962,7 +950,7 @@ xrep_agi_commit_new( pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); - return 0; + return xrep_roll_ag_trans(sc); } /* Repair the AGI. */ diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 279af72b1671..d1b8a4997dd2 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -9,13 +9,16 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_btree.h" #include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "xfs_ag.h" +#include "scrub/repair.h" /* * Set us up to scrub free space btrees. @@ -24,10 +27,19 @@ int xchk_setup_ag_allocbt( struct xfs_scrub *sc) { + int error; + if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); - return xchk_setup_ag_btree(sc, false); + error = xchk_setup_ag_btree(sc, false); + if (error) + return error; + + if (xchk_could_repair(sc)) + return xrep_setup_ag_allocbt(sc); + + return 0; } /* Free space btree scrubber. */ @@ -127,7 +139,7 @@ xchk_allocbt_rec( struct xchk_alloc *ca = bs->private; xfs_alloc_btrec_to_irec(rec, &irec); - if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) { + if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -138,31 +150,27 @@ xchk_allocbt_rec( return 0; } -/* Scrub the freespace btrees for some AG. */ -STATIC int +/* Scrub one of the freespace btrees for some AG. */ +int xchk_allocbt( - struct xfs_scrub *sc, - xfs_btnum_t which) + struct xfs_scrub *sc) { struct xchk_alloc ca = { }; struct xfs_btree_cur *cur; - cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; - return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); -} - -int -xchk_bnobt( - struct xfs_scrub *sc) -{ - return xchk_allocbt(sc, XFS_BTNUM_BNO); -} + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_BNOBT: + cur = sc->sa.bno_cur; + break; + case XFS_SCRUB_TYPE_CNTBT: + cur = sc->sa.cnt_cur; + break; + default: + ASSERT(0); + return -EIO; + } -int -xchk_cntbt( - struct xfs_scrub *sc) -{ - return xchk_allocbt(sc, XFS_BTNUM_CNT); + return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); } /* xref check that the extent is not free */ diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c new file mode 100644 index 000000000000..d421b253923e --- /dev/null +++ b/fs/xfs/scrub/alloc_repair.c @@ -0,0 +1,933 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_inode.h" +#include "xfs_refcount.h" +#include "xfs_extent_busy.h" +#include "xfs_health.h" +#include "xfs_bmap.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Free Space Btree Repair + * ======================= + * + * The reverse mappings are supposed to record all space usage for the entire + * AG. Therefore, we can recreate the free extent records in an AG by looking + * for gaps in the physical extents recorded in the rmapbt. These records are + * staged in @free_records. Identifying the gaps is more difficult on a + * reflink filesystem because rmap records are allowed to overlap. + * + * Because the final step of building a new index is to free the space used by + * the old index, repair needs to find that space. Unfortunately, all + * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share + * the same rmapbt owner code (OWN_AG), so this is not straightforward. + * + * The scan of the reverse mapping information records the space used by OWN_AG + * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While + * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to + * record all visited rmap btree blocks and all blocks owned by the AGFL. + * + * After that is where the definitions of old_allocbt_blocks shifts. This + * expression identifies possible former bnobt/cntbt blocks: + * + * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks); + * + * Substituting from above definitions, that becomes: + * + * old_allocbt_blocks & ~not_allocbt_blocks + * + * The OWN_AG bitmap itself isn't needed after this point, so what we really do + * instead is: + * + * old_allocbt_blocks &= ~not_allocbt_blocks; + * + * After this point, @old_allocbt_blocks is a bitmap of alleged former + * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first + * parameter in place to avoid copying records around. + * + * Next, some of the space described by @free_records are diverted to the newbt + * reservation and used to format new btree blocks. The remaining records are + * written to the new btree indices. We reconstruct both bnobt and cntbt at + * the same time since we've already done all the work. + * + * We use the prefix 'xrep_abt' here because we regenerate both free space + * allocation btrees at the same time. + */ + +struct xrep_abt { + /* Blocks owned by the rmapbt or the agfl. */ + struct xagb_bitmap not_allocbt_blocks; + + /* All OWN_AG blocks. */ + struct xagb_bitmap old_allocbt_blocks; + + /* + * New bnobt information. All btree block reservations are added to + * the reservation list in new_bnobt. + */ + struct xrep_newbt new_bnobt; + + /* new cntbt information */ + struct xrep_newbt new_cntbt; + + /* Free space extents. */ + struct xfarray *free_records; + + struct xfs_scrub *sc; + + /* Number of non-null records in @free_records. */ + uint64_t nr_real_records; + + /* get_records()'s position in the free space record array. */ + xfarray_idx_t array_cur; + + /* + * Next block we anticipate seeing in the rmap records. If the next + * rmap record is greater than next_agbno, we have found unused space. + */ + xfs_agblock_t next_agbno; + + /* Number of free blocks in this AG. */ + xfs_agblock_t nr_blocks; + + /* Longest free extent we found in the AG. */ + xfs_agblock_t longest; +}; + +/* Set up to repair AG free space btrees. */ +int +xrep_setup_ag_allocbt( + struct xfs_scrub *sc) +{ + unsigned int busy_gen; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. + */ + busy_gen = READ_ONCE(sc->sa.pag->pagb_gen); + if (xfs_extent_busy_list_empty(sc->sa.pag)) + return 0; + + return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0); +} + +/* Check for any obvious conflicts in the free extent. */ +STATIC int +xrep_abt_check_free_ext( + struct xfs_scrub *sc, + const struct xfs_alloc_rec_incore *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->ar_startblock, rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be shared or CoW staging. */ + if (sc->sa.refc_cur) { + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_SHARED, rec->ar_startblock, + rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_COW, rec->ar_startblock, + rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Stash a free space record for all the space since the last bno we found + * all the way up to @end. + */ +static int +xrep_abt_stash( + struct xrep_abt *ra, + xfs_agblock_t end) +{ + struct xfs_alloc_rec_incore arec = { + .ar_startblock = ra->next_agbno, + .ar_blockcount = end - ra->next_agbno, + }; + struct xfs_scrub *sc = ra->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + error = xrep_abt_check_free_ext(ra->sc, &arec); + if (error) + return error; + + trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec); + + error = xfarray_append(ra->free_records, &arec); + if (error) + return error; + + ra->nr_blocks += arec.ar_blockcount; + return 0; +} + +/* Record extents that aren't in use from gaps in the rmap records. */ +STATIC int +xrep_abt_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_abt *ra = priv; + int error; + + /* Record all the OWN_AG blocks... */ + if (rec->rm_owner == XFS_RMAP_OWN_AG) { + error = xagb_bitmap_set(&ra->old_allocbt_blocks, + rec->rm_startblock, rec->rm_blockcount); + if (error) + return error; + } + + /* ...and all the rmapbt blocks... */ + error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur); + if (error) + return error; + + /* ...and all the free space. */ + if (rec->rm_startblock > ra->next_agbno) { + error = xrep_abt_stash(ra, rec->rm_startblock); + if (error) + return error; + } + + /* + * rmap records can overlap on reflink filesystems, so project + * next_agbno as far out into the AG space as we currently know about. + */ + ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* Collect an AGFL block for the not-to-release list. */ +static int +xrep_abt_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1); +} + +/* + * Compare two free space extents by block number. We want to sort in order of + * increasing block number. + */ +static int +xrep_bnobt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_startblock > bp->ar_startblock) + return 1; + else if (ap->ar_startblock < bp->ar_startblock) + return -1; + return 0; +} + +/* + * Re-sort the free extents by block number so that we can put the records into + * the bnobt in the correct order. Make sure the records do not overlap in + * physical space. + */ +STATIC int +xrep_bnobt_sort_records( + struct xrep_abt *ra) +{ + struct xfs_alloc_rec_incore arec; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0); + if (error) + return error; + + while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) { + if (arec.ar_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = arec.ar_startblock + arec.ar_blockcount; + } + + return error; +} + +/* + * Compare two free space extents by length and then block number. We want + * to sort first in order of increasing length and then in order of increasing + * block number. + */ +static int +xrep_cntbt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_blockcount > bp->ar_blockcount) + return 1; + else if (ap->ar_blockcount < bp->ar_blockcount) + return -1; + return xrep_bnobt_extent_cmp(a, b); +} + +/* + * Sort the free extents by length so so that we can put the records into the + * cntbt in the correct order. Don't let userspace kill us if we're resorting + * after allocating btree blocks. + */ +STATIC int +xrep_cntbt_sort_records( + struct xrep_abt *ra, + bool is_resort) +{ + return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp, + is_resort ? 0 : XFARRAY_SORT_KILLABLE); +} + +/* + * Iterate all reverse mappings to find (1) the gaps between rmap records (all + * unowned space), (2) the OWN_AG extents (which encompass the free space + * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL + * blocks. The free space is (1) + (2) - (3) - (4). + */ +STATIC int +xrep_abt_find_freespace( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_buf *agfl_bp; + xfs_agblock_t agend; + int error; + + xagb_bitmap_init(&ra->not_allocbt_blocks); + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Iterate all the reverse mappings to find gaps in the physical + * mappings, all the OWN_AG blocks, and all the rmapbt extents. + */ + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra); + if (error) + goto err; + + /* Insert a record for space between the last rmap and EOAG. */ + agend = be32_to_cpu(agf->agf_length); + if (ra->next_agbno < agend) { + error = xrep_abt_stash(ra, agend); + if (error) + goto err; + } + + /* Collect all the AGFL blocks. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + goto err; + + error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra); + if (error) + goto err_agfl; + + /* Compute the old bnobt/cntbt blocks. */ + error = xagb_bitmap_disunion(&ra->old_allocbt_blocks, + &ra->not_allocbt_blocks); + if (error) + goto err_agfl; + + ra->nr_real_records = xfarray_length(ra->free_records); +err_agfl: + xfs_trans_brelse(sc->tp, agfl_bp); +err: + xchk_ag_btcur_free(&sc->sa); + xagb_bitmap_destroy(&ra->not_allocbt_blocks); + return error; +} + +/* + * We're going to use the observed free space records to reserve blocks for the + * new free space btrees, so we play an iterative game where we try to converge + * on the number of blocks we need: + * + * 1. Estimate how many blocks we'll need to store the records. + * 2. If the first free record has more blocks than we need, we're done. + * We will have to re-sort the records prior to building the cntbt. + * 3. If that record has exactly the number of blocks we need, null out the + * record. We're done. + * 4. Otherwise, we still need more blocks. Null out the record, subtract its + * length from the number of blocks we need, and go back to step 1. + * + * Fortunately, we don't have to do any transaction work to play this game, so + * we don't have to tear down the staging cursors. + */ +STATIC int +xrep_abt_reserve_space( + struct xrep_abt *ra, + struct xfs_btree_cur *bno_cur, + struct xfs_btree_cur *cnt_cur, + bool *needs_resort) +{ + struct xfs_scrub *sc = ra->sc; + xfarray_idx_t record_nr; + unsigned int allocated = 0; + int error = 0; + + record_nr = xfarray_length(ra->free_records) - 1; + do { + struct xfs_alloc_rec_incore arec; + uint64_t required; + unsigned int desired; + unsigned int len; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(cnt_cur, + &ra->new_cntbt.bload, ra->nr_real_records); + if (error) + break; + + error = xfs_btree_bload_compute_geometry(bno_cur, + &ra->new_bnobt.bload, ra->nr_real_records); + if (error) + break; + + /* How many btree blocks do we need to store all records? */ + required = ra->new_bnobt.bload.nr_blocks + + ra->new_cntbt.bload.nr_blocks; + ASSERT(required < INT_MAX); + + /* If we've reserved enough blocks, we're done. */ + if (allocated >= required) + break; + + desired = required - allocated; + + /* We need space but there's none left; bye! */ + if (ra->nr_real_records == 0) { + error = -ENOSPC; + break; + } + + /* Grab the first record from the list. */ + error = xfarray_load(ra->free_records, record_nr, &arec); + if (error) + break; + + ASSERT(arec.ar_blockcount <= UINT_MAX); + len = min_t(unsigned int, arec.ar_blockcount, desired); + + trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno, + arec.ar_startblock, len, XFS_RMAP_OWN_AG); + + error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, + arec.ar_startblock, len); + if (error) + break; + allocated += len; + ra->nr_blocks -= len; + + if (arec.ar_blockcount > desired) { + /* + * Record has more space than we need. The number of + * free records doesn't change, so shrink the free + * record, inform the caller that the records are no + * longer sorted by length, and exit. + */ + arec.ar_startblock += desired; + arec.ar_blockcount -= desired; + error = xfarray_store(ra->free_records, record_nr, + &arec); + if (error) + break; + + *needs_resort = true; + return 0; + } + + /* + * We're going to use up the entire record, so unset it and + * move on to the next one. This changes the number of free + * records (but doesn't break the sorting order), so we must + * go around the loop once more to re-run _bload_init. + */ + error = xfarray_unset(ra->free_records, record_nr); + if (error) + break; + ra->nr_real_records--; + record_nr--; + } while (1); + + return error; +} + +STATIC int +xrep_abt_dispose_one( + struct xrep_abt *ra, + struct xrep_newbt_resv *resv) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + xfs_agblock_t free_agbno = resv->agbno + resv->used; + xfs_extlen_t free_aglen = resv->len - resv->used; + int error; + + ASSERT(pag == resv->pag); + + /* Add a deferred rmap for each extent we used. */ + if (resv->used > 0) + xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno, + resv->used, XFS_RMAP_OWN_AG); + + /* + * For each reserved btree block we didn't use, add it to the free + * space btree. We didn't touch fdblocks when we reserved them, so + * we don't touch it now. + */ + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, ra->new_bnobt.oinfo.oi_owner); + + error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, + &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); + if (error) + return error; + + return xrep_defer_finish(sc); +} + +/* + * Deal with all the space we reserved. Blocks that were allocated for the + * free space btrees need to have a (deferred) rmap added for the OWN_AG + * allocation, and blocks that didn't get used can be freed via the usual + * (deferred) means. + */ +STATIC void +xrep_abt_dispose_reservations( + struct xrep_abt *ra, + int error) +{ + struct xrep_newbt_resv *resv, *n; + + if (error) + goto junkit; + + list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { + error = xrep_abt_dispose_one(ra, resv); + if (error) + goto junkit; + } + +junkit: + list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { + xfs_perag_put(resv->pag); + list_del(&resv->list); + kfree(resv); + } + + xrep_newbt_cancel(&ra->new_bnobt); + xrep_newbt_cancel(&ra->new_cntbt); +} + +/* Retrieve free space data for bulk load. */ +STATIC int +xrep_abt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a; + struct xrep_abt *ra = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load_next(ra->free_records, &ra->array_cur, + arec); + if (error) + return error; + + ra->longest = max(ra->longest, arec->ar_blockcount); + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_abt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr); +} + +/* + * Reset the AGF counters to reflect the free space btrees that we just + * rebuilt, then reinitialize the per-AG data. + */ +STATIC int +xrep_abt_reset_counters( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + unsigned int freesp_btreeblks = 0; + + /* + * Compute the contribution to agf_btreeblks for the new free space + * btrees. This is the computed btree size minus anything we didn't + * use. + */ + freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1; + freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1; + + freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt); + freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt); + + /* + * The AGF header contains extra information related to the free space + * btrees, so we must update those fields here. + */ + agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks + + (be32_to_cpu(agf->agf_rmap_blocks) - 1)); + agf->agf_freeblks = cpu_to_be32(ra->nr_blocks); + agf->agf_longest = cpu_to_be32(ra->longest); + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS | + XFS_AGF_LONGEST | + XFS_AGF_FREEBLKS); + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the allocbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_bno_level = pag->pagf_bno_level; + pag->pagf_repair_cnt_level = pag->pagf_cnt_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected free space information to stage new free space btrees. + * If this is successful we'll return with the new btree root + * information logged to the repair transaction but not yet committed. + */ +STATIC int +xrep_abt_build_new_trees( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + struct xfs_perag *pag = sc->sa.pag; + bool needs_resort = false; + int error; + + /* + * Sort the free extents by length so that we can set up the free space + * btrees in as few extents as possible. This reduces the amount of + * deferred rmap / free work we have to do at the end. + */ + error = xrep_cntbt_sort_records(ra, false); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + xrep_newbt_init_bare(&ra->new_bnobt, sc); + xrep_newbt_init_bare(&ra->new_cntbt, sc); + + ra->new_bnobt.bload.get_records = xrep_abt_get_records; + ra->new_cntbt.bload.get_records = xrep_abt_get_records; + + ra->new_bnobt.bload.claim_block = xrep_abt_claim_block; + ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; + + /* Allocate cursors for the staged btrees. */ + bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake); + + cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake); + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btrees. */ + error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort); + if (error) + goto err_cur; + + /* + * If we need to re-sort the free extents by length, do so so that we + * can put the records into the cntbt in the correct order. + */ + if (needs_resort) { + error = xrep_cntbt_sort_records(ra, needs_resort); + if (error) + goto err_cur; + } + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the alternate incore btree + * height so that we don't trip the verifiers when writing the new + * btree blocks to disk. + */ + pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height; + pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height; + + /* Load the free space by length tree. */ + ra->array_cur = XFARRAY_CURSOR_INIT; + ra->longest = 0; + error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra); + if (error) + goto err_levels; + + error = xrep_bnobt_sort_records(ra); + if (error) + return error; + + /* Load the free space by block number tree. */ + ra->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra); + if (error) + goto err_levels; + + /* + * Install the new btrees in the AG header. After this point the old + * btrees are no longer accessible and the new trees are live. + */ + xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(bno_cur, 0); + xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(cnt_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_abt_reset_counters(ra); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + xrep_abt_dispose_reservations(ra, error); + + return xrep_roll_ag_trans(sc); + +err_levels: + pag->pagf_repair_bno_level = 0; + pag->pagf_repair_cnt_level = 0; +err_cur: + xfs_btree_del_cursor(cnt_cur, error); + xfs_btree_del_cursor(bno_cur, error); +err_newbt: + xrep_abt_dispose_reservations(ra, error); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_abt_remove_old_trees( + struct xrep_abt *ra) +{ + struct xfs_perag *pag = ra->sc->sa.pag; + int error; + + /* Free the old btree blocks if they're not in use. */ + error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks, + &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE); + if (error) + return error; + + /* + * Now that we've zapped all the old allocbt blocks we can turn off + * the alternate height mechanism. + */ + pag->pagf_repair_bno_level = 0; + pag->pagf_repair_cnt_level = 0; + return 0; +} + +/* Repair the freespace btrees for some AG. */ +int +xrep_allocbt( + struct xfs_scrub *sc) +{ + struct xrep_abt *ra; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS); + if (!ra) + return -ENOMEM; + ra->sc = sc; + + /* We rebuild both data structures. */ + sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. In theory we cleared this before we started, but + * let's not risk the filesystem. + */ + if (!xfs_extent_busy_list_empty(sc->sa.pag)) { + error = -EDEADLOCK; + goto out_ra; + } + + /* Set up enough storage to handle maximally fragmented free space. */ + descr = xchk_xfile_ag_descr(sc, "free space records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2, + sizeof(struct xfs_alloc_rec_incore), + &ra->free_records); + kfree(descr); + if (error) + goto out_ra; + + /* Collect the free space data and find the old btree blocks. */ + xagb_bitmap_init(&ra->old_allocbt_blocks); + error = xrep_abt_find_freespace(ra); + if (error) + goto out_bitmap; + + /* Rebuild the free space information. */ + error = xrep_abt_build_new_trees(ra); + if (error) + goto out_bitmap; + + /* Kill the old trees. */ + error = xrep_abt_remove_old_trees(ra); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&ra->old_allocbt_blocks); + xfarray_destroy(ra->free_records); +out_ra: + kfree(ra); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_allocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; + error = xchk_allocbt(sc); + if (error) + goto out; + + sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT; + error = xchk_allocbt(sc); +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 6c16d9530cca..83c7feb38714 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -527,28 +527,23 @@ xchk_xattr_check_sf( struct xfs_scrub *sc) { struct xchk_xattr_buf *ab = sc->buf; - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; + struct xfs_ifork *ifp = &sc->ip->i_af; + struct xfs_attr_sf_hdr *sf = ifp->if_data; + struct xfs_attr_sf_entry *sfe = xfs_attr_sf_firstentry(sf); struct xfs_attr_sf_entry *next; - struct xfs_ifork *ifp; - unsigned char *end; + unsigned char *end = ifp->if_data + ifp->if_bytes; int i; int error = 0; - ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); - bitmap_zero(ab->usedmap, ifp->if_bytes); - sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data; - end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes; - xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr)); + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*sf)); - sfe = &sf->list[0]; if ((unsigned char *)sfe > end) { xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); return 0; } - for (i = 0; i < sf->hdr.count; i++) { + for (i = 0; i < sf->count; i++) { unsigned char *name = sfe->nameval; unsigned char *value = &sfe->nameval[sfe->namelen]; diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index e0c89a9a0ca0..0cb8d43912a8 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -16,7 +16,9 @@ #include <linux/interval_tree_generic.h> -struct xbitmap_node { +/* u64 bitmap */ + +struct xbitmap64_node { struct rb_node bn_rbnode; /* First set bit of this interval and subtree. */ @@ -39,72 +41,72 @@ struct xbitmap_node { * forward-declare them anyway for clarity. */ static inline void -xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root); +xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root); static inline void -xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root); +xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline struct xbitmap_node * -xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start, +static inline struct xbitmap64_node * +xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start, uint64_t last); -static inline struct xbitmap_node * -xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start, +static inline struct xbitmap64_node * +xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start, uint64_t last); -INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t, - __bn_subtree_last, START, LAST, static inline, xbitmap_tree) +INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t, + __bn_subtree_last, START, LAST, static inline, xbitmap64_tree) /* Iterate each interval of a bitmap. Do not change the bitmap. */ -#define for_each_xbitmap_extent(bn, bitmap) \ +#define for_each_xbitmap64_extent(bn, bitmap) \ for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ - struct xbitmap_node, bn_rbnode); \ + struct xbitmap64_node, bn_rbnode); \ (bn) != NULL; \ (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ - struct xbitmap_node, bn_rbnode)) + struct xbitmap64_node, bn_rbnode)) /* Clear a range of this bitmap. */ int -xbitmap_clear( - struct xbitmap *bitmap, +xbitmap64_clear( + struct xbitmap64 *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_node *bn; - struct xbitmap_node *new_bn; + struct xbitmap64_node *bn; + struct xbitmap64_node *new_bn; uint64_t last = start + len - 1; - while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) { + while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last))) { if (bn->bn_start < start && bn->bn_last > last) { uint64_t old_last = bn->bn_last; /* overlaps with the entire clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_last = start - 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); /* add an extent */ - new_bn = kmalloc(sizeof(struct xbitmap_node), + new_bn = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS); if (!new_bn) return -ENOMEM; new_bn->bn_start = last + 1; new_bn->bn_last = old_last; - xbitmap_tree_insert(new_bn, &bitmap->xb_root); + xbitmap64_tree_insert(new_bn, &bitmap->xb_root); } else if (bn->bn_start < start) { /* overlaps with the left side of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_last = start - 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); } else if (bn->bn_last > last) { /* overlaps with the right side of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_start = last + 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); break; } else { /* in the middle of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); kfree(bn); } } @@ -114,59 +116,59 @@ xbitmap_clear( /* Set a range of this bitmap. */ int -xbitmap_set( - struct xbitmap *bitmap, +xbitmap64_set( + struct xbitmap64 *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_node *left; - struct xbitmap_node *right; + struct xbitmap64_node *left; + struct xbitmap64_node *right; uint64_t last = start + len - 1; int error; /* Is this whole range already set? */ - left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + left = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last); if (left && left->bn_start <= start && left->bn_last >= last) return 0; /* Clear out everything in the range we want to set. */ - error = xbitmap_clear(bitmap, start, len); + error = xbitmap64_clear(bitmap, start, len); if (error) return error; /* Do we have a left-adjacent extent? */ - left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + left = xbitmap64_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); ASSERT(!left || left->bn_last + 1 == start); /* Do we have a right-adjacent extent? */ - right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + right = xbitmap64_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); ASSERT(!right || right->bn_start == last + 1); if (left && right) { /* combine left and right adjacent extent */ - xbitmap_tree_remove(left, &bitmap->xb_root); - xbitmap_tree_remove(right, &bitmap->xb_root); + xbitmap64_tree_remove(left, &bitmap->xb_root); + xbitmap64_tree_remove(right, &bitmap->xb_root); left->bn_last = right->bn_last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); kfree(right); } else if (left) { /* combine with left extent */ - xbitmap_tree_remove(left, &bitmap->xb_root); + xbitmap64_tree_remove(left, &bitmap->xb_root); left->bn_last = last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); } else if (right) { /* combine with right extent */ - xbitmap_tree_remove(right, &bitmap->xb_root); + xbitmap64_tree_remove(right, &bitmap->xb_root); right->bn_start = start; - xbitmap_tree_insert(right, &bitmap->xb_root); + xbitmap64_tree_insert(right, &bitmap->xb_root); } else { /* add an extent */ - left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS); + left = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS); if (!left) return -ENOMEM; left->bn_start = start; left->bn_last = last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); } return 0; @@ -174,21 +176,21 @@ xbitmap_set( /* Free everything related to this bitmap. */ void -xbitmap_destroy( - struct xbitmap *bitmap) +xbitmap64_destroy( + struct xbitmap64 *bitmap) { - struct xbitmap_node *bn; + struct xbitmap64_node *bn; - while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { - xbitmap_tree_remove(bn, &bitmap->xb_root); + while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { + xbitmap64_tree_remove(bn, &bitmap->xb_root); kfree(bn); } } /* Set up a per-AG block bitmap. */ void -xbitmap_init( - struct xbitmap *bitmap) +xbitmap64_init( + struct xbitmap64 *bitmap) { bitmap->xb_root = RB_ROOT_CACHED; } @@ -208,18 +210,18 @@ xbitmap_init( * This is the logical equivalent of bitmap &= ~sub. */ int -xbitmap_disunion( - struct xbitmap *bitmap, - struct xbitmap *sub) +xbitmap64_disunion( + struct xbitmap64 *bitmap, + struct xbitmap64 *sub) { - struct xbitmap_node *bn; + struct xbitmap64_node *bn; int error; - if (xbitmap_empty(bitmap) || xbitmap_empty(sub)) + if (xbitmap64_empty(bitmap) || xbitmap64_empty(sub)) return 0; - for_each_xbitmap_extent(bn, sub) { - error = xbitmap_clear(bitmap, bn->bn_start, + for_each_xbitmap64_extent(bn, sub) { + error = xbitmap64_clear(bitmap, bn->bn_start, bn->bn_last - bn->bn_start + 1); if (error) return error; @@ -228,88 +230,273 @@ xbitmap_disunion( return 0; } +/* How many bits are set in this bitmap? */ +uint64_t +xbitmap64_hweight( + struct xbitmap64 *bitmap) +{ + struct xbitmap64_node *bn; + uint64_t ret = 0; + + for_each_xbitmap64_extent(bn, bitmap) + ret += bn->bn_last - bn->bn_start + 1; + + return ret; +} + +/* Call a function for every run of set bits in this bitmap. */ +int +xbitmap64_walk( + struct xbitmap64 *bitmap, + xbitmap64_walk_fn fn, + void *priv) +{ + struct xbitmap64_node *bn; + int error = 0; + + for_each_xbitmap64_extent(bn, bitmap) { + error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); + if (error) + break; + } + + return error; +} + +/* Does this bitmap have no bits set at all? */ +bool +xbitmap64_empty( + struct xbitmap64 *bitmap) +{ + return bitmap->xb_root.rb_root.rb_node == NULL; +} + +/* Is the start of the range set or clear? And for how long? */ +bool +xbitmap64_test( + struct xbitmap64 *bitmap, + uint64_t start, + uint64_t *len) +{ + struct xbitmap64_node *bn; + uint64_t last = start + *len - 1; + + bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last); + if (!bn) + return false; + if (bn->bn_start <= start) { + if (bn->bn_last < last) + *len = bn->bn_last - start + 1; + return true; + } + *len = bn->bn_start - start; + return false; +} + +/* u32 bitmap */ + +struct xbitmap32_node { + struct rb_node bn_rbnode; + + /* First set bit of this interval and subtree. */ + uint32_t bn_start; + + /* Last set bit of this interval. */ + uint32_t bn_last; + + /* Last set bit of this subtree. Do not touch this. */ + uint32_t __bn_subtree_last; +}; + +/* Define our own interval tree type with uint32_t parameters. */ + /* - * Record all btree blocks seen while iterating all records of a btree. - * - * We know that the btree query_all function starts at the left edge and walks - * towards the right edge of the tree. Therefore, we know that we can walk up - * the btree cursor towards the root; if the pointer for a given level points - * to the first record/key in that block, we haven't seen this block before; - * and therefore we need to remember that we saw this block in the btree. - * - * So if our btree is: - * - * 4 - * / | \ - * 1 2 3 - * - * Pretend for this example that each leaf block has 100 btree records. For - * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we - * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so - * we record block 4. The list is [1, 4]. - * - * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit - * the loop. The list remains [1, 4]. - * - * For the 101st btree record, we've moved onto leaf block 2. Now - * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that - * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. - * - * For the 102nd record, bc_levels[0].ptr == 2, so we continue. - * - * For the 201st record, we've moved on to leaf block 3. - * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. - * - * For the 300th record we just exit, with the list being [1, 4, 2, 3]. + * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll + * forward-declare them anyway for clarity. */ +static inline void +xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root); -/* Mark a btree block to the agblock bitmap. */ -STATIC int -xagb_bitmap_visit_btblock( - struct xfs_btree_cur *cur, - int level, - void *priv) +static inline void +xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root); + +static inline struct xbitmap32_node * +xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start, + uint32_t last); + +static inline struct xbitmap32_node * +xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start, + uint32_t last); + +INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t, + __bn_subtree_last, START, LAST, static inline, xbitmap32_tree) + +/* Iterate each interval of a bitmap. Do not change the bitmap. */ +#define for_each_xbitmap32_extent(bn, bitmap) \ + for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ + struct xbitmap32_node, bn_rbnode); \ + (bn) != NULL; \ + (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ + struct xbitmap32_node, bn_rbnode)) + +/* Clear a range of this bitmap. */ +int +xbitmap32_clear( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t len) { - struct xagb_bitmap *bitmap = priv; - struct xfs_buf *bp; - xfs_fsblock_t fsbno; - xfs_agblock_t agbno; + struct xbitmap32_node *bn; + struct xbitmap32_node *new_bn; + uint32_t last = start + len - 1; - xfs_btree_get_block(cur, level, &bp); - if (!bp) - return 0; + while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last))) { + if (bn->bn_start < start && bn->bn_last > last) { + uint32_t old_last = bn->bn_last; - fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + /* overlaps with the entire clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + + /* add an extent */ + new_bn = kmalloc(sizeof(struct xbitmap32_node), + XCHK_GFP_FLAGS); + if (!new_bn) + return -ENOMEM; + new_bn->bn_start = last + 1; + new_bn->bn_last = old_last; + xbitmap32_tree_insert(new_bn, &bitmap->xb_root); + } else if (bn->bn_start < start) { + /* overlaps with the left side of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + } else if (bn->bn_last > last) { + /* overlaps with the right side of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_start = last + 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + break; + } else { + /* in the middle of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } + } - return xagb_bitmap_set(bitmap, agbno, 1); + return 0; } -/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +/* Set a range of this bitmap. */ int -xagb_bitmap_set_btblocks( - struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur) +xbitmap32_set( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t len) +{ + struct xbitmap32_node *left; + struct xbitmap32_node *right; + uint32_t last = start + len - 1; + int error; + + /* Is this whole range already set? */ + left = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last); + if (left && left->bn_start <= start && left->bn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + error = xbitmap32_clear(bitmap, start, len); + if (error) + return error; + + /* Do we have a left-adjacent extent? */ + left = xbitmap32_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + ASSERT(!left || left->bn_last + 1 == start); + + /* Do we have a right-adjacent extent? */ + right = xbitmap32_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + ASSERT(!right || right->bn_start == last + 1); + + if (left && right) { + /* combine left and right adjacent extent */ + xbitmap32_tree_remove(left, &bitmap->xb_root); + xbitmap32_tree_remove(right, &bitmap->xb_root); + left->bn_last = right->bn_last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + kfree(right); + } else if (left) { + /* combine with left extent */ + xbitmap32_tree_remove(left, &bitmap->xb_root); + left->bn_last = last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + } else if (right) { + /* combine with right extent */ + xbitmap32_tree_remove(right, &bitmap->xb_root); + right->bn_start = start; + xbitmap32_tree_insert(right, &bitmap->xb_root); + } else { + /* add an extent */ + left = kmalloc(sizeof(struct xbitmap32_node), XCHK_GFP_FLAGS); + if (!left) + return -ENOMEM; + left->bn_start = start; + left->bn_last = last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + } + + return 0; +} + +/* Free everything related to this bitmap. */ +void +xbitmap32_destroy( + struct xbitmap32 *bitmap) { - return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, - XFS_BTREE_VISIT_ALL, bitmap); + struct xbitmap32_node *bn; + + while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, 0, -1U))) { + xbitmap32_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } +} + +/* Set up a per-AG block bitmap. */ +void +xbitmap32_init( + struct xbitmap32 *bitmap) +{ + bitmap->xb_root = RB_ROOT_CACHED; } /* - * Record all the buffers pointed to by the btree cursor. Callers already - * engaged in a btree walk should call this function to capture the list of - * blocks going from the leaf towards the root. + * Remove all the blocks mentioned in @sub from the extents in @bitmap. + * + * The intent is that callers will iterate the rmapbt for all of its records + * for a given owner to generate @bitmap; and iterate all the blocks of the + * metadata structures that are not being rebuilt and have the same rmapbt + * owner to generate @sub. This routine subtracts all the extents + * mentioned in sub from all the extents linked in @bitmap, which leaves + * @bitmap as the list of blocks that are not accounted for, which we assume + * are the dead blocks of the old metadata structure. The blocks mentioned in + * @bitmap can be reaped. + * + * This is the logical equivalent of bitmap &= ~sub. */ int -xagb_bitmap_set_btcur_path( - struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur) +xbitmap32_disunion( + struct xbitmap32 *bitmap, + struct xbitmap32 *sub) { - int i; + struct xbitmap32_node *bn; int error; - for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { - error = xagb_bitmap_visit_btblock(cur, i, bitmap); + if (xbitmap32_empty(bitmap) || xbitmap32_empty(sub)) + return 0; + + for_each_xbitmap32_extent(bn, sub) { + error = xbitmap32_clear(bitmap, bn->bn_start, + bn->bn_last - bn->bn_start + 1); if (error) return error; } @@ -318,14 +505,14 @@ xagb_bitmap_set_btcur_path( } /* How many bits are set in this bitmap? */ -uint64_t -xbitmap_hweight( - struct xbitmap *bitmap) +uint32_t +xbitmap32_hweight( + struct xbitmap32 *bitmap) { - struct xbitmap_node *bn; - uint64_t ret = 0; + struct xbitmap32_node *bn; + uint32_t ret = 0; - for_each_xbitmap_extent(bn, bitmap) + for_each_xbitmap32_extent(bn, bitmap) ret += bn->bn_last - bn->bn_start + 1; return ret; @@ -333,15 +520,15 @@ xbitmap_hweight( /* Call a function for every run of set bits in this bitmap. */ int -xbitmap_walk( - struct xbitmap *bitmap, - xbitmap_walk_fn fn, +xbitmap32_walk( + struct xbitmap32 *bitmap, + xbitmap32_walk_fn fn, void *priv) { - struct xbitmap_node *bn; + struct xbitmap32_node *bn; int error = 0; - for_each_xbitmap_extent(bn, bitmap) { + for_each_xbitmap32_extent(bn, bitmap) { error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); if (error) break; @@ -352,23 +539,23 @@ xbitmap_walk( /* Does this bitmap have no bits set at all? */ bool -xbitmap_empty( - struct xbitmap *bitmap) +xbitmap32_empty( + struct xbitmap32 *bitmap) { return bitmap->xb_root.rb_root.rb_node == NULL; } /* Is the start of the range set or clear? And for how long? */ bool -xbitmap_test( - struct xbitmap *bitmap, - uint64_t start, - uint64_t *len) +xbitmap32_test( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t *len) { - struct xbitmap_node *bn; - uint64_t last = start + *len - 1; + struct xbitmap32_node *bn; + uint32_t last = start + *len - 1; - bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last); if (!bn) return false; if (bn->bn_start <= start) { @@ -379,3 +566,17 @@ xbitmap_test( *len = bn->bn_start - start; return false; } + +/* Count the number of set regions in this bitmap. */ +uint32_t +xbitmap32_count_set_regions( + struct xbitmap32 *bitmap) +{ + struct xbitmap32_node *bn; + uint32_t nr = 0; + + for_each_xbitmap32_extent(bn, bitmap) + nr++; + + return nr; +} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 4fe58bad6734..710c1ac5e323 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -6,17 +6,19 @@ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xbitmap { +/* u64 bitmap */ + +struct xbitmap64 { struct rb_root_cached xb_root; }; -void xbitmap_init(struct xbitmap *bitmap); -void xbitmap_destroy(struct xbitmap *bitmap); +void xbitmap64_init(struct xbitmap64 *bitmap); +void xbitmap64_destroy(struct xbitmap64 *bitmap); -int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); -int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); -int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); -uint64_t xbitmap_hweight(struct xbitmap *bitmap); +int xbitmap64_clear(struct xbitmap64 *bitmap, uint64_t start, uint64_t len); +int xbitmap64_set(struct xbitmap64 *bitmap, uint64_t start, uint64_t len); +int xbitmap64_disunion(struct xbitmap64 *bitmap, struct xbitmap64 *sub); +uint64_t xbitmap64_hweight(struct xbitmap64 *bitmap); /* * Return codes for the bitmap iterator functions are 0 to continue iterating, @@ -25,84 +27,41 @@ uint64_t xbitmap_hweight(struct xbitmap *bitmap); * iteration, because neither bitmap iterator ever generates that error code on * its own. Callers must not modify the bitmap while walking it. */ -typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); -int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, +typedef int (*xbitmap64_walk_fn)(uint64_t start, uint64_t len, void *priv); +int xbitmap64_walk(struct xbitmap64 *bitmap, xbitmap64_walk_fn fn, void *priv); -bool xbitmap_empty(struct xbitmap *bitmap); -bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); +bool xbitmap64_empty(struct xbitmap64 *bitmap); +bool xbitmap64_test(struct xbitmap64 *bitmap, uint64_t start, uint64_t *len); -/* Bitmaps, but for type-checked for xfs_agblock_t */ +/* u32 bitmap */ -struct xagb_bitmap { - struct xbitmap agbitmap; +struct xbitmap32 { + struct rb_root_cached xb_root; }; -static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) -{ - xbitmap_init(&bitmap->agbitmap); -} - -static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) -{ - xbitmap_destroy(&bitmap->agbitmap); -} - -static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t len) -{ - return xbitmap_clear(&bitmap->agbitmap, start, len); -} -static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t len) -{ - return xbitmap_set(&bitmap->agbitmap, start, len); -} - -static inline bool -xagb_bitmap_test( - struct xagb_bitmap *bitmap, - xfs_agblock_t start, - xfs_extlen_t *len) -{ - uint64_t biglen = *len; - bool ret; - - ret = xbitmap_test(&bitmap->agbitmap, start, &biglen); - - if (start + biglen >= UINT_MAX) { - ASSERT(0); - biglen = UINT_MAX - start; - } - - *len = biglen; - return ret; -} +void xbitmap32_init(struct xbitmap32 *bitmap); +void xbitmap32_destroy(struct xbitmap32 *bitmap); -static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, - struct xagb_bitmap *sub) -{ - return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap); -} +int xbitmap32_clear(struct xbitmap32 *bitmap, uint32_t start, uint32_t len); +int xbitmap32_set(struct xbitmap32 *bitmap, uint32_t start, uint32_t len); +int xbitmap32_disunion(struct xbitmap32 *bitmap, struct xbitmap32 *sub); +uint32_t xbitmap32_hweight(struct xbitmap32 *bitmap); -static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) -{ - return xbitmap_hweight(&bitmap->agbitmap); -} -static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) -{ - return xbitmap_empty(&bitmap->agbitmap); -} +/* + * Return codes for the bitmap iterator functions are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iteration caller. The special value -ECANCELED can be used to stop + * iteration, because neither bitmap iterator ever generates that error code on + * its own. Callers must not modify the bitmap while walking it. + */ +typedef int (*xbitmap32_walk_fn)(uint32_t start, uint32_t len, void *priv); +int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn, + void *priv); -static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, - xbitmap_walk_fn fn, void *priv) -{ - return xbitmap_walk(&bitmap->agbitmap, fn, priv); -} +bool xbitmap32_empty(struct xbitmap32 *bitmap); +bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len); -int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur); -int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur); +uint32_t xbitmap32_count_set_regions(struct xbitmap32 *bitmap); #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 75588915572e..24a15bf784f1 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -19,9 +19,11 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "scrub/health.h" #include "xfs_ag.h" /* Set us up with an inode's bmap. */ @@ -48,9 +50,18 @@ xchk_setup_inode_bmap( if (S_ISREG(VFS_I(sc->ip)->i_mode) && sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) { struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + bool is_repair = xchk_could_repair(sc); xchk_ilock(sc, XFS_MMAPLOCK_EXCL); + /* Break all our leases, we're going to mess with things. */ + if (is_repair) { + error = xfs_break_layouts(VFS_I(sc->ip), + &sc->ilock_flags, BREAK_WRITE); + if (error) + goto out; + } + inode_dio_wait(VFS_I(sc->ip)); /* @@ -71,6 +82,15 @@ xchk_setup_inode_bmap( error = filemap_fdatawait_keep_errors(mapping); if (error && (error != -ENOSPC && error != -EIO)) goto out; + + /* Drop the page cache if we're repairing block mappings. */ + if (is_repair) { + error = invalidate_inode_pages2( + VFS_I(sc->ip)->i_mapping); + if (error) + goto out; + } + } /* Got the inode, lock it and we're ready to go. */ @@ -78,6 +98,10 @@ xchk_setup_inode_bmap( if (error) goto out; + error = xchk_ino_dqattach(sc); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode */ @@ -410,7 +434,7 @@ xchk_bmap_iextent( /* Make sure the extent points to a valid place. */ if (info->is_rt && - !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount)) + !xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount)) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); if (!info->is_rt && @@ -633,6 +657,82 @@ xchk_bmap_check_ag_rmaps( } /* + * Decide if we want to scan the reverse mappings to determine if the attr + * fork /really/ has zero space mappings. + */ +STATIC bool +xchk_bmap_check_empty_attrfork( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_af; + + /* + * If the dinode repair found a bad attr fork, it will reset the fork + * to extents format with zero records and wait for the this scrubber + * to reconstruct the block mappings. If the fork is not in this + * state, then the fork cannot have been zapped. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0) + return false; + + /* + * Files can have an attr fork in EXTENTS format with zero records for + * several reasons: + * + * a) an attr set created a fork but ran out of space + * b) attr replace deleted an old attr but failed during the set step + * c) the data fork was in btree format when all attrs were deleted, so + * the fork was left in place + * d) the inode repair code zapped the fork + * + * Only in case (d) do we want to scan the rmapbt to see if we need to + * rebuild the attr fork. The fork zap code clears all DAC permission + * bits and zeroes the uid and gid, so avoid the scan if any of those + * three conditions are not met. + */ + if ((VFS_I(ip)->i_mode & 0777) != 0) + return false; + if (!uid_eq(VFS_I(ip)->i_uid, GLOBAL_ROOT_UID)) + return false; + if (!gid_eq(VFS_I(ip)->i_gid, GLOBAL_ROOT_GID)) + return false; + + return true; +} + +/* + * Decide if we want to scan the reverse mappings to determine if the data + * fork /really/ has zero space mappings. + */ +STATIC bool +xchk_bmap_check_empty_datafork( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_df; + + /* Don't support realtime rmap checks yet. */ + if (XFS_IS_REALTIME_INODE(ip)) + return false; + + /* + * If the dinode repair found a bad data fork, it will reset the fork + * to extents format with zero records and wait for the this scrubber + * to reconstruct the block mappings. If the fork is not in this + * state, then the fork cannot have been zapped. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0) + return false; + + /* + * If we encounter an empty data fork along with evidence that the fork + * might not really be empty, we need to scan the reverse mappings to + * decide if we're going to rebuild the fork. Data forks with nonzero + * file size are scanned. + */ + return i_size_read(VFS_I(ip)) != 0; +} + +/* * Decide if we want to walk every rmap btree in the fs to make sure that each * rmap for this file fork has corresponding bmbt entries. */ @@ -641,7 +741,6 @@ xchk_bmap_want_check_rmaps( struct xchk_bmap_info *info) { struct xfs_scrub *sc = info->sc; - struct xfs_ifork *ifp; if (!xfs_has_rmapbt(sc->mp)) return false; @@ -650,28 +749,10 @@ xchk_bmap_want_check_rmaps( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return false; - /* Don't support realtime rmap checks yet. */ - if (info->is_rt) - return false; - - /* - * The inode repair code zaps broken inode forks by resetting them back - * to EXTENTS format and zero extent records. If we encounter a fork - * in this state along with evidence that the fork isn't supposed to be - * empty, we need to scan the reverse mappings to decide if we're going - * to rebuild the fork. Data forks with nonzero file size are scanned. - * xattr forks are never empty of content, so they are always scanned. - */ - ifp = xfs_ifork_ptr(sc->ip, info->whichfork); - if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) { - if (info->whichfork == XFS_DATA_FORK && - i_size_read(VFS_I(sc->ip)) == 0) - return false; - - return true; - } + if (info->whichfork == XFS_ATTR_FORK) + return xchk_bmap_check_empty_attrfork(sc->ip); - return false; + return xchk_bmap_check_empty_datafork(sc->ip); } /* Make sure each rmap has a corresponding bmbt entry. */ @@ -843,7 +924,7 @@ xchk_bmap( if (!ifp) return -ENOENT; - info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); + info.is_rt = xfs_ifork_is_realtime(ip, whichfork); info.whichfork = whichfork; info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip); info.sc = sc; @@ -939,7 +1020,20 @@ int xchk_bmap_data( struct xfs_scrub *sc) { - return xchk_bmap(sc, XFS_DATA_FORK); + int error; + + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTD_ZAPPED)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_bmap(sc, XFS_DATA_FORK); + if (error) + return error; + + /* If the data fork is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTD_ZAPPED); + return 0; } /* Scrub an inode's attr fork. */ @@ -947,7 +1041,27 @@ int xchk_bmap_attr( struct xfs_scrub *sc) { - return xchk_bmap(sc, XFS_ATTR_FORK); + int error; + + /* + * If the attr fork has been zapped, it's possible that forkoff was + * reset to zero and hence sc->ip->i_afp is NULL. We don't want the + * NULL ifp check in xchk_bmap to conclude that the attr fork is ok, + * so short circuit that logic by setting the corruption flag and + * returning immediately. + */ + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTA_ZAPPED)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_bmap(sc, XFS_ATTR_FORK); + if (error) + return error; + + /* If the attr fork is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTA_ZAPPED); + return 0; } /* Scrub an inode's CoW fork. */ diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c new file mode 100644 index 000000000000..1e656fab5e41 --- /dev/null +++ b/fs/xfs/scrub/bmap_repair.c @@ -0,0 +1,873 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "xfs_reflink.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Inode Fork Block Mapping (BMBT) Repair + * ====================================== + * + * Gather all the rmap records for the inode and fork we're fixing, reset the + * incore fork, then recreate the btree. + */ + +enum reflink_scan_state { + RLS_IRRELEVANT = -1, /* not applicable to this file */ + RLS_UNKNOWN, /* shared extent scans required */ + RLS_SET_IFLAG, /* iflag must be set */ +}; + +struct xrep_bmap { + /* Old bmbt blocks */ + struct xfsb_bitmap old_bmbt_blocks; + + /* New fork. */ + struct xrep_newbt new_bmapbt; + + /* List of new bmap records. */ + struct xfarray *bmap_records; + + struct xfs_scrub *sc; + + /* How many blocks did we find allocated to this file? */ + xfs_rfsblock_t nblocks; + + /* How many bmbt blocks did we find for this fork? */ + xfs_rfsblock_t old_bmbt_block_count; + + /* get_records()'s position in the free space record array. */ + xfarray_idx_t array_cur; + + /* How many real (non-hole, non-delalloc) mappings do we have? */ + uint64_t real_mappings; + + /* Which fork are we fixing? */ + int whichfork; + + /* What d the REFLINK flag be set when the repair is over? */ + enum reflink_scan_state reflink_scan; + + /* Do we allow unwritten extents? */ + bool allow_unwritten; +}; + +/* Is this space extent shared? Flag the inode if it is. */ +STATIC int +xrep_bmap_discover_shared( + struct xrep_bmap *rb, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount) +{ + struct xfs_scrub *sc = rb->sc; + xfs_agblock_t agbno; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error; + + agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); + error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount, + &fbno, &flen, false); + if (error) + return error; + + if (fbno != NULLAGBLOCK) + rb->reflink_scan = RLS_SET_IFLAG; + + return 0; +} + +/* Remember this reverse-mapping as a series of bmap records. */ +STATIC int +xrep_bmap_from_rmap( + struct xrep_bmap *rb, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + bool unwritten) +{ + struct xfs_bmbt_irec irec = { + .br_startoff = startoff, + .br_startblock = startblock, + .br_state = unwritten ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM, + }; + struct xfs_bmbt_rec rbe; + struct xfs_scrub *sc = rb->sc; + int error = 0; + + /* + * If we're repairing the data fork of a non-reflinked regular file on + * a reflink filesystem, we need to figure out if this space extent is + * shared. + */ + if (rb->reflink_scan == RLS_UNKNOWN && !unwritten) { + error = xrep_bmap_discover_shared(rb, startblock, blockcount); + if (error) + return error; + } + + do { + xfs_failaddr_t fa; + + irec.br_blockcount = min_t(xfs_filblks_t, blockcount, + XFS_MAX_BMBT_EXTLEN); + + fa = xfs_bmap_validate_extent(sc->ip, rb->whichfork, &irec); + if (fa) + return -EFSCORRUPTED; + + xfs_bmbt_disk_set_all(&rbe, &irec); + + trace_xrep_bmap_found(sc->ip, rb->whichfork, &irec); + + if (xchk_should_terminate(sc, &error)) + return error; + + error = xfarray_append(rb->bmap_records, &rbe); + if (error) + return error; + + rb->real_mappings++; + + irec.br_startblock += irec.br_blockcount; + irec.br_startoff += irec.br_blockcount; + blockcount -= irec.br_blockcount; + } while (blockcount > 0); + + return 0; +} + +/* Check for any obvious errors or conflicts in the file mapping. */ +STATIC int +xrep_bmap_check_fork_rmap( + struct xrep_bmap *rb, + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec) +{ + struct xfs_scrub *sc = rb->sc; + enum xbtree_recpacking outcome; + int error; + + /* + * Data extents for rt files are never stored on the data device, but + * everything else (xattrs, bmbt blocks) can be. + */ + if (XFS_IS_REALTIME_INODE(sc->ip) && + !(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) + return -EFSCORRUPTED; + + /* Check that this is within the AG. */ + if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* Check the file offset range. */ + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + !xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* No contradictory flags. */ + if ((rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) && + (rec->rm_flags & XFS_RMAP_UNWRITTEN)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, + rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rm_startblock, rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record extents that belong to this inode's fork. */ +STATIC int +xrep_bmap_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_bmap *rb = priv; + struct xfs_mount *mp = cur->bc_mp; + xfs_fsblock_t fsbno; + int error = 0; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + if (rec->rm_owner != rb->sc->ip->i_ino) + return 0; + + error = xrep_bmap_check_fork_rmap(rb, cur, rec); + if (error) + return error; + + /* + * Record all blocks allocated to this file even if the extent isn't + * for the fork we're rebuilding so that we can reset di_nblocks later. + */ + rb->nblocks += rec->rm_blockcount; + + /* If this rmap isn't for the fork we want, we're done. */ + if (rb->whichfork == XFS_DATA_FORK && + (rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + if (rb->whichfork == XFS_ATTR_FORK && + !(rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + + /* Reject unwritten extents if we don't allow those. */ + if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten) + return -EFSCORRUPTED; + + fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, + rec->rm_startblock); + + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { + rb->old_bmbt_block_count += rec->rm_blockcount; + return xfsb_bitmap_set(&rb->old_bmbt_blocks, fsbno, + rec->rm_blockcount); + } + + return xrep_bmap_from_rmap(rb, rec->rm_offset, fsbno, + rec->rm_blockcount, + rec->rm_flags & XFS_RMAP_UNWRITTEN); +} + +/* + * Compare two block mapping records. We want to sort in order of increasing + * file offset. + */ +static int +xrep_bmap_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_bmbt_rec *ba = a; + const struct xfs_bmbt_rec *bb = b; + xfs_fileoff_t ao = xfs_bmbt_disk_get_startoff(ba); + xfs_fileoff_t bo = xfs_bmbt_disk_get_startoff(bb); + + if (ao > bo) + return 1; + else if (ao < bo) + return -1; + return 0; +} + +/* + * Sort the bmap extents by fork offset or else the records will be in the + * wrong order. Ensure there are no overlaps in the file offset ranges. + */ +STATIC int +xrep_bmap_sort_records( + struct xrep_bmap *rb) +{ + struct xfs_bmbt_irec irec; + xfs_fileoff_t next_off = 0; + xfarray_idx_t array_cur; + int error; + + error = xfarray_sort(rb->bmap_records, xrep_bmap_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rb->bmap_records, array_cur) { + struct xfs_bmbt_rec rec; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + error = xfarray_load(rb->bmap_records, array_cur, &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, &irec); + + if (irec.br_startoff < next_off) + return -EFSCORRUPTED; + + next_off = irec.br_startoff + irec.br_blockcount; + } + + return 0; +} + +/* Scan one AG for reverse mappings that we can turn into extent maps. */ +STATIC int +xrep_bmap_scan_ag( + struct xrep_bmap *rb, + struct xfs_perag *pag) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_bmap_walk_rmap, rb); + xchk_ag_free(sc, &sc->sa); + return error; +} + +/* Find the delalloc extents from the old incore extent tree. */ +STATIC int +xrep_bmap_find_delalloc( + struct xrep_bmap *rb) +{ + struct xfs_bmbt_irec irec; + struct xfs_iext_cursor icur; + struct xfs_bmbt_rec rbe; + struct xfs_inode *ip = rb->sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, rb->whichfork); + int error = 0; + + /* + * Skip this scan if we don't expect to find delayed allocation + * reservations in this fork. + */ + if (rb->whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0) + return 0; + + for_each_xfs_iext(ifp, &icur, &irec) { + if (!isnullstartblock(irec.br_startblock)) + continue; + + xfs_bmbt_disk_set_all(&rbe, &irec); + + trace_xrep_bmap_found(ip, rb->whichfork, &irec); + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + error = xfarray_append(rb->bmap_records, &rbe); + if (error) + return error; + } + + return 0; +} + +/* + * Collect block mappings for this fork of this inode and decide if we have + * enough space to rebuild. Caller is responsible for cleaning up the list if + * anything goes wrong. + */ +STATIC int +xrep_bmap_find_mappings( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error = 0; + + /* Iterate the rmaps for extents. */ + for_each_perag(sc->mp, agno, pag) { + error = xrep_bmap_scan_ag(rb, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + return xrep_bmap_find_delalloc(rb); +} + +/* Retrieve real extent mappings for bulk loading the bmap btree. */ +STATIC int +xrep_bmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_bmbt_rec rec; + struct xfs_bmbt_irec *irec = &cur->bc_rec.b; + struct xrep_bmap *rb = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + do { + error = xfarray_load(rb->bmap_records, rb->array_cur++, + &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, irec); + } while (isnullstartblock(irec->br_startblock)); + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_bmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_bmap *rb = priv; + + return xrep_newbt_claim_block(cur, &rb->new_bmapbt, ptr); +} + +/* Figure out how much space we need to create the incore btree root block. */ +STATIC size_t +xrep_bmap_iroot_size( + struct xfs_btree_cur *cur, + unsigned int level, + unsigned int nr_this_level, + void *priv) +{ + ASSERT(level > 0); + + return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level); +} + +/* Update the inode counters. */ +STATIC int +xrep_bmap_reset_counters( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake; + int64_t delta; + + if (rb->reflink_scan == RLS_SET_IFLAG) + sc->ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; + + /* + * Update the inode block counts to reflect the extents we found in the + * rmapbt. + */ + delta = ifake->if_blocks - rb->old_bmbt_block_count; + sc->ip->i_nblocks = rb->nblocks + delta; + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + + /* + * Adjust the quota counts by the difference in size between the old + * and new bmbt. + */ + xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, delta); + return 0; +} + +/* + * Create a new iext tree and load it with block mappings. If the inode is + * in extents format, that's all we need to do to commit the new mappings. + * If it is in btree format, this takes care of preloading the incore tree. + */ +STATIC int +xrep_bmap_extents_load( + struct xrep_bmap *rb) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp = rb->new_bmapbt.ifake.if_fork; + xfarray_idx_t array_cur; + int error; + + ASSERT(ifp->if_bytes == 0); + + /* Add all the mappings (incl. delalloc) to the incore extent tree. */ + xfs_iext_first(ifp, &icur); + foreach_xfarray_idx(rb->bmap_records, array_cur) { + struct xfs_bmbt_rec rec; + + error = xfarray_load(rb->bmap_records, array_cur, &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, &irec); + + xfs_iext_insert_raw(ifp, &icur, &irec); + if (!isnullstartblock(irec.br_startblock)) + ifp->if_nextents++; + + xfs_iext_next(ifp, &icur); + } + + return xrep_ino_ensure_extent_count(rb->sc, rb->whichfork, + ifp->if_nextents); +} + +/* + * Reserve new btree blocks, bulk load the bmap records into the ondisk btree, + * and load the incore extent tree. + */ +STATIC int +xrep_bmap_btree_load( + struct xrep_bmap *rb, + struct xfs_btree_cur *bmap_cur) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(bmap_cur, + &rb->new_bmapbt.bload, rb->real_mappings); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Guess how many blocks we're going to need to rebuild an entire bmap + * from the number of extents we found, and pump up our transaction to + * have sufficient block reservation. We're allowed to exceed file + * quota to repair inconsistent metadata. + */ + error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, + rb->new_bmapbt.bload.nr_blocks, 0, true); + if (error) + return error; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rb->new_bmapbt, + rb->new_bmapbt.bload.nr_blocks); + if (error) + return error; + + /* Add all observed bmap records. */ + rb->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(bmap_cur, &rb->new_bmapbt.bload, rb); + if (error) + return error; + + /* + * Load the new bmap records into the new incore extent tree to + * preserve delalloc reservations for regular files. The directory + * code loads the extent tree during xfs_dir_open and assumes + * thereafter that it remains loaded, so we must not violate that + * assumption. + */ + return xrep_bmap_extents_load(rb); +} + +/* + * Use the collected bmap information to stage a new bmap fork. If this is + * successful we'll return with the new fork information logged to the repair + * transaction but not yet committed. The caller must ensure that the inode + * is joined to the transaction; the inode will be joined to a clean + * transaction when the function returns. + */ +STATIC int +xrep_bmap_build_new_fork( + struct xrep_bmap *rb) +{ + struct xfs_owner_info oinfo; + struct xfs_scrub *sc = rb->sc; + struct xfs_btree_cur *bmap_cur; + struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake; + int error; + + error = xrep_bmap_sort_records(rb); + if (error) + return error; + + /* + * Prepare to construct the new fork by initializing the new btree + * structure and creating a fake ifork in the ifakeroot structure. + */ + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork); + error = xrep_newbt_init_inode(&rb->new_bmapbt, sc, rb->whichfork, + &oinfo); + if (error) + return error; + + rb->new_bmapbt.bload.get_records = xrep_bmap_get_records; + rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block; + rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size; + + /* + * Allocate a new bmap btree cursor for reloading an inode block mapping + * data structure. + */ + bmap_cur = xfs_bmbt_init_cursor(sc->mp, NULL, sc->ip, XFS_STAGING_FORK); + xfs_btree_stage_ifakeroot(bmap_cur, ifake); + + /* + * Figure out the size and format of the new fork, then fill it with + * all the bmap records we've found. Join the inode to the transaction + * so that we can roll the transaction while holding the inode locked. + */ + if (rb->real_mappings <= XFS_IFORK_MAXEXT(sc->ip, rb->whichfork)) { + ifake->if_fork->if_format = XFS_DINODE_FMT_EXTENTS; + error = xrep_bmap_extents_load(rb); + } else { + ifake->if_fork->if_format = XFS_DINODE_FMT_BTREE; + error = xrep_bmap_btree_load(rb, bmap_cur); + } + if (error) + goto err_cur; + + /* + * Install the new fork in the inode. After this point the old mapping + * data are no longer accessible and the new tree is live. We delete + * the cursor immediately after committing the staged root because the + * staged fork might be in extents format. + */ + xfs_bmbt_commit_staged_btree(bmap_cur, sc->tp, rb->whichfork); + xfs_btree_del_cursor(bmap_cur, 0); + + /* Reset the inode counters now that we've changed the fork. */ + error = xrep_bmap_reset_counters(rb); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rb->new_bmapbt); + if (error) + return error; + + return xrep_roll_trans(sc); + +err_cur: + if (bmap_cur) + xfs_btree_del_cursor(bmap_cur, error); +err_newbt: + xrep_newbt_cancel(&rb->new_bmapbt); + return error; +} + +/* + * Now that we've logged the new inode btree, invalidate all of the old blocks + * and free them, if there were any. + */ +STATIC int +xrep_bmap_remove_old_tree( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xfs_owner_info oinfo; + + /* Free the old bmbt blocks if they're not in use. */ + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork); + return xrep_reap_fsblocks(sc, &rb->old_bmbt_blocks, &oinfo); +} + +/* Check for garbage inputs. Returns -ECANCELED if there's nothing to do. */ +STATIC int +xrep_bmap_check_inputs( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); + + ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK); + + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + + /* No fork means nothing to rebuild. */ + if (!ifp) + return -ECANCELED; + + /* + * We only know how to repair extent mappings, which is to say that we + * only support extents and btree fork format. Repairs to a local + * format fork require a higher level repair function, so we do not + * have any work to do here. + */ + switch (ifp->if_format) { + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_UUID: + return -ECANCELED; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return -EFSCORRUPTED; + } + + if (whichfork == XFS_ATTR_FORK) + return 0; + + /* Only files, symlinks, and directories get to have data forks. */ + switch (VFS_I(sc->ip)->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + /* ok */ + break; + default: + return -EINVAL; + } + + /* Don't know how to rebuild realtime data forks. */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return -EOPNOTSUPP; + + return 0; +} + +/* Set up the initial state of the reflink scan. */ +static inline enum reflink_scan_state +xrep_bmap_init_reflink_scan( + struct xfs_scrub *sc, + int whichfork) +{ + /* cannot share on non-reflink filesystem */ + if (!xfs_has_reflink(sc->mp)) + return RLS_IRRELEVANT; + + /* preserve flag if it's already set */ + if (xfs_is_reflink_inode(sc->ip)) + return RLS_SET_IFLAG; + + /* can only share regular files */ + if (!S_ISREG(VFS_I(sc->ip)->i_mode)) + return RLS_IRRELEVANT; + + /* cannot share attr fork extents */ + if (whichfork != XFS_DATA_FORK) + return RLS_IRRELEVANT; + + /* cannot share realtime extents */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return RLS_IRRELEVANT; + + return RLS_UNKNOWN; +} + +/* Repair an inode fork. */ +int +xrep_bmap( + struct xfs_scrub *sc, + int whichfork, + bool allow_unwritten) +{ + struct xrep_bmap *rb; + char *descr; + unsigned int max_bmbt_recs; + bool large_extcount; + int error = 0; + + error = xrep_bmap_check_inputs(sc, whichfork); + if (error == -ECANCELED) + return 0; + if (error) + return error; + + rb = kzalloc(sizeof(struct xrep_bmap), XCHK_GFP_FLAGS); + if (!rb) + return -ENOMEM; + rb->sc = sc; + rb->whichfork = whichfork; + rb->reflink_scan = xrep_bmap_init_reflink_scan(sc, whichfork); + rb->allow_unwritten = allow_unwritten; + + /* Set up enough storage to handle the max records for this fork. */ + large_extcount = xfs_has_large_extent_counts(sc->mp); + max_bmbt_recs = xfs_iext_max_nextents(large_extcount, whichfork); + descr = xchk_xfile_ino_descr(sc, "%s fork mapping records", + whichfork == XFS_DATA_FORK ? "data" : "attr"); + error = xfarray_create(descr, max_bmbt_recs, + sizeof(struct xfs_bmbt_rec), &rb->bmap_records); + kfree(descr); + if (error) + goto out_rb; + + /* Collect all reverse mappings for this fork's extents. */ + xfsb_bitmap_init(&rb->old_bmbt_blocks); + error = xrep_bmap_find_mappings(rb); + if (error) + goto out_bitmap; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Rebuild the bmap information. */ + error = xrep_bmap_build_new_fork(rb); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_bmap_remove_old_tree(rb); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&rb->old_bmbt_blocks); + xfarray_destroy(rb->bmap_records); +out_rb: + kfree(rb); + return error; +} + +/* Repair an inode's data fork. */ +int +xrep_bmap_data( + struct xfs_scrub *sc) +{ + return xrep_bmap(sc, XFS_DATA_FORK, true); +} + +/* Repair an inode's attr fork. */ +int +xrep_bmap_attr( + struct xfs_scrub *sc) +{ + return xrep_bmap(sc, XFS_ATTR_FORK, false); +} diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 1935b9ce1885..fe678a0438bc 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -47,7 +47,7 @@ __xchk_btree_process_error( *error = 0; fallthrough; default: - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) trace_xchk_ifork_btree_op_error(sc, cur, level, *error, ret_ip); else @@ -91,7 +91,7 @@ __xchk_btree_set_corrupt( { sc->sm->sm_flags |= errflag; - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) trace_xchk_ifork_btree_error(sc, cur, level, ret_ip); else @@ -168,7 +168,7 @@ xchk_btree_rec( if (xfs_btree_keycmp_lt(cur, &key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, 1); - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Is high_key(rec) no larger than the parent high key? */ @@ -215,7 +215,7 @@ xchk_btree_key( if (xfs_btree_keycmp_lt(cur, key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, level); - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Is this block's high key no larger than the parent high key? */ @@ -236,22 +236,18 @@ xchk_btree_ptr_ok( int level, union xfs_btree_ptr *ptr) { - bool res; - /* A btree rooted in an inode has no block pointer to the root. */ - if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if (bs->cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == bs->cur->bc_nlevels) return true; /* Otherwise, check the pointers. */ - if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) - res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level); - else - res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level); - if (!res) + if (__xfs_btree_check_ptr(bs->cur, ptr, 0, level)) { xchk_btree_set_corrupt(bs->sc, bs->cur, level); + return false; + } - return res; + return true; } /* Check that a btree block's sibling matches what we expect it. */ @@ -374,18 +370,21 @@ xchk_btree_check_block_owner( { xfs_agnumber_t agno; xfs_agblock_t agbno; - xfs_btnum_t btnum; bool init_sa; int error = 0; if (!bs->cur) return 0; - btnum = bs->cur->bc_btnum; agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr); agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr); - init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS; + /* + * If the btree being examined is not itself a per-AG btree, initialize + * sc->sa so that we can check for the presence of an ownership record + * in the rmap btree for the AG containing the block. + */ + init_sa = bs->cur->bc_ops->type != XFS_BTREE_TYPE_AG; if (init_sa) { error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, @@ -399,11 +398,11 @@ xchk_btree_check_block_owner( * have to nullify it (to shut down further block owner checks) if * self-xref encounters problems. */ - if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO) + if (!bs->sc->sa.bno_cur && xfs_btree_is_bno(bs->cur->bc_ops)) bs->cur = NULL; xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo); - if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) + if (!bs->sc->sa.rmap_cur && xfs_btree_is_rmap(bs->cur->bc_ops)) bs->cur = NULL; out_free: @@ -429,7 +428,7 @@ xchk_btree_check_owner( * up. */ if (bp == NULL) { - if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE) xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; } @@ -442,7 +441,7 @@ xchk_btree_check_owner( * duplicate cursors. Therefore, save the buffer daddr for * later scanning. */ - if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { + if (xfs_btree_is_bno(cur->bc_ops) || xfs_btree_is_rmap(cur->bc_ops)) { struct check_owner *co; co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS); @@ -475,7 +474,7 @@ xchk_btree_check_iroot_minrecs( * existing filesystems, so instead we disable the check for data fork * bmap btrees when there's an attr fork. */ - if (bs->cur->bc_btnum == XFS_BTNUM_BMAP && + if (xfs_btree_is_bmap(bs->cur->bc_ops) && bs->cur->bc_ino.whichfork == XFS_DATA_FORK && xfs_inode_has_attr_fork(bs->sc->ip)) return false; @@ -508,7 +507,7 @@ xchk_btree_check_minrecs( * child block might be less than the standard minrecs, but that's ok * provided that there's only one direct child of the root. */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == cur->bc_nlevels - 2) { struct xfs_btree_block *root_block; struct xfs_buf *root_bp; @@ -562,7 +561,7 @@ xchk_btree_block_check_keys( return; } - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Make sure the high key of this block matches the parent. */ @@ -585,7 +584,6 @@ xchk_btree_get_block( struct xfs_btree_block **pblock, struct xfs_buf **pbp) { - xfs_failaddr_t failed_at; int error; *pblock = NULL; @@ -597,13 +595,7 @@ xchk_btree_get_block( return error; xfs_btree_get_block(bs->cur, level, pbp); - if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) - failed_at = __xfs_btree_check_lblock(bs->cur, *pblock, - level, *pbp); - else - failed_at = __xfs_btree_check_sblock(bs->cur, *pblock, - level, *pbp); - if (failed_at) { + if (__xfs_btree_check_block(bs->cur, *pblock, level, *pbp)) { xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; } @@ -664,7 +656,7 @@ xchk_btree_block_keys( if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys)) xchk_btree_set_corrupt(bs->sc, cur, 1); - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Get high keys */ @@ -728,7 +720,7 @@ xchk_btree( * error codes for us. */ level = cur->bc_nlevels - 1; - cur->bc_ops->init_ptr_from_cur(cur, &ptr); + xfs_btree_init_ptr_from_cur(cur, &ptr); if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr)) goto out; error = xchk_btree_get_block(bs, level, &ptr, &block, &bp); diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index de24532fe083..47a20cf5205f 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -25,9 +25,12 @@ #include "xfs_trans_priv.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_quota.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -81,6 +84,15 @@ __xchk_process_error( sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), sc->sm, *error); break; + case -ECANCELED: + /* + * ECANCELED here means that the caller set one of the scrub + * outcome flags (corrupt, xfail, xcorrupt) and wants to exit + * quickly. Set error to zero and do not continue. + */ + trace_xchk_op_error(sc, agno, bno, *error, ret_ip); + *error = 0; + break; case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ @@ -88,8 +100,7 @@ __xchk_process_error( *error = 0; fallthrough; default: - trace_xchk_op_error(sc, agno, bno, *error, - ret_ip); + trace_xchk_op_error(sc, agno, bno, *error, ret_ip); break; } return false; @@ -135,6 +146,16 @@ __xchk_fblock_process_error( /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; + case -ECANCELED: + /* + * ECANCELED here means that the caller set one of the scrub + * outcome flags (corrupt, xfail, xcorrupt) and wants to exit + * quickly. Set error to zero and do not continue. + */ + trace_xchk_file_op_error(sc, whichfork, offset, *error, + ret_ip); + *error = 0; + break; case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ @@ -226,6 +247,19 @@ xchk_block_set_corrupt( trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); } +#ifdef CONFIG_XFS_QUOTA +/* Record a corrupt quota counter. */ +void +xchk_qcheck_set_corrupt( + struct xfs_scrub *sc, + unsigned int dqtype, + xfs_dqid_t id) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xchk_qcheck_error(sc, dqtype, id, __return_address); +} +#endif + /* Record a corruption while cross-referencing. */ void xchk_block_xref_set_corrupt( @@ -426,7 +460,7 @@ xchk_perag_read_headers( * Grab the AG headers for the attached perag structure and wait for pending * intents to drain. */ -static int +int xchk_perag_drain_and_lock( struct xfs_scrub *sc) { @@ -554,46 +588,50 @@ xchk_ag_btcur_init( { struct xfs_mount *mp = sc->mp; - if (sa->agf_bp && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { + if (sa->agf_bp) { /* Set up a bnobt cursor for cross-referencing. */ - sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - sa->pag, XFS_BTNUM_BNO); - } + sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, + sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur, + XFS_SCRUB_TYPE_BNOBT); - if (sa->agf_bp && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) { /* Set up a cntbt cursor for cross-referencing. */ - sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - sa->pag, XFS_BTNUM_CNT); - } - - /* Set up a inobt cursor for cross-referencing. */ - if (sa->agi_bp && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { - sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, - XFS_BTNUM_INO); - } - - /* Set up a finobt cursor for cross-referencing. */ - if (sa->agi_bp && xfs_has_finobt(mp) && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { - sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, - XFS_BTNUM_FINO); - } - - /* Set up a rmapbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_has_rmapbt(mp) && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { - sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, + sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur, + XFS_SCRUB_TYPE_CNTBT); + + /* Set up a rmapbt cursor for cross-referencing. */ + if (xfs_has_rmapbt(mp)) { + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, + sa->agf_bp, sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur, + XFS_SCRUB_TYPE_RMAPBT); + } + + /* Set up a refcountbt cursor for cross-referencing. */ + if (xfs_has_reflink(mp)) { + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur, + XFS_SCRUB_TYPE_REFCNTBT); + } } - /* Set up a refcountbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_has_reflink(mp) && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { - sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, - sa->agf_bp, sa->pag); + if (sa->agi_bp) { + /* Set up a inobt cursor for cross-referencing. */ + sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, + sa->agi_bp); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur, + XFS_SCRUB_TYPE_INOBT); + + /* Set up a finobt cursor for cross-referencing. */ + if (xfs_has_finobt(mp)) { + sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp, + sa->agi_bp); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur, + XFS_SCRUB_TYPE_FINOBT); + } } } @@ -604,6 +642,7 @@ xchk_ag_free( struct xchk_ag *sa) { xchk_ag_btcur_free(sa); + xrep_reset_perag_resv(sc); if (sa->agf_bp) { xfs_trans_brelse(sc->tp, sa->agf_bp); sa->agf_bp = NULL; @@ -651,6 +690,13 @@ xchk_trans_cancel( sc->tp = NULL; } +int +xchk_trans_alloc_empty( + struct xfs_scrub *sc) +{ + return xfs_trans_alloc_empty(sc->mp, &sc->tp); +} + /* * Grab an empty transaction so that we can re-grab locked buffers if * one of our btrees turns out to be cyclic. @@ -670,7 +716,7 @@ xchk_trans_alloc( return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, resblks, 0, 0, &sc->tp); - return xfs_trans_alloc_empty(sc->mp, &sc->tp); + return xchk_trans_alloc_empty(sc); } /* Set us up with a transaction and an empty context. */ @@ -733,6 +779,8 @@ xchk_iget( xfs_ino_t inum, struct xfs_inode **ipp) { + ASSERT(sc->tp != NULL); + return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); } @@ -816,6 +864,26 @@ again: return 0; } +#ifdef CONFIG_XFS_QUOTA +/* + * Try to attach dquots to this inode if we think we might want to repair it. + * Callers must not hold any ILOCKs. If the dquots are broken and cannot be + * attached, a quotacheck will be scheduled. + */ +int +xchk_ino_dqattach( + struct xfs_scrub *sc) +{ + ASSERT(sc->tp != NULL); + ASSERT(sc->ip != NULL); + + if (!xchk_could_repair(sc)) + return 0; + + return xrep_ino_dqattach(sc); +} +#endif + /* Install an inode that we opened by handle for scrubbing. */ int xchk_install_handle_inode( @@ -882,8 +950,8 @@ xchk_iget_for_scrubbing( if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; - /* Try a regular untrusted iget. */ - error = xchk_iget(sc, sc->sm->sm_ino, &ip); + /* Try a safe untrusted iget. */ + error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); if (!error) return xchk_install_handle_inode(sc, ip); if (error == -ENOENT) @@ -976,9 +1044,7 @@ xchk_irele( struct xfs_scrub *sc, struct xfs_inode *ip) { - if (current->journal_info != NULL) { - ASSERT(current->journal_info == sc->tp); - + if (sc->tp) { /* * If we are in a transaction, we /cannot/ drop the inode * ourselves, because the VFS will trigger writeback, which @@ -1027,6 +1093,11 @@ xchk_setup_inode_contents( error = xchk_trans_alloc(sc, resblks); if (error) goto out; + + error = xchk_ino_dqattach(sc); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode for us */ @@ -1132,6 +1203,7 @@ xchk_metadata_inode_subtype( unsigned int scrub_type) { __u32 smtype = sc->sm->sm_type; + unsigned int sick_mask = sc->sick_mask; int error; sc->sm->sm_type = scrub_type; @@ -1149,6 +1221,7 @@ xchk_metadata_inode_subtype( break; } + sc->sick_mask = sick_mask; sc->sm->sm_type = smtype; return error; } @@ -1228,6 +1301,15 @@ xchk_fsgates_enable( if (scrub_fsgates & XCHK_FSGATES_DRAIN) xfs_drain_wait_enable(); + if (scrub_fsgates & XCHK_FSGATES_QUOTA) + xfs_dqtrx_hook_enable(); + + if (scrub_fsgates & XCHK_FSGATES_DIRENTS) + xfs_dir_hook_enable(); + + if (scrub_fsgates & XCHK_FSGATES_RMAP) + xfs_rmap_hook_enable(); + sc->flags |= scrub_fsgates; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index cabdc0e16838..89f7bbec887e 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -32,6 +32,7 @@ xchk_should_terminate( } int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); +int xchk_trans_alloc_empty(struct xfs_scrub *sc); void xchk_trans_cancel(struct xfs_scrub *sc); bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, @@ -54,6 +55,10 @@ void xchk_block_set_corrupt(struct xfs_scrub *sc, void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino); void xchk_fblock_set_corrupt(struct xfs_scrub *sc, int whichfork, xfs_fileoff_t offset); +#ifdef CONFIG_XFS_QUOTA +void xchk_qcheck_set_corrupt(struct xfs_scrub *sc, unsigned int dqtype, + xfs_dqid_t id); +#endif void xchk_block_xref_set_corrupt(struct xfs_scrub *sc, struct xfs_buf *bp); @@ -103,19 +108,33 @@ xchk_setup_rtsummary(struct xfs_scrub *sc) } #endif #ifdef CONFIG_XFS_QUOTA +int xchk_ino_dqattach(struct xfs_scrub *sc); int xchk_setup_quota(struct xfs_scrub *sc); +int xchk_setup_quotacheck(struct xfs_scrub *sc); #else static inline int +xchk_ino_dqattach(struct xfs_scrub *sc) +{ + return 0; +} +static inline int xchk_setup_quota(struct xfs_scrub *sc) { return -ENOENT; } +static inline int +xchk_setup_quotacheck(struct xfs_scrub *sc) +{ + return -ENOENT; +} #endif int xchk_setup_fscounters(struct xfs_scrub *sc); +int xchk_setup_nlinks(struct xfs_scrub *sc); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, struct xchk_ag *sa); +int xchk_perag_drain_and_lock(struct xfs_scrub *sc); /* * Grab all AG resources, treating the inability to grab the perag structure as @@ -151,6 +170,11 @@ void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); +/* + * Grab the inode at @inum. The caller must have created a scrub transaction + * so that we can confirm the inumber by walking the inobt and not deadlock on + * a loop in the inobt. + */ int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp); int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_buf **agi_bpp, struct xfs_inode **ipp); @@ -158,6 +182,26 @@ void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip); int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip); /* + * Safe version of (untrusted) xchk_iget that uses an empty transaction to + * avoid deadlocking on loops in the inobt. This should only be used in a + * scrub or repair setup routine, and only prior to grabbing a transaction. + */ +static inline int +xchk_iget_safe(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp) +{ + int error; + + ASSERT(sc->tp == NULL); + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + error = xchk_iget(sc, inum, ipp); + xchk_trans_cancel(sc); + return error; +} + +/* * Don't bother cross-referencing if we already found corruption or cross * referencing discrepancies. */ @@ -167,6 +211,8 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT); } +bool xchk_dir_looks_zapped(struct xfs_inode *dp); + #ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) @@ -175,8 +221,21 @@ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT | XFS_SCRUB_OFLAG_PREEN); } + +/* + * "Should we prepare for a repair?" + * + * Return true if the caller permits us to repair metadata and we're not + * setting up for a post-repair evaluation. + */ +static inline bool xchk_could_repair(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + !(sc->flags & XREP_ALREADY_FIXED); +} #else # define xchk_needs_repair(sc) (false) +# define xchk_could_repair(sc) (false) #endif /* CONFIG_XFS_ONLINE_REPAIR */ int xchk_metadata_inode_forks(struct xfs_scrub *sc); @@ -188,6 +247,16 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc); #define xchk_xfile_descr(sc, fmt, ...) \ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \ (sc)->mp->m_super->s_id, ##__VA_ARGS__) +#define xchk_xfile_ag_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \ + ##__VA_ARGS__) +#define xchk_xfile_ino_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \ + ##__VA_ARGS__) /* * Setting up a hook to wait for intents to drain is costly -- we have to take diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c new file mode 100644 index 000000000000..4de3f0f40f48 --- /dev/null +++ b/fs/xfs/scrub/cow_repair.c @@ -0,0 +1,614 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_errortag.h" +#include "xfs_icache.h" +#include "xfs_refcount_btree.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/off_bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/reap.h" + +/* + * CoW Fork Mapping Repair + * ======================= + * + * Although CoW staging extents are owned by incore CoW inode forks, on disk + * they are owned by the refcount btree. The ondisk metadata does not record + * any ownership information, which limits what we can do to repair the + * mappings in the CoW fork. At most, we can replace ifork mappings that lack + * an entry in the refcount btree or are described by a reverse mapping record + * whose owner is not OWN_COW. + * + * Replacing extents is also tricky -- we can't touch written CoW fork extents + * since they are undergoing writeback, and delalloc extents do not require + * repair since they only exist incore. Hence the most we can do is find the + * bad parts of unwritten mappings, allocate a replacement set of blocks, and + * replace the incore mapping. We use the regular reaping process to unmap + * or free the discarded blocks, as appropriate. + */ +struct xrep_cow { + struct xfs_scrub *sc; + + /* Bitmap of file offset ranges that need replacing. */ + struct xoff_bitmap bad_fileoffs; + + /* Bitmap of fsblocks that were removed from the CoW fork. */ + struct xfsb_bitmap old_cowfork_fsblocks; + + /* CoW fork mappings used to scan for bad CoW staging extents. */ + struct xfs_bmbt_irec irec; + + /* refcount btree block number of irec.br_startblock */ + unsigned int irec_startbno; + + /* refcount btree block number of the next refcount record we expect */ + unsigned int next_bno; +}; + +/* CoW staging extent. */ +struct xrep_cow_extent { + xfs_fsblock_t fsbno; + xfs_extlen_t len; +}; + +/* + * Mark the part of the file range that corresponds to the given physical + * space. Caller must ensure that the physical range is within xc->irec. + */ +STATIC int +xrep_cow_mark_file_range( + struct xrep_cow *xc, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount) +{ + xfs_fileoff_t startoff; + + startoff = xc->irec.br_startoff + + (startblock - xc->irec.br_startblock); + + trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff, + blockcount); + + return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount); +} + +/* + * Trim @src to fit within the CoW fork mapping being examined, and put the + * result in @dst. + */ +static inline void +xrep_cow_trim_refcount( + struct xrep_cow *xc, + struct xfs_refcount_irec *dst, + const struct xfs_refcount_irec *src) +{ + unsigned int adj; + + memcpy(dst, src, sizeof(*dst)); + + if (dst->rc_startblock < xc->irec_startbno) { + adj = xc->irec_startbno - dst->rc_startblock; + dst->rc_blockcount -= adj; + dst->rc_startblock += adj; + } + + if (dst->rc_startblock + dst->rc_blockcount > + xc->irec_startbno + xc->irec.br_blockcount) { + adj = (dst->rc_startblock + dst->rc_blockcount) - + (xc->irec_startbno + xc->irec.br_blockcount); + dst->rc_blockcount -= adj; + } +} + +/* Mark any shared CoW staging extents. */ +STATIC int +xrep_cow_mark_shared_staging( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + struct xfs_refcount_irec rrec; + xfs_fsblock_t fsbno; + + if (!xfs_refcount_check_domain(rec) || + rec->rc_domain != XFS_REFC_DOMAIN_SHARED) + return -EFSCORRUPTED; + + xrep_cow_trim_refcount(xc, &rrec, rec); + + fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, + rrec.rc_startblock); + return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount); +} + +/* + * Mark any portion of the CoW fork file offset range where there is not a CoW + * staging extent record in the refcountbt, and keep a record of where we did + * find correct refcountbt records. Staging records are always cleaned out at + * mount time, so any two inodes trying to map the same staging area would have + * already taken the fs down due to refcount btree verifier errors. Hence this + * inode should be the sole creator of the staging extent records ondisk. + */ +STATIC int +xrep_cow_mark_missing_staging( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + struct xfs_refcount_irec rrec; + int error; + + if (!xfs_refcount_check_domain(rec) || + rec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + xrep_cow_trim_refcount(xc, &rrec, rec); + + if (xc->next_bno >= rrec.rc_startblock) + goto next; + + error = xrep_cow_mark_file_range(xc, + XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, + xc->next_bno), + rrec.rc_startblock - xc->next_bno); + if (error) + return error; + +next: + xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount; + return 0; +} + +/* + * Mark any area that does not correspond to a CoW staging rmap. These are + * cross-linked areas that must be avoided. + */ +STATIC int +xrep_cow_mark_missing_staging_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + xfs_fsblock_t fsbno; + xfs_agblock_t rec_bno; + xfs_extlen_t rec_len; + unsigned int adj; + + if (rec->rm_owner == XFS_RMAP_OWN_COW) + return 0; + + rec_bno = rec->rm_startblock; + rec_len = rec->rm_blockcount; + if (rec_bno < xc->irec_startbno) { + adj = xc->irec_startbno - rec_bno; + rec_len -= adj; + rec_bno += adj; + } + + if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) { + adj = (rec_bno + rec_len) - + (xc->irec_startbno + xc->irec.br_blockcount); + rec_len -= adj; + } + + fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno); + return xrep_cow_mark_file_range(xc, fsbno, rec_len); +} + +/* + * Find any part of the CoW fork mapping that isn't a single-owner CoW staging + * extent and mark the corresponding part of the file range in the bitmap. + */ +STATIC int +xrep_cow_find_bad( + struct xrep_cow *xc) +{ + struct xfs_refcount_irec rc_low = { 0 }; + struct xfs_refcount_irec rc_high = { 0 }; + struct xfs_rmap_irec rm_low = { 0 }; + struct xfs_rmap_irec rm_high = { 0 }; + struct xfs_perag *pag; + struct xfs_scrub *sc = xc->sc; + xfs_agnumber_t agno; + int error; + + agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock); + xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock); + + pag = xfs_perag_get(sc->mp, agno); + if (!pag) + return -EFSCORRUPTED; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + goto out_pag; + + /* Mark any CoW fork extents that are shared. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED; + error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_shared_staging, xc); + if (error) + goto out_sa; + + /* Make sure there are CoW staging extents for the whole mapping. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW; + xc->next_bno = xc->irec_startbno; + error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_missing_staging, xc); + if (error) + goto out_sa; + + if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { + error = xrep_cow_mark_file_range(xc, + XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, + xc->next_bno), + xc->irec_startbno + xc->irec.br_blockcount - + xc->next_bno); + if (error) + goto out_sa; + } + + /* Mark any area has an rmap that isn't a COW staging extent. */ + rm_low.rm_startblock = xc->irec_startbno; + memset(&rm_high, 0xFF, sizeof(rm_high)); + rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high, + xrep_cow_mark_missing_staging_rmap, xc); + if (error) + goto out_sa; + + /* + * If userspace is forcing us to rebuild the CoW fork or someone turned + * on the debugging knob, replace everything in the CoW fork. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, + xc->irec.br_blockcount); + if (error) + return error; + } + +out_sa: + xchk_ag_free(sc, &sc->sa); +out_pag: + xfs_perag_put(pag); + return 0; +} + +/* + * Allocate a replacement CoW staging extent of up to the given number of + * blocks, and fill out the mapping. + */ +STATIC int +xrep_cow_alloc( + struct xfs_scrub *sc, + xfs_extlen_t maxlen, + struct xrep_cow_extent *repl) +{ + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = sc->mp, + .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, + .minlen = 1, + .maxlen = maxlen, + .prod = 1, + .resv = XFS_AG_RESV_NONE, + .datatype = XFS_ALLOC_USERDATA, + }; + int error; + + error = xfs_trans_reserve_more(sc->tp, maxlen, 0); + if (error) + return error; + + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino)); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len); + + repl->fsbno = args.fsbno; + repl->len = args.len; + return 0; +} + +/* + * Look up the current CoW fork mapping so that we only allocate enough to + * replace a single mapping. If we don't find a mapping that covers the start + * of the file range, or we find a delalloc or written extent, something is + * seriously wrong, since we didn't drop the ILOCK. + */ +static inline int +xrep_cow_find_mapping( + struct xrep_cow *xc, + struct xfs_iext_cursor *icur, + xfs_fileoff_t startoff, + struct xfs_bmbt_irec *got) +{ + struct xfs_inode *ip = xc->sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); + + if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got)) + goto bad; + + if (got->br_startoff > startoff) + goto bad; + + if (got->br_blockcount == 0) + goto bad; + + if (isnullstartblock(got->br_startblock)) + goto bad; + + if (xfs_bmap_is_written_extent(got)) + goto bad; + + return 0; +bad: + ASSERT(0); + return -EFSCORRUPTED; +} + +#define REPLACE_LEFT_SIDE (1U << 0) +#define REPLACE_RIGHT_SIDE (1U << 1) + +/* + * Given a CoW fork mapping @got and a replacement mapping @repl, remap the + * beginning of @got with the space described by @rep. + */ +static inline void +xrep_cow_replace_mapping( + struct xfs_inode *ip, + struct xfs_iext_cursor *icur, + const struct xfs_bmbt_irec *got, + const struct xrep_cow_extent *repl) +{ + struct xfs_bmbt_irec new = *got; /* struct copy */ + + ASSERT(repl->len > 0); + ASSERT(!isnullstartblock(got->br_startblock)); + + trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len); + + if (got->br_blockcount == repl->len) { + /* + * The new extent is a complete replacement for the existing + * extent. Update the COW fork record. + */ + new.br_startblock = repl->fsbno; + xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); + return; + } + + /* + * The new extent can replace the beginning of the COW fork record. + * Move the left side of @got upwards, then insert the new record. + */ + new.br_startoff += repl->len; + new.br_startblock += repl->len; + new.br_blockcount -= repl->len; + xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); + + new.br_startoff = got->br_startoff; + new.br_startblock = repl->fsbno; + new.br_blockcount = repl->len; + xfs_iext_insert(ip, icur, &new, BMAP_COWFORK); +} + +/* + * Replace the unwritten CoW staging extent backing the given file range with a + * new space extent that isn't as problematic. + */ +STATIC int +xrep_cow_replace_range( + struct xrep_cow *xc, + xfs_fileoff_t startoff, + xfs_extlen_t *blockcount) +{ + struct xfs_iext_cursor icur; + struct xrep_cow_extent repl; + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = xc->sc; + xfs_fileoff_t nextoff; + xfs_extlen_t alloc_len; + int error; + + /* + * Put the existing CoW fork mapping in @got. If @got ends before + * @rep, truncate @rep so we only replace one extent mapping at a time. + */ + error = xrep_cow_find_mapping(xc, &icur, startoff, &got); + if (error) + return error; + nextoff = min(startoff + *blockcount, + got.br_startoff + got.br_blockcount); + + /* + * Allocate a replacement extent. If we don't fill all the blocks, + * shorten the quantity that will be deleted in this step. + */ + alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN, + nextoff - startoff); + error = xrep_cow_alloc(sc, alloc_len, &repl); + if (error) + return error; + + /* + * Replace the old mapping with the new one, and commit the metadata + * changes made so far. + */ + xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl); + + xfs_inode_set_cowblocks_tag(sc->ip); + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* Note the old CoW staging extents; we'll reap them all later. */ + error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock, + repl.len); + if (error) + return error; + + *blockcount = repl.len; + return 0; +} + +/* + * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc + * reservation. + */ +STATIC int +xrep_cow_replace( + uint64_t startoff, + uint64_t blockcount, + void *priv) +{ + struct xrep_cow *xc = priv; + int error = 0; + + while (blockcount > 0) { + xfs_extlen_t len = min_t(xfs_filblks_t, blockcount, + XFS_MAX_BMBT_EXTLEN); + + error = xrep_cow_replace_range(xc, startoff, &len); + if (error) + break; + + blockcount -= len; + startoff += len; + } + + return error; +} + +/* + * Repair an inode's CoW fork. The CoW fork is an in-core structure, so + * there's no btree to rebuid. Instead, we replace any mappings that are + * cross-linked or lack ondisk CoW fork records in the refcount btree. + */ +int +xrep_bmap_cow( + struct xfs_scrub *sc) +{ + struct xrep_cow *xc; + struct xfs_iext_cursor icur; + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK); + int error; + + if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp)) + return -EOPNOTSUPP; + + if (!ifp) + return 0; + + /* realtime files aren't supported yet */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return -EOPNOTSUPP; + + /* + * If we're somehow not in extents format, then reinitialize it to + * an empty extent mapping fork and exit. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + return 0; + } + + xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS); + if (!xc) + return -ENOMEM; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + xc->sc = sc; + xoff_bitmap_init(&xc->bad_fileoffs); + xfsb_bitmap_init(&xc->old_cowfork_fsblocks); + + for_each_xfs_iext(ifp, &icur, &xc->irec) { + if (xchk_should_terminate(sc, &error)) + goto out_bitmap; + + /* + * delalloc reservations only exist incore, so there is no + * ondisk metadata that we can examine. Hence we leave them + * alone. + */ + if (isnullstartblock(xc->irec.br_startblock)) + continue; + + /* + * COW fork extents are only in the written state if writeback + * is actively writing to disk. We cannot restart the write + * at a different disk address since we've already issued the + * IO, so we leave these alone and hope for the best. + */ + if (xfs_bmap_is_written_extent(&xc->irec)) + continue; + + error = xrep_cow_find_bad(xc); + if (error) + goto out_bitmap; + } + + /* Replace any bad unwritten mappings with fresh reservations. */ + error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc); + if (error) + goto out_bitmap; + + /* + * Reap as many of the old CoW blocks as we can. They are owned ondisk + * by the refcount btree, not the inode, so it is correct to treat them + * like inode metadata. + */ + error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, + &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); + xoff_bitmap_destroy(&xc->bad_fileoffs); + kfree(xc); + return error; +} diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 0b491784b759..076a310b8eb0 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -15,10 +15,12 @@ #include "xfs_icache.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/readdir.h" +#include "scrub/health.h" /* Set us up to scrub directories. */ int @@ -91,11 +93,11 @@ xchk_dir_actor( return -ECANCELED; } - if (!strncmp(".", name->name, name->len)) { + if (xfs_dir2_samename(name, &xfs_name_dot)) { /* If this is "." then check that the inum matches the dir. */ if (ino != dp->i_ino) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - } else if (!strncmp("..", name->name, name->len)) { + } else if (xfs_dir2_samename(name, &xfs_name_dotdot)) { /* * If this is ".." in the root inode, check that the inum * matches this dir. @@ -760,6 +762,11 @@ xchk_directory( if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) return -ENOENT; + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_DIR_ZAPPED)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + /* Plausible size? */ if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -784,7 +791,36 @@ xchk_directory( /* Look up every name in this directory by hash. */ error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); - if (error == -ECANCELED) - error = 0; - return error; + if (error && error != -ECANCELED) + return error; + + /* If the dir is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED); + return 0; +} + +/* + * Decide if this directory has been zapped to satisfy the inode and ifork + * verifiers. Checking and repairing should be postponed until the directory + * is fixed. + */ +bool +xchk_dir_looks_zapped( + struct xfs_inode *dp) +{ + /* Repair zapped this dir's data fork a short time ago */ + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return true; + + /* + * If the dinode repair found a bad data fork, it will reset the fork + * to extents format with zero records and wait for the bmapbtd + * scrubber to reconstruct the block mappings. Directories always + * contain some content, so this is a clear sign of a zapped directory. + * The state checked by xfs_ifork_zapped is not persisted, so this is + * the secondary strategy if repairs are interrupted by a crash or an + * unmount. + */ + return dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS && + dp->i_df.if_nextents == 0; } diff --git a/fs/xfs/scrub/dqiterate.c b/fs/xfs/scrub/dqiterate.c new file mode 100644 index 000000000000..20c4daedd48d --- /dev/null +++ b/fs/xfs/scrub/dqiterate.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_bmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/quota.h" +#include "scrub/trace.h" + +/* Initialize a dquot iteration cursor. */ +void +xchk_dqiter_init( + struct xchk_dqiter *cursor, + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + cursor->sc = sc; + cursor->bmap.br_startoff = NULLFILEOFF; + cursor->dqtype = dqtype & XFS_DQTYPE_REC_MASK; + cursor->quota_ip = xfs_quota_inode(sc->mp, cursor->dqtype); + cursor->id = 0; +} + +/* + * Ensure that the cached data fork mapping for the dqiter cursor is fresh and + * covers the dquot pointed to by the scan cursor. + */ +STATIC int +xchk_dquot_iter_revalidate_bmap( + struct xchk_dqiter *cursor) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip, + XFS_DATA_FORK); + xfs_fileoff_t fileoff; + xfs_dqid_t this_id = cursor->id; + int nmaps = 1; + int error; + + fileoff = this_id / qi->qi_dqperchunk; + + /* + * If we have a mapping for cursor->id and it's still fresh, there's + * no need to reread the bmbt. + */ + if (cursor->bmap.br_startoff != NULLFILEOFF && + cursor->if_seq == ifp->if_seq && + cursor->bmap.br_startoff + cursor->bmap.br_blockcount > fileoff) + return 0; + + /* Look up the data fork mapping for the dquot id of interest. */ + error = xfs_bmapi_read(cursor->quota_ip, fileoff, + XFS_MAX_FILEOFF - fileoff, &cursor->bmap, &nmaps, 0); + if (error) + return error; + if (!nmaps) { + ASSERT(nmaps > 0); + return -EFSCORRUPTED; + } + if (cursor->bmap.br_startoff > fileoff) { + ASSERT(cursor->bmap.br_startoff == fileoff); + return -EFSCORRUPTED; + } + + cursor->if_seq = ifp->if_seq; + trace_xchk_dquot_iter_revalidate_bmap(cursor, cursor->id); + return 0; +} + +/* Advance the dqiter cursor to the next non-sparse region of the quota file. */ +STATIC int +xchk_dquot_iter_advance_bmap( + struct xchk_dqiter *cursor, + uint64_t *next_ondisk_id) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip, + XFS_DATA_FORK); + xfs_fileoff_t fileoff; + uint64_t next_id; + int nmaps = 1; + int error; + + /* Find the dquot id for the next non-hole mapping. */ + do { + fileoff = cursor->bmap.br_startoff + cursor->bmap.br_blockcount; + if (fileoff > XFS_DQ_ID_MAX / qi->qi_dqperchunk) { + /* The hole goes beyond the max dquot id, we're done */ + *next_ondisk_id = -1ULL; + return 0; + } + + error = xfs_bmapi_read(cursor->quota_ip, fileoff, + XFS_MAX_FILEOFF - fileoff, &cursor->bmap, + &nmaps, 0); + if (error) + return error; + if (!nmaps) { + /* Must have reached the end of the mappings. */ + *next_ondisk_id = -1ULL; + return 0; + } + if (cursor->bmap.br_startoff > fileoff) { + ASSERT(cursor->bmap.br_startoff == fileoff); + return -EFSCORRUPTED; + } + } while (!xfs_bmap_is_real_extent(&cursor->bmap)); + + next_id = cursor->bmap.br_startoff * qi->qi_dqperchunk; + if (next_id > XFS_DQ_ID_MAX) { + /* The hole goes beyond the max dquot id, we're done */ + *next_ondisk_id = -1ULL; + return 0; + } + + /* Propose jumping forward to the dquot in the next allocated block. */ + *next_ondisk_id = next_id; + cursor->if_seq = ifp->if_seq; + trace_xchk_dquot_iter_advance_bmap(cursor, *next_ondisk_id); + return 0; +} + +/* + * Find the id of the next highest incore dquot. Normally this will correspond + * exactly with the quota file block mappings, but repair might have erased a + * mapping because it was crosslinked; in that case, we need to re-allocate the + * space so that we can reset q_blkno. + */ +STATIC void +xchk_dquot_iter_advance_incore( + struct xchk_dqiter *cursor, + uint64_t *next_incore_id) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, cursor->dqtype); + struct xfs_dquot *dq; + unsigned int nr_found; + + *next_incore_id = -1ULL; + + mutex_lock(&qi->qi_tree_lock); + nr_found = radix_tree_gang_lookup(tree, (void **)&dq, cursor->id, 1); + if (nr_found) + *next_incore_id = dq->q_id; + mutex_unlock(&qi->qi_tree_lock); + + trace_xchk_dquot_iter_advance_incore(cursor, *next_incore_id); +} + +/* + * Walk all incore dquots of this filesystem. Caller must set *@cursorp to + * zero before the first call, and must not hold the quota file ILOCK. + * Returns 1 and a valid *@dqpp; 0 and *@dqpp == NULL when there are no more + * dquots to iterate; or a negative errno. + */ +int +xchk_dquot_iter( + struct xchk_dqiter *cursor, + struct xfs_dquot **dqpp) +{ + struct xfs_mount *mp = cursor->sc->mp; + struct xfs_dquot *dq = NULL; + uint64_t next_ondisk, next_incore = -1ULL; + unsigned int lock_mode; + int error = 0; + + if (cursor->id > XFS_DQ_ID_MAX) + return 0; + next_ondisk = cursor->id; + + /* Revalidate and/or advance the cursor. */ + lock_mode = xfs_ilock_data_map_shared(cursor->quota_ip); + error = xchk_dquot_iter_revalidate_bmap(cursor); + if (!error && !xfs_bmap_is_real_extent(&cursor->bmap)) + error = xchk_dquot_iter_advance_bmap(cursor, &next_ondisk); + xfs_iunlock(cursor->quota_ip, lock_mode); + if (error) + return error; + + if (next_ondisk > cursor->id) + xchk_dquot_iter_advance_incore(cursor, &next_incore); + + /* Pick the next dquot in the sequence and return it. */ + cursor->id = min(next_ondisk, next_incore); + if (cursor->id > XFS_DQ_ID_MAX) + return 0; + + trace_xchk_dquot_iter(cursor, cursor->id); + + error = xfs_qm_dqget(mp, cursor->id, cursor->dqtype, false, &dq); + if (error) + return error; + + cursor->id = dq->q_id + 1; + *dqpp = dq; + return 1; +} diff --git a/fs/xfs/scrub/fsb_bitmap.h b/fs/xfs/scrub/fsb_bitmap.h new file mode 100644 index 000000000000..40b462c1dd0d --- /dev/null +++ b/fs/xfs/scrub/fsb_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_FSB_BITMAP_H__ +#define __XFS_SCRUB_FSB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_fsblock_t */ + +struct xfsb_bitmap { + struct xbitmap64 fsbitmap; +}; + +static inline void xfsb_bitmap_init(struct xfsb_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->fsbitmap); +} + +static inline void xfsb_bitmap_destroy(struct xfsb_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->fsbitmap); +} + +static inline int xfsb_bitmap_set(struct xfsb_bitmap *bitmap, + xfs_fsblock_t start, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->fsbitmap, start, len); +} + +static inline int xfsb_bitmap_walk(struct xfsb_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->fsbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_FSB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 05be757668bb..d310737c8823 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -16,12 +16,13 @@ #include "xfs_health.h" #include "xfs_btree.h" #include "xfs_ag.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" +#include "scrub/fscounters.h" /* * FS Summary Counters @@ -48,17 +49,6 @@ * our tolerance for mismatch between expected and actual counter values. */ -struct xchk_fscounters { - struct xfs_scrub *sc; - uint64_t icount; - uint64_t ifree; - uint64_t fdblocks; - uint64_t frextents; - unsigned long long icount_min; - unsigned long long icount_max; - bool frozen; -}; - /* * Since the expected value computation is lockless but only browses incore * values, the percpu counters should be fairly close to each other. However, @@ -235,14 +225,19 @@ xchk_setup_fscounters( * Pause all writer activity in the filesystem while we're scrubbing to * reduce the likelihood of background perturbations to the counters * throwing off our calculations. + * + * If we're repairing, we need to prevent any other thread from + * changing the global fs summary counters while we're repairing them. + * This requires the fs to be frozen, which will disable background + * reclaim and purge all inactive inodes. */ - if (sc->flags & XCHK_TRY_HARDER) { + if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) { error = xchk_fscounters_freeze(sc); if (error) return error; } - return xfs_trans_alloc_empty(sc->mp, &sc->tp); + return xchk_trans_alloc_empty(sc); } /* @@ -254,7 +249,9 @@ xchk_setup_fscounters( * set the INCOMPLETE flag even when a negative errno is returned. This care * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, * ECANCELED) that are absorbed into a scrub state flag update by - * xchk_*_process_error. + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. */ /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ @@ -482,6 +479,10 @@ xchk_fscount_within_range( if (curr_value == expected) return true; + /* We require exact matches when repair is running. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + return false; + min_value = min(old_value, curr_value); max_value = max(old_value, curr_value); diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h new file mode 100644 index 000000000000..461a13d25f4b --- /dev/null +++ b/fs/xfs/scrub/fscounters.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_FSCOUNTERS_H__ +#define __XFS_SCRUB_FSCOUNTERS_H__ + +struct xchk_fscounters { + struct xfs_scrub *sc; + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; + uint64_t frextents; + unsigned long long icount_min; + unsigned long long icount_max; + bool frozen; +}; + +#endif /* __XFS_SCRUB_FSCOUNTERS_H__ */ diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c new file mode 100644 index 000000000000..94cdb852bee4 --- /dev/null +++ b/fs/xfs/scrub/fscounters_repair.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/fscounters.h" + +/* + * FS Summary Counters + * =================== + * + * We correct errors in the filesystem summary counters by setting them to the + * values computed during the obligatory scrub phase. However, we must be + * careful not to allow any other thread to change the counters while we're + * computing and setting new values. To achieve this, we freeze the + * filesystem for the whole operation if the REPAIR flag is set. The checking + * function is stricter when we've frozen the fs. + */ + +/* + * Reset the superblock counters. Caller is responsible for freezing the + * filesystem during the calculation and reset phases. + */ +int +xrep_fscounters( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xchk_fscounters *fsc = sc->buf; + + /* + * Reinitialize the in-core counters from what we computed. We froze + * the filesystem, so there shouldn't be anyone else trying to modify + * these counters. + */ + if (!fsc->frozen) { + ASSERT(fsc->frozen); + return -EFSCORRUPTED; + } + + trace_xrep_reset_counters(mp, fsc); + + percpu_counter_set(&mp->m_icount, fsc->icount); + percpu_counter_set(&mp->m_ifree, fsc->ifree); + percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); + percpu_counter_set(&mp->m_frextents, fsc->frextents); + mp->m_sb.sb_frextents = fsc->frextents; + + return 0; +} diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 5e2b09ed6e29..9020a6bef7f1 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -10,12 +10,11 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" #include "xfs_ag.h" #include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/health.h" +#include "scrub/common.h" /* * Scrub and In-Core Filesystem Health Assessments @@ -107,6 +106,8 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA }, [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, + [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, + [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS }, }; /* Return the health status mask for this scrub type. */ @@ -118,6 +119,56 @@ xchk_health_mask_for_scrub_type( } /* + * If the scrub state is clean, add @mask to the scrub sick mask to clear + * additional sick flags from the metadata object's sick state. + */ +void +xchk_mark_healthy_if_clean( + struct xfs_scrub *sc, + unsigned int mask) +{ + if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT))) + sc->sick_mask |= mask; +} + +/* + * If we're scrubbing a piece of file metadata for the first time, does it look + * like it has been zapped? Skip the check if we just repaired the metadata + * and are revalidating it. + */ +bool +xchk_file_looks_zapped( + struct xfs_scrub *sc, + unsigned int mask) +{ + ASSERT((mask & ~XFS_SICK_INO_ZAPPED) == 0); + + if (sc->flags & XREP_ALREADY_FIXED) + return false; + + return xfs_inode_has_sickness(sc->ip, mask); +} + +/* + * Scrub gave the filesystem a clean bill of health, so clear all the indirect + * markers of past problems (at least for the fs and ags) so that we can be + * healthy again. + */ +STATIC void +xchk_mark_all_healthy( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT); + xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT); + for_each_perag(mp, agno, pag) + xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT); +} + +/* * Update filesystem health assessments based on what we found and did. * * If the scrubber finds errors, we mark sick whatever's mentioned in @@ -134,6 +185,18 @@ xchk_update_health( struct xfs_perag *pag; bool bad; + /* + * The HEALTHY scrub type is a request from userspace to clear all the + * indirect flags after a clean scan of the entire filesystem. As such + * there's no sick flag defined for it, so we branch here ahead of the + * mask check. + */ + if (sc->sm->sm_type == XFS_SCRUB_TYPE_HEALTHY && + !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + xchk_mark_all_healthy(sc->mp); + return; + } + if (!sc->sick_mask) return; @@ -143,7 +206,7 @@ xchk_update_health( case XHG_AG: pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); if (bad) - xfs_ag_mark_sick(pag, sc->sick_mask); + xfs_ag_mark_corrupt(pag, sc->sick_mask); else xfs_ag_mark_healthy(pag, sc->sick_mask); xfs_perag_put(pag); @@ -151,20 +214,30 @@ xchk_update_health( case XHG_INO: if (!sc->ip) return; - if (bad) - xfs_inode_mark_sick(sc->ip, sc->sick_mask); - else + if (bad) { + unsigned int mask = sc->sick_mask; + + /* + * If we're coming in for repairs then we don't want + * sickness flags to propagate to the incore health + * status if the inode gets inactivated before we can + * fix it. + */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + mask |= XFS_SICK_INO_FORGET; + xfs_inode_mark_corrupt(sc->ip, mask); + } else xfs_inode_mark_healthy(sc->ip, sc->sick_mask); break; case XHG_FS: if (bad) - xfs_fs_mark_sick(sc->mp, sc->sick_mask); + xfs_fs_mark_corrupt(sc->mp, sc->sick_mask); else xfs_fs_mark_healthy(sc->mp, sc->sick_mask); break; case XHG_RT: if (bad) - xfs_rt_mark_sick(sc->mp, sc->sick_mask); + xfs_rt_mark_corrupt(sc->mp, sc->sick_mask); else xfs_rt_mark_healthy(sc->mp, sc->sick_mask); break; @@ -175,13 +248,13 @@ xchk_update_health( } /* Is the given per-AG btree healthy enough for scanning? */ -bool -xchk_ag_btree_healthy_enough( +void +xchk_ag_btree_del_cursor_if_sick( struct xfs_scrub *sc, - struct xfs_perag *pag, - xfs_btnum_t btnum) + struct xfs_btree_cur **curp, + unsigned int sm_type) { - unsigned int mask = 0; + unsigned int mask = (*curp)->bc_ops->sick_mask; /* * We always want the cursor if it's the same type as whatever we're @@ -190,41 +263,8 @@ xchk_ag_btree_healthy_enough( * Otherwise, we're only interested in the btree for cross-referencing. * If we know the btree is bad then don't bother, just set XFAIL. */ - switch (btnum) { - case XFS_BTNUM_BNO: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT) - return true; - mask = XFS_SICK_AG_BNOBT; - break; - case XFS_BTNUM_CNT: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT) - return true; - mask = XFS_SICK_AG_CNTBT; - break; - case XFS_BTNUM_INO: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) - return true; - mask = XFS_SICK_AG_INOBT; - break; - case XFS_BTNUM_FINO: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) - return true; - mask = XFS_SICK_AG_FINOBT; - break; - case XFS_BTNUM_RMAP: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT) - return true; - mask = XFS_SICK_AG_RMAPBT; - break; - case XFS_BTNUM_REFC: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT) - return true; - mask = XFS_SICK_AG_REFCNTBT; - break; - default: - ASSERT(0); - return true; - } + if (sc->sm->sm_type == sm_type) + return; /* * If we just repaired some AG metadata, sc->sick_mask will reflect all @@ -236,10 +276,42 @@ xchk_ag_btree_healthy_enough( type_to_health_flag[sc->sm->sm_type].group == XHG_AG) mask &= ~sc->sick_mask; - if (xfs_ag_has_sickness(pag, mask)) { + if (xfs_ag_has_sickness((*curp)->bc_ag.pag, mask)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; - return false; + xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR); + *curp = NULL; + } +} + +/* + * Quick scan to double-check that there isn't any evidence of lingering + * primary health problems. If we're still clear, then the health update will + * take care of clearing the indirect evidence. + */ +int +xchk_health_record( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag; + xfs_agnumber_t agno; + + unsigned int sick; + unsigned int checked; + + xfs_fs_measure_sickness(mp, &sick, &checked); + if (sick & XFS_SICK_FS_PRIMARY) + xchk_set_corrupt(sc); + + xfs_rt_measure_sickness(mp, &sick, &checked); + if (sick & XFS_SICK_RT_PRIMARY) + xchk_set_corrupt(sc); + + for_each_perag(mp, agno, pag) { + xfs_ag_measure_sickness(pag, &sick, &checked); + if (sick & XFS_SICK_AG_PRIMARY) + xchk_set_corrupt(sc); } - return true; + return 0; } diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h index 66a273f8585b..63fc426eb5ae 100644 --- a/fs/xfs/scrub/health.h +++ b/fs/xfs/scrub/health.h @@ -8,7 +8,10 @@ unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type); void xchk_update_health(struct xfs_scrub *sc); -bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, - xfs_btnum_t btnum); +void xchk_ag_btree_del_cursor_if_sick(struct xfs_scrub *sc, + struct xfs_btree_cur **curp, unsigned int sm_type); +void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask); +bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask); +int xchk_health_record(struct xfs_scrub *sc); #endif /* __XFS_SCRUB_HEALTH_H__ */ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index fb7bbf47ae5d..750d7b0cd25a 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -76,7 +76,7 @@ xchk_inobt_xref_finobt( int has_record; int error; - ASSERT(cur->bc_btnum == XFS_BTNUM_FINO); + ASSERT(xfs_btree_is_fino(cur->bc_ops)); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); if (error) @@ -179,7 +179,7 @@ xchk_finobt_xref_inobt( int has_record; int error; - ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + ASSERT(xfs_btree_is_ino(cur->bc_ops)); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); if (error) @@ -514,7 +514,7 @@ xchk_iallocbt_rec_alignment( * Otherwise, we expect that the finobt record is aligned to the * cluster alignment as told by the superblock. */ - if (bs->cur->bc_btnum == XFS_BTNUM_FINO) { + if (xfs_btree_is_fino(bs->cur->bc_ops)) { unsigned int imask; imask = min_t(unsigned int, XFS_INODES_PER_CHUNK, @@ -585,7 +585,7 @@ xchk_iallocbt_rec( uint16_t holemask; xfs_inobt_btrec_to_irec(mp, rec, &irec); - if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) { + if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -649,8 +649,7 @@ out: */ STATIC void xchk_iallocbt_xref_rmap_btreeblks( - struct xfs_scrub *sc, - int which) + struct xfs_scrub *sc) { xfs_filblks_t blocks; xfs_extlen_t inobt_blocks = 0; @@ -688,7 +687,6 @@ xchk_iallocbt_xref_rmap_btreeblks( STATIC void xchk_iallocbt_xref_rmap_inodes( struct xfs_scrub *sc, - int which, unsigned long long inodes) { xfs_filblks_t blocks; @@ -708,11 +706,10 @@ xchk_iallocbt_xref_rmap_inodes( xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } -/* Scrub the inode btrees for some AG. */ -STATIC int +/* Scrub one of the inode btrees for some AG. */ +int xchk_iallocbt( - struct xfs_scrub *sc, - xfs_btnum_t which) + struct xfs_scrub *sc) { struct xfs_btree_cur *cur; struct xchk_iallocbt iabt = { @@ -722,13 +719,24 @@ xchk_iallocbt( }; int error; - cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_INOBT: + cur = sc->sa.ino_cur; + break; + case XFS_SCRUB_TYPE_FINOBT: + cur = sc->sa.fino_cur; + break; + default: + ASSERT(0); + return -EIO; + } + error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT, &iabt); if (error) return error; - xchk_iallocbt_xref_rmap_btreeblks(sc, which); + xchk_iallocbt_xref_rmap_btreeblks(sc); /* * If we're scrubbing the inode btree, inode_blocks is the number of @@ -737,26 +745,11 @@ xchk_iallocbt( * knows about. We can't do this for the finobt since it only points * to inode chunks with free inodes. */ - if (which == XFS_BTNUM_INO) - xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes); - + if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) + xchk_iallocbt_xref_rmap_inodes(sc, iabt.inodes); return error; } -int -xchk_inobt( - struct xfs_scrub *sc) -{ - return xchk_iallocbt(sc, XFS_BTNUM_INO); -} - -int -xchk_finobt( - struct xfs_scrub *sc) -{ - return xchk_iallocbt(sc, XFS_BTNUM_FINO); -} - /* See if an inode btree has (or doesn't have) an inode chunk record. */ static inline void xchk_xref_inode_check( diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c new file mode 100644 index 000000000000..a00ec7ae1792 --- /dev/null +++ b/fs/xfs/scrub/ialloc_repair.c @@ -0,0 +1,884 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Inode Btree Repair + * ================== + * + * A quick refresher of inode btrees on a v5 filesystem: + * + * - Inode records are read into memory in units of 'inode clusters'. However + * many inodes fit in a cluster buffer is the smallest number of inodes that + * can be allocated or freed. Clusters are never smaller than one fs block + * though they can span multiple blocks. The size (in fs blocks) is + * computed with xfs_icluster_size_fsb(). The fs block alignment of a + * cluster is computed with xfs_ialloc_cluster_alignment(). + * + * - Each inode btree record can describe a single 'inode chunk'. The chunk + * size is defined to be 64 inodes. If sparse inodes are enabled, every + * inobt record must be aligned to the chunk size; if not, every record must + * be aligned to the start of a cluster. It is possible to construct an XFS + * geometry where one inobt record maps to multiple inode clusters; it is + * also possible to construct a geometry where multiple inobt records map to + * different parts of one inode cluster. + * + * - If sparse inodes are not enabled, the smallest unit of allocation for + * inode records is enough to contain one inode chunk's worth of inodes. + * + * - If sparse inodes are enabled, the holemask field will be active. Each + * bit of the holemask represents 4 potential inodes; if set, the + * corresponding space does *not* contain inodes and must be left alone. + * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation + * of inode records is one inode cluster. + * + * So what's the rebuild algorithm? + * + * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT + * records. The OWN_INOBT records are the old inode btree blocks and will be + * cleared out after we've rebuilt the tree. Each possible inode cluster + * within an OWN_INODES record will be read in; for each possible inobt record + * associated with that cluster, compute the freemask calculated from the + * i_mode data in the inode chunk. For sparse inodes the holemask will be + * calculated by creating the properly aligned inobt record and punching out + * any chunk that's missing. Inode allocations and frees grab the AGI first, + * so repair protects itself from concurrent access by locking the AGI. + * + * Once we've reconstructed all the inode records, we can create new inode + * btree roots and reload the btrees. We rebuild both inode trees at the same + * time because they have the same rmap owner and it would be more complex to + * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT + * blocks it owns. We have all the data we need to build both, so dump + * everything and start over. + * + * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once. + */ + +struct xrep_ibt { + /* Record under construction. */ + struct xfs_inobt_rec_incore rie; + + /* new inobt information */ + struct xrep_newbt new_inobt; + + /* new finobt information */ + struct xrep_newbt new_finobt; + + /* Old inode btree blocks we found in the rmap. */ + struct xagb_bitmap old_iallocbt_blocks; + + /* Reconstructed inode records. */ + struct xfarray *inode_records; + + struct xfs_scrub *sc; + + /* Number of inodes assigned disk space. */ + unsigned int icount; + + /* Number of inodes in use. */ + unsigned int iused; + + /* Number of finobt records needed. */ + unsigned int finobt_recs; + + /* get_records()'s position in the inode record array. */ + xfarray_idx_t array_cur; +}; + +/* + * Is this inode in use? If the inode is in memory we can tell from i_mode, + * otherwise we have to check di_mode in the on-disk buffer. We only care + * that the high (i.e. non-permission) bits of _mode are zero. This should be + * safe because repair keeps all AG headers locked until the end, and process + * trying to perform an inode allocation/free must lock the AGI. + * + * @cluster_ag_base is the inode offset of the cluster within the AG. + * @cluster_bp is the cluster buffer. + * @cluster_index is the inode offset within the inode cluster. + */ +STATIC int +xrep_ibt_check_ifree( + struct xrep_ibt *ri, + xfs_agino_t cluster_ag_base, + struct xfs_buf *cluster_bp, + unsigned int cluster_index, + bool *inuse) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_dinode *dip; + xfs_ino_t fsino; + xfs_agino_t agino; + xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno; + unsigned int cluster_buf_base; + unsigned int offset; + int error; + + agino = cluster_ag_base + cluster_index; + fsino = XFS_AGINO_TO_INO(mp, agno, agino); + + /* Inode uncached or half assembled, read disk buffer */ + cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base); + offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize; + if (offset >= BBTOB(cluster_bp->b_length)) + return -EFSCORRUPTED; + dip = xfs_buf_offset(cluster_bp, offset); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) + return -EFSCORRUPTED; + + if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino) + return -EFSCORRUPTED; + + /* Will the in-core inode tell us if it's in use? */ + error = xchk_inode_is_allocated(sc, agino, inuse); + if (!error) + return 0; + + *inuse = dip->di_mode != 0; + return 0; +} + +/* Stash the accumulated inobt record for rebuilding. */ +STATIC int +xrep_ibt_stash( + struct xrep_ibt *ri) +{ + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie); + if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL) + return -EFSCORRUPTED; + + if (ri->rie.ir_freecount > 0) + ri->finobt_recs++; + + trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie); + + error = xfarray_append(ri->inode_records, &ri->rie); + if (error) + return error; + + ri->rie.ir_startino = NULLAGINO; + return 0; +} + +/* + * Given an extent of inodes and an inode cluster buffer, calculate the + * location of the corresponding inobt record (creating it if necessary), + * then update the parts of the holemask and freemask of that record that + * correspond to the inode extent we were given. + * + * @cluster_ir_startino is the AG inode number of an inobt record that we're + * proposing to create for this inode cluster. If sparse inodes are enabled, + * we must round down to a chunk boundary to find the actual sparse record. + * @cluster_bp is the buffer of the inode cluster. + * @nr_inodes is the number of inodes to check from the cluster. + */ +STATIC int +xrep_ibt_cluster_record( + struct xrep_ibt *ri, + xfs_agino_t cluster_ir_startino, + struct xfs_buf *cluster_bp, + unsigned int nr_inodes) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + xfs_agino_t ir_startino; + unsigned int cluster_base; + unsigned int cluster_index; + int error = 0; + + ir_startino = cluster_ir_startino; + if (xfs_has_sparseinodes(mp)) + ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK); + cluster_base = cluster_ir_startino - ir_startino; + + /* + * If the accumulated inobt record doesn't map this cluster, add it to + * the list and reset it. + */ + if (ri->rie.ir_startino != NULLAGINO && + ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) { + error = xrep_ibt_stash(ri); + if (error) + return error; + } + + if (ri->rie.ir_startino == NULLAGINO) { + ri->rie.ir_startino = ir_startino; + ri->rie.ir_free = XFS_INOBT_ALL_FREE; + ri->rie.ir_holemask = 0xFFFF; + ri->rie.ir_count = 0; + } + + /* Record the whole cluster. */ + ri->icount += nr_inodes; + ri->rie.ir_count += nr_inodes; + ri->rie.ir_holemask &= ~xfs_inobt_maskn( + cluster_base / XFS_INODES_PER_HOLEMASK_BIT, + nr_inodes / XFS_INODES_PER_HOLEMASK_BIT); + + /* Which inodes within this cluster are free? */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + bool inuse = false; + + error = xrep_ibt_check_ifree(ri, cluster_ir_startino, + cluster_bp, cluster_index, &inuse); + if (error) + return error; + if (!inuse) + continue; + ri->iused++; + ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base + + cluster_index); + } + return 0; +} + +/* + * For each inode cluster covering the physical extent recorded by the rmapbt, + * we must calculate the properly aligned startino of that cluster, then + * iterate each cluster to fill in used and filled masks appropriately. We + * then use the (startino, used, filled) information to construct the + * appropriate inode records. + */ +STATIC int +xrep_ibt_process_cluster( + struct xrep_ibt *ri, + xfs_agblock_t cluster_bno) +{ + struct xfs_imap imap; + struct xfs_buf *cluster_bp; + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agino_t cluster_ag_base; + xfs_agino_t irec_index; + unsigned int nr_inodes; + int error; + + nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster, + XFS_INODES_PER_CHUNK); + + /* + * Grab the inode cluster buffer. This is safe to do with a broken + * inobt because imap_to_bp directly maps the buffer without touching + * either inode btree. + */ + imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno); + imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); + imap.im_boffset = 0; + error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp); + if (error) + return error; + + /* + * Record the contents of each possible inobt record mapping this + * cluster. + */ + cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno); + for (irec_index = 0; + irec_index < igeo->inodes_per_cluster; + irec_index += XFS_INODES_PER_CHUNK) { + error = xrep_ibt_cluster_record(ri, + cluster_ag_base + irec_index, cluster_bp, + nr_inodes); + if (error) + break; + + } + + xfs_trans_brelse(sc->tp, cluster_bp); + return error; +} + +/* Check for any obvious conflicts in the inode chunk extent. */ +STATIC int +xrep_ibt_check_inode_ext( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agino_t agino; + enum xbtree_recpacking outcome; + int error; + + /* Inode records must be within the AG. */ + if (!xfs_verify_agbext(sc->sa.pag, agbno, len)) + return -EFSCORRUPTED; + + /* The entire record must align to the inode cluster size. */ + if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) || + !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster)) + return -EFSCORRUPTED; + + /* + * The entire record must also adhere to the inode cluster alignment + * size if sparse inodes are not enabled. + */ + if (!xfs_has_sparseinodes(mp) && + (!IS_ALIGNED(agbno, igeo->cluster_align) || + !IS_ALIGNED(agbno + len, igeo->cluster_align))) + return -EFSCORRUPTED; + + /* + * On a sparse inode fs, this cluster could be part of a sparse chunk. + * Sparse clusters must be aligned to sparse chunk alignment. + */ + if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align && + (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) || + !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align))) + return -EFSCORRUPTED; + + /* Make sure the entire range of blocks are valid AG inodes. */ + agino = XFS_AGB_TO_AGINO(mp, agbno); + if (!xfs_verify_agino(sc->sa.pag, agino)) + return -EFSCORRUPTED; + + agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1; + if (!xfs_verify_agino(sc->sa.pag, agino)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Found a fragment of the old inode btrees; dispose of them later. */ +STATIC int +xrep_ibt_record_old_btree_blocks( + struct xrep_ibt *ri, + const struct xfs_rmap_irec *rec) +{ + if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock, + rec->rm_blockcount); +} + +/* Record extents that belong to inode cluster blocks. */ +STATIC int +xrep_ibt_record_inode_blocks( + struct xrep_ibt *ri, + const struct xfs_rmap_irec *rec) +{ + struct xfs_mount *mp = ri->sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t cluster_base; + int error; + + error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock, + rec->rm_blockcount); + if (error) + return error; + + trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno, + rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, + rec->rm_offset, rec->rm_flags); + + /* + * Record the free/hole masks for each inode cluster that could be + * mapped by this rmap record. + */ + for (cluster_base = 0; + cluster_base < rec->rm_blockcount; + cluster_base += igeo->blocks_per_cluster) { + error = xrep_ibt_process_cluster(ri, + rec->rm_startblock + cluster_base); + if (error) + return error; + } + + return 0; +} + +STATIC int +xrep_ibt_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_ibt *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + switch (rec->rm_owner) { + case XFS_RMAP_OWN_INOBT: + return xrep_ibt_record_old_btree_blocks(ri, rec); + case XFS_RMAP_OWN_INODES: + return xrep_ibt_record_inode_blocks(ri, rec); + } + return 0; +} + +/* + * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode + * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct + * the inode btrees. The caller must clean up the lists if anything goes + * wrong. + */ +STATIC int +xrep_ibt_find_inodes( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + ri->rie.ir_startino = NULLAGINO; + + /* Collect all reverse mappings for inode blocks. */ + xrep_ag_btcur_init(sc, &sc->sa); + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri); + xchk_ag_btcur_free(&sc->sa); + if (error) + return error; + + /* If we have a record ready to go, add it to the array. */ + if (ri->rie.ir_startino != NULLAGINO) + return xrep_ibt_stash(ri); + + return 0; +} + +/* Update the AGI counters. */ +STATIC int +xrep_ibt_reset_counters( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + unsigned int freecount = ri->icount - ri->iused; + + /* Trigger inode count recalculation */ + xfs_force_summary_recalc(sc->mp); + + /* + * The AGI header contains extra information related to the inode + * btrees, so we must update those fields here. + */ + agi->agi_count = cpu_to_be32(ri->icount); + agi->agi_freecount = cpu_to_be32(freecount); + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, + XFS_AGI_COUNT | XFS_AGI_FREECOUNT); + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagi(sc); +} + +/* Retrieve finobt data for bulk load. */ +STATIC int +xrep_fibt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; + struct xrep_ibt *ri = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + do { + error = xfarray_load(ri->inode_records, + ri->array_cur++, irec); + } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Retrieve inobt data for bulk load. */ +STATIC int +xrep_ibt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; + struct xrep_ibt *ri = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(ri->inode_records, ri->array_cur++, irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new inobt blocks to the bulk loader. */ +STATIC int +xrep_ibt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_ibt *ri = priv; + + return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr); +} + +/* Feed one of the new finobt blocks to the bulk loader. */ +STATIC int +xrep_fibt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_ibt *ri = priv; + + return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr); +} + +/* Make sure the records do not overlap in inumber address space. */ +STATIC int +xrep_ibt_check_overlap( + struct xrep_ibt *ri) +{ + struct xfs_inobt_rec_incore irec; + xfarray_idx_t cur; + xfs_agino_t next_agino = 0; + int error = 0; + + foreach_xfarray_idx(ri->inode_records, cur) { + if (xchk_should_terminate(ri->sc, &error)) + return error; + + error = xfarray_load(ri->inode_records, cur, &irec); + if (error) + return error; + + if (irec.ir_startino < next_agino) + return -EFSCORRUPTED; + + next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK; + } + + return error; +} + +/* Build new inode btrees and dispose of the old one. */ +STATIC int +xrep_ibt_build_new_trees( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_btree_cur *ino_cur; + struct xfs_btree_cur *fino_cur = NULL; + xfs_fsblock_t fsbno; + bool need_finobt; + int error; + + need_finobt = xfs_has_finobt(sc->mp); + + /* + * Create new btrees for staging all the inobt records we collected + * earlier. The records were collected in order of increasing agino, + * so we do not have to sort them. Ensure there are no overlapping + * records. + */ + error = xrep_ibt_check_overlap(ri); + if (error) + return error; + + /* + * The new inode btrees will not be rooted in the AGI until we've + * successfully rebuilt the tree. + * + * Start by setting up the inobt staging cursor. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_IBT_BLOCK(sc->mp)), + xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno, + XFS_AG_RESV_NONE); + ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; + ri->new_inobt.bload.get_records = xrep_ibt_get_records; + + ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL); + xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake); + error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload, + xfarray_length(ri->inode_records)); + if (error) + goto err_inocur; + + /* Set up finobt staging cursor. */ + if (need_finobt) { + enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA; + + if (sc->mp->m_finobt_nores) + resv = XFS_AG_RESV_NONE; + + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_FIBT_BLOCK(sc->mp)), + xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT, + fsbno, resv); + ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; + ri->new_finobt.bload.get_records = xrep_fibt_get_records; + + fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL); + xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake); + error = xfs_btree_bload_compute_geometry(fino_cur, + &ri->new_finobt.bload, ri->finobt_recs); + if (error) + goto err_finocur; + } + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_finocur; + + /* Reserve all the space we need to build the new btrees. */ + error = xrep_newbt_alloc_blocks(&ri->new_inobt, + ri->new_inobt.bload.nr_blocks); + if (error) + goto err_finocur; + + if (need_finobt) { + error = xrep_newbt_alloc_blocks(&ri->new_finobt, + ri->new_finobt.bload.nr_blocks); + if (error) + goto err_finocur; + } + + /* Add all inobt records. */ + ri->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri); + if (error) + goto err_finocur; + + /* Add all finobt records. */ + if (need_finobt) { + ri->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri); + if (error) + goto err_finocur; + } + + /* + * Install the new btrees in the AG header. After this point the old + * btrees are no longer accessible and the new trees are live. + */ + xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp); + xfs_btree_del_cursor(ino_cur, 0); + + if (fino_cur) { + xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp); + xfs_btree_del_cursor(fino_cur, 0); + } + + /* Reset the AGI counters now that we've changed the inode roots. */ + error = xrep_ibt_reset_counters(ri); + if (error) + goto err_finobt; + + /* Free unused blocks and bitmap. */ + if (need_finobt) { + error = xrep_newbt_commit(&ri->new_finobt); + if (error) + goto err_inobt; + } + error = xrep_newbt_commit(&ri->new_inobt); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_finocur: + if (need_finobt) + xfs_btree_del_cursor(fino_cur, error); +err_inocur: + xfs_btree_del_cursor(ino_cur, error); +err_finobt: + if (need_finobt) + xrep_newbt_cancel(&ri->new_finobt); +err_inobt: + xrep_newbt_cancel(&ri->new_inobt); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_ibt_remove_old_trees( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + /* + * Free the old inode btree blocks if they're not in use. It's ok to + * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG + * reservation because we reset the reservation before releasing the + * AGI and AGF header buffer locks. + */ + error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks, + &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE); + if (error) + return error; + + /* + * If the finobt is enabled and has a per-AG reservation, make sure we + * reinitialize the per-AG reservations. + */ + if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores) + sc->flags |= XREP_RESET_PERAG_RESV; + + return 0; +} + +/* Repair both inode btrees. */ +int +xrep_iallocbt( + struct xfs_scrub *sc) +{ + struct xrep_ibt *ri; + struct xfs_mount *mp = sc->mp; + char *descr; + xfs_agino_t first_agino, last_agino; + int error = 0; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS); + if (!ri) + return -ENOMEM; + ri->sc = sc; + + /* We rebuild both inode btrees. */ + sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT; + + /* Set up enough storage to handle an AG with nothing but inodes. */ + xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino); + last_agino /= XFS_INODES_PER_CHUNK; + descr = xchk_xfile_ag_descr(sc, "inode index records"); + error = xfarray_create(descr, last_agino, + sizeof(struct xfs_inobt_rec_incore), + &ri->inode_records); + kfree(descr); + if (error) + goto out_ri; + + /* Collect the inode data and find the old btree blocks. */ + xagb_bitmap_init(&ri->old_iallocbt_blocks); + error = xrep_ibt_find_inodes(ri); + if (error) + goto out_bitmap; + + /* Rebuild the inode indexes. */ + error = xrep_ibt_build_new_trees(ri); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_ibt_remove_old_trees(ri); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&ri->old_iallocbt_blocks); + xfarray_destroy(ri->inode_records); +out_ri: + kfree(ri); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_iallocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT; + error = xchk_iallocbt(sc); + if (error) + goto out; + + if (xfs_has_finobt(sc->mp)) { + sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT; + error = xchk_iallocbt(sc); + } + +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 59d7912fb75f..6e2fe2d6250b 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -20,10 +20,12 @@ #include "xfs_reflink.h" #include "xfs_rmap.h" #include "xfs_bmap_util.h" +#include "xfs_rtbitmap.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" +#include "scrub/repair.h" /* Prepare the attached inode for scrubbing. */ static inline int @@ -38,6 +40,10 @@ xchk_prepare_iscrub( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL); return 0; } @@ -94,8 +100,8 @@ xchk_setup_inode( if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; - /* Try a regular untrusted iget. */ - error = xchk_iget(sc, sc->sm->sm_ino, &ip); + /* Try a safe untrusted iget. */ + error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); if (!error) return xchk_install_handle_iscrub(sc, ip); if (error == -ENOENT) @@ -180,8 +186,11 @@ xchk_setup_inode( * saying the inode is allocated and the icache being unable to load * the inode until we can flag the corruption in xchk_inode. The * scrub function has to note the corruption, since we're not really - * supposed to do that from the setup function. + * supposed to do that from the setup function. Save the mapping to + * make repairs to the ondisk inode buffer. */ + if (xchk_could_repair(sc)) + xrep_setup_inode(sc, &imap); return 0; out_cancel: @@ -225,7 +234,7 @@ xchk_inode_extsize( */ if ((flags & XFS_DIFLAG_RTINHERIT) && (flags & XFS_DIFLAG_EXTSZINHERIT) && - value % sc->mp->m_sb.sb_rextsize > 0) + xfs_extlen_to_rtxmod(sc->mp, value) > 0) xchk_ino_set_warning(sc, ino); } @@ -337,6 +346,10 @@ xchk_inode_flags2( if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) goto bad; + /* no large extent counts without the filesystem feature */ + if ((flags2 & XFS_DIFLAG2_NREXT64) && !xfs_has_large_extent_counts(mp)) + goto bad; + return; bad: xchk_ino_set_corrupt(sc, ino); @@ -547,7 +560,7 @@ xchk_dinode( } /* di_forkoff */ - if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) + if (XFS_DFORK_BOFF(dip) >= mp->m_sb.sb_inodesize) xchk_ino_set_corrupt(sc, ino); if (naextents != 0 && dip->di_forkoff == 0) xchk_ino_set_corrupt(sc, ino); diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c new file mode 100644 index 000000000000..eab380e95ef4 --- /dev/null +++ b/fs/xfs/scrub/inode_repair.c @@ -0,0 +1,1750 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" +#include "xfs_ialloc.h" +#include "xfs_da_format.h" +#include "xfs_reflink.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_quota_defs.h" +#include "xfs_quota.h" +#include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_attr_leaf.h" +#include "xfs_log_priv.h" +#include "xfs_health.h" +#include "xfs_symlink_remote.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/readdir.h" + +/* + * Inode Record Repair + * =================== + * + * Roughly speaking, inode problems can be classified based on whether or not + * they trip the dinode verifiers. If those trip, then we won't be able to + * xfs_iget ourselves the inode. + * + * Therefore, the xrep_dinode_* functions fix anything that will cause the + * inode buffer verifier or the dinode verifier. The xrep_inode_* functions + * fix things on live incore inodes. The inode repair functions make decisions + * with security and usability implications when reviving a file: + * + * - Files with zero di_mode or a garbage di_mode are converted to regular file + * that only root can read. This file may not actually contain user data, + * if the file was not previously a regular file. Setuid and setgid bits + * are cleared. + * + * - Zero-size directories can be truncated to look empty. It is necessary to + * run the bmapbtd and directory repair functions to fully rebuild the + * directory. + * + * - Zero-size symbolic link targets can be truncated to '?'. It is necessary + * to run the bmapbtd and symlink repair functions to salvage the symlink. + * + * - Invalid extent size hints will be removed. + * + * - Quotacheck will be scheduled if we repaired an inode that was so badly + * damaged that the ondisk inode had to be rebuilt. + * + * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. + * Setuid and setgid bits are cleared. + * + * - Data and attr forks are reset to extents format with zero extents if the + * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta + * repair functions to recover the space mapping. + * + * - ACLs will not be recovered if the attr fork is zapped or the extended + * attribute structure itself requires salvaging. + * + * - If the attr fork is zapped, the user and group ids are reset to root and + * the setuid and setgid bits are removed. + */ + +/* + * All the information we need to repair the ondisk inode if we can't iget the + * incore inode. We don't allocate this buffer unless we're going to perform + * a repair to the ondisk inode cluster buffer. + */ +struct xrep_inode { + /* Inode mapping that we saved from the initial lookup attempt. */ + struct xfs_imap imap; + + struct xfs_scrub *sc; + + /* Blocks in use on the data device by data extents or bmbt blocks. */ + xfs_rfsblock_t data_blocks; + + /* Blocks in use on the rt device. */ + xfs_rfsblock_t rt_blocks; + + /* Blocks in use by the attr fork. */ + xfs_rfsblock_t attr_blocks; + + /* Number of data device extents for the data fork. */ + xfs_extnum_t data_extents; + + /* + * Number of realtime device extents for the data fork. If + * data_extents and rt_extents indicate that the data fork has extents + * on both devices, we'll just back away slowly. + */ + xfs_extnum_t rt_extents; + + /* Number of (data device) extents for the attr fork. */ + xfs_aextnum_t attr_extents; + + /* Sick state to set after zapping parts of the inode. */ + unsigned int ino_sick_mask; + + /* Must we remove all access from this file? */ + bool zap_acls; + + /* Inode scanner to see if we can find the ftype from dirents */ + struct xchk_iscan ftype_iscan; + uint8_t alleged_ftype; +}; + +/* + * Setup function for inode repair. @imap contains the ondisk inode mapping + * information so that we can correct the ondisk inode cluster buffer if + * necessary to make iget work. + */ +int +xrep_setup_inode( + struct xfs_scrub *sc, + const struct xfs_imap *imap) +{ + struct xrep_inode *ri; + + sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + ri = sc->buf; + memcpy(&ri->imap, imap, sizeof(struct xfs_imap)); + ri->sc = sc; + return 0; +} + +/* + * Make sure this ondisk inode can pass the inode buffer verifier. This is + * not the same as the dinode verifier. + */ +STATIC void +xrep_dinode_buf_core( + struct xfs_scrub *sc, + struct xfs_buf *bp, + unsigned int ioffset) +{ + struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset); + struct xfs_trans *tp = sc->tp; + struct xfs_mount *mp = sc->mp; + xfs_agino_t agino; + bool crc_ok = false; + bool magic_ok = false; + bool unlinked_ok = false; + + agino = be32_to_cpu(dip->di_next_unlinked); + + if (xfs_verify_agino_or_null(bp->b_pag, agino)) + unlinked_ok = true; + + if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + xfs_dinode_good_version(mp, dip->di_version)) + magic_ok = true; + + if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF)) + crc_ok = true; + + if (magic_ok && unlinked_ok && crc_ok) + return; + + if (!magic_ok) { + dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + dip->di_version = 3; + } + if (!unlinked_ok) + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + xfs_dinode_calc_crc(mp, dip); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); + xfs_trans_log_buf(tp, bp, ioffset, + ioffset + sizeof(struct xfs_dinode) - 1); +} + +/* Make sure this inode cluster buffer can pass the inode buffer verifier. */ +STATIC void +xrep_dinode_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = sc->mp; + int i; + int ni; + + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) + xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog); +} + +/* Reinitialize things that never change in an inode. */ +STATIC void +xrep_dinode_header( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + trace_xrep_dinode_header(sc, dip); + + dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + if (!xfs_dinode_good_version(sc->mp, dip->di_version)) + dip->di_version = 3; + dip->di_ino = cpu_to_be64(sc->sm->sm_ino); + uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid); + dip->di_gen = cpu_to_be32(sc->sm->sm_gen); +} + +/* + * If this directory entry points to the scrub target inode, then the directory + * we're scanning is the parent of the scrub target inode. + */ +STATIC int +xrep_dinode_findmode_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + if (ino != sc->sm->sm_ino) + return 0; + + /* Ignore garbage directory entry names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + /* + * Uhoh, more than one parent for this inode and they don't agree on + * the file type? + */ + if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN && + ri->alleged_ftype != name->type) { + trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type, + ri->alleged_ftype); + return -EFSCORRUPTED; + } + + /* We found a potential parent; remember the ftype. */ + trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type); + ri->alleged_ftype = name->type; + return 0; +} + +/* + * If this is a directory, walk the dirents looking for any that point to the + * scrub target inode. + */ +STATIC int +xrep_dinode_findmode_walk_directory( + struct xrep_inode *ri, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = ri->sc; + unsigned int lock_mode; + int error = 0; + + /* + * Scan the directory to see if there it contains an entry pointing to + * the directory that we are repairing. + */ + lock_mode = xfs_ilock_data_map_shared(dp); + + /* + * If this directory is known to be sick, we cannot scan it reliably + * and must abort. + */ + if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | + XFS_SICK_INO_BMBTD | + XFS_SICK_INO_DIR)) { + error = -EFSCORRUPTED; + goto out_unlock; + } + + /* + * We cannot complete our parent pointer scan if a directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lock_mode); + return error; +} + +/* + * Try to find the mode of the inode being repaired by looking for directories + * that point down to this file. + */ +STATIC int +xrep_dinode_find_mode( + struct xrep_inode *ri, + uint16_t *mode) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_inode *dp; + int error; + + /* No ftype means we have no other metadata to consult. */ + if (!xfs_has_ftype(sc->mp)) { + *mode = S_IFREG; + return 0; + } + + /* + * Scan all directories for parents that might point down to this + * inode. Skip the inode being repaired during the scan since it + * cannot be its own parent. Note that we still hold the AGI locked + * so there's a real possibility that _iscan_iter can return EBUSY. + */ + xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); + ri->ftype_iscan.skip_ino = sc->sm->sm_ino; + ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; + while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { + if (S_ISDIR(VFS_I(dp)->i_mode)) + error = xrep_dinode_findmode_walk_directory(ri, dp); + xchk_iscan_mark_visited(&ri->ftype_iscan, dp); + xchk_irele(sc, dp); + if (error < 0) + break; + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&ri->ftype_iscan); + xchk_iscan_teardown(&ri->ftype_iscan); + + if (error == -EBUSY) { + if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) { + /* + * If we got an EBUSY after finding at least one + * dirent, that means the scan found an inode on the + * inactivation list and could not open it. Accept the + * alleged ftype and install a new mode below. + */ + error = 0; + } else if (!(sc->flags & XCHK_TRY_HARDER)) { + /* + * Otherwise, retry the operation one time to see if + * the reason for the delay is an inode from the same + * cluster buffer waiting on the inactivation list. + */ + error = -EDEADLOCK; + } + } + if (error) + return error; + + /* + * Convert the discovered ftype into the file mode. If all else fails, + * return S_IFREG. + */ + switch (ri->alleged_ftype) { + case XFS_DIR3_FT_DIR: + *mode = S_IFDIR; + break; + case XFS_DIR3_FT_WHT: + case XFS_DIR3_FT_CHRDEV: + *mode = S_IFCHR; + break; + case XFS_DIR3_FT_BLKDEV: + *mode = S_IFBLK; + break; + case XFS_DIR3_FT_FIFO: + *mode = S_IFIFO; + break; + case XFS_DIR3_FT_SOCK: + *mode = S_IFSOCK; + break; + case XFS_DIR3_FT_SYMLINK: + *mode = S_IFLNK; + break; + default: + *mode = S_IFREG; + break; + } + return 0; +} + +/* Turn di_mode into /something/ recognizable. Returns true if we succeed. */ +STATIC int +xrep_dinode_mode( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + uint16_t mode = be16_to_cpu(dip->di_mode); + int error; + + trace_xrep_dinode_mode(sc, dip); + + if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) + return 0; + + /* Try to fix the mode. If we cannot, then leave everything alone. */ + error = xrep_dinode_find_mode(ri, &mode); + switch (error) { + case -EINTR: + case -EBUSY: + case -EDEADLOCK: + /* temporary failure or fatal signal */ + return error; + case 0: + /* found mode */ + break; + default: + /* some other error, assume S_IFREG */ + mode = S_IFREG; + break; + } + + /* bad mode, so we set it to a file that only root can read */ + dip->di_mode = cpu_to_be16(mode); + dip->di_uid = 0; + dip->di_gid = 0; + ri->zap_acls = true; + return 0; +} + +/* Fix any conflicting flags that the verifiers complain about. */ +STATIC void +xrep_dinode_flags( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + bool isrt) +{ + struct xfs_mount *mp = sc->mp; + uint64_t flags2 = be64_to_cpu(dip->di_flags2); + uint16_t flags = be16_to_cpu(dip->di_flags); + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_flags(sc, dip); + + if (isrt) + flags |= XFS_DIFLAG_REALTIME; + else + flags &= ~XFS_DIFLAG_REALTIME; + + /* + * For regular files on a reflink filesystem, set the REFLINK flag to + * protect shared extents. A later stage will actually check those + * extents and clear the flag if possible. + */ + if (xfs_has_reflink(mp) && S_ISREG(mode)) + flags2 |= XFS_DIFLAG2_REFLINK; + else + flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); + if (flags & XFS_DIFLAG_REALTIME) + flags2 &= ~XFS_DIFLAG2_REFLINK; + if (!xfs_has_bigtime(mp)) + flags2 &= ~XFS_DIFLAG2_BIGTIME; + if (!xfs_has_large_extent_counts(mp)) + flags2 &= ~XFS_DIFLAG2_NREXT64; + if (flags2 & XFS_DIFLAG2_NREXT64) + dip->di_nrext64_pad = 0; + else if (dip->di_version >= 3) + dip->di_v3_pad = 0; + dip->di_flags = cpu_to_be16(flags); + dip->di_flags2 = cpu_to_be64(flags2); +} + +/* + * Blow out symlink; now it points nowhere. We don't have to worry about + * incore state because this inode is failing the verifiers. + */ +STATIC void +xrep_dinode_zap_symlink( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + char *p; + + trace_xrep_dinode_zap_symlink(sc, dip); + + dip->di_format = XFS_DINODE_FMT_LOCAL; + dip->di_size = cpu_to_be64(1); + p = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + *p = '?'; + ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; +} + +/* + * Blow out dir, make the parent point to the root. In the future repair will + * reconstruct this directory for us. Note that there's no in-core directory + * inode because the sf verifier tripped, so we don't have to worry about the + * dentry cache. + */ +STATIC void +xrep_dinode_zap_dir( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_dir2_sf_hdr *sfp; + int i8count; + + trace_xrep_dinode_zap_dir(sc, dip); + + dip->di_format = XFS_DINODE_FMT_LOCAL; + i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM; + sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + sfp->count = 0; + sfp->i8count = i8count; + xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino); + dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count)); + ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED; +} + +/* Make sure we don't have a garbage file size. */ +STATIC void +xrep_dinode_size( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + uint64_t size = be64_to_cpu(dip->di_size); + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_size(sc, dip); + + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + /* di_size can't be nonzero for special files */ + dip->di_size = 0; + break; + case S_IFREG: + /* Regular files can't be larger than 2^63-1 bytes. */ + dip->di_size = cpu_to_be64(size & ~(1ULL << 63)); + break; + case S_IFLNK: + /* + * Truncate ridiculously oversized symlinks. If the size is + * zero, reset it to point to the current directory. Both of + * these conditions trigger dinode verifier errors, so there + * is no in-core state to reset. + */ + if (size > XFS_SYMLINK_MAXLEN) + dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN); + else if (size == 0) + xrep_dinode_zap_symlink(ri, dip); + break; + case S_IFDIR: + /* + * Directories can't have a size larger than 32G. If the size + * is zero, reset it to an empty directory. Both of these + * conditions trigger dinode verifier errors, so there is no + * in-core state to reset. + */ + if (size > XFS_DIR2_SPACE_SIZE) + dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE); + else if (size == 0) + xrep_dinode_zap_dir(ri, dip); + break; + } +} + +/* Fix extent size hints. */ +STATIC void +xrep_dinode_extsize_hints( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = sc->mp; + uint64_t flags2 = be64_to_cpu(dip->di_flags2); + uint16_t flags = be16_to_cpu(dip->di_flags); + uint16_t mode = be16_to_cpu(dip->di_mode); + + xfs_failaddr_t fa; + + trace_xrep_dinode_extsize_hints(sc, dip); + + fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), + mode, flags); + if (fa) { + dip->di_extsize = 0; + dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + } + + if (dip->di_version < 3) + return; + + fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) { + dip->di_cowextsize = 0; + dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE); + } +} + +/* Count extents and blocks for an inode given an rmap. */ +STATIC int +xrep_dinode_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + /* We only care about this inode. */ + if (rec->rm_owner != ri->sc->sm->sm_ino) + return 0; + + if (rec->rm_flags & XFS_RMAP_ATTR_FORK) { + ri->attr_blocks += rec->rm_blockcount; + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + ri->attr_extents++; + + return 0; + } + + ri->data_blocks += rec->rm_blockcount; + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + ri->data_extents++; + + return 0; +} + +/* Count extents and blocks for an inode from all AG rmap data. */ +STATIC int +xrep_dinode_count_ag_rmaps( + struct xrep_inode *ri, + struct xfs_perag *pag) +{ + struct xfs_btree_cur *cur; + struct xfs_buf *agf; + int error; + + error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf); + if (error) + return error; + + cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag); + error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri); + xfs_btree_del_cursor(cur, error); + xfs_trans_brelse(ri->sc->tp, agf); + return error; +} + +/* Count extents and blocks for a given inode from all rmap data. */ +STATIC int +xrep_dinode_count_rmaps( + struct xrep_inode *ri) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error; + + if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) + return -EOPNOTSUPP; + + for_each_perag(ri->sc->mp, agno, pag) { + error = xrep_dinode_count_ag_rmaps(ri, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + /* Can't have extents on both the rt and the data device. */ + if (ri->data_extents && ri->rt_extents) + return -EFSCORRUPTED; + + trace_xrep_dinode_count_rmaps(ri->sc, + ri->data_blocks, ri->rt_blocks, ri->attr_blocks, + ri->data_extents, ri->rt_extents, ri->attr_extents); + return 0; +} + +/* Return true if this extents-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_extents_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + struct xfs_bmbt_irec new; + struct xfs_bmbt_rec *dp; + xfs_extnum_t nex; + bool isrt; + unsigned int i; + + nex = xfs_dfork_nextents(dip, whichfork); + if (nex > dfork_size / sizeof(struct xfs_bmbt_rec)) + return true; + + dp = XFS_DFORK_PTR(dip, whichfork); + + isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME); + for (i = 0; i < nex; i++, dp++) { + xfs_failaddr_t fa; + + xfs_bmbt_disk_get_all(dp, &new); + fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork, + &new); + if (fa) + return true; + } + + return false; +} + +/* Return true if this btree-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_bmbt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + struct xfs_bmdr_block *dfp; + xfs_extnum_t nex; + unsigned int i; + unsigned int dmxr; + unsigned int nrecs; + unsigned int level; + + nex = xfs_dfork_nextents(dip, whichfork); + if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec)) + return true; + + if (dfork_size < sizeof(struct xfs_bmdr_block)) + return true; + + dfp = XFS_DFORK_PTR(dip, whichfork); + nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size) + return true; + if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) + return true; + + dmxr = xfs_bmdr_maxrecs(dfork_size, 0); + for (i = 1; i <= nrecs; i++) { + struct xfs_bmbt_key *fkp; + xfs_bmbt_ptr_t *fpp; + xfs_fileoff_t fileoff; + xfs_fsblock_t fsbno; + + fkp = XFS_BMDR_KEY_ADDR(dfp, i); + fileoff = be64_to_cpu(fkp->br_startoff); + if (!xfs_verify_fileoff(sc->mp, fileoff)) + return true; + + fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr); + fsbno = be64_to_cpu(*fpp); + if (!xfs_verify_fsbno(sc->mp, fsbno)) + return true; + } + + return false; +} + +/* + * Check the data fork for things that will fail the ifork verifiers or the + * ifork formatters. + */ +STATIC bool +xrep_dinode_check_dfork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + uint16_t mode) +{ + void *dfork_ptr; + int64_t data_size; + unsigned int fmt; + unsigned int dfork_size; + + /* + * Verifier functions take signed int64_t, so check for bogus negative + * values first. + */ + data_size = be64_to_cpu(dip->di_size); + if (data_size < 0) + return true; + + fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK); + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (fmt != XFS_DINODE_FMT_DEV) + return true; + break; + case S_IFREG: + if (fmt == XFS_DINODE_FMT_LOCAL) + return true; + fallthrough; + case S_IFLNK: + case S_IFDIR: + switch (fmt) { + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return true; + } + break; + default: + return true; + } + + dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK); + dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + switch (fmt) { + case XFS_DINODE_FMT_DEV: + break; + case XFS_DINODE_FMT_LOCAL: + /* dir/symlink structure cannot be larger than the fork */ + if (data_size > dfork_size) + return true; + /* directory structure must pass verification. */ + if (S_ISDIR(mode) && + xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL) + return true; + /* symlink structure must pass verification. */ + if (S_ISLNK(mode) && + xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL) + return true; + break; + case XFS_DINODE_FMT_EXTENTS: + if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; + case XFS_DINODE_FMT_BTREE: + if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; + default: + return true; + } + + return false; +} + +static void +xrep_dinode_set_data_nextents( + struct xfs_dinode *dip, + xfs_extnum_t nextents) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + dip->di_big_nextents = cpu_to_be64(nextents); + else + dip->di_nextents = cpu_to_be32(nextents); +} + +static void +xrep_dinode_set_attr_nextents( + struct xfs_dinode *dip, + xfs_extnum_t nextents) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + dip->di_big_anextents = cpu_to_be32(nextents); + else + dip->di_anextents = cpu_to_be16(nextents); +} + +/* Reset the data fork to something sane. */ +STATIC void +xrep_dinode_zap_dfork( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_scrub *sc = ri->sc; + + trace_xrep_dinode_zap_dfork(sc, dip); + + ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED; + + xrep_dinode_set_data_nextents(dip, 0); + ri->data_blocks = 0; + ri->rt_blocks = 0; + + /* Special files always get reset to DEV */ + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + dip->di_format = XFS_DINODE_FMT_DEV; + dip->di_size = 0; + return; + } + + /* + * If we have data extents, reset to an empty map and hope the user + * will run the bmapbtd checker next. + */ + if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) { + dip->di_format = XFS_DINODE_FMT_EXTENTS; + return; + } + + /* Otherwise, reset the local format to the minimum. */ + switch (mode & S_IFMT) { + case S_IFLNK: + xrep_dinode_zap_symlink(ri, dip); + break; + case S_IFDIR: + xrep_dinode_zap_dir(ri, dip); + break; + } +} + +/* + * Check the attr fork for things that will fail the ifork verifiers or the + * ifork formatters. + */ +STATIC bool +xrep_dinode_check_afork( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_attr_sf_hdr *afork_ptr; + size_t attr_size; + unsigned int afork_size; + + if (XFS_DFORK_BOFF(dip) == 0) + return dip->di_aformat != XFS_DINODE_FMT_EXTENTS || + xfs_dfork_attr_extents(dip) != 0; + + afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); + afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); + + switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) { + case XFS_DINODE_FMT_LOCAL: + /* Fork has to be large enough to extract the xattr size. */ + if (afork_size < sizeof(struct xfs_attr_sf_hdr)) + return true; + + /* xattr structure cannot be larger than the fork */ + attr_size = be16_to_cpu(afork_ptr->totsize); + if (attr_size > afork_size) + return true; + + /* xattr structure must pass verification. */ + return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL; + case XFS_DINODE_FMT_EXTENTS: + if (xrep_dinode_bad_extents_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; + case XFS_DINODE_FMT_BTREE: + if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; + default: + return true; + } + + return false; +} + +/* + * Reset the attr fork to empty. Since the attr fork could have contained + * ACLs, make the file readable only by root. + */ +STATIC void +xrep_dinode_zap_afork( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_scrub *sc = ri->sc; + + trace_xrep_dinode_zap_afork(sc, dip); + + ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED; + + dip->di_aformat = XFS_DINODE_FMT_EXTENTS; + xrep_dinode_set_attr_nextents(dip, 0); + ri->attr_blocks = 0; + + /* + * If the data fork is in btree format, removing the attr fork entirely + * might cause verifier failures if the next level down in the bmbt + * could now fit in the data fork area. + */ + if (dip->di_format != XFS_DINODE_FMT_BTREE) + dip->di_forkoff = 0; + dip->di_mode = cpu_to_be16(mode & ~0777); + dip->di_uid = 0; + dip->di_gid = 0; +} + +/* Make sure the fork offset is a sensible value. */ +STATIC void +xrep_dinode_ensure_forkoff( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_bmdr_block *bmdr; + struct xfs_scrub *sc = ri->sc; + xfs_extnum_t attr_extents, data_extents; + size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1); + unsigned int lit_sz = XFS_LITINO(sc->mp); + unsigned int afork_min, dfork_min; + + trace_xrep_dinode_ensure_forkoff(sc, dip); + + /* + * Before calling this function, xrep_dinode_core ensured that both + * forks actually fit inside their respective literal areas. If this + * was not the case, the fork was reset to FMT_EXTENTS with zero + * records. If the rmapbt scan found attr or data fork blocks, this + * will be noted in the dinode_stats, and we must leave enough room + * for the bmap repair code to reconstruct the mapping structure. + * + * First, compute the minimum space required for the attr fork. + */ + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + /* + * If we still have a shortform xattr structure at all, that + * means the attr fork area was exactly large enough to fit + * the sf structure. + */ + afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_EXTENTS: + attr_extents = xfs_dfork_attr_extents(dip); + if (attr_extents) { + /* + * We must maintain sufficient space to hold the entire + * extent map array in the data fork. Note that we + * previously zapped the fork if it had no chance of + * fitting in the inode. + */ + afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents; + } else if (ri->attr_extents > 0) { + /* + * The attr fork thinks it has zero extents, but we + * found some xattr extents. We need to leave enough + * empty space here so that the incore attr fork will + * get created (and hence trigger the attr fork bmap + * repairer). + */ + afork_min = bmdr_minsz; + } else { + /* No extents on disk or found in rmapbt. */ + afork_min = 0; + } + break; + case XFS_DINODE_FMT_BTREE: + /* Must have space for btree header and key/pointers. */ + bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); + afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + break; + default: + /* We should never see any other formats. */ + afork_min = 0; + break; + } + + /* Compute the minimum space required for the data fork. */ + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + dfork_min = sizeof(__be32); + break; + case XFS_DINODE_FMT_UUID: + dfork_min = sizeof(uuid_t); + break; + case XFS_DINODE_FMT_LOCAL: + /* + * If we still have a shortform data fork at all, that means + * the data fork area was large enough to fit whatever was in + * there. + */ + dfork_min = be64_to_cpu(dip->di_size); + break; + case XFS_DINODE_FMT_EXTENTS: + data_extents = xfs_dfork_data_extents(dip); + if (data_extents) { + /* + * We must maintain sufficient space to hold the entire + * extent map array in the data fork. Note that we + * previously zapped the fork if it had no chance of + * fitting in the inode. + */ + dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents; + } else if (ri->data_extents > 0 || ri->rt_extents > 0) { + /* + * The data fork thinks it has zero extents, but we + * found some data extents. We need to leave enough + * empty space here so that the data fork bmap repair + * will recover the mappings. + */ + dfork_min = bmdr_minsz; + } else { + /* No extents on disk or found in rmapbt. */ + dfork_min = 0; + } + break; + case XFS_DINODE_FMT_BTREE: + /* Must have space for btree header and key/pointers. */ + bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + break; + default: + dfork_min = 0; + break; + } + + /* + * Round all values up to the nearest 8 bytes, because that is the + * precision of di_forkoff. + */ + afork_min = roundup(afork_min, 8); + dfork_min = roundup(dfork_min, 8); + bmdr_minsz = roundup(bmdr_minsz, 8); + + ASSERT(dfork_min <= lit_sz); + ASSERT(afork_min <= lit_sz); + + /* + * If the data fork was zapped and we don't have enough space for the + * recovery fork, move the attr fork up. + */ + if (dip->di_format == XFS_DINODE_FMT_EXTENTS && + xfs_dfork_data_extents(dip) == 0 && + (ri->data_extents > 0 || ri->rt_extents > 0) && + bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) { + if (bmdr_minsz + afork_min > lit_sz) { + /* + * The attr for and the stub fork we need to recover + * the data fork won't both fit. Zap the attr fork. + */ + xrep_dinode_zap_afork(ri, dip, mode); + afork_min = bmdr_minsz; + } else { + void *before, *after; + + /* Otherwise, just slide the attr fork up. */ + before = XFS_DFORK_APTR(dip); + dip->di_forkoff = bmdr_minsz >> 3; + after = XFS_DFORK_APTR(dip); + memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp)); + } + } + + /* + * If the attr fork was zapped and we don't have enough space for the + * recovery fork, move the attr fork down. + */ + if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS && + xfs_dfork_attr_extents(dip) == 0 && + ri->attr_extents > 0 && + bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) { + if (dip->di_format == XFS_DINODE_FMT_BTREE) { + /* + * If the data fork is in btree format then we can't + * adjust forkoff because that runs the risk of + * violating the extents/btree format transition rules. + */ + } else if (bmdr_minsz + dfork_min > lit_sz) { + /* + * If we can't move the attr fork, too bad, we lose the + * attr fork and leak its blocks. + */ + xrep_dinode_zap_afork(ri, dip, mode); + } else { + /* + * Otherwise, just slide the attr fork down. The attr + * fork is empty, so we don't have any old contents to + * move here. + */ + dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3; + } + } +} + +/* + * Zap the data/attr forks if we spot anything that isn't going to pass the + * ifork verifiers or the ifork formatters, because we need to get the inode + * into good enough shape that the higher level repair functions can run. + */ +STATIC void +xrep_dinode_zap_forks( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + xfs_extnum_t data_extents; + xfs_extnum_t attr_extents; + xfs_filblks_t nblocks; + uint16_t mode; + bool zap_datafork = false; + bool zap_attrfork = ri->zap_acls; + + trace_xrep_dinode_zap_forks(sc, dip); + + mode = be16_to_cpu(dip->di_mode); + + data_extents = xfs_dfork_data_extents(dip); + attr_extents = xfs_dfork_attr_extents(dip); + nblocks = be64_to_cpu(dip->di_nblocks); + + /* Inode counters don't make sense? */ + if (data_extents > nblocks) + zap_datafork = true; + if (attr_extents > nblocks) + zap_attrfork = true; + if (data_extents + attr_extents > nblocks) + zap_datafork = zap_attrfork = true; + + if (!zap_datafork) + zap_datafork = xrep_dinode_check_dfork(sc, dip, mode); + if (!zap_attrfork) + zap_attrfork = xrep_dinode_check_afork(sc, dip); + + /* Zap whatever's bad. */ + if (zap_attrfork) + xrep_dinode_zap_afork(ri, dip, mode); + if (zap_datafork) + xrep_dinode_zap_dfork(ri, dip, mode); + xrep_dinode_ensure_forkoff(ri, dip, mode); + + /* + * Zero di_nblocks if we don't have any extents at all to satisfy the + * buffer verifier. + */ + data_extents = xfs_dfork_data_extents(dip); + attr_extents = xfs_dfork_attr_extents(dip); + if (data_extents + attr_extents == 0) + dip->di_nblocks = 0; +} + +/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ +STATIC int +xrep_dinode_core( + struct xrep_inode *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_buf *bp; + struct xfs_dinode *dip; + xfs_ino_t ino = sc->sm->sm_ino; + int error; + int iget_error; + + /* Figure out what this inode had mapped in both forks. */ + error = xrep_dinode_count_rmaps(ri); + if (error) + return error; + + /* Read the inode cluster buffer. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, + ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, + NULL); + if (error) + return error; + + /* Make sure we can pass the inode buffer verifier. */ + xrep_dinode_buf(sc, bp); + bp->b_ops = &xfs_inode_buf_ops; + + /* Fix everything the verifier will complain about. */ + dip = xfs_buf_offset(bp, ri->imap.im_boffset); + xrep_dinode_header(sc, dip); + iget_error = xrep_dinode_mode(ri, dip); + if (iget_error) + goto write; + xrep_dinode_flags(sc, dip, ri->rt_extents > 0); + xrep_dinode_size(ri, dip); + xrep_dinode_extsize_hints(sc, dip); + xrep_dinode_zap_forks(ri, dip); + +write: + /* Write out the inode. */ + trace_xrep_dinode_fixed(sc, dip); + xfs_dinode_calc_crc(sc->mp, dip); + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF); + xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset, + ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1); + + /* + * In theory, we've fixed the ondisk inode record enough that we should + * be able to load the inode into the cache. Try to iget that inode + * now while we hold the AGI and the inode cluster buffer and take the + * IOLOCK so that we can continue with repairs without anyone else + * accessing the inode. If iget fails, we still need to commit the + * changes. + */ + if (!iget_error) + iget_error = xchk_iget(sc, ino, &sc->ip); + if (!iget_error) + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + /* + * Commit the inode cluster buffer updates and drop the AGI buffer that + * we've been holding since scrub setup. From here on out, repairs + * deal only with the cached inode. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + if (iget_error) + return iget_error; + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + error = xrep_ino_dqattach(sc); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (ri->ino_sick_mask) + xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask); + return 0; +} + +/* Fix everything xfs_dinode_verify cares about. */ +STATIC int +xrep_dinode_problems( + struct xrep_inode *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + error = xrep_dinode_core(ri); + if (error) + return error; + + /* We had to fix a totally busted inode, schedule quotacheck. */ + if (XFS_IS_UQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + if (XFS_IS_GQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + if (XFS_IS_PQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + + return 0; +} + +/* + * Fix problems that the verifiers don't care about. In general these are + * errors that don't cause problems elsewhere in the kernel that we can easily + * detect, so we don't check them all that rigorously. + */ + +/* Make sure block and extent counts are ok. */ +STATIC int +xrep_inode_blockcounts( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + xfs_filblks_t count; + xfs_filblks_t acount; + xfs_extnum_t nextents; + int error; + + trace_xrep_inode_blockcounts(sc); + + /* Set data fork counters from the data fork mappings. */ + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, + &nextents, &count); + if (error) + return error; + if (xfs_is_reflink_inode(sc->ip)) { + /* + * data fork blockcount can exceed physical storage if a user + * reflinks the same block over and over again. + */ + ; + } else if (XFS_IS_REALTIME_INODE(sc->ip)) { + if (count >= sc->mp->m_sb.sb_rblocks) + return -EFSCORRUPTED; + } else { + if (count >= sc->mp->m_sb.sb_dblocks) + return -EFSCORRUPTED; + } + error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents); + if (error) + return error; + sc->ip->i_df.if_nextents = nextents; + + /* Set attr fork counters from the attr fork mappings. */ + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + if (ifp) { + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, + &nextents, &acount); + if (error) + return error; + if (count >= sc->mp->m_sb.sb_dblocks) + return -EFSCORRUPTED; + error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK, + nextents); + if (error) + return error; + ifp->if_nextents = nextents; + } else { + acount = 0; + } + + sc->ip->i_nblocks = count + acount; + return 0; +} + +/* Check for invalid uid/gid/prid. */ +STATIC void +xrep_inode_ids( + struct xfs_scrub *sc) +{ + bool dirty = false; + + trace_xrep_inode_ids(sc); + + if (!uid_valid(VFS_I(sc->ip)->i_uid)) { + i_uid_write(VFS_I(sc->ip), 0); + dirty = true; + if (XFS_IS_UQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + } + + if (!gid_valid(VFS_I(sc->ip)->i_gid)) { + i_gid_write(VFS_I(sc->ip), 0); + dirty = true; + if (XFS_IS_GQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + } + + if (sc->ip->i_projid == -1U) { + sc->ip->i_projid = 0; + dirty = true; + if (XFS_IS_PQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + } + + /* strip setuid/setgid if we touched any of the ids */ + if (dirty) + VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID); +} + +static inline void +xrep_clamp_timestamp( + struct xfs_inode *ip, + struct timespec64 *ts) +{ + ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC); + *ts = timestamp_truncate(*ts, VFS_I(ip)); +} + +/* Nanosecond counters can't have more than 1 billion. */ +STATIC void +xrep_inode_timestamps( + struct xfs_inode *ip) +{ + struct timespec64 tstamp; + struct inode *inode = VFS_I(ip); + + tstamp = inode_get_atime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_atime_to_ts(inode, tstamp); + + tstamp = inode_get_mtime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_mtime_to_ts(inode, tstamp); + + tstamp = inode_get_ctime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_ctime_to_ts(inode, tstamp); + + xrep_clamp_timestamp(ip, &ip->i_crtime); +} + +/* Fix inode flags that don't make sense together. */ +STATIC void +xrep_inode_flags( + struct xfs_scrub *sc) +{ + uint16_t mode; + + trace_xrep_inode_flags(sc); + + mode = VFS_I(sc->ip)->i_mode; + + /* Clear junk flags */ + if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY) + sc->ip->i_diflags &= ~XFS_DIFLAG_ANY; + + /* NEWRTBM only applies to realtime bitmaps */ + if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino) + sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM; + else + sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM; + + /* These only make sense for directories. */ + if (!S_ISDIR(mode)) + sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT | + XFS_DIFLAG_EXTSZINHERIT | + XFS_DIFLAG_PROJINHERIT | + XFS_DIFLAG_NOSYMLINKS); + + /* These only make sense for files. */ + if (!S_ISREG(mode)) + sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME | + XFS_DIFLAG_EXTSIZE); + + /* These only make sense for non-rt files. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) + sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM; + + /* Immutable and append only? Drop the append. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) && + (sc->ip->i_diflags & XFS_DIFLAG_APPEND)) + sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND; + + /* Clear junk flags. */ + if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY; + + /* No reflink flag unless we support it and it's a file. */ + if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode)) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + + /* DAX only applies to files and dirs. */ + if (!(S_ISREG(mode) || S_ISDIR(mode))) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; + + /* No reflink files on the realtime device. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; +} + +/* + * Fix size problems with block/node format directories. If we fail to find + * the extent list, just bail out and let the bmapbtd repair functions clean + * up that mess. + */ +STATIC void +xrep_inode_blockdir_size( + struct xfs_scrub *sc) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + xfs_fileoff_t off; + int error; + + trace_xrep_inode_blockdir_size(sc); + + error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK); + if (error) + return; + + /* Find the last block before 32G; this is the dir size. */ + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE); + if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) { + /* zero-extents directory? */ + return; + } + + off = got.br_startoff + got.br_blockcount; + sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE, + XFS_FSB_TO_B(sc->mp, off)); +} + +/* Fix size problems with short format directories. */ +STATIC void +xrep_inode_sfdir_size( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + + trace_xrep_inode_sfdir_size(sc); + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + sc->ip->i_disk_size = ifp->if_bytes; +} + +/* + * Fix any irregularities in a directory inode's size now that we can iterate + * extent maps and access other regular inode data. + */ +STATIC void +xrep_inode_dir_size( + struct xfs_scrub *sc) +{ + trace_xrep_inode_dir_size(sc); + + switch (sc->ip->i_df.if_format) { + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + xrep_inode_blockdir_size(sc); + break; + case XFS_DINODE_FMT_LOCAL: + xrep_inode_sfdir_size(sc); + break; + } +} + +/* Fix extent size hint problems. */ +STATIC void +xrep_inode_extsize( + struct xfs_scrub *sc) +{ + /* Fix misaligned extent size hints on a directory. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) { + sc->ip->i_extsize = 0; + sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT; + } +} + +/* Fix any irregularities in an inode that the verifiers don't catch. */ +STATIC int +xrep_inode_problems( + struct xfs_scrub *sc) +{ + int error; + + error = xrep_inode_blockcounts(sc); + if (error) + return error; + xrep_inode_timestamps(sc->ip); + xrep_inode_flags(sc); + xrep_inode_ids(sc); + /* + * We can now do a better job fixing the size of a directory now that + * we can scan the data fork extents than we could in xrep_dinode_size. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + xrep_inode_dir_size(sc); + xrep_inode_extsize(sc); + + trace_xrep_inode_fixed(sc); + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return xrep_roll_trans(sc); +} + +/* Repair an inode's fields. */ +int +xrep_inode( + struct xfs_scrub *sc) +{ + int error = 0; + + /* + * No inode? That means we failed the _iget verifiers. Repair all + * the things that the inode verifiers care about, then retry _iget. + */ + if (!sc->ip) { + struct xrep_inode *ri = sc->buf; + + ASSERT(ri != NULL); + + error = xrep_dinode_problems(ri); + if (error == -EBUSY) { + /* + * Directory scan to recover inode mode encountered a + * busy inode, so we did not continue repairing things. + */ + return 0; + } + if (error) + return error; + + /* By this point we had better have a working incore inode. */ + if (!sc->ip) + return -EFSCORRUPTED; + } + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* If we found corruption of any kind, try to fix it. */ + if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) { + error = xrep_inode_problems(sc); + if (error) + return error; + } + + /* See if we can clear the reflink flag. */ + if (xfs_is_reflink_inode(sc->ip)) { + error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); + if (error) + return error; + } + + return xrep_defer_finish(sc); +} diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c new file mode 100644 index 000000000000..ec3478bc505e --- /dev/null +++ b/fs/xfs/scrub/iscan.c @@ -0,0 +1,767 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_bit.h" +#include "xfs_icache.h" +#include "scrub/scrub.h" +#include "scrub/iscan.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * Live File Scan + * ============== + * + * Live file scans walk every inode in a live filesystem. This is more or + * less like a regular iwalk, except that when we're advancing the scan cursor, + * we must ensure that inodes cannot be added or deleted anywhere between the + * old cursor value and the new cursor value. If we're advancing the cursor + * by one inode, the caller must hold that inode; if we're finding the next + * inode to scan, we must grab the AGI and hold it until we've updated the + * scan cursor. + * + * Callers are expected to use this code to scan all files in the filesystem to + * construct a new metadata index of some kind. The scan races against other + * live updates, which means there must be a provision to update the new index + * when updates are made to inodes that already been scanned. The iscan lock + * can be used in live update hook code to stop the scan and protect this data + * structure. + * + * To keep the new index up to date with other metadata updates being made to + * the live filesystem, it is assumed that the caller will add hooks as needed + * to be notified when a metadata update occurs. The inode scanner must tell + * the hook code when an inode has been visited with xchk_iscan_mark_visit. + * Hook functions can use xchk_iscan_want_live_update to decide if the + * scanner's observations must be updated. + */ + +/* + * If the inobt record @rec covers @iscan->skip_ino, mark the inode free so + * that the scan ignores that inode. + */ +STATIC void +xchk_iscan_mask_skipino( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_inobt_rec_incore *rec, + xfs_agino_t lastrecino) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino); + xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino); + + if (pag->pag_agno != skip_agno) + return; + if (skip_agino < rec->ir_startino) + return; + if (skip_agino > lastrecino) + return; + + rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1); +} + +/* + * Set *cursor to the next allocated inode after whatever it's set to now. + * If there are no more inodes in this AG, cursor is set to NULLAGINO. + */ +STATIC int +xchk_iscan_find_next( + struct xchk_iscan *iscan, + struct xfs_buf *agi_bp, + struct xfs_perag *pag, + xfs_inofree_t *allocmaskp, + xfs_agino_t *cursor, + uint8_t *nr_inodesp) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + xfs_agnumber_t agno = pag->pag_agno; + xfs_agino_t lastino = NULLAGINO; + xfs_agino_t first, last; + xfs_agino_t agino = *cursor; + int has_rec; + int error; + + /* If the cursor is beyond the end of this AG, move to the next one. */ + xfs_agino_range(mp, agno, &first, &last); + if (agino > last) { + *cursor = NULLAGINO; + return 0; + } + + /* + * Look up the inode chunk for the current cursor position. If there + * is no chunk here, we want the next one. + */ + cur = xfs_inobt_init_cursor(pag, tp, agi_bp); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec); + if (!error && !has_rec) + error = xfs_btree_increment(cur, 0, &has_rec); + for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) { + xfs_inofree_t allocmask; + + /* + * If we've run out of inobt records in this AG, move the + * cursor on to the next AG and exit. The caller can try + * again with the next AG. + */ + if (!has_rec) { + *cursor = NULLAGINO; + break; + } + + error = xfs_inobt_get_rec(cur, &rec, &has_rec); + if (error) + break; + if (!has_rec) { + error = -EFSCORRUPTED; + break; + } + + /* Make sure that we always move forward. */ + if (lastino != NULLAGINO && + XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) { + error = -EFSCORRUPTED; + break; + } + lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1; + + /* + * If this record only covers inodes that come before the + * cursor, advance to the next record. + */ + if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + continue; + + if (iscan->skip_ino) + xchk_iscan_mask_skipino(iscan, pag, &rec, lastino); + + /* + * If the incoming lookup put us in the middle of an inobt + * record, mark it and the previous inodes "free" so that the + * search for allocated inodes will start at the cursor. + * We don't care about ir_freecount here. + */ + if (agino >= rec.ir_startino) + rec.ir_free |= xfs_inobt_maskn(0, + agino + 1 - rec.ir_startino); + + /* + * If there are allocated inodes in this chunk, find them + * and update the scan cursor. + */ + allocmask = ~rec.ir_free; + if (hweight64(allocmask) > 0) { + int next = xfs_lowbit64(allocmask); + + ASSERT(next >= 0); + *cursor = rec.ir_startino + next; + *allocmaskp = allocmask >> next; + *nr_inodesp = XFS_INODES_PER_CHUNK - next; + break; + } + } + + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Advance both the scan and the visited cursors. + * + * The inumber address space for a given filesystem is sparse, which means that + * the scan cursor can jump a long ways in a single iter() call. There are no + * inodes in these sparse areas, so we must move the visited cursor forward at + * the same time so that the scan user can receive live updates for inodes that + * may get created once we release the AGI buffer. + */ +static inline void +xchk_iscan_move_cursor( + struct xchk_iscan *iscan, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t cursor, visited; + + BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO); + + /* + * Special-case ino == 0 here so that we never set visited_ino to + * NULLFSINO when wrapping around EOFS, for that will let through all + * live updates. + */ + cursor = XFS_AGINO_TO_INO(mp, agno, agino); + if (cursor == 0) + visited = XFS_MAXINUMBER; + else + visited = cursor - 1; + + mutex_lock(&iscan->lock); + iscan->cursor_ino = cursor; + iscan->__visited_ino = visited; + trace_xchk_iscan_move_cursor(iscan); + mutex_unlock(&iscan->lock); +} + +/* + * Prepare to return agno/agino to the iscan caller by moving the lastino + * cursor to the previous inode. Do this while we still hold the AGI so that + * no other threads can create or delete inodes in this AG. + */ +static inline void +xchk_iscan_finish( + struct xchk_iscan *iscan) +{ + mutex_lock(&iscan->lock); + iscan->cursor_ino = NULLFSINO; + + /* All live updates will be applied from now on */ + iscan->__visited_ino = NULLFSINO; + + mutex_unlock(&iscan->lock); +} + +/* + * Advance ino to the next inode that the inobt thinks is allocated, being + * careful to jump to the next AG if we've reached the right end of this AG's + * inode btree. Advancing ino effectively means that we've pushed the inode + * scan forward, so set the iscan cursor to (ino - 1) so that our live update + * predicates will track inode allocations in that part of the inode number + * key space once we release the AGI buffer. + * + * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, + * -ECANCELED if the live scan aborted, or the usual negative errno. + */ +STATIC int +xchk_iscan_advance( + struct xchk_iscan *iscan, + struct xfs_perag **pagp, + struct xfs_buf **agi_bpp, + xfs_inofree_t *allocmaskp, + uint8_t *nr_inodesp) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agi_bp; + struct xfs_perag *pag; + xfs_agnumber_t agno; + xfs_agino_t agino; + int ret; + + ASSERT(iscan->cursor_ino >= iscan->__visited_ino); + + do { + if (xchk_iscan_aborted(iscan)) + return -ECANCELED; + + agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino); + pag = xfs_perag_get(mp, agno); + if (!pag) + return -ECANCELED; + + ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); + if (ret) + goto out_pag; + + agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino); + ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp, + &agino, nr_inodesp); + if (ret) + goto out_buf; + + if (agino != NULLAGINO) { + /* + * Found the next inode in this AG, so return it along + * with the AGI buffer and the perag structure to + * ensure it cannot go away. + */ + xchk_iscan_move_cursor(iscan, agno, agino); + *agi_bpp = agi_bp; + *pagp = pag; + return 1; + } + + /* + * Did not find any more inodes in this AG, move on to the next + * AG. + */ + agno = (agno + 1) % mp->m_sb.sb_agcount; + xchk_iscan_move_cursor(iscan, agno, 0); + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + trace_xchk_iscan_advance_ag(iscan); + } while (iscan->cursor_ino != iscan->scan_start_ino); + + xchk_iscan_finish(iscan); + return 0; + +out_buf: + xfs_trans_brelse(sc->tp, agi_bp); +out_pag: + xfs_perag_put(pag); + return ret; +} + +/* + * Grabbing the inode failed, so we need to back up the scan and ask the caller + * to try to _advance the scan again. Returns -EBUSY if we've run out of retry + * opportunities, -ECANCELED if the process has a fatal signal pending, or + * -EAGAIN if we should try again. + */ +STATIC int +xchk_iscan_iget_retry( + struct xchk_iscan *iscan, + bool wait) +{ + ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1); + + if (!iscan->iget_timeout || + time_is_before_jiffies(iscan->__iget_deadline)) + return -EBUSY; + + if (wait) { + unsigned long relax; + + /* + * Sleep for a period of time to let the rest of the system + * catch up. If we return early, someone sent a kill signal to + * the calling process. + */ + relax = msecs_to_jiffies(iscan->iget_retry_delay); + trace_xchk_iscan_iget_retry_wait(iscan); + + if (schedule_timeout_killable(relax) || + xchk_iscan_aborted(iscan)) + return -ECANCELED; + } + + iscan->cursor_ino--; + return -EAGAIN; +} + +/* + * Grab an inode as part of an inode scan. While scanning this inode, the + * caller must ensure that no other threads can modify the inode until a call + * to xchk_iscan_visit succeeds. + * + * Returns the number of incore inodes grabbed; -EAGAIN if the caller should + * call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode; + * -ECANCELED if there's a fatal signal pending; or some other negative errno. + */ +STATIC int +xchk_iscan_iget( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_buf *agi_bp, + xfs_inofree_t allocmask, + uint8_t nr_inodes) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t ino = iscan->cursor_ino; + unsigned int idx = 0; + unsigned int i; + int error; + + ASSERT(iscan->__inodes[0] == NULL); + + /* Fill the first slot in the inode array. */ + error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + &iscan->__inodes[idx]); + + trace_xchk_iscan_iget(iscan, error); + + if (error == -ENOENT || error == -EAGAIN) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + /* + * It's possible that this inode has lost all of its links but + * hasn't yet been inactivated. If we don't have a transaction + * or it's not writable, flush the inodegc workers and wait. + */ + xfs_inodegc_flush(mp); + return xchk_iscan_iget_retry(iscan, true); + } + + if (error == -EINVAL) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + /* + * We thought the inode was allocated, but the inode btree + * lookup failed, which means that it was freed since the last + * time we advanced the cursor. Back up and try again. This + * should never happen since still hold the AGI buffer from the + * inobt check, but we need to be careful about infinite loops. + */ + return xchk_iscan_iget_retry(iscan, false); + } + + if (error) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + return error; + } + idx++; + ino++; + allocmask >>= 1; + + /* + * Now that we've filled the first slot in __inodes, try to fill the + * rest of the batch with consecutively ordered inodes. to reduce the + * number of _iter calls. Make a bitmap of unallocated inodes from the + * zeroes in the inuse bitmap; these inodes will not be scanned, but + * the _want_live_update predicate will pass through all live updates. + * + * If we can't iget an allocated inode, stop and return what we have. + */ + mutex_lock(&iscan->lock); + iscan->__batch_ino = ino - 1; + iscan->__skipped_inomask = 0; + mutex_unlock(&iscan->lock); + + for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) { + if (!(allocmask & 1)) { + ASSERT(!(iscan->__skipped_inomask & (1ULL << i))); + + mutex_lock(&iscan->lock); + iscan->cursor_ino = ino; + iscan->__skipped_inomask |= (1ULL << i); + mutex_unlock(&iscan->lock); + continue; + } + + ASSERT(iscan->__inodes[idx] == NULL); + + error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + &iscan->__inodes[idx]); + if (error) + break; + + mutex_lock(&iscan->lock); + iscan->cursor_ino = ino; + mutex_unlock(&iscan->lock); + idx++; + } + + trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx); + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + return idx; +} + +/* + * Advance the visit cursor to reflect skipped inodes beyond whatever we + * scanned. + */ +STATIC void +xchk_iscan_finish_batch( + struct xchk_iscan *iscan) +{ + xfs_ino_t highest_skipped; + + mutex_lock(&iscan->lock); + + if (iscan->__batch_ino != NULLFSINO) { + highest_skipped = iscan->__batch_ino + + xfs_highbit64(iscan->__skipped_inomask); + iscan->__visited_ino = max(iscan->__visited_ino, + highest_skipped); + + trace_xchk_iscan_skip(iscan); + } + + iscan->__batch_ino = NULLFSINO; + iscan->__skipped_inomask = 0; + + mutex_unlock(&iscan->lock); +} + +/* + * Advance the inode scan cursor to the next allocated inode and return up to + * 64 consecutive allocated inodes starting with the cursor position. + */ +STATIC int +xchk_iscan_iter_batch( + struct xchk_iscan *iscan) +{ + struct xfs_scrub *sc = iscan->sc; + int ret; + + xchk_iscan_finish_batch(iscan); + + if (iscan->iget_timeout) + iscan->__iget_deadline = jiffies + + msecs_to_jiffies(iscan->iget_timeout); + + do { + struct xfs_buf *agi_bp = NULL; + struct xfs_perag *pag = NULL; + xfs_inofree_t allocmask = 0; + uint8_t nr_inodes = 0; + + ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask, + &nr_inodes); + if (ret != 1) + return ret; + + if (xchk_iscan_aborted(iscan)) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + ret = -ECANCELED; + break; + } + + ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes); + } while (ret == -EAGAIN); + + return ret; +} + +/* + * Advance the inode scan cursor to the next allocated inode and return the + * incore inode structure associated with it. + * + * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, + * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be + * grabbed, or the usual negative errno. + * + * If the function returns -EBUSY and the caller can handle skipping an inode, + * it may call this function again to continue the scan with the next allocated + * inode. + */ +int +xchk_iscan_iter( + struct xchk_iscan *iscan, + struct xfs_inode **ipp) +{ + unsigned int i; + int error; + + /* Find a cached inode, or go get another batch. */ + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (iscan->__inodes[i]) + goto foundit; + } + + error = xchk_iscan_iter_batch(iscan); + if (error <= 0) + return error; + + ASSERT(iscan->__inodes[0] != NULL); + i = 0; + +foundit: + /* Give the caller our reference. */ + *ipp = iscan->__inodes[i]; + iscan->__inodes[i] = NULL; + return 1; +} + +/* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */ +void +xchk_iscan_iter_finish( + struct xchk_iscan *iscan) +{ + struct xfs_scrub *sc = iscan->sc; + unsigned int i; + + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (iscan->__inodes[i]) { + xchk_irele(sc, iscan->__inodes[i]); + iscan->__inodes[i] = NULL; + } + } +} + +/* Mark this inode scan finished and release resources. */ +void +xchk_iscan_teardown( + struct xchk_iscan *iscan) +{ + xchk_iscan_iter_finish(iscan); + xchk_iscan_finish(iscan); + mutex_destroy(&iscan->lock); +} + +/* Pick an AG from which to start a scan. */ +static inline xfs_ino_t +xchk_iscan_rotor( + struct xfs_mount *mp) +{ + static atomic_t agi_rotor; + unsigned int r = atomic_inc_return(&agi_rotor) - 1; + + /* + * Rotoring *backwards* through the AGs, so we add one here before + * subtracting from the agcount to arrive at an AG number. + */ + r = (r % mp->m_sb.sb_agcount) + 1; + + return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0); +} + +/* + * Set ourselves up to start an inode scan. If the @iget_timeout and + * @iget_retry_delay parameters are set, the scan will try to iget each inode + * for @iget_timeout milliseconds. If an iget call indicates that the inode is + * waiting to be inactivated, the CPU will relax for @iget_retry_delay + * milliseconds after pushing the inactivation workers. + */ +void +xchk_iscan_start( + struct xfs_scrub *sc, + unsigned int iget_timeout, + unsigned int iget_retry_delay, + struct xchk_iscan *iscan) +{ + xfs_ino_t start_ino; + + start_ino = xchk_iscan_rotor(sc->mp); + + iscan->__batch_ino = NULLFSINO; + iscan->__skipped_inomask = 0; + + iscan->sc = sc; + clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); + iscan->iget_timeout = iget_timeout; + iscan->iget_retry_delay = iget_retry_delay; + iscan->__visited_ino = start_ino; + iscan->cursor_ino = start_ino; + iscan->scan_start_ino = start_ino; + mutex_init(&iscan->lock); + memset(iscan->__inodes, 0, sizeof(iscan->__inodes)); + + trace_xchk_iscan_start(iscan, start_ino); +} + +/* + * Mark this inode as having been visited. Callers must hold a sufficiently + * exclusive lock on the inode to prevent concurrent modifications. + */ +void +xchk_iscan_mark_visited( + struct xchk_iscan *iscan, + struct xfs_inode *ip) +{ + mutex_lock(&iscan->lock); + iscan->__visited_ino = ip->i_ino; + trace_xchk_iscan_visit(iscan); + mutex_unlock(&iscan->lock); +} + +/* + * Did we skip this inode because it wasn't allocated when we loaded the batch? + * If so, it is newly allocated and will not be scanned. All live updates to + * this inode must be passed to the caller to maintain scan correctness. + */ +static inline bool +xchk_iscan_skipped( + const struct xchk_iscan *iscan, + xfs_ino_t ino) +{ + if (iscan->__batch_ino == NULLFSINO) + return false; + if (ino < iscan->__batch_ino) + return false; + if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK) + return false; + + return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino)); +} + +/* + * Do we need a live update for this inode? This is true if the scanner thread + * has visited this inode and the scan hasn't been aborted due to errors. + * Callers must hold a sufficiently exclusive lock on the inode to prevent + * scanners from reading any inode metadata. + */ +bool +xchk_iscan_want_live_update( + struct xchk_iscan *iscan, + xfs_ino_t ino) +{ + bool ret = false; + + if (xchk_iscan_aborted(iscan)) + return false; + + mutex_lock(&iscan->lock); + + trace_xchk_iscan_want_live_update(iscan, ino); + + /* Scan is finished, caller should receive all updates. */ + if (iscan->__visited_ino == NULLFSINO) { + ret = true; + goto unlock; + } + + /* + * No inodes have been visited yet, so the visited cursor points at the + * start of the scan range. The caller should not receive any updates. + */ + if (iscan->scan_start_ino == iscan->__visited_ino) { + ret = false; + goto unlock; + } + + /* + * This inode was not allocated at the time of the iscan batch. + * The caller should receive all updates. + */ + if (xchk_iscan_skipped(iscan, ino)) { + ret = true; + goto unlock; + } + + /* + * The visited cursor hasn't yet wrapped around the end of the FS. If + * @ino is inside the starred range, the caller should receive updates: + * + * 0 ------------ S ************ V ------------ EOFS + */ + if (iscan->scan_start_ino <= iscan->__visited_ino) { + if (ino >= iscan->scan_start_ino && + ino <= iscan->__visited_ino) + ret = true; + + goto unlock; + } + + /* + * The visited cursor wrapped around the end of the FS. If @ino is + * inside the starred range, the caller should receive updates: + * + * 0 ************ V ------------ S ************ EOFS + */ + if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino) + ret = true; + +unlock: + mutex_unlock(&iscan->lock); + return ret; +} diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h new file mode 100644 index 000000000000..71f657552dfa --- /dev/null +++ b/fs/xfs/scrub/iscan.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_ISCAN_H__ +#define __XFS_SCRUB_ISCAN_H__ + +struct xchk_iscan { + struct xfs_scrub *sc; + + /* Lock to protect the scan cursor. */ + struct mutex lock; + + /* + * This is the first inode in the inumber address space that we + * examined. When the scan wraps around back to here, the scan is + * finished. + */ + xfs_ino_t scan_start_ino; + + /* This is the inode that will be examined next. */ + xfs_ino_t cursor_ino; + + /* If nonzero and non-NULL, skip this inode when scanning. */ + xfs_ino_t skip_ino; + + /* + * This is the last inode that we've successfully scanned, either + * because the caller scanned it, or we moved the cursor past an empty + * part of the inode address space. Scan callers should only use the + * xchk_iscan_visit function to modify this. + */ + xfs_ino_t __visited_ino; + + /* Operational state of the livescan. */ + unsigned long __opstate; + + /* Give up on iterating @cursor_ino if we can't iget it by this time. */ + unsigned long __iget_deadline; + + /* Amount of time (in ms) that we will try to iget an inode. */ + unsigned int iget_timeout; + + /* Wait this many ms to retry an iget. */ + unsigned int iget_retry_delay; + + /* + * The scan grabs batches of inodes and stashes them here before + * handing them out with _iter. Unallocated inodes are set in the + * mask so that all updates to that inode are selected for live + * update propagation. + */ + xfs_ino_t __batch_ino; + xfs_inofree_t __skipped_inomask; + struct xfs_inode *__inodes[XFS_INODES_PER_CHUNK]; +}; + +/* Set if the scan has been aborted due to some event in the fs. */ +#define XCHK_ISCAN_OPSTATE_ABORTED (1) + +static inline bool +xchk_iscan_aborted(const struct xchk_iscan *iscan) +{ + return test_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); +} + +static inline void +xchk_iscan_abort(struct xchk_iscan *iscan) +{ + set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); +} + +void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout, + unsigned int iget_retry_delay, struct xchk_iscan *iscan); +void xchk_iscan_teardown(struct xchk_iscan *iscan); + +int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp); +void xchk_iscan_iter_finish(struct xchk_iscan *iscan); + +void xchk_iscan_mark_visited(struct xchk_iscan *iscan, struct xfs_inode *ip); +bool xchk_iscan_want_live_update(struct xchk_iscan *iscan, xfs_ino_t ino); + +#endif /* __XFS_SCRUB_ISCAN_H__ */ diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c new file mode 100644 index 000000000000..4a0271123d94 --- /dev/null +++ b/fs/xfs/scrub/newbt.c @@ -0,0 +1,567 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" +#include "xfs_defer.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/newbt.h" + +/* + * Estimate proper slack values for a btree that's being reloaded. + * + * Under most circumstances, we'll take whatever default loading value the + * btree bulk loading code calculates for us. However, there are some + * exceptions to this rule: + * + * (0) If someone turned one of the debug knobs. + * (1) If this is a per-AG btree and the AG has less than 10% space free. + * (2) If this is an inode btree and the FS has less than 10% space free. + + * In either case, format the new btree blocks almost completely full to + * minimize space usage. + */ +static void +xrep_newbt_estimate_slack( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_btree_bload *bload = &xnr->bload; + uint64_t free; + uint64_t sz; + + /* + * The xfs_globals values are set to -1 (i.e. take the bload defaults) + * unless someone has set them otherwise, so we just pull the values + * here. + */ + bload->leaf_slack = xfs_globals.bload_leaf_slack; + bload->node_slack = xfs_globals.bload_node_slack; + + if (sc->ops->type == ST_PERAG) { + free = sc->sa.pag->pagf_freeblks; + sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); + } else { + free = percpu_counter_sum(&sc->mp->m_fdblocks); + sz = sc->mp->m_sb.sb_dblocks; + } + + /* No further changes if there's more than 10% free space left. */ + if (free >= div_u64(sz, 10)) + return; + + /* + * We're low on space; load the btrees as tightly as possible. Leave + * a couple of open slots in each btree block so that we don't end up + * splitting the btrees like crazy after a mount. + */ + if (bload->leaf_slack < 0) + bload->leaf_slack = 2; + if (bload->node_slack < 0) + bload->node_slack = 2; +} + +/* Initialize accounting resources for staging a new AG btree. */ +void +xrep_newbt_init_ag( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, + xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv) +{ + memset(xnr, 0, sizeof(struct xrep_newbt)); + xnr->sc = sc; + xnr->oinfo = *oinfo; /* structure copy */ + xnr->alloc_hint = alloc_hint; + xnr->resv = resv; + INIT_LIST_HEAD(&xnr->resv_list); + xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ + xrep_newbt_estimate_slack(xnr); +} + +/* Initialize accounting resources for staging a new inode fork btree. */ +int +xrep_newbt_init_inode( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + int whichfork, + const struct xfs_owner_info *oinfo) +{ + struct xfs_ifork *ifp; + + ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); + if (!ifp) + return -ENOMEM; + + xrep_newbt_init_ag(xnr, sc, oinfo, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), + XFS_AG_RESV_NONE); + xnr->ifake.if_fork = ifp; + xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); + return 0; +} + +/* + * Initialize accounting resources for staging a new btree. Callers are + * expected to add their own reservations (and clean them up) manually. + */ +void +xrep_newbt_init_bare( + struct xrep_newbt *xnr, + struct xfs_scrub *sc) +{ + xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, + XFS_AG_RESV_NONE); +} + +/* + * Designate specific blocks to be used to build our new btree. @pag must be + * a passive reference. + */ +STATIC int +xrep_newbt_add_blocks( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + const struct xfs_alloc_arg *args) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xrep_newbt_resv *resv; + int error; + + resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); + if (!resv) + return -ENOMEM; + + INIT_LIST_HEAD(&resv->list); + resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + resv->len = args->len; + resv->used = 0; + resv->pag = xfs_perag_hold(pag); + + if (args->tp) { + ASSERT(xnr->oinfo.oi_offset == 0); + + error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); + if (error) + goto out_pag; + } + + list_add_tail(&resv->list, &xnr->resv_list); + return 0; +out_pag: + xfs_perag_put(resv->pag); + kfree(resv); + return error; +} + +/* + * Add an extent to the new btree reservation pool. Callers are required to + * reap this reservation manually if the repair is cancelled. @pag must be a + * passive reference. + */ +int +xrep_newbt_add_extent( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xfs_alloc_arg args = { + .tp = NULL, /* no autoreap */ + .oinfo = xnr->oinfo, + .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), + .len = len, + .resv = xnr->resv, + }; + + return xrep_newbt_add_blocks(xnr, pag, &args); +} + +/* Don't let our allocation hint take us beyond this AG */ +static inline void +xrep_newbt_validate_ag_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); + + if (agno == sc->sa.pag->pag_agno && + xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for a new per-AG btree. */ +STATIC int +xrep_newbt_alloc_ag_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + ASSERT(sc->sa.pag != NULL); + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + xfs_agnumber_t agno; + + xrep_newbt_validate_ag_alloc_hint(xnr); + + if (xnr->alloc_vextent) + error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); + else + error = xfs_alloc_vextent_near_bno(&args, + xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_ag_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + if (agno != sc->sa.pag->pag_agno) { + ASSERT(agno == sc->sa.pag->pag_agno); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Don't let our allocation hint take us beyond EOFS */ +static inline void +xrep_newbt_validate_file_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + + if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for our new file-based btree. */ +STATIC int +xrep_newbt_alloc_file_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + struct xfs_perag *pag; + xfs_agnumber_t agno; + + xrep_newbt_validate_file_alloc_hint(xnr); + + if (xnr->alloc_vextent) + error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); + else + error = xfs_alloc_vextent_start_ag(&args, + xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_file_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + pag = xfs_perag_get(mp, agno); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, pag, &args); + xfs_perag_put(pag); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Allocate disk space for our new btree. */ +int +xrep_newbt_alloc_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + if (xnr->sc->ip) + return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); + return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); +} + +/* + * Free the unused part of a space extent that was reserved for a new ondisk + * structure. Returns the number of EFIs logged or a negative errno. + */ +STATIC int +xrep_newbt_free_extent( + struct xrep_newbt *xnr, + struct xrep_newbt_resv *resv, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agblock_t free_agbno = resv->agbno; + xfs_extlen_t free_aglen = resv->len; + xfs_fsblock_t fsbno; + int error; + + if (!btree_committed || resv->used == 0) { + /* + * If we're not committing a new btree or we didn't use the + * space reservation, let the existing EFI free the entire + * space extent. + */ + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, + free_agbno, free_aglen, xnr->oinfo.oi_owner); + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); + return 1; + } + + /* + * We used space and committed the btree. Cancel the autoreap, remove + * the written blocks from the reservation, and possibly log a new EFI + * to free any unused reservation space. + */ + xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); + free_agbno += resv->used; + free_aglen -= resv->used; + + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, xnr->oinfo.oi_owner); + + ASSERT(xnr->resv != XFS_AG_RESV_AGFL); + ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); + + /* + * Use EFIs to free the reservations. This reduces the chance + * that we leak blocks if the system goes down. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); + error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, + xnr->resv, true); + if (error) + return error; + + return 1; +} + +/* Free all the accounting info and disk space we reserved for a new btree. */ +STATIC int +xrep_newbt_free( + struct xrep_newbt *xnr, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + struct xrep_newbt_resv *resv, *n; + unsigned int freed = 0; + int error = 0; + + /* + * If the filesystem already went down, we can't free the blocks. Skip + * ahead to freeing the incore metadata because we can't fix anything. + */ + if (xfs_is_shutdown(sc->mp)) + goto junkit; + + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + int ret; + + ret = xrep_newbt_free_extent(xnr, resv, btree_committed); + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + if (ret < 0) { + error = ret; + goto junkit; + } + + freed += ret; + if (freed >= XREP_MAX_ITRUNCATE_EFIS) { + error = xrep_defer_finish(sc); + if (error) + goto junkit; + freed = 0; + } + } + + if (freed) + error = xrep_defer_finish(sc); + +junkit: + /* + * If we still have reservations attached to @newbt, cleanup must have + * failed and the filesystem is about to go down. Clean up the incore + * reservations and try to commit to freeing the space we used. + */ + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + } + + if (sc->ip) { + kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); + xnr->ifake.if_fork = NULL; + } + + return error; +} + +/* + * Free all the accounting info and unused disk space allocations after + * committing a new btree. + */ +int +xrep_newbt_commit( + struct xrep_newbt *xnr) +{ + return xrep_newbt_free(xnr, true); +} + +/* + * Free all the accounting info and all of the disk space we reserved for a new + * btree that we're not going to commit. We want to try to roll things back + * cleanly for things like ENOSPC midway through allocation. + */ +void +xrep_newbt_cancel( + struct xrep_newbt *xnr) +{ + xrep_newbt_free(xnr, false); +} + +/* Feed one of the reserved btree blocks to the bulk loader. */ +int +xrep_newbt_claim_block( + struct xfs_btree_cur *cur, + struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr) +{ + struct xrep_newbt_resv *resv; + struct xfs_mount *mp = cur->bc_mp; + xfs_agblock_t agbno; + + /* + * The first item in the list should always have a free block unless + * we're completely out. + */ + resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); + if (resv->used == resv->len) + return -ENOSPC; + + /* + * Peel off a block from the start of the reservation. We allocate + * blocks in order to place blocks on disk in increasing record or key + * order. The block reservations tend to end up on the list in + * decreasing order, which hopefully results in leaf blocks ending up + * together. + */ + agbno = resv->agbno + resv->used; + resv->used++; + + /* If we used all the blocks in this reservation, move it to the end. */ + if (resv->used == resv->len) + list_move_tail(&resv->list, &xnr->resv_list); + + trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, + xnr->oinfo.oi_owner); + + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) + ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, + agbno)); + else + ptr->s = cpu_to_be32(agbno); + + /* Relog all the EFIs. */ + return xrep_defer_finish(xnr->sc); +} + +/* How many reserved blocks are unused? */ +unsigned int +xrep_newbt_unused_blocks( + struct xrep_newbt *xnr) +{ + struct xrep_newbt_resv *resv; + unsigned int unused = 0; + + list_for_each_entry(resv, &xnr->resv_list, list) + unused += resv->len - resv->used; + return unused; +} diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h new file mode 100644 index 000000000000..3d804d31af24 --- /dev/null +++ b/fs/xfs/scrub/newbt.h @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_NEWBT_H__ +#define __XFS_SCRUB_NEWBT_H__ + +struct xfs_alloc_arg; + +struct xrep_newbt_resv { + /* Link to list of extents that we've reserved. */ + struct list_head list; + + struct xfs_perag *pag; + + /* Auto-freeing this reservation if we don't commit. */ + struct xfs_alloc_autoreap autoreap; + + /* AG block of the extent we reserved. */ + xfs_agblock_t agbno; + + /* Length of the reservation. */ + xfs_extlen_t len; + + /* How much of this reservation has been used. */ + xfs_extlen_t used; +}; + +struct xrep_newbt { + struct xfs_scrub *sc; + + /* Custom allocation function, or NULL for xfs_alloc_vextent */ + int (*alloc_vextent)(struct xfs_scrub *sc, + struct xfs_alloc_arg *args, + xfs_fsblock_t alloc_hint); + + /* List of extents that we've reserved. */ + struct list_head resv_list; + + /* Fake root for new btree. */ + union { + struct xbtree_afakeroot afake; + struct xbtree_ifakeroot ifake; + }; + + /* rmap owner of these blocks */ + struct xfs_owner_info oinfo; + + /* btree geometry for the bulk loader */ + struct xfs_btree_bload bload; + + /* Allocation hint */ + xfs_fsblock_t alloc_hint; + + /* per-ag reservation type */ + enum xfs_ag_resv_type resv; +}; + +void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc); +void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv); +int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc, + int whichfork, const struct xfs_owner_info *oinfo); +int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks); +int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len); +void xrep_newbt_cancel(struct xrep_newbt *xnr); +int xrep_newbt_commit(struct xrep_newbt *xnr); +int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr); +unsigned int xrep_newbt_unused_blocks(struct xrep_newbt *xnr); + +#endif /* __XFS_SCRUB_NEWBT_H__ */ diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c new file mode 100644 index 000000000000..8a7d9557897c --- /dev/null +++ b/fs/xfs/scrub/nlinks.c @@ -0,0 +1,930 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/nlinks.h" +#include "scrub/trace.h" +#include "scrub/readdir.h" + +/* + * Live Inode Link Count Checking + * ============================== + * + * Inode link counts are "summary" metadata, in the sense that they are + * computed as the number of directory entries referencing each file on the + * filesystem. Therefore, we compute the correct link counts by creating a + * shadow link count structure and walking every inode. + */ + +/* Set us up to scrub inode link counts. */ +int +xchk_setup_nlinks( + struct xfs_scrub *sc) +{ + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + return xchk_setup_fs(sc); +} + +/* + * Part 1: Collecting file link counts. For each file, we create a shadow link + * counting structure, then walk the entire directory tree, incrementing parent + * and child link counts for each directory entry seen. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. + * + * Because we are scanning a live filesystem, it's possible that another thread + * will try to update the link counts for an inode that we've already scanned. + * This will cause our counts to be incorrect. Therefore, we hook all + * directory entry updates because that is when link count updates occur. By + * shadowing transaction updates in this manner, live nlink check can ensure by + * locking the inode and the shadow structure that its own copies are not out + * of date. Because the hook code runs in a different process context from the + * scrub code and the scrub state flags are not accessed atomically, failures + * in the hook code must abort the iscan and the scrubber must notice the + * aborted scan and set the incomplete flag. + * + * Note that we use jump labels and srcu notifier hooks to minimize the + * overhead when live nlinks is /not/ running. Locking order for nlink + * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock. + */ + +/* + * Add a delta to an nlink counter, clamping the value to U32_MAX. Because + * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results + * even if we lose some precision. + */ +static inline void +careful_add( + xfs_nlink_t *nlinkp, + int delta) +{ + uint64_t new_value = (uint64_t)(*nlinkp) + delta; + + BUILD_BUG_ON(XFS_MAXLINK > U32_MAX); + *nlinkp = min_t(uint64_t, new_value, U32_MAX); +} + +/* Update incore link count information. Caller must hold the nlinks lock. */ +STATIC int +xchk_nlinks_update_incore( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + int parents_delta, + int backrefs_delta, + int children_delta) +{ + struct xchk_nlink nl; + int error; + + if (!xnc->nlinks) + return 0; + + error = xfarray_load_sparse(xnc->nlinks, ino, &nl); + if (error) + return error; + + trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta, + backrefs_delta, children_delta); + + careful_add(&nl.parents, parents_delta); + careful_add(&nl.backrefs, backrefs_delta); + careful_add(&nl.children, children_delta); + + nl.flags |= XCHK_NLINK_WRITTEN; + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + error = -ECANCELED; + } + return error; +} + +/* + * Apply a link count change from the regular filesystem into our shadow link + * count structure based on a directory update in progress. + */ +STATIC int +xchk_nlinks_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xchk_nlink_ctrs *xnc; + int error; + + xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb); + + trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino, + p->delta, p->name->name, p->name->len); + + /* + * If we've already scanned @dp, update the number of parents that link + * to @ip. If @ip is a subdirectory, update the number of child links + * going out of @dp. + */ + if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) { + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta, + 0, 0); + if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode)) + error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0, + 0, p->delta); + mutex_unlock(&xnc->lock); + if (error) + goto out_abort; + } + + /* + * If @ip is a subdirectory and we've already scanned it, update the + * number of backrefs pointing to @dp. + */ + if (S_ISDIR(VFS_IC(p->ip)->i_mode) && + xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) { + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0, + p->delta, 0); + mutex_unlock(&xnc->lock); + if (error) + goto out_abort; + } + + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xnc->collect_iscan); + return NOTIFY_DONE; +} + +/* Bump the observed link count for the inode referenced by this entry. */ +STATIC int +xchk_nlinks_collect_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xchk_nlink_ctrs *xnc = priv; + bool dot = false, dotdot = false; + int error; + + /* Does this name make sense? */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) { + error = -ECANCELED; + goto out_abort; + } + + if (name->len == 1 && name->name[0] == '.') + dot = true; + else if (name->len == 2 && name->name[0] == '.' && + name->name[1] == '.') + dotdot = true; + + /* Don't accept a '.' entry that points somewhere else. */ + if (dot && ino != dp->i_ino) { + error = -ECANCELED; + goto out_abort; + } + + /* Don't accept an invalid inode number. */ + if (!xfs_verify_dir_ino(sc->mp, ino)) { + error = -ECANCELED; + goto out_abort; + } + + /* Update the shadow link counts if we haven't already failed. */ + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + + trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name); + + mutex_lock(&xnc->lock); + + /* + * If this is a dotdot entry, it is a back link from dp to ino. How + * we handle this depends on whether or not dp is the root directory. + * + * The root directory is its own parent, so we pretend the dotdot entry + * establishes the "parent" of the root directory. Increment the + * number of parents of the root directory. + * + * Otherwise, increment the number of backrefs pointing back to ino. + */ + if (dotdot) { + if (dp == sc->mp->m_rootip) + error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); + else + error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0); + if (error) + goto out_unlock; + } + + /* + * If this dirent is a forward link from dp to ino, increment the + * number of parents linking into ino. + */ + if (!dot && !dotdot) { + error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); + if (error) + goto out_unlock; + } + + /* + * If this dirent is a forward link to a subdirectory, increment the + * number of child links of dp. + */ + if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) { + error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1); + if (error) + goto out_unlock; + } + + mutex_unlock(&xnc->lock); + return 0; + +out_unlock: + mutex_unlock(&xnc->lock); +out_abort: + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(sc); + return error; +} + +/* Walk a directory to bump the observed link counts of the children. */ +STATIC int +xchk_nlinks_collect_dir( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = xnc->sc; + unsigned int lock_mode; + int error = 0; + + /* Prevent anyone from changing this directory while we walk it. */ + xfs_ilock(dp, XFS_IOLOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(dp); + + /* + * The dotdot entry of an unlinked directory still points to the last + * parent, but the parent no longer links to this directory. Skip the + * directory to avoid overcounting. + */ + if (VFS_I(dp)->i_nlink == 0) + goto out_unlock; + + /* + * We cannot count file links if the directory looks as though it has + * been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_abort; + } + + error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc); + if (error == -ECANCELED) { + error = 0; + goto out_unlock; + } + if (error) + goto out_abort; + + xchk_iscan_mark_visited(&xnc->collect_iscan, dp); + goto out_unlock; + +out_abort: + xchk_set_incomplete(sc); + xchk_iscan_abort(&xnc->collect_iscan); +out_unlock: + xfs_iunlock(dp, lock_mode); + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return error; +} + +/* If this looks like a valid pointer, count it. */ +static inline int +xchk_nlinks_collect_metafile( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + if (!xfs_verify_ino(xnc->sc->mp, ino)) + return 0; + + trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino); + return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); +} + +/* Bump the link counts of metadata files rooted in the superblock. */ +STATIC int +xchk_nlinks_collect_metafiles( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_mount *mp = xnc->sc->mp; + int error = -ECANCELED; + + + if (xchk_iscan_aborted(&xnc->collect_iscan)) + goto out_incomplete; + + mutex_lock(&xnc->lock); + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino); + if (error) + goto out_abort; + mutex_unlock(&xnc->lock); + + return 0; + +out_abort: + mutex_unlock(&xnc->lock); + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(xnc->sc); + return error; +} + +/* Advance the collection scan cursor for this non-directory file. */ +static inline int +xchk_nlinks_collect_file( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + xfs_ilock(ip, XFS_IOLOCK_SHARED); + xchk_iscan_mark_visited(&xnc->collect_iscan, ip); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; +} + +/* Walk all directories and count inode links. */ +STATIC int +xchk_nlinks_collect( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_scrub *sc = xnc->sc; + struct xfs_inode *ip; + int error; + + /* Count the rt and quota files that are rooted in the superblock. */ + error = xchk_nlinks_collect_metafiles(xnc); + if (error) + return error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Cancel the transaction to release the log grant space while we scan + * the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) { + if (S_ISDIR(VFS_I(ip)->i_mode)) + error = xchk_nlinks_collect_dir(xnc, ip); + else + error = xchk_nlinks_collect_file(xnc, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xnc->collect_iscan); + if (error) { + xchk_set_incomplete(sc); + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Switch out for a real transaction in preparation for building a new + * tree. + */ + xchk_trans_cancel(sc); + return xchk_setup_fs(sc); +} + +/* + * Part 2: Comparing file link counters. Walk each inode and compare the link + * counts against our shadow information; and then walk each shadow link count + * structure (that wasn't covered in the first part), comparing it against the + * file. + */ + +/* Read the observed link count for comparison with the actual inode. */ +STATIC int +xchk_nlinks_comparison_read( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + struct xchk_nlink *obs) +{ + struct xchk_nlink nl; + int error; + + error = xfarray_load_sparse(xnc->nlinks, ino, &nl); + if (error) + return error; + + nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN); + + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. This + * shouldn't really happen outside of the collection phase. + */ + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + if (error) + return error; + + /* Copy the counters, but do not expose the internal state. */ + obs->parents = nl.parents; + obs->backrefs = nl.backrefs; + obs->children = nl.children; + obs->flags = 0; + return 0; +} + +/* Check our link count against an inode. */ +STATIC int +xchk_nlinks_compare_inode( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + struct xchk_nlink obs; + struct xfs_scrub *sc = xnc->sc; + uint64_t total_links; + unsigned int actual_nlink; + int error; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + mutex_lock(&xnc->lock); + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_scanlock; + } + + error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs); + if (error) + goto out_scanlock; + + /* + * If we don't have ftype to get an accurate count of the subdirectory + * entries in this directory, take advantage of the fact that on a + * consistent ftype=0 filesystem, the number of subdirectory + * backreferences (dotdot entries) pointing towards this directory + * should be equal to the number of subdirectory entries in the + * directory. + */ + if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode)) + obs.children = obs.backrefs; + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + + trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs); + + /* + * If we found so many parents that we'd overflow i_nlink, we must flag + * this as a corruption. The VFS won't let users increase the link + * count, but it will let them decrease it. + */ + if (total_links > XFS_MAXLINK) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + /* Link counts should match. */ + if (total_links != actual_nlink) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) { + /* + * The collection phase ignores directories with zero link + * count, so we ignore them here too. + * + * The number of subdirectory backreferences (dotdot entries) + * pointing towards this directory should be equal to the + * number of subdirectory entries in the directory. + */ + if (obs.children != obs.backrefs) + xchk_ino_xref_set_corrupt(sc, ip->i_ino); + } else { + /* + * Non-directories and unlinked directories should not have + * back references. + */ + if (obs.backrefs != 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + /* + * Non-directories and unlinked directories should not have + * children. + */ + if (obs.children != 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } + + if (ip == sc->mp->m_rootip) { + /* + * For the root of a directory tree, both the '.' and '..' + * entries should point to the root directory. The dotdot + * entry is counted as a parent of the root /and/ a backref of + * the root directory. + */ + if (obs.parents != 1) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } else if (actual_nlink > 0) { + /* + * Linked files that are not the root directory should have at + * least one parent. + */ + if (obs.parents == 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } + +out_corrupt: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + error = -ECANCELED; +out_scanlock: + mutex_unlock(&xnc->lock); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +/* + * Check our link count against an inode that wasn't checked previously. This + * is intended to catch directories with dangling links, though we could be + * racing with inode allocation in other threads. + */ +STATIC int +xchk_nlinks_compare_inum( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + struct xchk_nlink obs; + struct xfs_mount *mp = xnc->sc->mp; + struct xfs_trans *tp = xnc->sc->tp; + struct xfs_buf *agi_bp; + struct xfs_inode *ip; + int error; + + /* + * The first iget failed, so try again with the variant that returns + * either an incore inode or the AGI buffer. If the function returns + * EINVAL/ENOENT, it should have passed us the AGI buffer so that we + * can guarantee that the inode won't be allocated while we check for + * a zero link count in the observed link count data. + */ + error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip); + if (!error) { + /* Actually got an inode, so use the inode compare. */ + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_irele(xnc->sc, ip); + return error; + } + if (error == -ENOENT || error == -EINVAL) { + /* No inode was found. Check for zero link count below. */ + error = 0; + } + if (error) + goto out_agi; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_agi; + } + + mutex_lock(&xnc->lock); + error = xchk_nlinks_comparison_read(xnc, ino, &obs); + if (error) + goto out_scanlock; + + trace_xchk_nlinks_check_zero(mp, ino, &obs); + + /* + * If we can't grab the inode, the link count had better be zero. We + * still hold the AGI to prevent inode allocation/freeing. + */ + if (xchk_nlink_total(NULL, &obs) != 0) { + xchk_ino_set_corrupt(xnc->sc, ino); + error = -ECANCELED; + } + +out_scanlock: + mutex_unlock(&xnc->lock); +out_agi: + if (agi_bp) + xfs_trans_brelse(tp, agi_bp); + return error; +} + +/* + * Try to visit every inode in the filesystem to compare the link count. Move + * on if we can't grab an inode, since we'll revisit unchecked nlink records in + * the second part. + */ +static int +xchk_nlinks_compare_iter( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode **ipp) +{ + int error; + + do { + error = xchk_iscan_iter(&xnc->compare_iscan, ipp); + } while (error == -EBUSY); + + return error; +} + +/* Compare the link counts we observed against the live information. */ +STATIC int +xchk_nlinks_compare( + struct xchk_nlink_ctrs *xnc) +{ + struct xchk_nlink nl; + struct xfs_scrub *sc = xnc->sc; + struct xfs_inode *ip; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Create a new empty transaction so that we can advance the iscan + * cursor without deadlocking if the inobt has a cycle and push on the + * inactivation workqueue. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + /* + * Use the inobt to walk all allocated inodes to compare the link + * counts. Inodes skipped by _compare_iter will be tried again in the + * next phase of the scan. + */ + xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan); + while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) { + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_iscan_mark_visited(&xnc->compare_iscan, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xnc->compare_iscan); + xchk_iscan_teardown(&xnc->compare_iscan); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Walk all the non-null nlink observations that weren't checked in the + * previous step. + */ + mutex_lock(&xnc->lock); + while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) { + xfs_ino_t ino = cur - 1; + + if (nl.flags & XCHK_NLINK_COMPARE_SCANNED) + continue; + + mutex_unlock(&xnc->lock); + + error = xchk_nlinks_compare_inum(xnc, ino); + if (error) + return error; + + if (xchk_should_terminate(xnc->sc, &error)) + return error; + + mutex_lock(&xnc->lock); + } + mutex_unlock(&xnc->lock); + + return error; +} + +/* Tear down everything associated with a nlinks check. */ +static void +xchk_nlinks_teardown_scan( + void *priv) +{ + struct xchk_nlink_ctrs *xnc = priv; + + /* Discourage any hook functions that might be running. */ + xchk_iscan_abort(&xnc->collect_iscan); + + xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook); + + xfarray_destroy(xnc->nlinks); + xnc->nlinks = NULL; + + xchk_iscan_teardown(&xnc->collect_iscan); + mutex_destroy(&xnc->lock); + xnc->sc = NULL; +} + +/* + * Scan all inodes in the entire filesystem to generate link count data. If + * the scan is successful, the counts will be left alive for a repair. If any + * error occurs, we'll tear everything down. + */ +STATIC int +xchk_nlinks_setup_scan( + struct xfs_scrub *sc, + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_mount *mp = sc->mp; + char *descr; + unsigned long long max_inos; + xfs_agnumber_t last_agno = mp->m_sb.sb_agcount - 1; + xfs_agino_t first_agino, last_agino; + int error; + + ASSERT(xnc->sc == NULL); + xnc->sc = sc; + + mutex_init(&xnc->lock); + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan); + + /* + * Set up enough space to store an nlink record for the highest + * possible inode number in this system. + */ + xfs_agino_range(mp, last_agno, &first_agino, &last_agino); + max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1; + descr = xchk_xfile_descr(sc, "file link counts"); + error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos), + sizeof(struct xchk_nlink), &xnc->nlinks); + kfree(descr); + if (error) + goto out_teardown; + + /* + * Hook into the directory entry code so that we can capture updates to + * file link counts. The hook only triggers for inodes that were + * already scanned, and the scanner thread takes each inode's ILOCK, + * which means that any in-progress inode updates will finish before we + * can scan the inode. + */ + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update); + error = xfs_dir_hook_add(mp, &xnc->dhook); + if (error) + goto out_teardown; + + /* Use deferred cleanup to pass the inode link count data to repair. */ + sc->buf_cleanup = xchk_nlinks_teardown_scan; + return 0; + +out_teardown: + xchk_nlinks_teardown_scan(xnc); + return error; +} + +/* Scrub the link count of all inodes on the filesystem. */ +int +xchk_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc = sc->buf; + int error = 0; + + /* Set ourselves up to check link counts on the live filesystem. */ + error = xchk_nlinks_setup_scan(sc, xnc); + if (error) + return error; + + /* Walk all inodes, picking up link count information. */ + error = xchk_nlinks_collect(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Fail fast if we're not playing with a full dataset. */ + if (xchk_iscan_aborted(&xnc->collect_iscan)) + xchk_set_incomplete(sc); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* Compare link counts. */ + error = xchk_nlinks_compare(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Check one last time for an incomplete dataset. */ + if (xchk_iscan_aborted(&xnc->collect_iscan)) + xchk_set_incomplete(sc); + + return 0; +} diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h new file mode 100644 index 000000000000..a950f3daf204 --- /dev/null +++ b/fs/xfs/scrub/nlinks.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_NLINKS_H__ +#define __XFS_SCRUB_NLINKS_H__ + +/* Live link count control structure. */ +struct xchk_nlink_ctrs { + struct xfs_scrub *sc; + + /* Shadow link count data and its mutex. */ + struct xfarray *nlinks; + struct mutex lock; + + /* + * The collection step uses a separate iscan context from the compare + * step because the collection iscan coordinates live updates to the + * observation data while this scanner is running. The compare iscan + * is secondary and can be reinitialized as needed. + */ + struct xchk_iscan collect_iscan; + struct xchk_iscan compare_iscan; + + /* + * Hook into directory updates so that we can receive live updates + * from other writer threads. + */ + struct xfs_dir_hook dhook; +}; + +/* + * In-core link counts for a given inode in the filesystem. + * + * For an empty rootdir, the directory entries and the field to which they are + * accounted are as follows: + * + * Root directory: + * + * . points to self (root.child) + * .. points to self (root.parent) + * f1 points to a child file (f1.parent) + * d1 points to a child dir (d1.parent, root.child) + * + * Subdirectory d1: + * + * . points to self (d1.child) + * .. points to root dir (root.backref) + * f2 points to child file (f2.parent) + * f3 points to root.f1 (f1.parent) + * + * root.nlink == 3 (root.dot, root.dotdot, root.d1) + * d1.nlink == 2 (root.d1, d1.dot) + * f1.nlink == 2 (root.f1, d1.f3) + * f2.nlink == 1 (d1.f2) + */ +struct xchk_nlink { + /* Count of forward links from parent directories to this file. */ + xfs_nlink_t parents; + + /* + * Count of back links to this parent directory from child + * subdirectories. + */ + xfs_nlink_t backrefs; + + /* + * Count of forward links from this directory to all child files and + * the number of dot entries. Should be zero for non-directories. + */ + xfs_nlink_t children; + + /* Record state flags */ + unsigned int flags; +}; + +/* + * This incore link count has been written at least once. We never want to + * store an xchk_nlink that looks uninitialized. + */ +#define XCHK_NLINK_WRITTEN (1U << 0) + +/* Already checked this link count record. */ +#define XCHK_NLINK_COMPARE_SCANNED (1U << 1) + +/* Already made a repair with this link count record. */ +#define XREP_NLINK_DIRTY (1U << 2) + +/* Compute total link count, using large enough variables to detect overflow. */ +static inline uint64_t +xchk_nlink_total(struct xfs_inode *ip, const struct xchk_nlink *live) +{ + uint64_t ret = live->parents; + + /* Add one link count for the dot entry of any linked directory. */ + if (ip && S_ISDIR(VFS_I(ip)->i_mode) && VFS_I(ip)->i_nlink) + ret++; + return ret + live->children; +} + +#endif /* __XFS_SCRUB_NLINKS_H__ */ diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c new file mode 100644 index 000000000000..b87618322f55 --- /dev/null +++ b/fs/xfs/scrub/nlinks_repair.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_sb.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/nlinks.h" +#include "scrub/trace.h" + +/* + * Live Inode Link Count Repair + * ============================ + * + * Use the live inode link count information that we collected to replace the + * nlink values of the incore inodes. A scrub->repair cycle should have left + * the live data and hooks active, so this is safe so long as we make sure the + * inode is locked. + */ + +/* + * Correct the link count of the given inode. Because we have to grab locks + * and resources in a certain order, it's possible that this will be a no-op. + */ +STATIC int +xrep_nlinks_repair_inode( + struct xchk_nlink_ctrs *xnc) +{ + struct xchk_nlink obs; + struct xfs_scrub *sc = xnc->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + uint64_t total_links; + uint64_t actual_nlink; + bool dirty = false; + int error; + + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_trans_ijoin(sc->tp, ip, 0); + + mutex_lock(&xnc->lock); + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_scanlock; + } + + error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs); + if (error) + goto out_scanlock; + + /* + * We're done accessing the shared scan data, so we can drop the lock. + * We still hold @ip's ILOCK, so its link count cannot change. + */ + mutex_unlock(&xnc->lock); + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + + /* + * Non-directories cannot have directories pointing up to them. + * + * We previously set error to zero, but set it again because one static + * checker author fears that programmers will fail to maintain this + * invariant and built their tool to flag this as a security risk. A + * different tool author made their bot complain about the redundant + * store. This is a never-ending and stupid battle; both tools missed + * *actual bugs* elsewhere; and I no longer care. + */ + if (!S_ISDIR(VFS_I(ip)->i_mode) && obs.children != 0) { + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + error = 0; + goto out_trans; + } + + /* + * We did not find any links to this inode. If the inode agrees, we + * have nothing further to do. If not, the inode has a nonzero link + * count and we don't have anywhere to graft the child onto. Dropping + * a live inode's link count to zero can cause unexpected shutdowns in + * inactivation, so leave it alone. + */ + if (total_links == 0) { + if (actual_nlink != 0) + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + goto out_trans; + } + + /* Commit the new link count if it changed. */ + if (total_links != actual_nlink) { + if (total_links > XFS_MAXLINK) { + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + goto out_trans; + } + + trace_xrep_nlinks_update_inode(mp, ip, &obs); + + set_nlink(VFS_I(ip), total_links); + dirty = true; + } + + if (!dirty) { + error = 0; + goto out_trans; + } + + xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE); + + error = xrep_trans_commit(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + return error; + +out_scanlock: + mutex_unlock(&xnc->lock); +out_trans: + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + return error; +} + +/* + * Try to visit every inode in the filesystem for repairs. Move on if we can't + * grab an inode, since we're still making forward progress. + */ +static int +xrep_nlinks_iter( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode **ipp) +{ + int error; + + do { + error = xchk_iscan_iter(&xnc->compare_iscan, ipp); + } while (error == -EBUSY); + + return error; +} + +/* Commit the new inode link counters. */ +int +xrep_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc = sc->buf; + int error; + + /* + * We need ftype for an accurate count of the number of child + * subdirectory links. Child subdirectories with a back link (dotdot + * entry) but no forward link are unfixable, so we cannot repair the + * link count of the parent directory based on the back link count + * alone. Filesystems without ftype support are rare (old V4) so we + * just skip out here. + */ + if (!xfs_has_ftype(sc->mp)) + return -EOPNOTSUPP; + + /* + * Use the inobt to walk all allocated inodes to compare and fix the + * link counts. Retry iget every tenth of a second for up to 30 + * seconds -- even if repair misses a few inodes, we still try to fix + * as many of them as we can. + */ + xchk_iscan_start(sc, 30000, 100, &xnc->compare_iscan); + ASSERT(sc->ip == NULL); + + while ((error = xrep_nlinks_iter(xnc, &sc->ip)) == 1) { + /* + * Commit the scrub transaction so that we can create repair + * transactions with the correct reservations. + */ + xchk_trans_cancel(sc); + + error = xrep_nlinks_repair_inode(xnc); + xchk_iscan_mark_visited(&xnc->compare_iscan, sc->ip); + xchk_irele(sc, sc->ip); + sc->ip = NULL; + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + + /* + * Create a new empty transaction so that we can advance the + * iscan cursor without deadlocking if the inobt has a cycle. + * We can only push the inactivation workqueues with an empty + * transaction. + */ + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + xchk_iscan_iter_finish(&xnc->compare_iscan); + xchk_iscan_teardown(&xnc->compare_iscan); + + return error; +} diff --git a/fs/xfs/scrub/off_bitmap.h b/fs/xfs/scrub/off_bitmap.h new file mode 100644 index 000000000000..0d3f9e6c1aad --- /dev/null +++ b/fs/xfs/scrub/off_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_OFF_BITMAP_H__ +#define __XFS_SCRUB_OFF_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_fileoff_t */ + +struct xoff_bitmap { + struct xbitmap64 offbitmap; +}; + +static inline void xoff_bitmap_init(struct xoff_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->offbitmap); +} + +static inline void xoff_bitmap_destroy(struct xoff_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->offbitmap); +} + +static inline int xoff_bitmap_set(struct xoff_bitmap *bitmap, + xfs_fileoff_t off, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->offbitmap, off, len); +} + +static inline int xoff_bitmap_walk(struct xoff_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->offbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_OFF_BITMAP_H__ */ diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index e6155d86f791..7db873672146 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -156,6 +156,16 @@ xchk_parent_validate( goto out_rele; } + /* + * We cannot yet validate this parent pointer if the directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + xchk_set_incomplete(sc); + goto out_unlock; + } + /* Look for a directory entry in the parent pointing to the child. */ error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) @@ -217,6 +227,13 @@ xchk_parent( */ error = xchk_parent_validate(sc, parent_ino); } while (error == -EAGAIN); + if (error == -EBUSY) { + /* + * We could not scan a directory, so we marked the check + * incomplete. No further error return is necessary. + */ + return 0; + } return error; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 5671c8153433..183d531875ea 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -6,6 +6,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" @@ -17,9 +18,10 @@ #include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/quota.h" /* Convert a scrub type code to a DQ flag, or return 0 if error. */ -static inline xfs_dqtype_t +xfs_dqtype_t xchk_quota_to_dqtype( struct xfs_scrub *sc) { @@ -75,14 +77,70 @@ struct xchk_quota_info { xfs_dqid_t last_id; }; +/* There's a written block backing this dquot, right? */ +STATIC int +xchk_quota_item_bmap( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + xfs_fileoff_t offset) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = sc->mp; + int nmaps = 1; + int error; + + if (!xfs_verify_fileoff(mp, offset)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + if (dq->q_fileoffset != offset) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0); + if (error) + return error; + + if (nmaps != 1) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + if (!xfs_verify_fsbno(mp, irec.br_startblock)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (XFS_FSB_TO_DADDR(mp, irec.br_startblock) != dq->q_blkno) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (!xfs_bmap_is_written_extent(&irec)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + return 0; +} + +/* Complain if a quota timer is incorrectly set. */ +static inline void +xchk_quota_item_timer( + struct xfs_scrub *sc, + xfs_fileoff_t offset, + const struct xfs_dquot_res *res) +{ + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (!res->timer) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } else { + if (res->timer) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } +} + /* Scrub the fields in an individual quota item. */ STATIC int xchk_quota_item( - struct xfs_dquot *dq, - xfs_dqtype_t dqtype, - void *priv) + struct xchk_quota_info *sqi, + struct xfs_dquot *dq) { - struct xchk_quota_info *sqi = priv; struct xfs_scrub *sc = sqi->sc; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -94,6 +152,17 @@ xchk_quota_item( return error; /* + * We want to validate the bmap record for the storage backing this + * dquot, so we need to lock the dquot and the quota file. For quota + * operations, the locking order is first the ILOCK and then the dquot. + * However, dqiterate gave us a locked dquot, so drop the dquot lock to + * get the ILOCK. + */ + xfs_dqunlock(dq); + xchk_ilock(sc, XFS_ILOCK_SHARED); + xfs_dqlock(dq); + + /* * Except for the root dquot, the actual dquot we got must either have * the same or higher id as we saw before. */ @@ -103,6 +172,11 @@ xchk_quota_item( sqi->last_id = dq->q_id; + error = xchk_quota_item_bmap(sc, dq, offset); + xchk_iunlock(sc, XFS_ILOCK_SHARED); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error)) + return error; + /* * Warn if the hard limits are larger than the fs. * Administrators can do this, though in production this seems @@ -166,6 +240,10 @@ xchk_quota_item( dq->q_rtb.count > dq->q_rtb.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + xchk_quota_item_timer(sc, offset, &dq->q_blk); + xchk_quota_item_timer(sc, offset, &dq->q_ino); + xchk_quota_item_timer(sc, offset, &dq->q_rtb); + out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return -ECANCELED; @@ -191,7 +269,7 @@ xchk_quota_data_fork( return error; /* Check for data fork problems that apply only to quota files. */ - max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; + max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk; ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) @@ -218,9 +296,11 @@ int xchk_quota( struct xfs_scrub *sc) { - struct xchk_quota_info sqi; + struct xchk_dqiter cursor = { }; + struct xchk_quota_info sqi = { .sc = sc }; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_dquot *dq; xfs_dqtype_t dqtype; int error = 0; @@ -239,10 +319,15 @@ xchk_quota( * functions. */ xchk_iunlock(sc, sc->ilock_flags); - sqi.sc = sc; - sqi.last_id = 0; - error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi); - xchk_ilock(sc, XFS_ILOCK_EXCL); + + /* Now look for things that the quota verifiers won't complain about. */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xchk_quota_item(&sqi, dq); + xfs_qm_dqput(dq); + if (error) + break; + } if (error == -ECANCELED) error = 0; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, diff --git a/fs/xfs/scrub/quota.h b/fs/xfs/scrub/quota.h new file mode 100644 index 000000000000..6c7134ce2385 --- /dev/null +++ b/fs/xfs/scrub/quota.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_QUOTA_H__ +#define __XFS_SCRUB_QUOTA_H__ + +xfs_dqtype_t xchk_quota_to_dqtype(struct xfs_scrub *sc); + +/* dquot iteration code */ + +struct xchk_dqiter { + struct xfs_scrub *sc; + + /* Quota file that we're walking. */ + struct xfs_inode *quota_ip; + + /* Cached data fork mapping for the dquot. */ + struct xfs_bmbt_irec bmap; + + /* The next dquot to scan. */ + uint64_t id; + + /* Quota type (user/group/project). */ + xfs_dqtype_t dqtype; + + /* Data fork sequence number to detect stale mappings. */ + unsigned int if_seq; +}; + +void xchk_dqiter_init(struct xchk_dqiter *cursor, struct xfs_scrub *sc, + xfs_dqtype_t dqtype); +int xchk_dquot_iter(struct xchk_dqiter *cursor, struct xfs_dquot **dqpp); + +#endif /* __XFS_SCRUB_QUOTA_H__ */ diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c new file mode 100644 index 000000000000..0bab4c30cb85 --- /dev/null +++ b/fs/xfs/scrub/quota_repair.c @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_dquot.h" +#include "xfs_dquot_item.h" +#include "xfs_reflink.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/quota.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Quota Repair + * ============ + * + * Quota repairs are fairly simplistic; we fix everything that the dquot + * verifiers complain about, cap any counters or limits that make no sense, + * and schedule a quotacheck if we had to fix anything. We also repair any + * data fork extent records that don't apply to metadata files. + */ + +struct xrep_quota_info { + struct xfs_scrub *sc; + bool need_quotacheck; +}; + +/* + * Allocate a new block into a sparse hole in the quota file backing this + * dquot, initialize the block, and commit the whole mess. + */ +STATIC int +xrep_quota_item_fill_bmap_hole( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + struct xfs_bmbt_irec *irec) +{ + struct xfs_buf *bp; + struct xfs_mount *mp = sc->mp; + int nmaps = 1; + int error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Map a block into the file. */ + error = xfs_trans_reserve_more(sc->tp, XFS_QM_DQALLOC_SPACE_RES(mp), + 0); + if (error) + return error; + + error = xfs_bmapi_write(sc->tp, sc->ip, dq->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, + irec, &nmaps); + if (error) + return error; + if (nmaps != 1) + return -ENOSPC; + + dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock); + + trace_xrep_dquot_item_fill_bmap_hole(sc->mp, dq->q_type, dq->q_id); + + /* Initialize the new block. */ + error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, dq->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) + return error; + bp->b_ops = &xfs_dquot_buf_ops; + + xfs_qm_init_dquot_blk(sc->tp, dq->q_id, dq->q_type, bp); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); + + /* + * Finish the mapping transactions and roll one more time to + * disconnect sc->ip from sc->tp. + */ + error = xrep_defer_finish(sc); + if (error) + return error; + return xfs_trans_roll(&sc->tp); +} + +/* Make sure there's a written block backing this dquot */ +STATIC int +xrep_quota_item_bmap( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + bool *dirty) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = sc->mp; + struct xfs_quotainfo *qi = mp->m_quotainfo; + xfs_fileoff_t offset = dq->q_id / qi->qi_dqperchunk; + int nmaps = 1; + int error; + + /* The computed file offset should always be valid. */ + if (!xfs_verify_fileoff(mp, offset)) { + ASSERT(xfs_verify_fileoff(mp, offset)); + return -EFSCORRUPTED; + } + dq->q_fileoffset = offset; + + error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0); + if (error) + return error; + + if (nmaps < 1 || !xfs_bmap_is_real_extent(&irec)) { + /* Hole/delalloc extent; allocate a real block. */ + error = xrep_quota_item_fill_bmap_hole(sc, dq, &irec); + if (error) + return error; + } else if (irec.br_state != XFS_EXT_NORM) { + /* Unwritten extent, which we already took care of? */ + ASSERT(irec.br_state == XFS_EXT_NORM); + return -EFSCORRUPTED; + } else if (dq->q_blkno != XFS_FSB_TO_DADDR(mp, irec.br_startblock)) { + /* + * If the cached daddr is incorrect, repair probably punched a + * hole out of the quota file and filled it back in with a new + * block. Update the block mapping in the dquot. + */ + dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec.br_startblock); + } + + *dirty = true; + return 0; +} + +/* Reset quota timers if incorrectly set. */ +static inline void +xrep_quota_item_timer( + struct xfs_scrub *sc, + const struct xfs_dquot_res *res, + bool *dirty) +{ + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (!res->timer) + *dirty = true; + } else { + if (res->timer) + *dirty = true; + } +} + +/* Scrub the fields in an individual quota item. */ +STATIC int +xrep_quota_item( + struct xrep_quota_info *rqi, + struct xfs_dquot *dq) +{ + struct xfs_scrub *sc = rqi->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t fs_icount; + bool dirty = false; + int error = 0; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * We might need to fix holes in the bmap record for the storage + * backing this dquot, so we need to lock the dquot and the quota file. + * dqiterate gave us a locked dquot, so drop the dquot lock to get the + * ILOCK_EXCL. + */ + xfs_dqunlock(dq); + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_dqlock(dq); + + error = xrep_quota_item_bmap(sc, dq, &dirty); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + if (error) + return error; + + /* Check the limits. */ + if (dq->q_blk.softlimit > dq->q_blk.hardlimit) { + dq->q_blk.softlimit = dq->q_blk.hardlimit; + dirty = true; + } + + if (dq->q_ino.softlimit > dq->q_ino.hardlimit) { + dq->q_ino.softlimit = dq->q_ino.hardlimit; + dirty = true; + } + + if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) { + dq->q_rtb.softlimit = dq->q_rtb.hardlimit; + dirty = true; + } + + /* + * Check that usage doesn't exceed physical limits. However, on + * a reflink filesystem we're allowed to exceed physical space + * if there are no quota limits. We don't know what the real number + * is, but we can make quotacheck find out for us. + */ + if (!xfs_has_reflink(mp) && dq->q_blk.count > mp->m_sb.sb_dblocks) { + dq->q_blk.reserved -= dq->q_blk.count; + dq->q_blk.reserved += mp->m_sb.sb_dblocks; + dq->q_blk.count = mp->m_sb.sb_dblocks; + rqi->need_quotacheck = true; + dirty = true; + } + fs_icount = percpu_counter_sum(&mp->m_icount); + if (dq->q_ino.count > fs_icount) { + dq->q_ino.reserved -= dq->q_ino.count; + dq->q_ino.reserved += fs_icount; + dq->q_ino.count = fs_icount; + rqi->need_quotacheck = true; + dirty = true; + } + if (dq->q_rtb.count > mp->m_sb.sb_rblocks) { + dq->q_rtb.reserved -= dq->q_rtb.count; + dq->q_rtb.reserved += mp->m_sb.sb_rblocks; + dq->q_rtb.count = mp->m_sb.sb_rblocks; + rqi->need_quotacheck = true; + dirty = true; + } + + xrep_quota_item_timer(sc, &dq->q_blk, &dirty); + xrep_quota_item_timer(sc, &dq->q_ino, &dirty); + xrep_quota_item_timer(sc, &dq->q_rtb, &dirty); + + if (!dirty) + return 0; + + trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id); + + dq->q_flags |= XFS_DQFLAG_DIRTY; + xfs_trans_dqjoin(sc->tp, dq); + if (dq->q_id) { + xfs_qm_adjust_dqlimits(dq); + xfs_qm_adjust_dqtimers(dq); + } + xfs_trans_log_dquot(sc->tp, dq); + error = xfs_trans_roll(&sc->tp); + xfs_dqlock(dq); + return error; +} + +/* Fix a quota timer so that we can pass the verifier. */ +STATIC void +xrep_quota_fix_timer( + struct xfs_mount *mp, + const struct xfs_disk_dquot *ddq, + __be64 softlimit, + __be64 countnow, + __be32 *timer, + time64_t timelimit) +{ + uint64_t soft = be64_to_cpu(softlimit); + uint64_t count = be64_to_cpu(countnow); + time64_t new_timer; + uint32_t t; + + if (!soft || count <= soft || *timer != 0) + return; + + new_timer = xfs_dquot_set_timeout(mp, + ktime_get_real_seconds() + timelimit); + if (ddq->d_type & XFS_DQTYPE_BIGTIME) + t = xfs_dq_unix_to_bigtime(new_timer); + else + t = new_timer; + + *timer = cpu_to_be32(t); +} + +/* Fix anything the verifiers complain about. */ +STATIC int +xrep_quota_block( + struct xfs_scrub *sc, + xfs_daddr_t daddr, + xfs_dqtype_t dqtype, + xfs_dqid_t id) +{ + struct xfs_dqblk *dqblk; + struct xfs_disk_dquot *ddq; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_def_quota *defq = xfs_get_defquota(qi, dqtype); + struct xfs_buf *bp = NULL; + enum xfs_blft buftype = 0; + int i; + int error; + + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, daddr, + qi->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); + switch (error) { + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Failed verifier, retry read with no ops. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, + sc->mp->m_ddev_targp, daddr, qi->qi_dqchunklen, + 0, &bp, NULL); + if (error) + return error; + break; + case 0: + dqblk = bp->b_addr; + ddq = &dqblk[0].dd_diskdq; + + /* + * If there's nothing that would impede a dqiterate, we're + * done. + */ + if ((ddq->d_type & XFS_DQTYPE_REC_MASK) != dqtype || + id == be32_to_cpu(ddq->d_id)) { + xfs_trans_brelse(sc->tp, bp); + return 0; + } + break; + default: + return error; + } + + /* Something's wrong with the block, fix the whole thing. */ + dqblk = bp->b_addr; + bp->b_ops = &xfs_dquot_buf_ops; + for (i = 0; i < qi->qi_dqperchunk; i++, dqblk++) { + ddq = &dqblk->dd_diskdq; + + trace_xrep_disk_dquot(sc->mp, dqtype, id + i); + + ddq->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + ddq->d_version = XFS_DQUOT_VERSION; + ddq->d_type = dqtype; + ddq->d_id = cpu_to_be32(id + i); + + if (xfs_has_bigtime(sc->mp) && ddq->d_id) + ddq->d_type |= XFS_DQTYPE_BIGTIME; + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_blk_softlimit, + ddq->d_bcount, &ddq->d_btimer, + defq->blk.time); + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_ino_softlimit, + ddq->d_icount, &ddq->d_itimer, + defq->ino.time); + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_rtb_softlimit, + ddq->d_rtbcount, &ddq->d_rtbtimer, + defq->rtb.time); + + /* We only support v5 filesystems so always set these. */ + uuid_copy(&dqblk->dd_uuid, &sc->mp->m_sb.sb_meta_uuid); + xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + dqblk->dd_lsn = 0; + } + switch (dqtype) { + case XFS_DQTYPE_USER: + buftype = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_DQTYPE_GROUP: + buftype = XFS_BLFT_GDQUOT_BUF; + break; + case XFS_DQTYPE_PROJ: + buftype = XFS_BLFT_PDQUOT_BUF; + break; + } + xfs_trans_buf_set_type(sc->tp, bp, buftype); + xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); + return xrep_roll_trans(sc); +} + +/* + * Repair a quota file's data fork. The function returns with the inode + * joined. + */ +STATIC int +xrep_quota_data_fork( + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + struct xfs_bmbt_irec irec = { 0 }; + struct xfs_iext_cursor icur; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_ifork *ifp; + xfs_fileoff_t max_dqid_off; + xfs_fileoff_t off; + xfs_fsblock_t fsbno; + bool truncate = false; + bool joined = false; + int error = 0; + + error = xrep_metadata_inode_forks(sc); + if (error) + goto out; + + /* Check for data fork problems that apply only to quota files. */ + max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk; + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + for_each_xfs_iext(ifp, &icur, &irec) { + if (isnullstartblock(irec.br_startblock)) { + error = -EFSCORRUPTED; + goto out; + } + + if (irec.br_startoff > max_dqid_off || + irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { + truncate = true; + break; + } + + /* Convert unwritten extents to real ones. */ + if (irec.br_state == XFS_EXT_UNWRITTEN) { + struct xfs_bmbt_irec nrec; + int nmap = 1; + + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + error = xfs_bmapi_write(sc->tp, sc->ip, + irec.br_startoff, irec.br_blockcount, + XFS_BMAPI_CONVERT, 0, &nrec, &nmap); + if (error) + goto out; + if (nmap != 1) { + error = -ENOSPC; + goto out; + } + ASSERT(nrec.br_startoff == irec.br_startoff); + ASSERT(nrec.br_blockcount == irec.br_blockcount); + + error = xfs_defer_finish(&sc->tp); + if (error) + goto out; + } + } + + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + if (truncate) { + /* Erase everything after the block containing the max dquot */ + error = xfs_bunmapi_range(&sc->tp, sc->ip, 0, + max_dqid_off * sc->mp->m_sb.sb_blocksize, + XFS_MAX_FILEOFF); + if (error) + goto out; + + /* Remove all CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(sc->ip, &sc->tp, 0, + XFS_MAX_FILEOFF, true); + if (error) + goto out; + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + + /* + * Always re-log the inode so that our permanent transaction + * can keep on rolling it forward in the log. + */ + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + } + + /* Now go fix anything that fails the verifiers. */ + for_each_xfs_iext(ifp, &icur, &irec) { + for (fsbno = irec.br_startblock, off = irec.br_startoff; + fsbno < irec.br_startblock + irec.br_blockcount; + fsbno += XFS_DQUOT_CLUSTER_SIZE_FSB, + off += XFS_DQUOT_CLUSTER_SIZE_FSB) { + error = xrep_quota_block(sc, + XFS_FSB_TO_DADDR(sc->mp, fsbno), + dqtype, off * qi->qi_dqperchunk); + if (error) + goto out; + } + } + +out: + return error; +} + +/* + * Go fix anything in the quota items that we could have been mad about. Now + * that we've checked the quota inode data fork we have to drop ILOCK_EXCL to + * use the regular dquot functions. + */ +STATIC int +xrep_quota_problems( + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xrep_quota_info rqi = { .sc = sc }; + struct xfs_dquot *dq; + int error; + + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xrep_quota_item(&rqi, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* Make a quotacheck happen. */ + if (rqi.need_quotacheck) + xrep_force_quotacheck(sc, dqtype); + return 0; +} + +/* Repair all of a quota type's items. */ +int +xrep_quota( + struct xfs_scrub *sc) +{ + xfs_dqtype_t dqtype; + int error; + + dqtype = xchk_quota_to_dqtype(sc); + + /* + * Re-take the ILOCK so that we can fix any problems that we found + * with the data fork mappings, or with the dquot bufs themselves. + */ + if (!(sc->ilock_flags & XFS_ILOCK_EXCL)) + xchk_ilock(sc, XFS_ILOCK_EXCL); + error = xrep_quota_data_fork(sc, dqtype); + if (error) + return error; + + /* + * Finish deferred items and roll the transaction to unjoin the quota + * inode from transaction so that we can unlock the quota inode; we + * play only with dquots from now on. + */ + error = xrep_defer_finish(sc); + if (error) + return error; + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + xchk_iunlock(sc, sc->ilock_flags); + + /* Fix anything the dquot verifiers don't complain about. */ + error = xrep_quota_problems(sc, dqtype); + if (error) + return error; + + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c new file mode 100644 index 000000000000..c77eb2de8df7 --- /dev/null +++ b/fs/xfs/scrub/quotacheck.c @@ -0,0 +1,867 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/quota.h" +#include "scrub/quotacheck.h" +#include "scrub/trace.h" + +/* + * Live Quotacheck + * =============== + * + * Quota counters are "summary" metadata, in the sense that they are computed + * as the summation of the block usage counts for every file on the filesystem. + * Therefore, we compute the correct icount, bcount, and rtbcount values by + * creating a shadow quota counter structure and walking every inode. + */ + +/* Track the quota deltas for a dquot in a transaction. */ +struct xqcheck_dqtrx { + xfs_dqtype_t q_type; + xfs_dqid_t q_id; + + int64_t icount_delta; + + int64_t bcount_delta; + int64_t delbcnt_delta; + + int64_t rtbcount_delta; + int64_t delrtb_delta; +}; + +#define XQCHECK_MAX_NR_DQTRXS (XFS_QM_TRANS_DQTYPES * XFS_QM_TRANS_MAXDQS) + +/* + * Track the quota deltas for all dquots attached to a transaction if the + * quota deltas are being applied to an inode that we already scanned. + */ +struct xqcheck_dqacct { + struct rhash_head hash; + uintptr_t tx_id; + struct xqcheck_dqtrx dqtrx[XQCHECK_MAX_NR_DQTRXS]; + unsigned int refcount; +}; + +/* Free a shadow dquot accounting structure. */ +static void +xqcheck_dqacct_free( + void *ptr, + void *arg) +{ + struct xqcheck_dqacct *dqa = ptr; + + kfree(dqa); +} + +/* Set us up to scrub quota counters. */ +int +xchk_setup_quotacheck( + struct xfs_scrub *sc) +{ + if (!XFS_IS_QUOTA_ON(sc->mp)) + return -ENOENT; + + xchk_fsgates_enable(sc, XCHK_FSGATES_QUOTA); + + sc->buf = kzalloc(sizeof(struct xqcheck), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + return xchk_setup_fs(sc); +} + +/* + * Part 1: Collecting dquot resource usage counts. For each xfs_dquot attached + * to each inode, we create a shadow dquot, and compute the inode count and add + * the data/rt block usage from what we see. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. + * + * Because we are scanning a live filesystem, it's possible that another thread + * will try to update the quota counters for an inode that we've already + * scanned. This will cause our counts to be incorrect. Therefore, we hook + * the live transaction code in two places: (1) when the callers update the + * per-transaction dqtrx structure to log quota counter updates; and (2) when + * transaction commit actually logs those updates to the incore dquot. By + * shadowing transaction updates in this manner, live quotacheck can ensure + * by locking the dquot and the shadow structure that its own copies are not + * out of date. Because the hook code runs in a different process context from + * the scrub code and the scrub state flags are not accessed atomically, + * failures in the hook code must abort the iscan and the scrubber must notice + * the aborted scan and set the incomplete flag. + * + * Note that we use srcu notifier hooks to minimize the overhead when live + * quotacheck is /not/ running. + */ + +/* Update an incore dquot counter information from a live update. */ +static int +xqcheck_update_incore_counts( + struct xqcheck *xqc, + struct xfarray *counts, + xfs_dqid_t id, + int64_t inodes, + int64_t nblks, + int64_t rtblks) +{ + struct xqcheck_dquot xcdq; + int error; + + error = xfarray_load_sparse(counts, id, &xcdq); + if (error) + return error; + + xcdq.flags |= XQCHECK_DQUOT_WRITTEN; + xcdq.icount += inodes; + xcdq.bcount += nblks; + xcdq.rtbcount += rtblks; + + error = xfarray_store(counts, id, &xcdq); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + error = -ECANCELED; + } + return error; +} + +/* Decide if this is the shadow dquot accounting structure for a transaction. */ +static int +xqcheck_dqacct_obj_cmpfn( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const uintptr_t *tx_idp = arg->key; + const struct xqcheck_dqacct *dqa = obj; + + if (dqa->tx_id != *tx_idp) + return 1; + return 0; +} + +static const struct rhashtable_params xqcheck_dqacct_hash_params = { + .min_size = 32, + .key_len = sizeof(uintptr_t), + .key_offset = offsetof(struct xqcheck_dqacct, tx_id), + .head_offset = offsetof(struct xqcheck_dqacct, hash), + .automatic_shrinking = true, + .obj_cmpfn = xqcheck_dqacct_obj_cmpfn, +}; + +/* Find a shadow dqtrx slot for the given dquot. */ +STATIC struct xqcheck_dqtrx * +xqcheck_get_dqtrx( + struct xqcheck_dqacct *dqa, + xfs_dqtype_t q_type, + xfs_dqid_t q_id) +{ + int i; + + for (i = 0; i < XQCHECK_MAX_NR_DQTRXS; i++) { + if (dqa->dqtrx[i].q_type == 0 || + (dqa->dqtrx[i].q_type == q_type && + dqa->dqtrx[i].q_id == q_id)) + return &dqa->dqtrx[i]; + } + + return NULL; +} + +/* + * Create and fill out a quota delta tracking structure to shadow the updates + * going on in the regular quota code. + */ +static int +xqcheck_mod_live_ino_dqtrx( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_mod_ino_dqtrx_params *p = data; + struct xqcheck *xqc; + struct xqcheck_dqacct *dqa; + struct xqcheck_dqtrx *dqtrx; + int error; + + xqc = container_of(nb, struct xqcheck, qhook.mod_hook.nb); + + /* Skip quota reservation fields. */ + switch (action) { + case XFS_TRANS_DQ_BCOUNT: + case XFS_TRANS_DQ_DELBCOUNT: + case XFS_TRANS_DQ_ICOUNT: + case XFS_TRANS_DQ_RTBCOUNT: + case XFS_TRANS_DQ_DELRTBCOUNT: + break; + default: + return NOTIFY_DONE; + } + + /* Ignore dqtrx updates for quota types we don't care about. */ + switch (p->q_type) { + case XFS_DQTYPE_USER: + if (!xqc->ucounts) + return NOTIFY_DONE; + break; + case XFS_DQTYPE_GROUP: + if (!xqc->gcounts) + return NOTIFY_DONE; + break; + case XFS_DQTYPE_PROJ: + if (!xqc->pcounts) + return NOTIFY_DONE; + break; + default: + return NOTIFY_DONE; + } + + /* Skip inodes that haven't been scanned yet. */ + if (!xchk_iscan_want_live_update(&xqc->iscan, p->ino)) + return NOTIFY_DONE; + + /* Make a shadow quota accounting tracker for this transaction. */ + mutex_lock(&xqc->lock); + dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id, + xqcheck_dqacct_hash_params); + if (!dqa) { + dqa = kzalloc(sizeof(struct xqcheck_dqacct), XCHK_GFP_FLAGS); + if (!dqa) + goto out_abort; + + dqa->tx_id = p->tx_id; + error = rhashtable_insert_fast(&xqc->shadow_dquot_acct, + &dqa->hash, xqcheck_dqacct_hash_params); + if (error) + goto out_abort; + } + + /* Find the shadow dqtrx (or an empty slot) here. */ + dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id); + if (!dqtrx) + goto out_abort; + if (dqtrx->q_type == 0) { + dqtrx->q_type = p->q_type; + dqtrx->q_id = p->q_id; + dqa->refcount++; + } + + /* Update counter */ + switch (action) { + case XFS_TRANS_DQ_BCOUNT: + dqtrx->bcount_delta += p->delta; + break; + case XFS_TRANS_DQ_DELBCOUNT: + dqtrx->delbcnt_delta += p->delta; + break; + case XFS_TRANS_DQ_ICOUNT: + dqtrx->icount_delta += p->delta; + break; + case XFS_TRANS_DQ_RTBCOUNT: + dqtrx->rtbcount_delta += p->delta; + break; + case XFS_TRANS_DQ_DELRTBCOUNT: + dqtrx->delrtb_delta += p->delta; + break; + } + + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xqc->iscan); + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; +} + +/* + * Apply the transaction quota deltas to our shadow quota accounting info when + * the regular quota code are doing the same. + */ +static int +xqcheck_apply_live_dqtrx( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_apply_dqtrx_params *p = data; + struct xqcheck *xqc; + struct xqcheck_dqacct *dqa; + struct xqcheck_dqtrx *dqtrx; + struct xfarray *counts; + int error; + + xqc = container_of(nb, struct xqcheck, qhook.apply_hook.nb); + + /* Map the dquot type to an incore counter object. */ + switch (p->q_type) { + case XFS_DQTYPE_USER: + counts = xqc->ucounts; + break; + case XFS_DQTYPE_GROUP: + counts = xqc->gcounts; + break; + case XFS_DQTYPE_PROJ: + counts = xqc->pcounts; + break; + default: + return NOTIFY_DONE; + } + + if (xchk_iscan_aborted(&xqc->iscan) || counts == NULL) + return NOTIFY_DONE; + + /* + * Find the shadow dqtrx for this transaction and dquot, if any deltas + * need to be applied here. If not, we're finished early. + */ + mutex_lock(&xqc->lock); + dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id, + xqcheck_dqacct_hash_params); + if (!dqa) + goto out_unlock; + dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id); + if (!dqtrx || dqtrx->q_type == 0) + goto out_unlock; + + /* Update our shadow dquot if we're committing. */ + if (action == XFS_APPLY_DQTRX_COMMIT) { + error = xqcheck_update_incore_counts(xqc, counts, p->q_id, + dqtrx->icount_delta, + dqtrx->bcount_delta + dqtrx->delbcnt_delta, + dqtrx->rtbcount_delta + dqtrx->delrtb_delta); + if (error) + goto out_abort; + } + + /* Free the shadow accounting structure if that was the last user. */ + dqa->refcount--; + if (dqa->refcount == 0) { + error = rhashtable_remove_fast(&xqc->shadow_dquot_acct, + &dqa->hash, xqcheck_dqacct_hash_params); + if (error) + goto out_abort; + xqcheck_dqacct_free(dqa, NULL); + } + + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xqc->iscan); +out_unlock: + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; +} + +/* Record this inode's quota usage in our shadow quota counter data. */ +STATIC int +xqcheck_collect_inode( + struct xqcheck *xqc, + struct xfs_inode *ip) +{ + struct xfs_trans *tp = xqc->sc->tp; + xfs_filblks_t nblks, rtblks; + uint ilock_flags = 0; + xfs_dqid_t id; + bool isreg = S_ISREG(VFS_I(ip)->i_mode); + int error = 0; + + if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) { + /* + * Quota files are never counted towards quota, so we do not + * need to take the lock. + */ + xchk_iscan_mark_visited(&xqc->iscan, ip); + return 0; + } + + /* Figure out the data / rt device block counts. */ + xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (isreg) + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * Read in the data fork for rt files so that _count_blocks + * can count the number of blocks allocated from the rt volume. + * Inodes do not track that separately. + */ + ilock_flags = xfs_ilock_data_map_shared(ip); + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); + if (error) + goto out_abort; + } else { + ilock_flags = XFS_ILOCK_SHARED; + xfs_ilock(ip, XFS_ILOCK_SHARED); + } + xfs_inode_count_blocks(tp, ip, &nblks, &rtblks); + + if (xchk_iscan_aborted(&xqc->iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + + /* Update the shadow dquot counters. */ + mutex_lock(&xqc->lock); + if (xqc->ucounts) { + id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_USER); + error = xqcheck_update_incore_counts(xqc, xqc->ucounts, id, 1, + nblks, rtblks); + if (error) + goto out_mutex; + } + + if (xqc->gcounts) { + id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_GROUP); + error = xqcheck_update_incore_counts(xqc, xqc->gcounts, id, 1, + nblks, rtblks); + if (error) + goto out_mutex; + } + + if (xqc->pcounts) { + id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_PROJ); + error = xqcheck_update_incore_counts(xqc, xqc->pcounts, id, 1, + nblks, rtblks); + if (error) + goto out_mutex; + } + mutex_unlock(&xqc->lock); + + xchk_iscan_mark_visited(&xqc->iscan, ip); + goto out_ilock; + +out_mutex: + mutex_unlock(&xqc->lock); +out_abort: + xchk_iscan_abort(&xqc->iscan); +out_incomplete: + xchk_set_incomplete(xqc->sc); +out_ilock: + xfs_iunlock(ip, ilock_flags); + if (isreg) + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return error; +} + +/* Walk all the allocated inodes and run a quota scan on them. */ +STATIC int +xqcheck_collect_counts( + struct xqcheck *xqc) +{ + struct xfs_scrub *sc = xqc->sc; + struct xfs_inode *ip; + int error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Cancel the transaction to release the log grant space while we scan + * the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done without + * risk of deadlock between sb_internal and the IOLOCK (we take the + * IOLOCK to quiesce the file before scanning) because empty + * transactions do not take sb_internal. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) { + error = xqcheck_collect_inode(xqc, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xqc->iscan); + if (error) { + xchk_set_incomplete(sc); + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Switch out for a real transaction in preparation for building a new + * tree. + */ + xchk_trans_cancel(sc); + return xchk_setup_fs(sc); +} + +/* + * Part 2: Comparing dquot resource counters. Walk each xfs_dquot, comparing + * the resource usage counters against our shadow dquots; and then walk each + * shadow dquot (that wasn't covered in the first part), comparing it against + * the xfs_dquot. + */ + +/* + * Check the dquot data against what we observed. Caller must hold the dquot + * lock. + */ +STATIC int +xqcheck_compare_dquot( + struct xqcheck *xqc, + xfs_dqtype_t dqtype, + struct xfs_dquot *dq) +{ + struct xqcheck_dquot xcdq; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + int error; + + if (xchk_iscan_aborted(&xqc->iscan)) { + xchk_set_incomplete(xqc->sc); + return -ECANCELED; + } + + mutex_lock(&xqc->lock); + error = xfarray_load_sparse(counts, dq->q_id, &xcdq); + if (error) + goto out_unlock; + + if (xcdq.icount != dq->q_ino.count) + xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id); + + if (xcdq.bcount != dq->q_blk.count) + xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id); + + if (xcdq.rtbcount != dq->q_rtb.count) + xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id); + + xcdq.flags |= (XQCHECK_DQUOT_COMPARE_SCANNED | XQCHECK_DQUOT_WRITTEN); + error = xfarray_store(counts, dq->q_id, &xcdq); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. This + * should never happen outside of the collection phase. + */ + xchk_set_incomplete(xqc->sc); + error = -ECANCELED; + } + mutex_unlock(&xqc->lock); + if (error) + return error; + + if (xqc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + + return 0; + +out_unlock: + mutex_unlock(&xqc->lock); + return error; +} + +/* + * Walk all the observed dquots, and make sure there's a matching incore + * dquot and that its counts match ours. + */ +STATIC int +xqcheck_walk_observations( + struct xqcheck *xqc, + xfs_dqtype_t dqtype) +{ + struct xqcheck_dquot xcdq; + struct xfs_dquot *dq; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + mutex_lock(&xqc->lock); + while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) { + xfs_dqid_t id = cur - 1; + + if (xcdq.flags & XQCHECK_DQUOT_COMPARE_SCANNED) + continue; + + mutex_unlock(&xqc->lock); + + error = xfs_qm_dqget(xqc->sc->mp, id, dqtype, false, &dq); + if (error == -ENOENT) { + xchk_qcheck_set_corrupt(xqc->sc, dqtype, id); + return 0; + } + if (error) + return error; + + error = xqcheck_compare_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + return error; + + if (xchk_should_terminate(xqc->sc, &error)) + return error; + + mutex_lock(&xqc->lock); + } + mutex_unlock(&xqc->lock); + + return error; +} + +/* Compare the quota counters we observed against the live dquots. */ +STATIC int +xqcheck_compare_dqtype( + struct xqcheck *xqc, + xfs_dqtype_t dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xfs_scrub *sc = xqc->sc; + struct xfs_dquot *dq; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* If the quota CHKD flag is cleared, we need to repair this quota. */ + if (!(xfs_quota_chkd_flag(dqtype) & sc->mp->m_qflags)) { + xchk_qcheck_set_corrupt(xqc->sc, dqtype, 0); + return 0; + } + + /* Compare what we observed against the actual dquots. */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xqcheck_compare_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* Walk all the observed dquots and compare to the incore ones. */ + return xqcheck_walk_observations(xqc, dqtype); +} + +/* Tear down everything associated with a quotacheck. */ +static void +xqcheck_teardown_scan( + void *priv) +{ + struct xqcheck *xqc = priv; + struct xfs_quotainfo *qi = xqc->sc->mp->m_quotainfo; + + /* Discourage any hook functions that might be running. */ + xchk_iscan_abort(&xqc->iscan); + + /* + * As noted above, the apply hook is responsible for cleaning up the + * shadow dquot accounting data when a transaction completes. The mod + * hook must be removed before the apply hook so that we don't + * mistakenly leave an active shadow account for the mod hook to get + * its hands on. No hooks should be running after these functions + * return. + */ + xfs_dqtrx_hook_del(qi, &xqc->qhook); + + if (xqc->shadow_dquot_acct.key_len) { + rhashtable_free_and_destroy(&xqc->shadow_dquot_acct, + xqcheck_dqacct_free, NULL); + xqc->shadow_dquot_acct.key_len = 0; + } + + if (xqc->pcounts) { + xfarray_destroy(xqc->pcounts); + xqc->pcounts = NULL; + } + + if (xqc->gcounts) { + xfarray_destroy(xqc->gcounts); + xqc->gcounts = NULL; + } + + if (xqc->ucounts) { + xfarray_destroy(xqc->ucounts); + xqc->ucounts = NULL; + } + + xchk_iscan_teardown(&xqc->iscan); + mutex_destroy(&xqc->lock); + xqc->sc = NULL; +} + +/* + * Scan all inodes in the entire filesystem to generate quota counter data. + * If the scan is successful, the quota data will be left alive for a repair. + * If any error occurs, we'll tear everything down. + */ +STATIC int +xqcheck_setup_scan( + struct xfs_scrub *sc, + struct xqcheck *xqc) +{ + char *descr; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + unsigned long long max_dquots = XFS_DQ_ID_MAX + 1ULL; + int error; + + ASSERT(xqc->sc == NULL); + xqc->sc = sc; + + mutex_init(&xqc->lock); + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &xqc->iscan); + + error = -ENOMEM; + if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_USER)) { + descr = xchk_xfile_descr(sc, "user dquot records"); + error = xfarray_create(descr, max_dquots, + sizeof(struct xqcheck_dquot), &xqc->ucounts); + kfree(descr); + if (error) + goto out_teardown; + } + + if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_GROUP)) { + descr = xchk_xfile_descr(sc, "group dquot records"); + error = xfarray_create(descr, max_dquots, + sizeof(struct xqcheck_dquot), &xqc->gcounts); + kfree(descr); + if (error) + goto out_teardown; + } + + if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_PROJ)) { + descr = xchk_xfile_descr(sc, "project dquot records"); + error = xfarray_create(descr, max_dquots, + sizeof(struct xqcheck_dquot), &xqc->pcounts); + kfree(descr); + if (error) + goto out_teardown; + } + + /* + * Set up hash table to map transactions to our internal shadow dqtrx + * structures. + */ + error = rhashtable_init(&xqc->shadow_dquot_acct, + &xqcheck_dqacct_hash_params); + if (error) + goto out_teardown; + + /* + * Hook into the quota code. The hook only triggers for inodes that + * were already scanned, and the scanner thread takes each inode's + * ILOCK, which means that any in-progress inode updates will finish + * before we can scan the inode. + * + * The apply hook (which removes the shadow dquot accounting struct) + * must be installed before the mod hook so that we never fail to catch + * the end of a quota update sequence and leave stale shadow data. + */ + ASSERT(sc->flags & XCHK_FSGATES_QUOTA); + xfs_dqtrx_hook_setup(&xqc->qhook, xqcheck_mod_live_ino_dqtrx, + xqcheck_apply_live_dqtrx); + + error = xfs_dqtrx_hook_add(qi, &xqc->qhook); + if (error) + goto out_teardown; + + /* Use deferred cleanup to pass the quota count data to repair. */ + sc->buf_cleanup = xqcheck_teardown_scan; + return 0; + +out_teardown: + xqcheck_teardown_scan(xqc); + return error; +} + +/* Scrub all counters for a given quota type. */ +int +xchk_quotacheck( + struct xfs_scrub *sc) +{ + struct xqcheck *xqc = sc->buf; + int error = 0; + + /* Check quota counters on the live filesystem. */ + error = xqcheck_setup_scan(sc, xqc); + if (error) + return error; + + /* Walk all inodes, picking up quota information. */ + error = xqcheck_collect_counts(xqc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Fail fast if we're not playing with a full dataset. */ + if (xchk_iscan_aborted(&xqc->iscan)) + xchk_set_incomplete(sc); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* Compare quota counters. */ + if (xqc->ucounts) { + error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_USER); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + } + if (xqc->gcounts) { + error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_GROUP); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + } + if (xqc->pcounts) { + error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_PROJ); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + } + + /* Check one last time for an incomplete dataset. */ + if (xchk_iscan_aborted(&xqc->iscan)) + xchk_set_incomplete(sc); + + return 0; +} diff --git a/fs/xfs/scrub/quotacheck.h b/fs/xfs/scrub/quotacheck.h new file mode 100644 index 000000000000..4ea5f249c978 --- /dev/null +++ b/fs/xfs/scrub/quotacheck.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_QUOTACHECK_H__ +#define __XFS_SCRUB_QUOTACHECK_H__ + +/* Quota counters for live quotacheck. */ +struct xqcheck_dquot { + /* block usage count */ + int64_t bcount; + + /* inode usage count */ + int64_t icount; + + /* realtime block usage count */ + int64_t rtbcount; + + /* Record state */ + unsigned int flags; +}; + +/* + * This incore dquot record has been written at least once. We never want to + * store an xqcheck_dquot that looks uninitialized. + */ +#define XQCHECK_DQUOT_WRITTEN (1U << 0) + +/* Already checked this dquot. */ +#define XQCHECK_DQUOT_COMPARE_SCANNED (1U << 1) + +/* Already repaired this dquot. */ +#define XQCHECK_DQUOT_REPAIR_SCANNED (1U << 2) + +/* Live quotacheck control structure. */ +struct xqcheck { + struct xfs_scrub *sc; + + /* Shadow dquot counter data. */ + struct xfarray *ucounts; + struct xfarray *gcounts; + struct xfarray *pcounts; + + /* Lock protecting quotacheck count observations */ + struct mutex lock; + + struct xchk_iscan iscan; + + /* Hooks into the quota code. */ + struct xfs_dqtrx_hook qhook; + + /* Shadow quota delta tracking structure. */ + struct rhashtable shadow_dquot_acct; +}; + +/* Return the incore counter array for a given quota type. */ +static inline struct xfarray * +xqcheck_counters_for( + struct xqcheck *xqc, + xfs_dqtype_t dqtype) +{ + switch (dqtype) { + case XFS_DQTYPE_USER: + return xqc->ucounts; + case XFS_DQTYPE_GROUP: + return xqc->gcounts; + case XFS_DQTYPE_PROJ: + return xqc->pcounts; + } + + ASSERT(0); + return NULL; +} + +#endif /* __XFS_SCRUB_QUOTACHECK_H__ */ diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c new file mode 100644 index 000000000000..dd8554c755b5 --- /dev/null +++ b/fs/xfs/scrub/quotacheck_repair.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_sb.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/quota.h" +#include "scrub/quotacheck.h" +#include "scrub/trace.h" + +/* + * Live Quotacheck Repair + * ====================== + * + * Use the live quota counter information that we collected to replace the + * counter values in the incore dquots. A scrub->repair cycle should have left + * the live data and hooks active, so this is safe so long as we make sure the + * dquot is locked. + */ + +/* Commit new counters to a dquot. */ +static int +xqcheck_commit_dquot( + struct xqcheck *xqc, + xfs_dqtype_t dqtype, + struct xfs_dquot *dq) +{ + struct xqcheck_dquot xcdq; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + int64_t delta; + bool dirty = false; + int error = 0; + + /* Unlock the dquot just long enough to allocate a transaction. */ + xfs_dqunlock(dq); + error = xchk_trans_alloc(xqc->sc, 0); + xfs_dqlock(dq); + if (error) + return error; + + xfs_trans_dqjoin(xqc->sc->tp, dq); + + if (xchk_iscan_aborted(&xqc->iscan)) { + error = -ECANCELED; + goto out_cancel; + } + + mutex_lock(&xqc->lock); + error = xfarray_load_sparse(counts, dq->q_id, &xcdq); + if (error) + goto out_unlock; + + /* Adjust counters as needed. */ + delta = (int64_t)xcdq.icount - dq->q_ino.count; + if (delta) { + dq->q_ino.reserved += delta; + dq->q_ino.count += delta; + dirty = true; + } + + delta = (int64_t)xcdq.bcount - dq->q_blk.count; + if (delta) { + dq->q_blk.reserved += delta; + dq->q_blk.count += delta; + dirty = true; + } + + delta = (int64_t)xcdq.rtbcount - dq->q_rtb.count; + if (delta) { + dq->q_rtb.reserved += delta; + dq->q_rtb.count += delta; + dirty = true; + } + + xcdq.flags |= (XQCHECK_DQUOT_REPAIR_SCANNED | XQCHECK_DQUOT_WRITTEN); + error = xfarray_store(counts, dq->q_id, &xcdq); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the repair + * and must cancel the whole operation. This should never + * happen, but we need to catch it anyway. + */ + error = -ECANCELED; + } + mutex_unlock(&xqc->lock); + if (error || !dirty) + goto out_cancel; + + trace_xrep_quotacheck_dquot(xqc->sc->mp, dq->q_type, dq->q_id); + + /* Commit the dirty dquot to disk. */ + dq->q_flags |= XFS_DQFLAG_DIRTY; + if (dq->q_id) + xfs_qm_adjust_dqtimers(dq); + xfs_trans_log_dquot(xqc->sc->tp, dq); + + /* + * Transaction commit unlocks the dquot, so we must re-lock it so that + * the caller can put the reference (which apparently requires a locked + * dquot). + */ + error = xrep_trans_commit(xqc->sc); + xfs_dqlock(dq); + return error; + +out_unlock: + mutex_unlock(&xqc->lock); +out_cancel: + xchk_trans_cancel(xqc->sc); + + /* Re-lock the dquot so the caller can put the reference. */ + xfs_dqlock(dq); + return error; +} + +/* Commit new quota counters for a particular quota type. */ +STATIC int +xqcheck_commit_dqtype( + struct xqcheck *xqc, + unsigned int dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xqcheck_dquot xcdq; + struct xfs_scrub *sc = xqc->sc; + struct xfs_mount *mp = sc->mp; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + struct xfs_dquot *dq; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + /* + * Update the counters of every dquot that the quota file knows about. + */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xqcheck_commit_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* + * Make a second pass to deal with the dquots that we know about but + * the quota file previously did not know about. + */ + mutex_lock(&xqc->lock); + while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) { + xfs_dqid_t id = cur - 1; + + if (xcdq.flags & XQCHECK_DQUOT_REPAIR_SCANNED) + continue; + + mutex_unlock(&xqc->lock); + + /* + * Grab the dquot, allowing for dquot block allocation in a + * separate transaction. We committed the scrub transaction + * in a previous step, so we will not be creating nested + * transactions here. + */ + error = xfs_qm_dqget(mp, id, dqtype, true, &dq); + if (error) + return error; + + error = xqcheck_commit_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + return error; + + mutex_lock(&xqc->lock); + } + mutex_unlock(&xqc->lock); + + return error; +} + +/* Figure out quota CHKD flags for the running quota types. */ +static inline unsigned int +xqcheck_chkd_flags( + struct xfs_mount *mp) +{ + unsigned int ret = 0; + + if (XFS_IS_UQUOTA_ON(mp)) + ret |= XFS_UQUOTA_CHKD; + if (XFS_IS_GQUOTA_ON(mp)) + ret |= XFS_GQUOTA_CHKD; + if (XFS_IS_PQUOTA_ON(mp)) + ret |= XFS_PQUOTA_CHKD; + return ret; +} + +/* Commit the new dquot counters. */ +int +xrep_quotacheck( + struct xfs_scrub *sc) +{ + struct xqcheck *xqc = sc->buf; + unsigned int qflags = xqcheck_chkd_flags(sc->mp); + int error; + + /* + * Clear the CHKD flag for the running quota types and commit the scrub + * transaction so that we can allocate new quota block mappings if we + * have to. If we crash after this point, the sb still has the CHKD + * flags cleared, so mount quotacheck will fix all of this up. + */ + xrep_update_qflags(sc, qflags, 0); + error = xrep_trans_commit(sc); + if (error) + return error; + + /* Commit the new counters to the dquots. */ + if (xqc->ucounts) { + error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_USER); + if (error) + return error; + } + if (xqc->gcounts) { + error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_GROUP); + if (error) + return error; + } + if (xqc->pcounts) { + error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_PROJ); + if (error) + return error; + } + + /* Set the CHKD flags now that we've fixed quota counts. */ + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + xrep_update_qflags(sc, 0, qflags); + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/rcbag.c b/fs/xfs/scrub/rcbag.c new file mode 100644 index 000000000000..e1e52bc20713 --- /dev/null +++ b/fs/xfs/scrub/rcbag.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_error.h" +#include "scrub/scrub.h" +#include "scrub/rcbag_btree.h" +#include "scrub/rcbag.h" +#include "scrub/trace.h" + +struct rcbag { + struct xfs_mount *mp; + struct xfbtree xfbtree; + uint64_t nr_items; +}; + +int +rcbag_init( + struct xfs_mount *mp, + struct xfs_buftarg *btp, + struct rcbag **bagp) +{ + struct rcbag *bag; + int error; + + bag = kzalloc(sizeof(struct rcbag), XCHK_GFP_FLAGS); + if (!bag) + return -ENOMEM; + + bag->nr_items = 0; + bag->mp = mp; + + error = rcbagbt_mem_init(mp, &bag->xfbtree, btp); + if (error) + goto out_bag; + + *bagp = bag; + return 0; + +out_bag: + kfree(bag); + return error; +} + +void +rcbag_free( + struct rcbag **bagp) +{ + struct rcbag *bag = *bagp; + + xfbtree_destroy(&bag->xfbtree); + kfree(bag); + *bagp = NULL; +} + +/* Track an rmap in the refcount bag. */ +int +rcbag_add( + struct rcbag *bag, + struct xfs_trans *tp, + const struct xfs_rmap_irec *rmap) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + int has; + int error; + + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + error = rcbagbt_lookup_eq(cur, rmap, &has); + if (error) + goto out_cur; + + if (has) { + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + bagrec.rbg_refcount++; + error = rcbagbt_update(cur, &bagrec); + if (error) + goto out_cur; + } else { + bagrec.rbg_startblock = rmap->rm_startblock; + bagrec.rbg_blockcount = rmap->rm_blockcount; + bagrec.rbg_refcount = 1; + + error = rcbagbt_insert(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + } + + xfs_btree_del_cursor(cur, 0); + + error = xfbtree_trans_commit(&bag->xfbtree, tp); + if (error) + return error; + + bag->nr_items++; + return 0; + +out_cur: + xfs_btree_del_cursor(cur, error); + xfbtree_trans_cancel(&bag->xfbtree, tp); + return error; +} + +/* Return the number of records in the bag. */ +uint64_t +rcbag_count( + const struct rcbag *rcbag) +{ + return rcbag->nr_items; +} + +static inline uint32_t rcbag_rec_next_bno(const struct rcbag_rec *r) +{ + return r->rbg_startblock + r->rbg_blockcount; +} + +/* + * Find the next block where the refcount changes, given the next rmap we + * looked at and the ones we're already tracking. + */ +int +rcbag_next_edge( + struct rcbag *bag, + struct xfs_trans *tp, + const struct xfs_rmap_irec *next_rmap, + bool next_valid, + uint32_t *next_bnop) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + uint32_t next_bno = NULLAGBLOCK; + int has; + int error; + + if (next_valid) + next_bno = next_rmap->rm_startblock; + + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + error = xfs_btree_goto_left_edge(cur); + if (error) + goto out_cur; + + while (true) { + error = xfs_btree_increment(cur, 0, &has); + if (error) + goto out_cur; + if (!has) + break; + + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + next_bno = min(next_bno, rcbag_rec_next_bno(&bagrec)); + } + + /* + * We should have found /something/ because either next_rrm is the next + * interesting rmap to look at after emitting this refcount extent, or + * there are other rmaps in rmap_bag contributing to the current + * sharing count. But if something is seriously wrong, bail out. + */ + if (next_bno == NULLAGBLOCK) { + error = -EFSCORRUPTED; + goto out_cur; + } + + xfs_btree_del_cursor(cur, 0); + + *next_bnop = next_bno; + return 0; + +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* Pop all refcount bag records that end at next_bno */ +int +rcbag_remove_ending_at( + struct rcbag *bag, + struct xfs_trans *tp, + uint32_t next_bno) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + int has; + int error; + + /* go to the right edge of the tree */ + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + memset(&cur->bc_rec, 0xFF, sizeof(cur->bc_rec)); + error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, &has); + if (error) + goto out_cur; + + while (true) { + error = xfs_btree_decrement(cur, 0, &has); + if (error) + goto out_cur; + if (!has) + break; + + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + if (rcbag_rec_next_bno(&bagrec) != next_bno) + continue; + + error = xfs_btree_delete(cur, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + bag->nr_items -= bagrec.rbg_refcount; + } + + xfs_btree_del_cursor(cur, 0); + return xfbtree_trans_commit(&bag->xfbtree, tp); +out_cur: + xfs_btree_del_cursor(cur, error); + xfbtree_trans_cancel(&bag->xfbtree, tp); + return error; +} + +/* Dump the rcbag. */ +void +rcbag_dump( + struct rcbag *bag, + struct xfs_trans *tp) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + unsigned long long nr = 0; + int has; + int error; + + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + error = xfs_btree_goto_left_edge(cur); + if (error) + goto out_cur; + + while (true) { + error = xfs_btree_increment(cur, 0, &has); + if (error) + goto out_cur; + if (!has) + break; + + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + xfs_err(bag->mp, "[%llu]: bno 0x%x fsbcount 0x%x refcount 0x%llx\n", + nr++, + (unsigned int)bagrec.rbg_startblock, + (unsigned int)bagrec.rbg_blockcount, + (unsigned long long)bagrec.rbg_refcount); + } + +out_cur: + xfs_btree_del_cursor(cur, error); +} diff --git a/fs/xfs/scrub/rcbag.h b/fs/xfs/scrub/rcbag.h new file mode 100644 index 000000000000..e29ef788ba72 --- /dev/null +++ b/fs/xfs/scrub/rcbag.h @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RCBAG_H__ +#define __XFS_SCRUB_RCBAG_H__ + +struct xfs_mount; +struct rcbag; +struct xfs_buftarg; + +int rcbag_init(struct xfs_mount *mp, struct xfs_buftarg *btp, + struct rcbag **bagp); +void rcbag_free(struct rcbag **bagp); +int rcbag_add(struct rcbag *bag, struct xfs_trans *tp, + const struct xfs_rmap_irec *rmap); +uint64_t rcbag_count(const struct rcbag *bag); + +int rcbag_next_edge(struct rcbag *bag, struct xfs_trans *tp, + const struct xfs_rmap_irec *next_rmap, bool next_valid, + uint32_t *next_bnop); +int rcbag_remove_ending_at(struct rcbag *bag, struct xfs_trans *tp, + uint32_t next_bno); + +void rcbag_dump(struct rcbag *bag, struct xfs_trans *tp); + +#endif /* __XFS_SCRUB_RCBAG_H__ */ diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c new file mode 100644 index 000000000000..709356dc6256 --- /dev/null +++ b/fs/xfs/scrub/rcbag_btree.c @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_error.h" +#include "scrub/rcbag_btree.h" +#include "scrub/trace.h" + +static struct kmem_cache *rcbagbt_cur_cache; + +STATIC void +rcbagbt_init_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + struct rcbag_key *bag_key = (struct rcbag_key *)key; + const struct rcbag_rec *bag_rec = (const struct rcbag_rec *)rec; + + BUILD_BUG_ON(sizeof(struct rcbag_key) > sizeof(union xfs_btree_key)); + BUILD_BUG_ON(sizeof(struct rcbag_rec) > sizeof(union xfs_btree_rec)); + + bag_key->rbg_startblock = bag_rec->rbg_startblock; + bag_key->rbg_blockcount = bag_rec->rbg_blockcount; +} + +STATIC void +rcbagbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + struct rcbag_rec *bag_rec = (struct rcbag_rec *)rec; + struct rcbag_rec *bag_irec = (struct rcbag_rec *)&cur->bc_rec; + + bag_rec->rbg_startblock = bag_irec->rbg_startblock; + bag_rec->rbg_blockcount = bag_irec->rbg_blockcount; + bag_rec->rbg_refcount = bag_irec->rbg_refcount; +} + +STATIC int64_t +rcbagbt_key_diff( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) +{ + struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec; + const struct rcbag_key *kp = (const struct rcbag_key *)key; + + if (kp->rbg_startblock > rec->rbg_startblock) + return 1; + if (kp->rbg_startblock < rec->rbg_startblock) + return -1; + + if (kp->rbg_blockcount > rec->rbg_blockcount) + return 1; + if (kp->rbg_blockcount < rec->rbg_blockcount) + return -1; + + return 0; +} + +STATIC int64_t +rcbagbt_diff_two_keys( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) +{ + const struct rcbag_key *kp1 = (const struct rcbag_key *)k1; + const struct rcbag_key *kp2 = (const struct rcbag_key *)k2; + + ASSERT(mask == NULL); + + if (kp1->rbg_startblock > kp2->rbg_startblock) + return 1; + if (kp1->rbg_startblock < kp2->rbg_startblock) + return -1; + + if (kp1->rbg_blockcount > kp2->rbg_blockcount) + return 1; + if (kp1->rbg_blockcount < kp2->rbg_blockcount) + return -1; + + return 0; +} + +STATIC int +rcbagbt_keys_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) +{ + const struct rcbag_key *kp1 = (const struct rcbag_key *)k1; + const struct rcbag_key *kp2 = (const struct rcbag_key *)k2; + + if (kp1->rbg_startblock > kp2->rbg_startblock) + return 0; + if (kp1->rbg_startblock < kp2->rbg_startblock) + return 1; + + if (kp1->rbg_blockcount > kp2->rbg_blockcount) + return 0; + if (kp1->rbg_blockcount < kp2->rbg_blockcount) + return 1; + + return 0; +} + +STATIC int +rcbagbt_recs_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) +{ + const struct rcbag_rec *rp1 = (const struct rcbag_rec *)r1; + const struct rcbag_rec *rp2 = (const struct rcbag_rec *)r2; + + if (rp1->rbg_startblock > rp2->rbg_startblock) + return 0; + if (rp1->rbg_startblock < rp2->rbg_startblock) + return 1; + + if (rp1->rbg_blockcount > rp2->rbg_blockcount) + return 0; + if (rp1->rbg_blockcount < rp2->rbg_blockcount) + return 1; + + return 0; +} + +static xfs_failaddr_t +rcbagbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + unsigned int level; + unsigned int maxrecs; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + + level = be16_to_cpu(block->bb_level); + if (level >= rcbagbt_maxlevels_possible()) + return __this_address; + + maxrecs = rcbagbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0); + return xfs_btree_memblock_verify(bp, maxrecs); +} + +static void +rcbagbt_rw_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa = rcbagbt_verify(bp); + + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +/* skip crc checks on in-memory btrees to save time */ +static const struct xfs_buf_ops rcbagbt_mem_buf_ops = { + .name = "rcbagbt_mem", + .magic = { 0, cpu_to_be32(RCBAG_MAGIC) }, + .verify_read = rcbagbt_rw_verify, + .verify_write = rcbagbt_rw_verify, + .verify_struct = rcbagbt_verify, +}; + +static const struct xfs_btree_ops rcbagbt_mem_ops = { + .name = "rcbag", + .type = XFS_BTREE_TYPE_MEM, + + .rec_len = sizeof(struct rcbag_rec), + .key_len = sizeof(struct rcbag_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = 1, + .statoff = XFS_STATS_CALC_INDEX(xs_rcbag_2), + + .dup_cursor = xfbtree_dup_cursor, + .set_root = xfbtree_set_root, + .alloc_block = xfbtree_alloc_block, + .free_block = xfbtree_free_block, + .get_minrecs = xfbtree_get_minrecs, + .get_maxrecs = xfbtree_get_maxrecs, + .init_key_from_rec = rcbagbt_init_key_from_rec, + .init_rec_from_cur = rcbagbt_init_rec_from_cur, + .init_ptr_from_cur = xfbtree_init_ptr_from_cur, + .key_diff = rcbagbt_key_diff, + .buf_ops = &rcbagbt_mem_buf_ops, + .diff_two_keys = rcbagbt_diff_two_keys, + .keys_inorder = rcbagbt_keys_inorder, + .recs_inorder = rcbagbt_recs_inorder, +}; + +/* Create a cursor for an in-memory btree. */ +struct xfs_btree_cur * +rcbagbt_mem_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfbtree *xfbtree) +{ + struct xfs_btree_cur *cur; + + cur = xfs_btree_alloc_cursor(mp, tp, &rcbagbt_mem_ops, + rcbagbt_maxlevels_possible(), rcbagbt_cur_cache); + + cur->bc_mem.xfbtree = xfbtree; + cur->bc_nlevels = xfbtree->nlevels; + return cur; +} + +/* Create an in-memory refcount bag btree. */ +int +rcbagbt_mem_init( + struct xfs_mount *mp, + struct xfbtree *xfbt, + struct xfs_buftarg *btp) +{ + xfbt->owner = 0; + return xfbtree_init(mp, xfbt, btp, &rcbagbt_mem_ops); +} + +/* Calculate number of records in a refcount bag btree block. */ +static inline unsigned int +rcbagbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct rcbag_rec); + return blocklen / + (sizeof(struct rcbag_key) + sizeof(rcbag_ptr_t)); +} + +/* + * Calculate number of records in an refcount bag btree block. + */ +unsigned int +rcbagbt_maxrecs( + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) +{ + blocklen -= RCBAG_BLOCK_LEN; + return rcbagbt_block_maxrecs(blocklen, leaf); +} + +/* Compute the max possible height for refcount bag btrees. */ +unsigned int +rcbagbt_maxlevels_possible(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_space_to_height(minrecs, ULLONG_MAX); +} + +/* Calculate the refcount bag btree size for some records. */ +unsigned long long +rcbagbt_calc_size( + unsigned long long nr_records) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_calc_size(minrecs, nr_records); +} + +int __init +rcbagbt_init_cur_cache(void) +{ + rcbagbt_cur_cache = kmem_cache_create("xfs_rcbagbt_cur", + xfs_btree_cur_sizeof(rcbagbt_maxlevels_possible()), + 0, 0, NULL); + + if (!rcbagbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +rcbagbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(rcbagbt_cur_cache); + rcbagbt_cur_cache = NULL; +} + +/* Look up the refcount bag record corresponding to this reverse mapping. */ +int +rcbagbt_lookup_eq( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rmap, + int *success) +{ + struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec; + + rec->rbg_startblock = rmap->rm_startblock; + rec->rbg_blockcount = rmap->rm_blockcount; + + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, success); +} + +/* Get the data from the pointed-to record. */ +int +rcbagbt_get_rec( + struct xfs_btree_cur *cur, + struct rcbag_rec *rec, + int *has) +{ + union xfs_btree_rec *btrec; + int error; + + error = xfs_btree_get_rec(cur, &btrec, has); + if (error || !(*has)) + return error; + + memcpy(rec, btrec, sizeof(struct rcbag_rec)); + return 0; +} + +/* Update the record referred to by cur to the value given. */ +int +rcbagbt_update( + struct xfs_btree_cur *cur, + const struct rcbag_rec *rec) +{ + union xfs_btree_rec btrec; + + memcpy(&btrec, rec, sizeof(struct rcbag_rec)); + return xfs_btree_update(cur, &btrec); +} + +/* Update the record referred to by cur to the value given. */ +int +rcbagbt_insert( + struct xfs_btree_cur *cur, + const struct rcbag_rec *rec, + int *success) +{ + struct rcbag_rec *btrec = (struct rcbag_rec *)&cur->bc_rec; + + memcpy(btrec, rec, sizeof(struct rcbag_rec)); + return xfs_btree_insert(cur, success); +} diff --git a/fs/xfs/scrub/rcbag_btree.h b/fs/xfs/scrub/rcbag_btree.h new file mode 100644 index 000000000000..03cadb032552 --- /dev/null +++ b/fs/xfs/scrub/rcbag_btree.h @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RCBAG_BTREE_H__ +#define __XFS_SCRUB_RCBAG_BTREE_H__ + +#ifdef CONFIG_XFS_BTREE_IN_MEM + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +#define RCBAG_MAGIC 0x74826671 /* 'JRBG' */ + +struct rcbag_key { + uint32_t rbg_startblock; + uint32_t rbg_blockcount; +}; + +struct rcbag_rec { + uint32_t rbg_startblock; + uint32_t rbg_blockcount; + uint64_t rbg_refcount; +}; + +typedef __be64 rcbag_ptr_t; + +/* reflinks only exist on crc enabled filesystems */ +#define RCBAG_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define RCBAG_REC_ADDR(block, index) \ + ((struct rcbag_rec *) \ + ((char *)(block) + RCBAG_BLOCK_LEN + \ + (((index) - 1) * sizeof(struct rcbag_rec)))) + +#define RCBAG_KEY_ADDR(block, index) \ + ((struct rcbag_key *) \ + ((char *)(block) + RCBAG_BLOCK_LEN + \ + ((index) - 1) * sizeof(struct rcbag_key))) + +#define RCBAG_PTR_ADDR(block, index, maxrecs) \ + ((rcbag_ptr_t *) \ + ((char *)(block) + RCBAG_BLOCK_LEN + \ + (maxrecs) * sizeof(struct rcbag_key) + \ + ((index) - 1) * sizeof(rcbag_ptr_t))) + +unsigned int rcbagbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); + +unsigned long long rcbagbt_calc_size(unsigned long long nr_records); + +unsigned int rcbagbt_maxlevels_possible(void); + +int __init rcbagbt_init_cur_cache(void); +void rcbagbt_destroy_cur_cache(void); + +struct xfs_btree_cur *rcbagbt_mem_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfbtree *xfbtree); +int rcbagbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, + struct xfs_buftarg *btp); + +int rcbagbt_lookup_eq(struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rmap, int *success); +int rcbagbt_get_rec(struct xfs_btree_cur *cur, struct rcbag_rec *rec, int *has); +int rcbagbt_update(struct xfs_btree_cur *cur, const struct rcbag_rec *rec); +int rcbagbt_insert(struct xfs_btree_cur *cur, const struct rcbag_rec *rec, + int *success); + +#else +# define rcbagbt_init_cur_cache() 0 +# define rcbagbt_destroy_cur_cache() ((void)0) +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + +#endif /* __XFS_SCRUB_RCBAG_BTREE_H__ */ diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c index e51c1544be63..dfdcb96b6c16 100644 --- a/fs/xfs/scrub/readdir.c +++ b/fs/xfs/scrub/readdir.c @@ -36,16 +36,14 @@ xchk_dir_walk_sf( struct xfs_mount *mp = dp->i_mount; struct xfs_da_geometry *geo = mp->m_dir_geo; struct xfs_dir2_sf_entry *sfep; - struct xfs_dir2_sf_hdr *sfp; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; xfs_ino_t ino; xfs_dir2_dataptr_t dapos; unsigned int i; int error; ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - - sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); /* dot entry */ dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, @@ -283,7 +281,7 @@ xchk_dir_walk( return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); - ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) return xchk_dir_walk_sf(sc, dp, dirent_fn, priv); @@ -334,7 +332,7 @@ xchk_dir_lookup( return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); - ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { error = xfs_dir2_sf_lookup(&args); diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 86a62420e02c..0252a3b5b65a 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -20,6 +20,7 @@ #include "xfs_ialloc_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_extent_busy.h" #include "xfs_ag.h" @@ -31,11 +32,14 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_remote.h" +#include "xfs_defer.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/fsb_bitmap.h" #include "scrub/reap.h" /* @@ -73,10 +77,10 @@ * with only the same rmap owner but the block is not owned by something with * the same rmap owner, the block will be freed. * - * The caller is responsible for locking the AG headers for the entire rebuild - * operation so that nothing else can sneak in and change the AG state while - * we're not looking. We must also invalidate any buffers associated with - * @bitmap. + * The caller is responsible for locking the AG headers/inode for the entire + * rebuild operation so that nothing else can sneak in and change the incore + * state while we're not looking. We must also invalidate any buffers + * associated with @bitmap. */ /* Information about reaping extents after a repair. */ @@ -110,7 +114,7 @@ xreap_put_freelist( int error; /* Make sure there's space on the freelist. */ - error = xrep_fix_freelist(sc, true); + error = xrep_fix_freelist(sc, 0); if (error) return error; @@ -247,7 +251,7 @@ xreap_agextent_binval( max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); - for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) { + for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) { struct xfs_buf *bp = NULL; xfs_daddr_t daddr; int error; @@ -377,6 +381,17 @@ xreap_agextent_iter( trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); rs->force_roll = true; + + if (rs->oinfo == &XFS_RMAP_OINFO_COW) { + /* + * If we're unmapping CoW staging extents, remove the + * records from the refcountbt, which will remove the + * rmap record as well. + */ + xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + return 0; + } + return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, *aglenp, rs->oinfo); } @@ -395,6 +410,26 @@ xreap_agextent_iter( return 0; } + /* + * If we're getting rid of CoW staging extents, use deferred work items + * to remove the refcountbt records (which removes the rmap records) + * and free the extent. We're not worried about the system going down + * here because log recovery walks the refcount btree to clean out the + * CoW staging extents. + */ + if (rs->oinfo == &XFS_RMAP_OINFO_COW) { + ASSERT(rs->resv == XFS_AG_RESV_NONE); + + xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, + rs->resv, true); + if (error) + return error; + + rs->force_roll = true; + return 0; + } + /* Put blocks back on the AGFL one at a time. */ if (rs->resv == XFS_AG_RESV_AGFL) { ASSERT(*aglenp == 1); @@ -409,13 +444,17 @@ xreap_agextent_iter( /* * Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. + * Add a defer ops barrier every other extent to avoid stressing the + * system with large EFIs. */ - error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, rs->resv, true); if (error) return error; rs->deferred++; + if (rs->deferred % 2 == 0) + xfs_defer_add_barrier(sc->tp); return 0; } @@ -425,13 +464,12 @@ xreap_agextent_iter( */ STATIC int xreap_agmeta_extent( - uint64_t fsbno, - uint64_t len, + uint32_t agbno, + uint32_t len, void *priv) { struct xreap_state *rs = priv; struct xfs_scrub *sc = rs->sc; - xfs_agblock_t agbno = fsbno; xfs_agblock_t agbno_next = agbno + len; int error = 0; @@ -496,3 +534,115 @@ xrep_reap_agblocks( return 0; } + +/* + * Break a file metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. The extent must + * not cross an AG boundary. + */ +STATIC int +xreap_fsmeta_extent( + uint64_t fsbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + xfs_agblock_t agbno_next = agbno + len; + int error = 0; + + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); + ASSERT(sc->ip != NULL); + ASSERT(!sc->sa.pag); + + /* + * We're reaping blocks after repairing file metadata, which means that + * we have to init the xchk_ag structure ourselves. + */ + sc->sa.pag = xfs_perag_get(sc->mp, agno); + if (!sc->sa.pag) + return -EFSCORRUPTED; + + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); + if (error) + goto out_pag; + + while (agbno < agbno_next) { + xfs_extlen_t aglen; + bool crosslinked; + + error = xreap_agextent_select(rs, agbno, agbno_next, + &crosslinked, &aglen); + if (error) + goto out_agf; + + error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); + if (error) + goto out_agf; + + if (xreap_want_defer_finish(rs)) { + /* + * Holds the AGF buffer across the deferred chain + * processing. + */ + error = xrep_defer_finish(sc); + if (error) + goto out_agf; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + /* + * Hold the AGF buffer across the transaction roll so + * that we don't have to reattach it to the scrub + * context. + */ + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + error = xfs_trans_roll_inode(&sc->tp, sc->ip); + xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); + if (error) + goto out_agf; + xreap_reset(rs); + } + + agbno += aglen; + } + +out_agf: + xfs_trans_brelse(sc->tp, sc->sa.agf_bp); + sc->sa.agf_bp = NULL; +out_pag: + xfs_perag_put(sc->sa.pag); + sc->sa.pag = NULL; + return error; +} + +/* + * Dispose of every block of every fs metadata extent in the bitmap. + * Do not use this to dispose of the mappings in an ondisk inode fork. + */ +int +xrep_reap_fsblocks( + struct xfs_scrub *sc, + struct xfsb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = XFS_AG_RESV_NONE, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip != NULL); + + error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h index fe24626af164..0b69f16dd98f 100644 --- a/fs/xfs/scrub/reap.h +++ b/fs/xfs/scrub/reap.h @@ -6,7 +6,12 @@ #ifndef __XFS_SCRUB_REAP_H__ #define __XFS_SCRUB_REAP_H__ +struct xagb_bitmap; +struct xfsb_bitmap; + int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); +int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap, + const struct xfs_owner_info *oinfo); #endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 304ea1e1bfb0..d0c7d4a29c0f 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -7,8 +7,10 @@ #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_ag.h" #include "xfs_btree.h" #include "xfs_rmap.h" @@ -17,6 +19,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" +#include "scrub/repair.h" /* * Set us up to scrub reference count btrees. @@ -27,6 +30,15 @@ xchk_setup_ag_refcountbt( { if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + if (xchk_could_repair(sc)) { + int error; + + error = xrep_setup_ag_refcountbt(sc); + if (error) + return error; + } + return xchk_setup_ag_btree(sc, false); } @@ -441,7 +453,7 @@ xchk_refcountbt_rec( struct xchk_refcbt_records *rrc = bs->private; xfs_refcount_btrec_to_irec(rec, &irec); - if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) { + if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c new file mode 100644 index 000000000000..a00d7ce7ae5b --- /dev/null +++ b/fs/xfs/scrub/refcount_repair.c @@ -0,0 +1,751 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_error.h" +#include "xfs_ag.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" +#include "scrub/rcbag.h" + +/* + * Rebuilding the Reference Count Btree + * ==================================== + * + * This algorithm is "borrowed" from xfs_repair. Imagine the rmap + * entries as rectangles representing extents of physical blocks, and + * that the rectangles can be laid down to allow them to overlap each + * other; then we know that we must emit a refcnt btree entry wherever + * the amount of overlap changes, i.e. the emission stimulus is + * level-triggered: + * + * - --- + * -- ----- ---- --- ------ + * -- ---- ----------- ---- --------- + * -------------------------------- ----------- + * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ + * 2 1 23 21 3 43 234 2123 1 01 2 3 0 + * + * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). + * + * Note that in the actual refcnt btree we don't store the refcount < 2 + * cases because the bnobt tells us which blocks are free; single-use + * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt + * supports storing multiple entries covering a given block we could + * theoretically dispense with the refcntbt and simply count rmaps, but + * that's inefficient in the (hot) write path, so we'll take the cost of + * the extra tree to save time. Also there's no guarantee that rmap + * will be enabled. + * + * Given an array of rmaps sorted by physical block number, a starting + * physical block (sp), a bag to hold rmaps that cover sp, and the next + * physical block where the level changes (np), we can reconstruct the + * refcount btree as follows: + * + * While there are still unprocessed rmaps in the array, + * - Set sp to the physical block (pblk) of the next unprocessed rmap. + * - Add to the bag all rmaps in the array where startblock == sp. + * - Set np to the physical block where the bag size will change. This + * is the minimum of (the pblk of the next unprocessed rmap) and + * (startblock + len of each rmap in the bag). + * - Record the bag size as old_bag_size. + * + * - While the bag isn't empty, + * - Remove from the bag all rmaps where startblock + len == np. + * - Add to the bag all rmaps in the array where startblock == np. + * - If the bag size isn't old_bag_size, store the refcount entry + * (sp, np - sp, bag_size) in the refcnt btree. + * - If the bag is empty, break out of the inner loop. + * - Set old_bag_size to the bag size + * - Set sp = np. + * - Set np to the physical block where the bag size will change. + * This is the minimum of (the pblk of the next unprocessed rmap) + * and (startblock + len of each rmap in the bag). + * + * Like all the other repairers, we make a list of all the refcount + * records we need, then reinitialize the refcount btree root and + * insert all the records. + */ + +struct xrep_refc { + /* refcount extents */ + struct xfarray *refcount_records; + + /* new refcountbt information */ + struct xrep_newbt new_btree; + + /* old refcountbt blocks */ + struct xagb_bitmap old_refcountbt_blocks; + + struct xfs_scrub *sc; + + /* get_records()'s position in the refcount record array. */ + xfarray_idx_t array_cur; + + /* # of refcountbt blocks */ + xfs_extlen_t btblocks; +}; + +/* Set us up to repair refcount btrees. */ +int +xrep_setup_ag_refcountbt( + struct xfs_scrub *sc) +{ + char *descr; + int error; + + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + return error; +} + +/* Check for any obvious conflicts with this shared/CoW staging extent. */ +STATIC int +xrep_refc_check_ext( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock, + rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rc_startblock, rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record a reference count extent. */ +STATIC int +xrep_refc_stash( + struct xrep_refc *rr, + enum xfs_refc_domain domain, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t refcount) +{ + struct xfs_refcount_irec irec = { + .rc_startblock = agbno, + .rc_blockcount = len, + .rc_domain = domain, + }; + struct xfs_scrub *sc = rr->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount); + + error = xrep_refc_check_ext(rr->sc, &irec); + if (error) + return error; + + trace_xrep_refc_found(sc->sa.pag, &irec); + + return xfarray_append(rr->refcount_records, &irec); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_refc_stash_cow( + struct xrep_refc *rr, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1); +} + +/* Decide if an rmap could describe a shared extent. */ +static inline bool +xrep_refc_rmap_shareable( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rmap) +{ + /* AG metadata are never sharable */ + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + + /* Metadata in files are never shareable */ + if (xfs_internal_inum(mp, rmap->rm_owner)) + return false; + + /* Metadata and unwritten file blocks are not shareable. */ + if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) + return false; + + return true; +} + +/* + * Walk along the reverse mapping records until we find one that could describe + * a shared extent. + */ +STATIC int +xrep_refc_walk_rmaps( + struct xrep_refc *rr, + struct xfs_rmap_irec *rmap, + bool *have_rec) +{ + struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur; + struct xfs_mount *mp = cur->bc_mp; + int have_gt; + int error = 0; + + *have_rec = false; + + /* + * Loop through the remaining rmaps. Remember CoW staging + * extents and the refcountbt blocks from the old tree for later + * disposal. We can only share written data fork extents, so + * keep looping until we find an rmap for one. + */ + do { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + return error; + if (!have_gt) + return 0; + + error = xfs_rmap_get_rec(cur, rmap, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(mp, !have_gt)) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + + if (rmap->rm_owner == XFS_RMAP_OWN_COW) { + error = xrep_refc_stash_cow(rr, rmap->rm_startblock, + rmap->rm_blockcount); + if (error) + return error; + } else if (rmap->rm_owner == XFS_RMAP_OWN_REFC) { + /* refcountbt block, dump it when we're done. */ + rr->btblocks += rmap->rm_blockcount; + error = xagb_bitmap_set(&rr->old_refcountbt_blocks, + rmap->rm_startblock, + rmap->rm_blockcount); + if (error) + return error; + } + } while (!xrep_refc_rmap_shareable(mp, rmap)); + + *have_rec = true; + return 0; +} + +static inline uint32_t +xrep_refc_encode_startblock( + const struct xfs_refcount_irec *irec) +{ + uint32_t start; + + start = irec->rc_startblock & ~XFS_REFC_COWFLAG; + if (irec->rc_domain == XFS_REFC_DOMAIN_COW) + start |= XFS_REFC_COWFLAG; + + return start; +} + +/* Sort in the same order as the ondisk records. */ +static int +xrep_refc_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_refcount_irec *ap = a; + const struct xfs_refcount_irec *bp = b; + uint32_t sa, sb; + + sa = xrep_refc_encode_startblock(ap); + sb = xrep_refc_encode_startblock(bp); + + if (sa > sb) + return 1; + if (sa < sb) + return -1; + return 0; +} + +/* + * Sort the refcount extents by startblock or else the btree records will be in + * the wrong order. Make sure the records do not overlap in physical space. + */ +STATIC int +xrep_refc_sort_records( + struct xrep_refc *rr) +{ + struct xfs_refcount_irec irec; + xfarray_idx_t cur; + enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rr->refcount_records, cur) { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfarray_load(rr->refcount_records, cur, &irec); + if (error) + return error; + + if (dom == XFS_REFC_DOMAIN_SHARED && + irec.rc_domain == XFS_REFC_DOMAIN_COW) { + dom = irec.rc_domain; + next_agbno = 0; + } + + if (dom != irec.rc_domain) + return -EFSCORRUPTED; + if (irec.rc_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = irec.rc_startblock + irec.rc_blockcount; + } + + return error; +} + +/* + * Walk forward through the rmap btree to collect all rmaps starting at + * @bno in @rmap_bag. These represent the file(s) that share ownership of + * the current block. Upon return, the rmap cursor points to the last record + * satisfying the startblock constraint. + */ +static int +xrep_refc_push_rmaps_at( + struct xrep_refc *rr, + struct rcbag *rcstack, + xfs_agblock_t bno, + struct xfs_rmap_irec *rmap, + bool *have) +{ + struct xfs_scrub *sc = rr->sc; + int have_gt; + int error; + + while (*have && rmap->rm_startblock == bno) { + error = rcbag_add(rcstack, rr->sc->tp, rmap); + if (error) + return error; + + error = xrep_refc_walk_rmaps(rr, rmap, have); + if (error) + return error; + } + + error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) { + xfs_btree_mark_sick(sc->sa.rmap_cur); + return -EFSCORRUPTED; + } + + return 0; +} + +/* Iterate all the rmap records to generate reference count data. */ +STATIC int +xrep_refc_find_refcounts( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct rcbag *rcstack; + uint64_t old_stack_height; + xfs_agblock_t sbno; + xfs_agblock_t cbno; + xfs_agblock_t nbno; + bool have; + int error; + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Set up a bag to store all the rmap records that we're tracking to + * generate a reference count record. If the size of the bag exceeds + * MAXREFCOUNT, we clamp rc_refcount. + */ + error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); + if (error) + goto out_cur; + + /* Start the rmapbt cursor to the left of all records. */ + error = xfs_btree_goto_left_edge(sc->sa.rmap_cur); + if (error) + goto out_bag; + + /* Process reverse mappings into refcount data. */ + while (xfs_btree_has_more_records(sc->sa.rmap_cur)) { + struct xfs_rmap_irec rmap; + + /* Push all rmaps with pblk == sbno onto the stack */ + error = xrep_refc_walk_rmaps(rr, &rmap, &have); + if (error) + goto out_bag; + if (!have) + break; + sbno = cbno = rmap.rm_startblock; + error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap, + &have); + if (error) + goto out_bag; + + /* Set nbno to the bno of the next refcount change */ + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + old_stack_height = rcbag_count(rcstack); + + /* While stack isn't empty... */ + while (rcbag_count(rcstack) > 0) { + /* Pop all rmaps that end at nbno */ + error = rcbag_remove_ending_at(rcstack, sc->tp, nbno); + if (error) + goto out_bag; + + /* Push array items that start at nbno */ + error = xrep_refc_walk_rmaps(rr, &rmap, &have); + if (error) + goto out_bag; + if (have) { + error = xrep_refc_push_rmaps_at(rr, rcstack, + nbno, &rmap, &have); + if (error) + goto out_bag; + } + + /* Emit refcount if necessary */ + ASSERT(nbno > cbno); + if (rcbag_count(rcstack) != old_stack_height) { + if (old_stack_height > 1) { + error = xrep_refc_stash(rr, + XFS_REFC_DOMAIN_SHARED, + cbno, nbno - cbno, + old_stack_height); + if (error) + goto out_bag; + } + cbno = nbno; + } + + /* Stack empty, go find the next rmap */ + if (rcbag_count(rcstack) == 0) + break; + old_stack_height = rcbag_count(rcstack); + sbno = nbno; + + /* Set nbno to the bno of the next refcount change */ + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, + &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + } + } + + ASSERT(rcbag_count(rcstack) == 0); +out_bag: + rcbag_free(&rcstack); +out_cur: + xchk_ag_btcur_free(&sc->sa); + return error; +} + +/* Retrieve refcountbt data for bulk load. */ +STATIC int +xrep_refc_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + struct xrep_refc *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(rr->refcount_records, rr->array_cur++, + irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_refc_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_refc *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Update the AGF counters. */ +STATIC int +xrep_refc_reset_counters( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the refcountbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_refcount_level = pag->pagf_refcount_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected refcount information to stage a new refcount btree. If + * this is successful we'll return with the new btree root information logged + * to the repair transaction but not yet committed. + */ +STATIC int +xrep_refc_build_new_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *refc_cur; + struct xfs_perag *pag = sc->sa.pag; + xfs_fsblock_t fsbno; + int error; + + error = xrep_refc_sort_records(rr); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp)); + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno, + XFS_AG_RESV_METADATA); + rr->new_btree.bload.get_records = xrep_refc_get_records; + rr->new_btree.bload.claim_block = xrep_refc_claim_block; + + /* Compute how many blocks we'll need. */ + refc_cur = xfs_refcountbt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(refc_cur, &rr->new_btree.afake); + error = xfs_btree_bload_compute_geometry(refc_cur, + &rr->new_btree.bload, + xfarray_length(rr->refcount_records)); + if (error) + goto err_cur; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + rr->new_btree.bload.nr_blocks); + if (error) + goto err_cur; + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the incore btree height so + * that we don't trip the verifiers when writing the new btree blocks + * to disk. + */ + pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height; + + /* Add all observed refcount records. */ + rr->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr); + if (error) + goto err_level; + + /* + * Install the new btree in the AG header. After this point the old + * btree is no longer accessible and the new tree is live. + */ + xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(refc_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_refc_reset_counters(rr); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_level: + pag->pagf_repair_refcount_level = 0; +err_cur: + xfs_btree_del_cursor(refc_cur, error); +err_newbt: + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_refc_remove_old_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + int error; + + /* Free the old refcountbt blocks if they're not in use. */ + error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks, + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + if (error) + return error; + + /* + * Now that we've zapped all the old refcountbt blocks we can turn off + * the alternate height mechanism and reset the per-AG space + * reservations. + */ + pag->pagf_repair_refcount_level = 0; + sc->flags |= XREP_RESET_PERAG_RESV; + return 0; +} + +/* Rebuild the refcount btree. */ +int +xrep_refcountbt( + struct xfs_scrub *sc) +{ + struct xrep_refc *rr; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + rr->sc = sc; + + /* Set up enough storage to handle one refcount record per block. */ + descr = xchk_xfile_ag_descr(sc, "reference count records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks, + sizeof(struct xfs_refcount_irec), + &rr->refcount_records); + kfree(descr); + if (error) + goto out_rr; + + /* Collect all reference counts. */ + xagb_bitmap_init(&rr->old_refcountbt_blocks); + error = xrep_refc_find_refcounts(rr); + if (error) + goto out_bitmap; + + /* Rebuild the refcount information. */ + error = xrep_refc_build_new_tree(rr); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_refc_remove_old_tree(rr); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&rr->old_refcountbt_blocks); + xfarray_destroy(rr->refcount_records); +out_rr: + kfree(rr); + return error; +} diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1b8b5439f2d7..f43dce771cdd 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -27,12 +27,18 @@ #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_defer.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_reflink.h" +#include "xfs_health.h" +#include "xfs_buf_mem.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" #include "scrub/stats.h" +#include "scrub/xfile.h" /* * Attempt to repair some metadata, if the metadata is corrupt and userspace @@ -176,6 +182,16 @@ xrep_roll_ag_trans( return 0; } +/* Roll the scrub transaction, holding the primary metadata locked. */ +int +xrep_roll_trans( + struct xfs_scrub *sc) +{ + if (!sc->ip) + return xrep_roll_ag_trans(sc); + return xfs_trans_roll_inode(&sc->tp, sc->ip); +} + /* Finish all deferred work attached to the repair transaction. */ int xrep_defer_finish( @@ -387,7 +403,7 @@ xrep_calc_ag_resblks( int xrep_fix_freelist( struct xfs_scrub *sc, - bool can_shrink) + int alloc_flags) { struct xfs_alloc_arg args = {0}; @@ -397,8 +413,7 @@ xrep_fix_freelist( args.alignment = 1; args.pag = sc->sa.pag; - return xfs_alloc_fix_freelist(&args, - can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); + return xfs_alloc_fix_freelist(&args, alloc_flags); } /* @@ -673,6 +688,45 @@ xrep_find_ag_btree_roots( return error; } +#ifdef CONFIG_XFS_QUOTA +/* Update some quota flags in the superblock. */ +void +xrep_update_qflags( + struct xfs_scrub *sc, + unsigned int clear_flags, + unsigned int set_flags) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); + if ((mp->m_qflags & clear_flags) == 0 && + (mp->m_qflags & set_flags) == set_flags) + goto no_update; + + mp->m_qflags &= ~clear_flags; + mp->m_qflags |= set_flags; + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags &= ~clear_flags; + mp->m_sb.sb_qflags |= set_flags; + spin_unlock(&mp->m_sb_lock); + + /* + * Update the quota flags in the ondisk superblock without touching + * the summary counters. We have not quiesced inode chunk allocation, + * so we cannot coordinate with updates to the icount and ifree percpu + * counters. + */ + bp = xfs_trans_getsb(sc->tp); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1); + +no_update: + mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); +} + /* Force a quotacheck the next time we mount. */ void xrep_force_quotacheck( @@ -685,13 +739,7 @@ xrep_force_quotacheck( if (!(flag & sc->mp->m_qflags)) return; - mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); - sc->mp->m_qflags &= ~flag; - spin_lock(&sc->mp->m_sb_lock); - sc->mp->m_sb.sb_qflags &= ~flag; - spin_unlock(&sc->mp->m_sb_lock); - xfs_log_sb(sc->tp); - mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); + xrep_update_qflags(sc, flag, 0); } /* @@ -699,10 +747,10 @@ xrep_force_quotacheck( * * This function ensures that the appropriate dquots are attached to an inode. * We cannot allow the dquot code to allocate an on-disk dquot block here - * because we're already in transaction context with the inode locked. The - * on-disk dquot should already exist anyway. If the quota code signals - * corruption or missing quota information, schedule quotacheck, which will - * repair corruptions in the quota metadata. + * because we're already in transaction context. The on-disk dquot should + * already exist anyway. If the quota code signals corruption or missing quota + * information, schedule quotacheck, which will repair corruptions in the quota + * metadata. */ int xrep_ino_dqattach( @@ -710,7 +758,10 @@ xrep_ino_dqattach( { int error; - error = xfs_qm_dqattach_locked(sc->ip, false); + ASSERT(sc->tp != NULL); + ASSERT(sc->ip != NULL); + + error = xfs_qm_dqattach(sc->ip); switch (error) { case -EFSBADCRC: case -EFSCORRUPTED: @@ -734,3 +785,419 @@ xrep_ino_dqattach( return error; } +#endif /* CONFIG_XFS_QUOTA */ + +/* + * Ensure that the inode being repaired is ready to handle a certain number of + * extents, or return EFSCORRUPTED. Caller must hold the ILOCK of the inode + * being repaired and have joined it to the scrub transaction. + */ +int +xrep_ino_ensure_extent_count( + struct xfs_scrub *sc, + int whichfork, + xfs_extnum_t nextents) +{ + xfs_extnum_t max_extents; + bool inode_has_nrext64; + + inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip); + max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork); + if (nextents <= max_extents) + return 0; + if (inode_has_nrext64) + return -EFSCORRUPTED; + if (!xfs_has_large_extent_counts(sc->mp)) + return -EFSCORRUPTED; + + max_extents = xfs_iext_max_nextents(true, whichfork); + if (nextents > max_extents) + return -EFSCORRUPTED; + + sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return 0; +} + +/* + * Initialize all the btree cursors for an AG repair except for the btree that + * we're rebuilding. + */ +void +xrep_ag_btcur_init( + struct xfs_scrub *sc, + struct xchk_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + + /* Set up a bnobt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT && + sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) { + sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); + sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); + } + + /* Set up a inobt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT && + sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) { + sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, + sa->agi_bp); + if (xfs_has_finobt(mp)) + sa->fino_cur = xfs_finobt_init_cursor(sc->sa.pag, + sc->tp, sa->agi_bp); + } + + /* Set up a rmapbt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT && + xfs_has_rmapbt(mp)) + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); + + /* Set up a refcountbt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT && + xfs_has_reflink(mp)) + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, sc->sa.pag); +} + +/* + * Reinitialize the in-core AG state after a repair by rereading the AGF + * buffer. We had better get the same AGF buffer as the one that's attached + * to the scrub context. + */ +int +xrep_reinit_pagf( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_buf *bp; + int error; + + ASSERT(pag); + ASSERT(xfs_perag_initialised_agf(pag)); + + clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp); + if (error) + return error; + + if (bp != sc->sa.agf_bp) { + ASSERT(bp == sc->sa.agf_bp); + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Reinitialize the in-core AG state after a repair by rereading the AGI + * buffer. We had better get the same AGI buffer as the one that's attached + * to the scrub context. + */ +int +xrep_reinit_pagi( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_buf *bp; + int error; + + ASSERT(pag); + ASSERT(xfs_perag_initialised_agi(pag)); + + clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); + error = xfs_ialloc_read_agi(pag, sc->tp, &bp); + if (error) + return error; + + if (bp != sc->sa.agi_bp) { + ASSERT(bp == sc->sa.agi_bp); + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Given an active reference to a perag structure, load AG headers and cursors. + * This should only be called to scan an AG while repairing file-based metadata. + */ +int +xrep_ag_init( + struct xfs_scrub *sc, + struct xfs_perag *pag, + struct xchk_ag *sa) +{ + int error; + + ASSERT(!sa->pag); + + error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp); + if (error) + return error; + + error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp); + if (error) + return error; + + /* Grab our own passive reference from the caller's ref. */ + sa->pag = xfs_perag_hold(pag); + xrep_ag_btcur_init(sc, sa); + return 0; +} + +/* Reinitialize the per-AG block reservation for the AG we just fixed. */ +int +xrep_reset_perag_resv( + struct xfs_scrub *sc) +{ + int error; + + if (!(sc->flags & XREP_RESET_PERAG_RESV)) + return 0; + + ASSERT(sc->sa.pag != NULL); + ASSERT(sc->ops->type == ST_PERAG); + ASSERT(sc->tp); + + sc->flags &= ~XREP_RESET_PERAG_RESV; + error = xfs_ag_resv_free(sc->sa.pag); + if (error) + goto out; + error = xfs_ag_resv_init(sc->sa.pag, sc->tp); + if (error == -ENOSPC) { + xfs_err(sc->mp, +"Insufficient free space to reset per-AG reservation for AG %u after repair.", + sc->sa.pag->pag_agno); + error = 0; + } + +out: + return error; +} + +/* Decide if we are going to call the repair function for a scrub type. */ +bool +xrep_will_attempt( + struct xfs_scrub *sc) +{ + /* Userspace asked us to rebuild the structure regardless. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) + return true; + + /* Let debug users force us into the repair routines. */ + if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + return true; + + /* Metadata is corrupt or failed cross-referencing. */ + if (xchk_needs_repair(sc->sm)) + return true; + + return false; +} + +/* Try to fix some part of a metadata inode by calling another scrubber. */ +STATIC int +xrep_metadata_inode_subtype( + struct xfs_scrub *sc, + unsigned int scrub_type) +{ + __u32 smtype = sc->sm->sm_type; + __u32 smflags = sc->sm->sm_flags; + unsigned int sick_mask = sc->sick_mask; + int error; + + /* + * Let's see if the inode needs repair. We're going to open-code calls + * to the scrub and repair functions so that we can hang on to the + * resources that we already acquired instead of using the standard + * setup/teardown routines. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + sc->sm->sm_type = scrub_type; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xchk_bmap_attr(sc); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + if (error) + goto out; + + if (!xrep_will_attempt(sc)) + goto out; + + /* + * Repair some part of the inode. This will potentially join the inode + * to the transaction. + */ + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xrep_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xrep_bmap(sc, XFS_DATA_FORK, false); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xrep_bmap(sc, XFS_ATTR_FORK, false); + break; + } + if (error) + goto out; + + /* + * Finish all deferred intent items and then roll the transaction so + * that the inode will not be joined to the transaction when we exit + * the function. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + goto out; + error = xfs_trans_roll(&sc->tp); + if (error) + goto out; + + /* + * Clear the corruption flags and re-check the metadata that we just + * repaired. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xchk_bmap_attr(sc); + break; + } + if (error) + goto out; + + /* If corruption persists, the repair has failed. */ + if (xchk_needs_repair(sc->sm)) { + error = -EFSCORRUPTED; + goto out; + } +out: + sc->sick_mask = sick_mask; + sc->sm->sm_type = smtype; + sc->sm->sm_flags = smflags; + return error; +} + +/* + * Repair the ondisk forks of a metadata inode. The caller must ensure that + * sc->ip points to the metadata inode and the ILOCK is held on that inode. + * The inode must not be joined to the transaction before the call, and will + * not be afterwards. + */ +int +xrep_metadata_inode_forks( + struct xfs_scrub *sc) +{ + bool dirty = false; + int error; + + /* Repair the inode record and the data fork. */ + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); + if (error) + return error; + + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); + if (error) + return error; + + /* Make sure the attr fork looks ok before we delete it. */ + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); + if (error) + return error; + + /* Clear the reflink flag since metadata never shares. */ + if (xfs_is_reflink_inode(sc->ip)) { + dirty = true; + xfs_trans_ijoin(sc->tp, sc->ip, 0); + error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); + if (error) + return error; + } + + /* + * If we modified the inode, roll the transaction but don't rejoin the + * inode to the new transaction because xrep_bmap_data can do that. + */ + if (dirty) { + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + dirty = false; + } + + return 0; +} + +/* + * Set up an in-memory buffer cache so that we can use the xfbtree. Allocating + * a shmem file might take loks, so we cannot be in transaction context. Park + * our resources in the scrub context and let the teardown function take care + * of them at the right time. + */ +int +xrep_setup_xfbtree( + struct xfs_scrub *sc, + const char *descr) +{ + ASSERT(sc->tp == NULL); + + return xmbuf_alloc(sc->mp, descr, &sc->xmbtp); +} + +/* + * Create a dummy transaction for use in a live update hook function. This + * function MUST NOT be called from regular repair code because the current + * process' transaction is saved via the cookie. + */ +int +xrep_trans_alloc_hook_dummy( + struct xfs_mount *mp, + void **cookiep, + struct xfs_trans **tpp) +{ + int error; + + *cookiep = current->journal_info; + current->journal_info = NULL; + + error = xfs_trans_alloc_empty(mp, tpp); + if (!error) + return 0; + + current->journal_info = *cookiep; + *cookiep = NULL; + return error; +} + +/* Cancel a dummy transaction used by a live update hook function. */ +void +xrep_trans_cancel_hook_dummy( + void **cookiep, + struct xfs_trans *tp) +{ + xfs_trans_cancel(tp); + current->journal_info = *cookiep; + *cookiep = NULL; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 60d2a9ae5f2e..ce082d941459 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -28,17 +28,30 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) /* Repair helpers */ int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); +bool xrep_will_attempt(struct xfs_scrub *sc); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); +int xrep_roll_trans(struct xfs_scrub *sc); int xrep_defer_finish(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, enum xfs_ag_resv_type type); xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc); +static inline int +xrep_trans_commit( + struct xfs_scrub *sc) +{ + int error = xfs_trans_commit(sc->tp); + + sc->tp = NULL; + return error; +} + struct xbitmap; struct xagb_bitmap; +struct xfsb_bitmap; -int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); +int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags); struct xrep_find_ag_btree { /* in: rmap owner of the btree we're looking for */ @@ -57,8 +70,41 @@ struct xrep_find_ag_btree { int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); + +#ifdef CONFIG_XFS_QUOTA +void xrep_update_qflags(struct xfs_scrub *sc, unsigned int clear_flags, + unsigned int set_flags); void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); +#else +# define xrep_force_quotacheck(sc, type) ((void)0) +# define xrep_ino_dqattach(sc) (0) +#endif /* CONFIG_XFS_QUOTA */ + +int xrep_setup_xfbtree(struct xfs_scrub *sc, const char *descr); + +int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork, + xfs_extnum_t nextents); +int xrep_reset_perag_resv(struct xfs_scrub *sc); +int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten); +int xrep_metadata_inode_forks(struct xfs_scrub *sc); +int xrep_setup_ag_rmapbt(struct xfs_scrub *sc); +int xrep_setup_ag_refcountbt(struct xfs_scrub *sc); + +/* Repair setup functions */ +int xrep_setup_ag_allocbt(struct xfs_scrub *sc); + +struct xfs_imap; +int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap); + +void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); +int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag, + struct xchk_ag *sa); + +/* Metadata revalidators */ + +int xrep_revalidate_allocbt(struct xfs_scrub *sc); +int xrep_revalidate_iallocbt(struct xfs_scrub *sc); /* Metadata repairers */ @@ -67,9 +113,43 @@ int xrep_superblock(struct xfs_scrub *sc); int xrep_agf(struct xfs_scrub *sc); int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); +int xrep_allocbt(struct xfs_scrub *sc); +int xrep_iallocbt(struct xfs_scrub *sc); +int xrep_rmapbt(struct xfs_scrub *sc); +int xrep_refcountbt(struct xfs_scrub *sc); +int xrep_inode(struct xfs_scrub *sc); +int xrep_bmap_data(struct xfs_scrub *sc); +int xrep_bmap_attr(struct xfs_scrub *sc); +int xrep_bmap_cow(struct xfs_scrub *sc); +int xrep_nlinks(struct xfs_scrub *sc); +int xrep_fscounters(struct xfs_scrub *sc); + +#ifdef CONFIG_XFS_RT +int xrep_rtbitmap(struct xfs_scrub *sc); +#else +# define xrep_rtbitmap xrep_notsupported +#endif /* CONFIG_XFS_RT */ + +#ifdef CONFIG_XFS_QUOTA +int xrep_quota(struct xfs_scrub *sc); +int xrep_quotacheck(struct xfs_scrub *sc); +#else +# define xrep_quota xrep_notsupported +# define xrep_quotacheck xrep_notsupported +#endif /* CONFIG_XFS_QUOTA */ + +int xrep_reinit_pagf(struct xfs_scrub *sc); +int xrep_reinit_pagi(struct xfs_scrub *sc); + +int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep, + struct xfs_trans **tpp); +void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); #else +#define xrep_ino_dqattach(sc) (0) +#define xrep_will_attempt(sc) (false) + static inline int xrep_attempt( struct xfs_scrub *sc, @@ -87,11 +167,51 @@ xrep_calc_ag_resblks( return 0; } +static inline int +xrep_reset_perag_resv( + struct xfs_scrub *sc) +{ + if (!(sc->flags & XREP_RESET_PERAG_RESV)) + return 0; + + ASSERT(0); + return -EOPNOTSUPP; +} + +/* repair setup functions for no-repair */ +static inline int +xrep_setup_nothing( + struct xfs_scrub *sc) +{ + return 0; +} +#define xrep_setup_ag_allocbt xrep_setup_nothing +#define xrep_setup_ag_rmapbt xrep_setup_nothing +#define xrep_setup_ag_refcountbt xrep_setup_nothing + +#define xrep_setup_inode(sc, imap) ((void)0) + +#define xrep_revalidate_allocbt (NULL) +#define xrep_revalidate_iallocbt (NULL) + #define xrep_probe xrep_notsupported #define xrep_superblock xrep_notsupported #define xrep_agf xrep_notsupported #define xrep_agfl xrep_notsupported #define xrep_agi xrep_notsupported +#define xrep_allocbt xrep_notsupported +#define xrep_iallocbt xrep_notsupported +#define xrep_rmapbt xrep_notsupported +#define xrep_refcountbt xrep_notsupported +#define xrep_inode xrep_notsupported +#define xrep_bmap_data xrep_notsupported +#define xrep_bmap_attr xrep_notsupported +#define xrep_bmap_cow xrep_notsupported +#define xrep_rtbitmap xrep_notsupported +#define xrep_quota xrep_notsupported +#define xrep_quotacheck xrep_notsupported +#define xrep_nlinks xrep_notsupported +#define xrep_fscounters xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index d29a26ecddd6..ba5bbc3fb754 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -24,6 +24,8 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/repair.h" /* * Set us up to scrub reverse mapping btrees. @@ -35,6 +37,14 @@ xchk_setup_ag_rmapbt( if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + if (xchk_could_repair(sc)) { + int error; + + error = xrep_setup_ag_rmapbt(sc); + if (error) + return error; + } + return xchk_setup_ag_btree(sc, false); } @@ -348,7 +358,7 @@ xchk_rmapbt_rec( struct xfs_rmap_irec irec; if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || - xfs_rmap_check_irec(bs->cur, &irec) != NULL) { + xfs_rmap_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -411,8 +421,8 @@ xchk_rmapbt_walk_ag_metadata( /* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */ cur = sc->sa.bno_cur; if (!cur) - cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); + cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); if (cur != sc->sa.bno_cur) xfs_btree_del_cursor(cur, error); @@ -421,8 +431,8 @@ xchk_rmapbt_walk_ag_metadata( cur = sc->sa.cnt_cur; if (!cur) - cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); if (cur != sc->sa.cnt_cur) xfs_btree_del_cursor(cur, error); @@ -446,8 +456,7 @@ xchk_rmapbt_walk_ag_metadata( /* OWN_INOBT: inobt, finobt */ cur = sc->sa.ino_cur; if (!cur) - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp, - XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp); error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); if (cur != sc->sa.ino_cur) xfs_btree_del_cursor(cur, error); @@ -457,8 +466,8 @@ xchk_rmapbt_walk_ag_metadata( if (xfs_has_finobt(sc->mp)) { cur = sc->sa.fino_cur; if (!cur) - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, - sc->sa.agi_bp, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, + sc->sa.agi_bp); error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); if (cur != sc->sa.fino_cur) xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c new file mode 100644 index 000000000000..e8e07b683eab --- /dev/null +++ b/fs/xfs/scrub/rmap_repair.c @@ -0,0 +1,1697 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Reverse Mapping Btree Repair + * ============================ + * + * This is the most involved of all the AG space btree rebuilds. Everywhere + * else in XFS we lock inodes and then AG data structures, but generating the + * list of rmap records requires that we be able to scan both block mapping + * btrees of every inode in the filesystem to see if it owns any extents in + * this AG. We can't tolerate any inode updates while we do this, so we + * freeze the filesystem to lock everyone else out, and grant ourselves + * special privileges to run transactions with regular background reclamation + * turned off. + * + * We also have to be very careful not to allow inode reclaim to start a + * transaction because all transactions (other than our own) will block. + * Deferred inode inactivation helps us out there. + * + * I) Reverse mappings for all non-space metadata and file data are collected + * according to the following algorithm: + * + * 1. For each fork of each inode: + * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary. + * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate + * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree + * block. + * 1.3. If the incore extent map is loaded but the fork is in btree format, + * just visit the bmbt blocks to set the corresponding BMBIT areas. + * 1.4. From the incore extent map, accumulate each bmap that falls into our + * target AG. Remember, multiple bmap records can map to a single rmap + * record, so we cannot simply emit rmap records 1:1. + * 1.5. Emit rmap records for each extent in BMBIT and free it. + * 2. Create bitmaps INOBIT and ICHUNKBIT. + * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT, + * and set bits in INOBIT for each btree block. If the inobt has no records + * at all, we must be careful to record its root in INOBIT. + * 4. For each block in the finobt, set the corresponding INOBIT area. + * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them. + * 6. Create bitmaps REFCBIT and COWBIT. + * 7. For each CoW staging extent in the refcountbt, set the corresponding + * areas in COWBIT. + * 8. For each block in the refcountbt, set the corresponding REFCBIT area. + * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them. + * A. Emit rmap for the AG headers. + * B. Emit rmap for the log, if there is one. + * + * II) The rmapbt shape and space metadata rmaps are computed as follows: + * + * 1. Count the rmaps collected in the previous step. (= NR) + * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB) + * 3. Reserve RMB blocks through the newbt using the allocator in normap mode. + * 4. Create bitmap AGBIT. + * 5. For each reservation in the newbt, set the corresponding areas in AGBIT. + * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT. + * 7. Count the extents in AGBIT. (= AGNR) + * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB') + * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB', + * and clear AGBIT. Go to step 5. + * A. Emit rmaps for each extent in AGBIT. + * + * III) The rmapbt is constructed and set in place as follows: + * + * 1. Sort the rmap records. + * 2. Bulk load the rmaps. + * + * IV) Reap the old btree blocks. + * + * 1. Create a bitmap OLDRMBIT. + * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT. + * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT. + * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are + * the parts of the AG that the rmap didn't find during its scan of the + * primary metadata and aren't known to be in the free space, which implies + * that they were the old rmapbt blocks. + * 5. Commit. + * + * We use the 'xrep_rmap' prefix for all the rmap functions. + */ + +/* Context for collecting rmaps */ +struct xrep_rmap { + /* new rmapbt information */ + struct xrep_newbt new_btree; + + /* lock for the xfbtree and xfile */ + struct mutex lock; + + /* rmap records generated from primary metadata */ + struct xfbtree rmap_btree; + + struct xfs_scrub *sc; + + /* in-memory btree cursor for the xfs_btree_bload iteration */ + struct xfs_btree_cur *mcur; + + /* Hooks into rmap update code. */ + struct xfs_rmap_hook rhook; + + /* inode scan cursor */ + struct xchk_iscan iscan; + + /* Number of non-freespace records found. */ + unsigned long long nr_records; + + /* bnobt/cntbt contribution to btreeblks */ + xfs_agblock_t freesp_btblocks; + + /* old agf_rmap_blocks counter */ + unsigned int old_rmapbt_fsbcount; +}; + +/* Set us up to repair reverse mapping btrees. */ +int +xrep_setup_ag_rmapbt( + struct xfs_scrub *sc) +{ + struct xrep_rmap *rr; + char *descr; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP); + + descr = xchk_xfile_ag_descr(sc, "reverse mapping records"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + if (error) + return error; + + rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + + rr->sc = sc; + sc->buf = rr; + return 0; +} + +/* Make sure there's nothing funny about this mapping. */ +STATIC int +xrep_rmap_check_mapping( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, + rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Store a reverse-mapping record. */ +static inline int +xrep_rmap_stash( + struct xrep_rmap *rr, + xfs_agblock_t startblock, + xfs_extlen_t blockcount, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + struct xfs_rmap_irec rmap = { + .rm_startblock = startblock, + .rm_blockcount = blockcount, + .rm_owner = owner, + .rm_offset = offset, + .rm_flags = flags, + }; + struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *mcur; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + if (xchk_iscan_aborted(&rr->iscan)) + return -EFSCORRUPTED; + + trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap); + + mutex_lock(&rr->lock); + mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree); + error = xfs_rmap_map_raw(mcur, &rmap); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_cancel; + + error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp); + if (error) + goto out_abort; + + mutex_unlock(&rr->lock); + return 0; + +out_cancel: + xfbtree_trans_cancel(&rr->rmap_btree, sc->tp); +out_abort: + xchk_iscan_abort(&rr->iscan); + mutex_unlock(&rr->lock); + return error; +} + +struct xrep_rmap_stash_run { + struct xrep_rmap *rr; + uint64_t owner; + unsigned int rmap_flags; +}; + +static int +xrep_rmap_stash_run( + uint32_t start, + uint32_t len, + void *priv) +{ + struct xrep_rmap_stash_run *rsr = priv; + struct xrep_rmap *rr = rsr->rr; + + return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags); +} + +/* + * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure + * that the ranges are in units of FS blocks. + */ +STATIC int +xrep_rmap_stash_bitmap( + struct xrep_rmap *rr, + struct xagb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xrep_rmap_stash_run rsr = { + .rr = rr, + .owner = oinfo->oi_owner, + .rmap_flags = 0, + }; + + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) + rsr.rmap_flags |= XFS_RMAP_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) + rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK; + + return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr); +} + +/* Section (I): Finding all file and bmbt extents. */ + +/* Context for accumulating rmaps for an inode fork. */ +struct xrep_rmap_ifork { + /* + * Accumulate rmap data here to turn multiple adjacent bmaps into a + * single rmap. + */ + struct xfs_rmap_irec accum; + + /* Bitmap of bmbt blocks in this AG. */ + struct xagb_bitmap bmbt_blocks; + + struct xrep_rmap *rr; + + /* Which inode fork? */ + int whichfork; +}; + +/* Stash an rmap that we accumulated while walking an inode fork. */ +STATIC int +xrep_rmap_stash_accumulated( + struct xrep_rmap_ifork *rf) +{ + if (rf->accum.rm_blockcount == 0) + return 0; + + return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock, + rf->accum.rm_blockcount, rf->accum.rm_owner, + rf->accum.rm_offset, rf->accum.rm_flags); +} + +/* Accumulate a bmbt record. */ +STATIC int +xrep_rmap_visit_bmbt( + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *rec, + void *priv) +{ + struct xrep_rmap_ifork *rf = priv; + struct xfs_mount *mp = rf->rr->sc->mp; + struct xfs_rmap_irec *accum = &rf->accum; + xfs_agblock_t agbno; + unsigned int rmap_flags = 0; + int error; + + if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) != + rf->rr->sc->sa.pag->pag_agno) + return 0; + + agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock); + if (rf->whichfork == XFS_ATTR_FORK) + rmap_flags |= XFS_RMAP_ATTR_FORK; + if (rec->br_state == XFS_EXT_UNWRITTEN) + rmap_flags |= XFS_RMAP_UNWRITTEN; + + /* If this bmap is adjacent to the previous one, just add it. */ + if (accum->rm_blockcount > 0 && + rec->br_startoff == accum->rm_offset + accum->rm_blockcount && + agbno == accum->rm_startblock + accum->rm_blockcount && + rmap_flags == accum->rm_flags) { + accum->rm_blockcount += rec->br_blockcount; + return 0; + } + + /* Otherwise stash the old rmap and start accumulating a new one. */ + error = xrep_rmap_stash_accumulated(rf); + if (error) + return error; + + accum->rm_startblock = agbno; + accum->rm_blockcount = rec->br_blockcount; + accum->rm_offset = rec->br_startoff; + accum->rm_flags = rmap_flags; + return 0; +} + +/* Add a btree block to the bitmap. */ +STATIC int +xrep_rmap_visit_iroot_btree_block( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xrep_rmap_ifork *rf = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno) + return 0; + + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1); +} + +/* + * Iterate a metadata btree rooted in an inode to collect rmap records for + * anything in this fork that matches the AG. + */ +STATIC int +xrep_rmap_scan_iroot_btree( + struct xrep_rmap_ifork *rf, + struct xfs_btree_cur *cur) +{ + struct xfs_owner_info oinfo; + struct xrep_rmap *rr = rf->rr; + int error; + + xagb_bitmap_init(&rf->bmbt_blocks); + + /* Record all the blocks in the btree itself. */ + error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block, + XFS_BTREE_VISIT_ALL, rf); + if (error) + goto out; + + /* Emit rmaps for the btree blocks. */ + xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork); + error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo); + if (error) + goto out; + + /* Stash any remaining accumulated rmaps. */ + error = xrep_rmap_stash_accumulated(rf); +out: + xagb_bitmap_destroy(&rf->bmbt_blocks); + return error; +} + +static inline bool +is_rt_data_fork( + struct xfs_inode *ip, + int whichfork) +{ + return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK; +} + +/* + * Iterate the block mapping btree to collect rmap records for anything in this + * fork that matches the AG. Sets @mappings_done to true if we've scanned the + * block mappings in this fork. + */ +STATIC int +xrep_rmap_scan_bmbt( + struct xrep_rmap_ifork *rf, + struct xfs_inode *ip, + bool *mappings_done) +{ + struct xrep_rmap *rr = rf->rr; + struct xfs_btree_cur *cur; + struct xfs_ifork *ifp; + int error; + + *mappings_done = false; + ifp = xfs_ifork_ptr(ip, rf->whichfork); + cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork); + + if (!xfs_ifork_is_realtime(ip, rf->whichfork) && + xfs_need_iread_extents(ifp)) { + /* + * If the incore extent cache isn't loaded, scan the bmbt for + * mapping records. This avoids loading the incore extent + * tree, which will increase memory pressure at a time when + * we're trying to run as quickly as we possibly can. Ignore + * realtime extents. + */ + error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf); + if (error) + goto out_cur; + + *mappings_done = true; + } + + /* Scan for the bmbt blocks, which always live on the data device. */ + error = xrep_rmap_scan_iroot_btree(rf, cur); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Iterate the in-core extent cache to collect rmap records for anything in + * this fork that matches the AG. + */ +STATIC int +xrep_rmap_scan_iext( + struct xrep_rmap_ifork *rf, + struct xfs_ifork *ifp) +{ + struct xfs_bmbt_irec rec; + struct xfs_iext_cursor icur; + int error; + + for_each_xfs_iext(ifp, &icur, &rec) { + if (isnullstartblock(rec.br_startblock)) + continue; + error = xrep_rmap_visit_bmbt(NULL, &rec, rf); + if (error) + return error; + } + + return xrep_rmap_stash_accumulated(rf); +} + +/* Find all the extents from a given AG in an inode fork. */ +STATIC int +xrep_rmap_scan_ifork( + struct xrep_rmap *rr, + struct xfs_inode *ip, + int whichfork) +{ + struct xrep_rmap_ifork rf = { + .accum = { .rm_owner = ip->i_ino, }, + .rr = rr, + .whichfork = whichfork, + }; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + int error = 0; + + if (!ifp) + return 0; + + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + bool mappings_done; + + /* + * Scan the bmap btree for data device mappings. This includes + * the btree blocks themselves, even if this is a realtime + * file. + */ + error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done); + if (error || mappings_done) + return error; + } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { + return 0; + } + + /* Scan incore extent cache if this isn't a realtime file. */ + if (xfs_ifork_is_realtime(ip, whichfork)) + return 0; + + return xrep_rmap_scan_iext(&rf, ifp); +} + +/* + * Take ILOCK on a file that we want to scan. + * + * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded + * attr bmbt. Otherwise, take ILOCK_SHARED. + */ +static inline unsigned int +xrep_rmap_scan_ilock( + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + if (xfs_need_iread_extents(&ip->i_df)) { + lock_mode = XFS_ILOCK_EXCL; + goto lock; + } + + if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) + lock_mode = XFS_ILOCK_EXCL; + +lock: + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* Record reverse mappings for a file. */ +STATIC int +xrep_rmap_scan_inode( + struct xrep_rmap *rr, + struct xfs_inode *ip) +{ + unsigned int lock_mode = 0; + int error; + + /* + * Directory updates (create/link/unlink/rename) drop the directory's + * ILOCK before finishing any rmapbt updates associated with directory + * shape changes. For this scan to coordinate correctly with the live + * update hook, we must take the only lock (i_rwsem) that is held all + * the way to dir op completion. This will get fixed by the parent + * pointer patchset. + */ + if (S_ISDIR(VFS_I(ip)->i_mode)) { + lock_mode = XFS_IOLOCK_SHARED; + xfs_ilock(ip, lock_mode); + } + lock_mode |= xrep_rmap_scan_ilock(ip); + + /* Check the data fork. */ + error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + /* Check the attr fork. */ + error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK); + if (error) + goto out_unlock; + + /* COW fork extents are "owned" by the refcount btree. */ + + xchk_iscan_mark_visited(&rr->iscan, ip); +out_unlock: + xfs_iunlock(ip, lock_mode); + return error; +} + +/* Section (I): Find all AG metadata extents except for free space metadata. */ + +struct xrep_rmap_inodes { + struct xrep_rmap *rr; + struct xagb_bitmap inobt_blocks; /* INOBIT */ + struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */ +}; + +/* Record inode btree rmaps. */ +STATIC int +xrep_rmap_walk_inobt( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_inobt_rec_incore irec; + struct xrep_rmap_inodes *ri = priv; + struct xfs_mount *mp = cur->bc_mp; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agino_t agino; + xfs_agino_t iperhole; + unsigned int i; + int error; + + /* Record the inobt blocks. */ + error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur); + if (error) + return error; + + xfs_inobt_btrec_to_irec(mp, rec, &irec); + if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL) + return -EFSCORRUPTED; + + agino = irec.ir_startino; + + /* Record a non-sparse inode chunk. */ + if (!xfs_inobt_issparse(irec.ir_holemask)) { + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + aglen = max_t(xfs_extlen_t, 1, + XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock); + + return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); + } + + /* Iterate each chunk. */ + iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock, + XFS_INODES_PER_HOLEMASK_BIT); + aglen = iperhole / mp->m_sb.sb_inopblock; + for (i = 0, agino = irec.ir_startino; + i < XFS_INOBT_HOLEMASK_BITS; + i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) { + /* Skip holes. */ + if (irec.ir_holemask & (1 << i)) + continue; + + /* Record the inode chunk otherwise. */ + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); + if (error) + return error; + } + + return 0; +} + +/* Collect rmaps for the blocks containing inode btrees and the inode chunks. */ +STATIC int +xrep_rmap_find_inode_rmaps( + struct xrep_rmap *rr) +{ + struct xrep_rmap_inodes ri = { + .rr = rr, + }; + struct xfs_scrub *sc = rr->sc; + int error; + + xagb_bitmap_init(&ri.inobt_blocks); + xagb_bitmap_init(&ri.ichunk_blocks); + + /* + * Iterate every record in the inobt so we can capture all the inode + * chunks and the blocks in the inobt itself. + */ + error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri); + if (error) + goto out_bitmap; + + /* + * Note that if there are zero records in the inobt then query_all does + * nothing and we have to account the empty inobt root manually. + */ + if (xagb_bitmap_empty(&ri.ichunk_blocks)) { + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + + error = xagb_bitmap_set(&ri.inobt_blocks, + be32_to_cpu(agi->agi_root), 1); + if (error) + goto out_bitmap; + } + + /* Scan the finobt too. */ + if (xfs_has_finobt(sc->mp)) { + error = xagb_bitmap_set_btblocks(&ri.inobt_blocks, + sc->sa.fino_cur); + if (error) + goto out_bitmap; + } + + /* Generate rmaps for everything. */ + error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks, + &XFS_RMAP_OINFO_INOBT); + if (error) + goto out_bitmap; + error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks, + &XFS_RMAP_OINFO_INODES); + +out_bitmap: + xagb_bitmap_destroy(&ri.inobt_blocks); + xagb_bitmap_destroy(&ri.ichunk_blocks); + return error; +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_rmap_walk_cowblocks( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + + if (!xfs_refcount_check_domain(irec) || + irec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount); +} + +/* + * Collect rmaps for the blocks containing the refcount btree, and all CoW + * staging extents. + */ +STATIC int +xrep_rmap_find_refcount_rmaps( + struct xrep_rmap *rr) +{ + struct xagb_bitmap refcountbt_blocks; /* REFCBIT */ + struct xagb_bitmap cow_blocks; /* COWBIT */ + struct xfs_refcount_irec low = { + .rc_startblock = 0, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_refcount_irec high = { + .rc_startblock = -1U, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_scrub *sc = rr->sc; + int error; + + if (!xfs_has_reflink(sc->mp)) + return 0; + + xagb_bitmap_init(&refcountbt_blocks); + xagb_bitmap_init(&cow_blocks); + + /* refcountbt */ + error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur); + if (error) + goto out_bitmap; + + /* Collect rmaps for CoW staging extents. */ + error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high, + xrep_rmap_walk_cowblocks, &cow_blocks); + if (error) + goto out_bitmap; + + /* Generate rmaps for everything. */ + error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks, + &XFS_RMAP_OINFO_REFC); + +out_bitmap: + xagb_bitmap_destroy(&cow_blocks); + xagb_bitmap_destroy(&refcountbt_blocks); + return error; +} + +/* Generate rmaps for the AG headers (AGI/AGF/AGFL) */ +STATIC int +xrep_rmap_find_agheader_rmaps( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + /* Create a record for the AG sb->agfl. */ + return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp), + XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1, + XFS_RMAP_OWN_FS, 0, 0); +} + +/* Generate rmaps for the log, if it's in this AG. */ +STATIC int +xrep_rmap_find_log_rmaps( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno)) + return 0; + + return xrep_rmap_stash(rr, + XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart), + sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0); +} + +/* Check and count all the records that we gathered. */ +STATIC int +xrep_rmap_check_record( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rmap *rr = priv; + int error; + + error = xrep_rmap_check_mapping(rr->sc, rec); + if (error) + return error; + + rr->nr_records++; + return 0; +} + +/* + * Generate all the reverse-mappings for this AG, a list of the old rmapbt + * blocks, and the new btreeblks count. Figure out if we have enough free + * space to reconstruct the inode btrees. The caller must clean up the lists + * if anything goes wrong. This implements section (I) above. + */ +STATIC int +xrep_rmap_find_rmaps( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xchk_ag *sa = &sc->sa; + struct xfs_inode *ip; + struct xfs_btree_cur *mcur; + int error; + + /* Find all the per-AG metadata. */ + xrep_ag_btcur_init(sc, &sc->sa); + + error = xrep_rmap_find_inode_rmaps(rr); + if (error) + goto end_agscan; + + error = xrep_rmap_find_refcount_rmaps(rr); + if (error) + goto end_agscan; + + error = xrep_rmap_find_agheader_rmaps(rr); + if (error) + goto end_agscan; + + error = xrep_rmap_find_log_rmaps(rr); +end_agscan: + xchk_ag_btcur_free(&sc->sa); + if (error) + return error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Unlock the AG header buffers and cancel the transaction to release + * the log grant space while we scan the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + sa->agf_bp = NULL; + sa->agi_bp = NULL; + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + /* Iterate all AGs for inodes rmaps. */ + while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { + error = xrep_rmap_scan_inode(rr, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rr->iscan); + if (error) + return error; + + /* + * Switch out for a real transaction and lock the AG headers in + * preparation for building a new tree. + */ + xchk_trans_cancel(sc); + error = xchk_setup_fs(sc); + if (error) + return error; + error = xchk_perag_drain_and_lock(sc); + if (error) + return error; + + /* + * If a hook failed to update the in-memory btree, we lack the data to + * continue the repair. + */ + if (xchk_iscan_aborted(&rr->iscan)) + return -EFSCORRUPTED; + + /* + * Now that we have everything locked again, we need to count the + * number of rmap records stashed in the btree. This should reflect + * all actively-owned space in the filesystem. At the same time, check + * all our records before we start building a new btree, which requires + * a bnobt cursor. + */ + mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + + rr->nr_records = 0; + error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr); + + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + xfs_btree_del_cursor(mcur, error); + + return error; +} + +/* Section (II): Reserving space for new rmapbt and setting free space bitmap */ + +struct xrep_rmap_agfl { + struct xagb_bitmap *bitmap; + xfs_agnumber_t agno; +}; + +/* Add an AGFL block to the rmap list. */ +STATIC int +xrep_rmap_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xrep_rmap_agfl *ra = priv; + + return xagb_bitmap_set(ra->bitmap, agbno, 1); +} + +/* + * Run one round of reserving space for the new rmapbt and recomputing the + * number of blocks needed to store the previously observed rmapbt records and + * the ones we'll create for the free space metadata. When we don't need more + * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to + * true. + */ +STATIC int +xrep_rmap_try_reserve( + struct xrep_rmap *rr, + struct xfs_btree_cur *rmap_cur, + struct xagb_bitmap *freesp_blocks, + uint64_t *blocks_reserved, + bool *done) +{ + struct xrep_rmap_agfl ra = { + .bitmap = freesp_blocks, + .agno = rr->sc->sa.pag->pag_agno, + }; + struct xfs_scrub *sc = rr->sc; + struct xrep_newbt_resv *resv, *n; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_buf *agfl_bp; + uint64_t nr_blocks; /* RMB */ + uint64_t freesp_records; + int error; + + /* + * We're going to recompute new_btree.bload.nr_blocks at the end of + * this function to reflect however many btree blocks we need to store + * all the rmap records (including the ones that reflect the changes we + * made to support the new rmapbt blocks), so we save the old value + * here so we can decide if we've reserved enough blocks. + */ + nr_blocks = rr->new_btree.bload.nr_blocks; + + /* + * Make sure we've reserved enough space for the new btree. This can + * change the shape of the free space btrees, which can cause secondary + * interactions with the rmap records because all three space btrees + * have the same rmap owner. We'll account for all that below. + */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + nr_blocks - *blocks_reserved); + if (error) + return error; + + *blocks_reserved = rr->new_btree.bload.nr_blocks; + + /* Clear everything in the bitmap. */ + xagb_bitmap_destroy(freesp_blocks); + + /* Set all the bnobt blocks in the bitmap. */ + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur); + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + if (error) + return error; + + /* Set all the cntbt blocks in the bitmap. */ + sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur); + xfs_btree_del_cursor(sc->sa.cnt_cur, error); + sc->sa.cnt_cur = NULL; + if (error) + return error; + + /* Record our new btreeblks value. */ + rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2; + + /* Set all the new rmapbt blocks in the bitmap. */ + list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) { + error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len); + if (error) + return error; + } + + /* Set all the AGFL blocks in the bitmap. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + + error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra); + if (error) + return error; + + /* Count the extents in the bitmap. */ + freesp_records = xagb_bitmap_count_set_regions(freesp_blocks); + + /* Compute how many blocks we'll need for all the rmaps. */ + error = xfs_btree_bload_compute_geometry(rmap_cur, + &rr->new_btree.bload, rr->nr_records + freesp_records); + if (error) + return error; + + /* We're done when we don't need more blocks. */ + *done = nr_blocks >= rr->new_btree.bload.nr_blocks; + return 0; +} + +/* + * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for + * the free space metadata. This implements section (II) above. + */ +STATIC int +xrep_rmap_reserve_space( + struct xrep_rmap *rr, + struct xfs_btree_cur *rmap_cur) +{ + struct xagb_bitmap freesp_blocks; /* AGBIT */ + uint64_t blocks_reserved = 0; + bool done = false; + int error; + + /* Compute how many blocks we'll need for the rmaps collected so far. */ + error = xfs_btree_bload_compute_geometry(rmap_cur, + &rr->new_btree.bload, rr->nr_records); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(rr->sc, &error)) + return error; + + xagb_bitmap_init(&freesp_blocks); + + /* + * Iteratively reserve space for the new rmapbt and recompute the + * number of blocks needed to store the previously observed rmapbt + * records and the ones we'll create for the free space metadata. + * Finish when we don't need more blocks. + */ + do { + error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks, + &blocks_reserved, &done); + if (error) + goto out_bitmap; + } while (!done); + + /* Emit rmaps for everything in the free space bitmap. */ + xrep_ag_btcur_init(rr->sc, &rr->sc->sa); + error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG); + xchk_ag_btcur_free(&rr->sc->sa); + +out_bitmap: + xagb_bitmap_destroy(&freesp_blocks); + return error; +} + +/* Section (III): Building the new rmap btree. */ + +/* Update the AGF counters. */ +STATIC int +xrep_rmap_reset_counters( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + xfs_agblock_t rmap_btblocks; + + /* + * The AGF header contains extra information related to the reverse + * mapping btree, so we must update those fields here. + */ + rmap_btblocks = rr->new_btree.afake.af_blocks - 1; + agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks); + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS); + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the rmapbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_rmap_level = pag->pagf_rmap_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* Retrieve rmapbt data for bulk load. */ +STATIC int +xrep_rmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xrep_rmap *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + int stat = 0; + + error = xfs_btree_increment(rr->mcur, 0, &stat); + if (error) + return error; + if (!stat) + return -EFSCORRUPTED; + + error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat); + if (error) + return error; + if (!stat) + return -EFSCORRUPTED; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_rmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_rmap *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Custom allocation function for new rmap btrees. */ +STATIC int +xrep_rmap_alloc_vextent( + struct xfs_scrub *sc, + struct xfs_alloc_arg *args, + xfs_fsblock_t alloc_hint) +{ + int error; + + /* + * We don't want an rmap update on the allocation, since we iteratively + * compute the OWN_AG records /after/ allocating blocks for the records + * that we already know we need to store. Therefore, fix the freelist + * with the NORMAP flag set so that we don't also try to create an rmap + * for new AGFL blocks. + */ + error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP); + if (error) + return error; + + /* + * If xrep_fix_freelist fixed the freelist by moving blocks from the + * free space btrees or by removing blocks from the AGFL and queueing + * an EFI to free the block, the transaction will be dirty. This + * second case is of interest to us. + * + * Later on, we will need to compare gaps in the new recordset against + * the block usage of all OWN_AG owners in order to free the old + * btree's blocks, which means that we can't have EFIs for former AGFL + * blocks attached to the repair transaction when we commit the new + * btree. + * + * xrep_newbt_alloc_blocks guarantees this for us by calling + * xrep_defer_finish to commit anything that fix_freelist may have + * added to the transaction. + */ + return xfs_alloc_vextent_near_bno(args, alloc_hint); +} + + +/* Count the records in this btree. */ +STATIC int +xrep_rmap_count_records( + struct xfs_btree_cur *cur, + unsigned long long *nr) +{ + int running = 1; + int error; + + *nr = 0; + + error = xfs_btree_goto_left_edge(cur); + if (error) + return error; + + while (running && !(error = xfs_btree_increment(cur, 0, &running))) { + if (running) + (*nr)++; + } + + return error; +} +/* + * Use the collected rmap information to stage a new rmap btree. If this is + * successful we'll return with the new btree root information logged to the + * repair transaction but not yet committed. This implements section (III) + * above. + */ +STATIC int +xrep_rmap_build_new_tree( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_btree_cur *rmap_cur; + xfs_fsblock_t fsbno; + int error; + + /* + * Preserve the old rmapbt block count so that we can adjust the + * per-AG rmapbt reservation after we commit the new btree root and + * want to dispose of the old btree blocks. + */ + rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks); + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. The new blocks are accounted to the + * rmapbt per-AG reservation, which we will adjust further after + * committing the new btree. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp)); + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE, + fsbno, XFS_AG_RESV_RMAPBT); + rr->new_btree.bload.get_records = xrep_rmap_get_records; + rr->new_btree.bload.claim_block = xrep_rmap_claim_block; + rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent; + rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake); + + /* + * Initialize @rr->new_btree, reserve space for the new rmapbt, + * and compute OWN_AG rmaps. + */ + error = xrep_rmap_reserve_space(rr, rmap_cur); + if (error) + goto err_cur; + + /* + * Count the rmapbt records again, because the space reservation + * for the rmapbt itself probably added more records to the btree. + */ + rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, + &rr->rmap_btree); + + error = xrep_rmap_count_records(rr->mcur, &rr->nr_records); + if (error) + goto err_mcur; + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the incore btree height so + * that we don't trip the verifiers when writing the new btree blocks + * to disk. + */ + pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height; + + /* + * Move the cursor to the left edge of the tree so that the first + * increment in ->get_records positions us at the first record. + */ + error = xfs_btree_goto_left_edge(rr->mcur); + if (error) + goto err_level; + + /* Add all observed rmap records. */ + error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr); + if (error) + goto err_level; + + /* + * Install the new btree in the AG header. After this point the old + * btree is no longer accessible and the new tree is live. + */ + xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(rmap_cur, 0); + xfs_btree_del_cursor(rr->mcur, 0); + rr->mcur = NULL; + + /* + * Now that we've written the new btree to disk, we don't need to keep + * updating the in-memory btree. Abort the scan to stop live updates. + */ + xchk_iscan_abort(&rr->iscan); + + /* + * The newly committed rmap recordset includes mappings for the blocks + * that we reserved to build the new btree. If there is excess space + * reservation to be freed, the corresponding rmap records must also be + * removed. + */ + rr->new_btree.oinfo = XFS_RMAP_OINFO_AG; + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_rmap_reset_counters(rr); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_level: + pag->pagf_repair_rmap_level = 0; +err_mcur: + xfs_btree_del_cursor(rr->mcur, error); +err_cur: + xfs_btree_del_cursor(rmap_cur, error); +err_newbt: + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* Section (IV): Reaping the old btree. */ + +struct xrep_rmap_find_gaps { + struct xagb_bitmap rmap_gaps; + xfs_agblock_t next_agbno; +}; + +/* Subtract each free extent in the bnobt from the rmap gaps. */ +STATIC int +xrep_rmap_find_freesp( + struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *rec, + void *priv) +{ + struct xrep_rmap_find_gaps *rfg = priv; + + return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock, + rec->ar_blockcount); +} + +/* Record the free space we find, as part of cleaning out the btree. */ +STATIC int +xrep_rmap_find_gaps( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rmap_find_gaps *rfg = priv; + int error; + + if (rec->rm_startblock > rfg->next_agbno) { + error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno, + rec->rm_startblock - rfg->next_agbno); + if (error) + return error; + } + + rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* + * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make + * a list of gaps in the rmap records and a list of the extents mentioned in + * the bnobt. Any block that's in the new rmapbt gap list but not mentioned + * in the bnobt is a block from the old rmapbt and can be removed. + */ +STATIC int +xrep_rmap_remove_old_tree( + struct xrep_rmap *rr) +{ + struct xrep_rmap_find_gaps rfg = { + .next_agbno = 0, + }; + struct xfs_scrub *sc = rr->sc; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_btree_cur *mcur; + xfs_agblock_t agend; + int error; + + xagb_bitmap_init(&rfg.rmap_gaps); + + /* Compute free space from the new rmapbt. */ + mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); + + error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_bitmap; + + /* Insert a record for space between the last rmap and EOAG. */ + agend = be32_to_cpu(agf->agf_length); + if (rfg.next_agbno < agend) { + error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno, + agend - rfg.next_agbno); + if (error) + goto out_bitmap; + } + + /* Compute free space from the existing bnobt. */ + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp, + &rfg); + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + if (error) + goto out_bitmap; + + /* + * Free the "free" blocks that the new rmapbt knows about but the bnobt + * doesn't--these are the old rmapbt blocks. Credit the old rmapbt + * block usage count back to the per-AG rmapbt reservation (and not + * fdblocks, since the rmap btree lives in free space) to keep the + * reservation and free space accounting correct. + */ + error = xrep_reap_agblocks(sc, &rfg.rmap_gaps, + &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT); + if (error) + goto out_bitmap; + + /* + * Now that we've zapped all the old rmapbt blocks we can turn off + * the alternate height mechanism and reset the per-AG space + * reservation. + */ + pag->pagf_repair_rmap_level = 0; + sc->flags |= XREP_RESET_PERAG_RESV; +out_bitmap: + xagb_bitmap_destroy(&rfg.rmap_gaps); + return error; +} + +static inline bool +xrep_rmapbt_want_live_update( + struct xchk_iscan *iscan, + const struct xfs_owner_info *oi) +{ + if (xchk_iscan_aborted(iscan)) + return false; + + /* + * Before unlocking the AG header to perform the inode scan, we + * recorded reverse mappings for all AG metadata except for the OWN_AG + * metadata. IOWs, the in-memory btree knows about the AG headers, the + * two inode btrees, the CoW staging extents, and the refcount btrees. + * For these types of metadata, we need to record the live updates in + * the in-memory rmap btree. + * + * However, we do not scan the free space btrees or the AGFL until we + * have re-locked the AGF and are ready to reserve space for the new + * rmap btree, so we do not want live updates for OWN_AG metadata. + */ + if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner)) + return oi->oi_owner != XFS_RMAP_OWN_AG; + + /* Ignore updates to files that the scanner hasn't visited yet. */ + return xchk_iscan_want_live_update(iscan, oi->oi_owner); +} + +/* + * Apply a rmapbt update from the regular filesystem into our shadow btree. + * We're running from the thread that owns the AGF buffer and is generating + * the update, so we must be careful about which parts of the struct xrep_rmap + * that we change. + */ +static int +xrep_rmapbt_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_rmap_update_params *p = data; + struct xrep_rmap *rr; + struct xfs_mount *mp; + struct xfs_btree_cur *mcur; + struct xfs_trans *tp; + void *txcookie; + int error; + + rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb); + mp = rr->sc->mp; + + if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo)) + goto out_unlock; + + trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p); + + error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); + if (error) + goto out_abort; + + mutex_lock(&rr->lock); + mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree); + error = __xfs_rmap_finish_intent(mcur, action, p->startblock, + p->blockcount, &p->oinfo, p->unwritten); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_cancel; + + error = xfbtree_trans_commit(&rr->rmap_btree, tp); + if (error) + goto out_cancel; + + xrep_trans_cancel_hook_dummy(&txcookie, tp); + mutex_unlock(&rr->lock); + return NOTIFY_DONE; + +out_cancel: + xfbtree_trans_cancel(&rr->rmap_btree, tp); + xrep_trans_cancel_hook_dummy(&txcookie, tp); +out_abort: + mutex_unlock(&rr->lock); + xchk_iscan_abort(&rr->iscan); +out_unlock: + return NOTIFY_DONE; +} + +/* Set up the filesystem scan components. */ +STATIC int +xrep_rmap_setup_scan( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + int error; + + mutex_init(&rr->lock); + + /* Set up in-memory rmap btree */ + error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp, + sc->sa.pag->pag_agno); + if (error) + goto out_mutex; + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &rr->iscan); + + /* + * Hook into live rmap operations so that we can update our in-memory + * btree to reflect live changes on the filesystem. Since we drop the + * AGF buffer to scan all the inodes, we need this piece to avoid + * installing a stale btree. + */ + ASSERT(sc->flags & XCHK_FSGATES_RMAP); + xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update); + error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook); + if (error) + goto out_iscan; + return 0; + +out_iscan: + xchk_iscan_teardown(&rr->iscan); + xfbtree_destroy(&rr->rmap_btree); +out_mutex: + mutex_destroy(&rr->lock); + return error; +} + +/* Tear down scan components. */ +STATIC void +xrep_rmap_teardown( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + xchk_iscan_abort(&rr->iscan); + xfs_rmap_hook_del(sc->sa.pag, &rr->rhook); + xchk_iscan_teardown(&rr->iscan); + xfbtree_destroy(&rr->rmap_btree); + mutex_destroy(&rr->lock); +} + +/* Repair the rmap btree for some AG. */ +int +xrep_rmapbt( + struct xfs_scrub *sc) +{ + struct xrep_rmap *rr = sc->buf; + int error; + + error = xrep_rmap_setup_scan(rr); + if (error) + return error; + + /* + * Collect rmaps for everything in this AG that isn't space metadata. + * These rmaps won't change even as we try to allocate blocks. + */ + error = xrep_rmap_find_rmaps(rr); + if (error) + goto out_records; + + /* Rebuild the rmap information. */ + error = xrep_rmap_build_new_tree(rr); + if (error) + goto out_records; + + /* Kill the old tree. */ + error = xrep_rmap_remove_old_tree(rr); + if (error) + goto out_records; + +out_records: + xrep_rmap_teardown(rr); + return error; +} diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 008ddb599e13..46583517377f 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -11,20 +11,37 @@ #include "xfs_mount.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_sb.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/rtbitmap.h" /* Set us up with the realtime metadata locked. */ int xchk_setup_rtbitmap( struct xfs_scrub *sc) { + struct xfs_mount *mp = sc->mp; + struct xchk_rtbitmap *rtb; int error; - error = xchk_trans_alloc(sc, 0); + rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS); + if (!rtb) + return -ENOMEM; + sc->buf = rtb; + + if (xchk_could_repair(sc)) { + error = xrep_setup_rtbitmap(sc, rtb); + if (error) + return error; + } + + error = xchk_trans_alloc(sc, rtb->resblks); if (error) return error; @@ -32,7 +49,22 @@ xchk_setup_rtbitmap( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + + /* + * Now that we've locked the rtbitmap, we can't race with growfsrt + * trying to expand the bitmap or change the size of the rt volume. + * Hence it is safe to compute and check the geometry values. + */ + if (mp->m_sb.sb_rblocks) { + rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rtb->rextslog = xfs_compute_rextslog(rtb->rextents); + rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents); + } return 0; } @@ -48,12 +80,12 @@ xchk_rtbitmap_rec( { struct xfs_scrub *sc = priv; xfs_rtblock_t startblock; - xfs_rtblock_t blockcount; + xfs_filblks_t blockcount; - startblock = rec->ar_startext * mp->m_sb.sb_rextsize; - blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; + startblock = xfs_rtx_to_rtb(mp, rec->ar_startext); + blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount); - if (!xfs_verify_rtext(mp, startblock, blockcount)) + if (!xfs_verify_rtbext(mp, startblock, blockcount)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -63,21 +95,30 @@ STATIC int xchk_rtbitmap_check_extents( struct xfs_scrub *sc) { - struct xfs_mount *mp = sc->mp; struct xfs_bmbt_irec map; - xfs_rtblock_t off; - int nmap; + struct xfs_iext_cursor icur; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + xfs_fileoff_t off = 0; + xfs_fileoff_t endoff; int error = 0; - for (off = 0; off < mp->m_sb.sb_rbmblocks;) { + /* Mappings may not cross or lie beyond EOF. */ + endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); + if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff); + return 0; + } + + while (off < endoff) { + int nmap = 1; + if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) break; /* Make sure we have a written extent. */ - nmap = 1; - error = xfs_bmapi_read(mp->m_rbmip, off, - mp->m_sb.sb_rbmblocks - off, &map, &nmap, + error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap, XFS_DATA_FORK); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) break; @@ -98,12 +139,48 @@ int xchk_rtbitmap( struct xfs_scrub *sc) { + struct xfs_mount *mp = sc->mp; + struct xchk_rtbitmap *rtb = sc->buf; int error; - /* Is the size of the rtbitmap correct? */ - if (sc->mp->m_rbmip->i_disk_size != - XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) { - xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino); + /* Is sb_rextents correct? */ + if (mp->m_sb.sb_rextents != rtb->rextents) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* Is sb_rextslog correct? */ + if (mp->m_sb.sb_rextslog != rtb->rextslog) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* + * Is sb_rbmblocks large enough to handle the current rt volume? In no + * case can we exceed 4bn bitmap blocks since the super field is a u32. + */ + if (rtb->rbmblocks > U32_MAX) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* The bitmap file length must be aligned to an fsblock. */ + if (mp->m_rbmip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* + * Is the bitmap file itself large enough to handle the rt volume? + * growfsrt expands the bitmap file before updating sb_rextents, so the + * file can be larger than sb_rbmblocks. + */ + if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); return 0; } @@ -116,38 +193,33 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc); + error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; -out: - return error; + return 0; } /* xref check that the extent is not free in the rtbitmap */ void xchk_xref_is_used_rt_space( struct xfs_scrub *sc, - xfs_rtblock_t fsbno, + xfs_rtblock_t rtbno, xfs_extlen_t len) { - xfs_rtblock_t startext; - xfs_rtblock_t endext; - xfs_rtblock_t extcount; + xfs_rtxnum_t startext; + xfs_rtxnum_t endext; bool is_free; int error; if (xchk_skip_xref(sc->sm)) return; - startext = fsbno; - endext = fsbno + len - 1; - do_div(startext, sc->mp->m_sb.sb_rextsize); - do_div(endext, sc->mp->m_sb.sb_rextsize); - extcount = endext - startext + 1; + startext = xfs_rtb_to_rtx(sc->mp, rtbno); + endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1); xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, - &is_free); + error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, + endext - startext + 1, &is_free); if (!xchk_should_check_xref(sc, &error, NULL)) goto out_unlock; if (is_free) diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h new file mode 100644 index 000000000000..85304ff019e1 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RTBITMAP_H__ +#define __XFS_SCRUB_RTBITMAP_H__ + +struct xchk_rtbitmap { + uint64_t rextents; + uint64_t rbmblocks; + unsigned int rextslog; + unsigned int resblks; +}; + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb); +#else +# define xrep_setup_rtbitmap(sc, rtb) (0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_RTBITMAP_H__ */ diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c new file mode 100644 index 000000000000..46f5d5f605c9 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap_repair.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/rtbitmap.h" + +/* Set up to repair the realtime bitmap file metadata. */ +int +xrep_setup_rtbitmap( + struct xfs_scrub *sc, + struct xchk_rtbitmap *rtb) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks = 0; + + /* + * Reserve enough blocks to write out a completely new bmbt for a + * maximally fragmented bitmap file. We do not hold the rtbitmap + * ILOCK yet, so this is entirely speculative. + */ + blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks); + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + rtb->resblks += blocks; + return 0; +} + +/* + * Make sure that the given range of the data fork of the realtime file is + * mapped to written blocks. The caller must ensure that the inode is joined + * to the transaction. + */ +STATIC int +xrep_rtbitmap_data_mappings( + struct xfs_scrub *sc, + xfs_filblks_t len) +{ + struct xfs_bmbt_irec map; + xfs_fileoff_t off = 0; + int error; + + ASSERT(sc->ip != NULL); + + while (off < len) { + int nmaps = 1; + + /* + * If we have a real extent mapping this block then we're + * in ok shape. + */ + error = xfs_bmapi_read(sc->ip, off, len - off, &map, &nmaps, + XFS_DATA_FORK); + if (error) + return error; + if (nmaps == 0) { + ASSERT(nmaps != 0); + return -EFSCORRUPTED; + } + + /* + * Written extents are ok. Holes are not filled because we + * do not know the freespace information. + */ + if (xfs_bmap_is_written_extent(&map) || + map.br_startblock == HOLESTARTBLOCK) { + off = map.br_startoff + map.br_blockcount; + continue; + } + + /* + * If we find a delalloc reservation then something is very + * very wrong. Bail out. + */ + if (map.br_startblock == DELAYSTARTBLOCK) + return -EFSCORRUPTED; + + /* Make sure we're really converting an unwritten extent. */ + if (map.br_state != XFS_EXT_UNWRITTEN) { + ASSERT(map.br_state == XFS_EXT_UNWRITTEN); + return -EFSCORRUPTED; + } + + /* Make sure this block has a real zeroed extent mapped. */ + nmaps = 1; + error = xfs_bmapi_write(sc->tp, sc->ip, map.br_startoff, + map.br_blockcount, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, + 0, &map, &nmaps); + if (error) + return error; + if (nmaps != 1) + return -EFSCORRUPTED; + + /* Commit new extent and all deferred work. */ + error = xrep_defer_finish(sc); + if (error) + return error; + + off = map.br_startoff + map.br_blockcount; + } + + return 0; +} + +/* Fix broken rt volume geometry. */ +STATIC int +xrep_rtbitmap_geometry( + struct xfs_scrub *sc, + struct xchk_rtbitmap *rtb) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + + /* Superblock fields */ + if (mp->m_sb.sb_rextents != rtb->rextents) + xfs_trans_mod_sb(sc->tp, XFS_TRANS_SB_REXTENTS, + rtb->rextents - mp->m_sb.sb_rextents); + + if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + rtb->rbmblocks - mp->m_sb.sb_rbmblocks); + + if (mp->m_sb.sb_rextslog != rtb->rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + rtb->rextslog - mp->m_sb.sb_rextslog); + + /* Fix broken isize */ + sc->ip->i_disk_size = roundup_64(sc->ip->i_disk_size, + mp->m_sb.sb_blocksize); + + if (sc->ip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) + sc->ip->i_disk_size = XFS_FSB_TO_B(mp, rtb->rbmblocks); + + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return xrep_roll_trans(sc); +} + +/* Repair the realtime bitmap file metadata. */ +int +xrep_rtbitmap( + struct xfs_scrub *sc) +{ + struct xchk_rtbitmap *rtb = sc->buf; + struct xfs_mount *mp = sc->mp; + unsigned long long blocks = 0; + int error; + + /* Impossibly large rtbitmap means we can't touch the filesystem. */ + if (rtb->rbmblocks > U32_MAX) + return 0; + + /* + * If the size of the rt bitmap file is larger than what we reserved, + * figure out if we need to adjust the block reservation in the + * transaction. + */ + blocks = xfs_bmbt_calc_size(mp, rtb->rbmblocks); + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + if (blocks > rtb->resblks) { + error = xfs_trans_reserve_more(sc->tp, blocks, 0); + if (error) + return error; + + rtb->resblks += blocks; + } + + /* Fix inode core and forks. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Ensure no unwritten extents. */ + error = xrep_rtbitmap_data_mappings(sc, rtb->rbmblocks); + if (error) + return error; + + /* Fix inconsistent bitmap geometry */ + return xrep_rtbitmap_geometry(sc, rtb); +} diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 437ed9acbb27..5055092bd9e8 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -13,9 +13,10 @@ #include "xfs_inode.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_bit.h" #include "xfs_bmap.h" +#include "xfs_sb.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -31,6 +32,18 @@ * (potentially large) amount of data in pageable memory. */ +struct xchk_rtsummary { + struct xfs_rtalloc_args args; + + uint64_t rextents; + uint64_t rbmblocks; + uint64_t rsumsize; + unsigned int rsumlevels; + + /* Memory buffer for the summary comparison. */ + union xfs_suminfo_raw words[]; +}; + /* Set us up to check the rtsummary file. */ int xchk_setup_rtsummary( @@ -38,8 +51,15 @@ xchk_setup_rtsummary( { struct xfs_mount *mp = sc->mp; char *descr; + struct xchk_rtsummary *rts; int error; + rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize), + XCHK_GFP_FLAGS); + if (!rts) + return -ENOMEM; + sc->buf = rts; + /* * Create an xfile to construct a new rtsummary file. The xfile allows * us to avoid pinning kernel memory for this purpose. @@ -54,15 +74,14 @@ xchk_setup_rtsummary( if (error) return error; - /* Allocate a memory buffer for the summary comparison. */ - sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS); - if (!sc->buf) - return -ENOMEM; - error = xchk_install_live_inode(sc, mp->m_rsumip); if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + /* * Locking order requires us to take the rtbitmap first. We must be * careful to unlock it ourselves when we are done with the rtbitmap @@ -71,44 +90,71 @@ xchk_setup_rtsummary( */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + + /* + * Now that we've locked the rtbitmap and rtsummary, we can't race with + * growfsrt trying to expand the summary or change the size of the rt + * volume. Hence it is safe to compute and check the geometry values. + */ + if (mp->m_sb.sb_rblocks) { + xfs_filblks_t rsumblocks; + int rextslog; + + rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rextslog = xfs_compute_rextslog(rts->rextents); + rts->rsumlevels = rextslog + 1; + rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents); + rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels, + rts->rbmblocks); + rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks); + } return 0; } /* Helper functions to record suminfo words in an xfile. */ -typedef unsigned int xchk_rtsumoff_t; - static inline int xfsum_load( struct xfs_scrub *sc, - xchk_rtsumoff_t sumoff, - xfs_suminfo_t *info) + xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo) { - return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t), + return xfile_load(sc->xfile, rawinfo, + sizeof(union xfs_suminfo_raw), sumoff << XFS_WORDLOG); } static inline int xfsum_store( struct xfs_scrub *sc, - xchk_rtsumoff_t sumoff, - const xfs_suminfo_t info) + xfs_rtsumoff_t sumoff, + const union xfs_suminfo_raw rawinfo) { - return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t), + return xfile_store(sc->xfile, &rawinfo, + sizeof(union xfs_suminfo_raw), sumoff << XFS_WORDLOG); } static inline int xfsum_copyout( struct xfs_scrub *sc, - xchk_rtsumoff_t sumoff, - xfs_suminfo_t *info, + xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo, unsigned int nr_words) { - return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG, + return xfile_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG, sumoff << XFS_WORDLOG); } +static inline xfs_suminfo_t +xchk_rtsum_inc( + struct xfs_mount *mp, + union xfs_suminfo_raw *v) +{ + v->old += 1; + return v->old; +} + /* Update the summary file to reflect the free extent that we've accumulated. */ STATIC int xchk_rtsum_record_free( @@ -121,23 +167,24 @@ xchk_rtsum_record_free( xfs_fileoff_t rbmoff; xfs_rtblock_t rtbno; xfs_filblks_t rtlen; - xchk_rtsumoff_t offs; + xfs_rtsumoff_t offs; unsigned int lenlog; - xfs_suminfo_t v = 0; + union xfs_suminfo_raw v; + xfs_suminfo_t value; int error = 0; if (xchk_should_terminate(sc, &error)) return error; /* Compute the relevant location in the rtsum file. */ - rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext); - lenlog = XFS_RTBLOCKLOG(rec->ar_extcount); - offs = XFS_SUMOFFS(mp, lenlog, rbmoff); + rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext); + lenlog = xfs_highbit64(rec->ar_extcount); + offs = xfs_rtsumoffs(mp, lenlog, rbmoff); - rtbno = rec->ar_startext * mp->m_sb.sb_rextsize; - rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize; + rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); + rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount); - if (!xfs_verify_rtext(mp, rtbno, rtlen)) { + if (!xfs_verify_rtbext(mp, rtbno, rtlen)) { xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); return -EFSCORRUPTED; } @@ -147,9 +194,9 @@ xchk_rtsum_record_free( if (error) return error; - v++; + value = xchk_rtsum_inc(sc->mp, &v); trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount, - lenlog, offs, v); + lenlog, offs, value); return xfsum_store(sc, offs, v); } @@ -160,12 +207,11 @@ xchk_rtsum_compute( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; - unsigned long long rtbmp_bytes; + unsigned long long rtbmp_blocks; /* If the bitmap size doesn't match the computed size, bail. */ - rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY); - if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) != - mp->m_rbmip->i_disk_size) + rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents); + if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size) return -EFSCORRUPTED; return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free, @@ -177,15 +223,29 @@ STATIC int xchk_rtsum_compare( struct xfs_scrub *sc) { - struct xfs_mount *mp = sc->mp; - struct xfs_buf *bp; struct xfs_bmbt_irec map; - xfs_fileoff_t off; - xchk_rtsumoff_t sumoff = 0; - int nmap; + struct xfs_iext_cursor icur; + + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct xchk_rtsummary *rts = sc->buf; + xfs_fileoff_t off = 0; + xfs_fileoff_t endoff; + xfs_rtsumoff_t sumoff = 0; + int error = 0; + + rts->args.mp = sc->mp; + rts->args.tp = sc->tp; - for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) { - int error = 0; + /* Mappings may not cross or lie beyond EOF. */ + endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); + if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff); + return 0; + } + + while (off < endoff) { + int nmap = 1; if (xchk_should_terminate(sc, &error)) return error; @@ -193,8 +253,7 @@ xchk_rtsum_compare( return 0; /* Make sure we have a written extent. */ - nmap = 1; - error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap, + error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap, XFS_DATA_FORK); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; @@ -204,23 +263,33 @@ xchk_rtsum_compare( return 0; } + off += map.br_blockcount; + } + + for (off = 0; off < endoff; off++) { + union xfs_suminfo_raw *ondisk_info; + /* Read a block's worth of ondisk rtsummary file. */ - error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp); + error = xfs_rtsummary_read_buf(&rts->args, off); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; /* Read a block's worth of computed rtsummary file. */ - error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize); + error = xfsum_copyout(sc, sumoff, rts->words, mp->m_blockwsize); if (error) { - xfs_trans_brelse(sc->tp, bp); + xfs_rtbuf_cache_relse(&rts->args); return error; } - if (memcmp(bp->b_addr, sc->buf, - mp->m_blockwsize << XFS_WORDLOG) != 0) + ondisk_info = xfs_rsumblock_infoptr(&rts->args, 0); + if (memcmp(ondisk_info, rts->words, + mp->m_blockwsize << XFS_WORDLOG) != 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + xfs_rtbuf_cache_relse(&rts->args); + return error; + } - xfs_trans_brelse(sc->tp, bp); + xfs_rtbuf_cache_relse(&rts->args); sumoff += mp->m_blockwsize; } @@ -233,8 +302,43 @@ xchk_rtsummary( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; + struct xchk_rtsummary *rts = sc->buf; int error = 0; + /* Is sb_rextents correct? */ + if (mp->m_sb.sb_rextents != rts->rextents) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + goto out_rbm; + } + + /* Is m_rsumlevels correct? */ + if (mp->m_rsumlevels != rts->rsumlevels) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* Is m_rsumsize correct? */ + if (mp->m_rsumsize != rts->rsumsize) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* The summary file length must be aligned to an fsblock. */ + if (mp->m_rsumip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* + * Is the summary file itself large enough to handle the rt volume? + * growfsrt expands the summary file before updating sb_rextents, so + * the file can be larger than rsumsize. + */ + if (mp->m_rsumip->i_disk_size < rts->rsumsize) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + /* Invoke the fork scrubber. */ error = xchk_metadata_inode_forks(sc); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 4849efcaa33a..20fac9723c08 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -14,9 +14,9 @@ #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_qm.h" -#include "xfs_errortag.h" -#include "xfs_error.h" #include "xfs_scrub.h" +#include "xfs_buf_mem.h" +#include "xfs_rmap.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -159,6 +159,15 @@ xchk_fsgates_disable( if (sc->flags & XCHK_FSGATES_DRAIN) xfs_drain_wait_disable(); + if (sc->flags & XCHK_FSGATES_QUOTA) + xfs_dqtrx_hook_disable(); + + if (sc->flags & XCHK_FSGATES_DIRENTS) + xfs_dir_hook_disable(); + + if (sc->flags & XCHK_FSGATES_RMAP) + xfs_rmap_hook_disable(); + sc->flags &= ~XCHK_FSGATES_ALL; } @@ -186,6 +195,10 @@ xchk_teardown( sc->flags &= ~XCHK_HAVE_FREEZE_PROT; mnt_drop_write_file(sc->file); } + if (sc->xmbtp) { + xmbuf_free(sc->xmbtp); + sc->xmbtp = NULL; + } if (sc->xfile) { xfile_destroy(sc->xfile); sc->xfile = NULL; @@ -238,65 +251,69 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, - .scrub = xchk_bnobt, - .repair = xrep_notsupported, + .scrub = xchk_allocbt, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, - .scrub = xchk_cntbt, - .repair = xrep_notsupported, + .scrub = xchk_allocbt, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, - .scrub = xchk_inobt, - .repair = xrep_notsupported, + .scrub = xchk_iallocbt, + .repair = xrep_iallocbt, + .repair_eval = xrep_revalidate_iallocbt, }, [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, - .scrub = xchk_finobt, + .scrub = xchk_iallocbt, .has = xfs_has_finobt, - .repair = xrep_notsupported, + .repair = xrep_iallocbt, + .repair_eval = xrep_revalidate_iallocbt, }, [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ .type = ST_PERAG, .setup = xchk_setup_ag_rmapbt, .scrub = xchk_rmapbt, .has = xfs_has_rmapbt, - .repair = xrep_notsupported, + .repair = xrep_rmapbt, }, [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ .type = ST_PERAG, .setup = xchk_setup_ag_refcountbt, .scrub = xchk_refcountbt, .has = xfs_has_reflink, - .repair = xrep_notsupported, + .repair = xrep_refcountbt, }, [XFS_SCRUB_TYPE_INODE] = { /* inode record */ .type = ST_INODE, .setup = xchk_setup_inode, .scrub = xchk_inode, - .repair = xrep_notsupported, + .repair = xrep_inode, }, [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_data, - .repair = xrep_notsupported, + .repair = xrep_bmap_data, }, [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_attr, - .repair = xrep_notsupported, + .repair = xrep_bmap_attr, }, [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_cow, - .repair = xrep_notsupported, + .repair = xrep_bmap_cow, }, [XFS_SCRUB_TYPE_DIR] = { /* directory */ .type = ST_INODE, @@ -326,39 +343,55 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, - .has = xfs_has_realtime, - .repair = xrep_notsupported, + .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, - .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ .type = ST_FS, .setup = xchk_setup_fscounters, .scrub = xchk_fscounters, - .repair = xrep_notsupported, + .repair = xrep_fscounters, + }, + [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */ + .type = ST_FS, + .setup = xchk_setup_quotacheck, + .scrub = xchk_quotacheck, + .repair = xrep_quotacheck, + }, + [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */ + .type = ST_FS, + .setup = xchk_setup_nlinks, + .scrub = xchk_nlinks, + .repair = xrep_nlinks, + }, + [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */ + .type = ST_FS, + .setup = xchk_setup_fs, + .scrub = xchk_health_record, + .repair = xrep_notsupported, }, }; @@ -531,7 +564,10 @@ retry_op: /* Scrub for errors. */ check_start = xchk_stats_now(); - error = sc->ops->scrub(sc); + if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL) + error = sc->ops->repair_eval(sc); + else + error = sc->ops->scrub(sc); run.scrub_ns += xchk_stats_elapsed_ns(check_start); if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) goto try_harder; @@ -542,23 +578,12 @@ retry_op: xchk_update_health(sc); - if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && - !(sc->flags & XREP_ALREADY_FIXED)) { - bool needs_fix = xchk_needs_repair(sc->sm); - - /* Userspace asked us to rebuild the structure regardless. */ - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) - needs_fix = true; - - /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) - needs_fix = true; - + if (xchk_could_repair(sc)) { /* * If userspace asked for a repair but it wasn't necessary, * report that back to userspace. */ - if (!needs_fix) { + if (!xrep_will_attempt(sc)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; goto out_nofix; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 1ef9c6b4842a..9ad65b604fe1 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -35,6 +35,14 @@ struct xchk_meta_ops { /* Repair or optimize the metadata. */ int (*repair)(struct xfs_scrub *); + /* + * Re-scrub the metadata we repaired, in case there's extra work that + * we need to do to check our repair work. If this is NULL, we'll use + * the ->scrub function pointer, assuming that the regular scrub is + * sufficient. + */ + int (*repair_eval)(struct xfs_scrub *sc); + /* Decide if we even have this piece of metadata. */ bool (*has)(struct xfs_mount *); @@ -91,6 +99,9 @@ struct xfs_scrub { /* xfile used by the scrubbers; freed at teardown. */ struct xfile *xfile; + /* buffer target for in-memory btrees; also freed at teardown. */ + struct xfs_buftarg *xmbtp; + /* Lock flags for @ip. */ uint ilock_flags; @@ -113,6 +124,10 @@ struct xfs_scrub { #define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ +#define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */ +#define XCHK_FSGATES_DIRENTS (1U << 5) /* directory live update enabled */ +#define XCHK_FSGATES_RMAP (1U << 6) /* rmapbt live update enabled */ +#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ /* @@ -121,7 +136,10 @@ struct xfs_scrub { * features are gated off via dynamic code patching, which is why the state * must be enabled during scrub setup and can only be torn down afterwards. */ -#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN) +#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN | \ + XCHK_FSGATES_QUOTA | \ + XCHK_FSGATES_DIRENTS | \ + XCHK_FSGATES_RMAP) /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); @@ -129,10 +147,8 @@ int xchk_superblock(struct xfs_scrub *sc); int xchk_agf(struct xfs_scrub *sc); int xchk_agfl(struct xfs_scrub *sc); int xchk_agi(struct xfs_scrub *sc); -int xchk_bnobt(struct xfs_scrub *sc); -int xchk_cntbt(struct xfs_scrub *sc); -int xchk_inobt(struct xfs_scrub *sc); -int xchk_finobt(struct xfs_scrub *sc); +int xchk_allocbt(struct xfs_scrub *sc); +int xchk_iallocbt(struct xfs_scrub *sc); int xchk_rmapbt(struct xfs_scrub *sc); int xchk_refcountbt(struct xfs_scrub *sc); int xchk_inode(struct xfs_scrub *sc); @@ -160,14 +176,21 @@ xchk_rtsummary(struct xfs_scrub *sc) #endif #ifdef CONFIG_XFS_QUOTA int xchk_quota(struct xfs_scrub *sc); +int xchk_quotacheck(struct xfs_scrub *sc); #else static inline int xchk_quota(struct xfs_scrub *sc) { return -ENOENT; } +static inline int +xchk_quotacheck(struct xfs_scrub *sc) +{ + return -ENOENT; +} #endif int xchk_fscounters(struct xfs_scrub *sc); +int xchk_nlinks(struct xfs_scrub *sc); /* cross-referencing helpers */ void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno, diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index cd91db4a5548..42cafbed94ac 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -77,6 +77,8 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_GQUOTA] = "grpquota", [XFS_SCRUB_TYPE_PQUOTA] = "prjquota", [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", + [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck", + [XFS_SCRUB_TYPE_NLINKS] = "nlinks", }; /* Format the scrub stats into a text buffer, similar to pcp style. */ @@ -329,9 +331,9 @@ xchk_stats_register( if (!cs->cs_debugfs) return; - debugfs_create_file("stats", 0644, cs->cs_debugfs, cs, + debugfs_create_file("stats", 0444, cs->cs_debugfs, cs, &scrub_stats_fops); - debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs, + debugfs_create_file("clear_stats", 0200, cs->cs_debugfs, cs, &clear_scrub_stats_fops); } diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 38708fb9a5d7..d77d8a9598f6 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -12,8 +12,11 @@ #include "xfs_log_format.h" #include "xfs_inode.h" #include "xfs_symlink.h" +#include "xfs_health.h" +#include "xfs_symlink_remote.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/health.h" /* Set us up to scrub a symbolic link. */ int @@ -41,29 +44,37 @@ xchk_symlink( if (!S_ISLNK(VFS_I(ip)->i_mode)) return -ENOENT; + + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_SYMLINK_ZAPPED)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); len = ip->i_disk_size; /* Plausible size? */ if (len > XFS_SYMLINK_MAXLEN || len <= 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* Inline symlink? */ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { if (len > xfs_inode_data_fork_size(ip) || - len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip))) + len > strnlen(ifp->if_data, xfs_inode_data_fork_size(ip))) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* Remote symlink; must read the contents. */ - error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf); + error = xfs_symlink_remote_read(sc->ip, sc->buf); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); -out: - return error; + + /* If a remote symlink is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED); + return 0; } diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 46249e7b17e0..3dd281d6d185 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -13,9 +13,19 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_quota.h" +#include "xfs_quota_defs.h" +#include "xfs_da_format.h" +#include "xfs_dir2.h" +#include "xfs_rmap.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" +#include "scrub/quota.h" +#include "scrub/iscan.h" +#include "scrub/nlinks.h" +#include "scrub/fscounters.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t @@ -28,7 +38,7 @@ xchk_btree_cur_fsbno( xfs_buf_daddr(cur->bc_levels[level].bp)); if (level == cur->bc_nlevels - 1 && - (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + cur->bc_ops->type == XFS_BTREE_TYPE_INODE) return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); return NULLFSBLOCK; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index cbd4d01e253c..5b294be52c55 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -15,10 +15,17 @@ #include <linux/tracepoint.h> #include "xfs_bit.h" +#include "xfs_quota_defs.h" +struct xfs_scrub; struct xfile; struct xfarray; struct xfarray_sortinfo; +struct xchk_dqiter; +struct xchk_iscan; +struct xchk_nlink; +struct xchk_fscounters; +struct xfs_rmap_update_params; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -26,14 +33,6 @@ struct xfarray_sortinfo; * ring buffer. Somehow this was only worth mentioning in the ftrace sample * code. */ -TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi); -TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi); -TRACE_DEFINE_ENUM(XFS_BTNUM_INOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); -TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); - TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); @@ -62,6 +61,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -88,7 +90,10 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \ { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \ { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ - { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" } + { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \ + { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \ + { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \ + { XFS_SCRUB_TYPE_HEALTHY, "healthy" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ @@ -106,8 +111,21 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ + { XCHK_FSGATES_QUOTA, "fsgates_quota" }, \ + { XCHK_FSGATES_DIRENTS, "fsgates_dirents" }, \ + { XCHK_FSGATES_RMAP, "fsgates_rmap" }, \ + { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \ { XREP_ALREADY_FIXED, "already_fixed" } +TRACE_DEFINE_ENUM(XFS_RMAP_MAP); +TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP); +TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT); +TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC); +TRACE_DEFINE_ENUM(XFS_RMAP_FREE); + DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, int error), @@ -347,6 +365,77 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \ DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error); DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning); +#ifdef CONFIG_XFS_QUOTA +DECLARE_EVENT_CLASS(xchk_dqiter_class, + TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), + TP_ARGS(cursor, id), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, dqtype) + __field(xfs_ino_t, ino) + __field(unsigned long long, cur_id) + __field(unsigned long long, id) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + ), + TP_fast_assign( + __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev; + __entry->dqtype = cursor->dqtype; + __entry->ino = cursor->quota_ip->i_ino; + __entry->cur_id = cursor->id; + __entry->startoff = cursor->bmap.br_startoff; + __entry->startblock = cursor->bmap.br_startblock; + __entry->blockcount = cursor->bmap.br_blockcount; + __entry->state = cursor->bmap.br_state; + __entry->id = id; + ), + TP_printk("dev %d:%d dquot type %s ino 0x%llx cursor_id 0x%llx startoff 0x%llx startblock 0x%llx blockcount 0x%llx state %u id 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS), + __entry->ino, + __entry->cur_id, + __entry->startoff, + __entry->startblock, + __entry->blockcount, + __entry->state, + __entry->id) +); + +#define DEFINE_SCRUB_DQITER_EVENT(name) \ +DEFINE_EVENT(xchk_dqiter_class, name, \ + TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), \ + TP_ARGS(cursor, id)) +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter); + +TRACE_EVENT(xchk_qcheck_error, + TP_PROTO(struct xfs_scrub *sc, xfs_dqtype_t dqtype, xfs_dqid_t id, + void *ret_ip), + TP_ARGS(sc, dqtype, id, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, dqtype) + __field(xfs_dqid_t, id) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->dqtype = dqtype; + __entry->id = id; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d dquot type %s id 0x%x ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS), + __entry->id, + __entry->ret_ip) +); +#endif /* CONFIG_XFS_QUOTA */ + TRACE_EVENT(xchk_incomplete, TP_PROTO(struct xfs_scrub *sc, void *ret_ip), TP_ARGS(sc, ret_ip), @@ -373,7 +462,7 @@ TRACE_EVENT(xchk_btree_op_error, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) @@ -386,7 +475,7 @@ TRACE_EVENT(xchk_btree_op_error, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -394,10 +483,10 @@ TRACE_EVENT(xchk_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", + TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -415,7 +504,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __field(xfs_ino_t, ino) __field(int, whichfork) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(int, ptr) __field(xfs_agnumber_t, agno) @@ -429,7 +518,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->ino = sc->ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->ptr = cur->bc_levels[level].ptr; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); @@ -437,12 +526,12 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -458,7 +547,7 @@ TRACE_EVENT(xchk_btree_error, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) @@ -469,17 +558,17 @@ TRACE_EVENT(xchk_btree_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->ptr = cur->bc_levels[level].ptr; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", + TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -496,7 +585,7 @@ TRACE_EVENT(xchk_ifork_btree_error, __field(xfs_ino_t, ino) __field(int, whichfork) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) @@ -509,19 +598,19 @@ TRACE_EVENT(xchk_ifork_btree_error, __entry->ino = sc->ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->ptr = cur->bc_levels[level].ptr; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -536,7 +625,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, TP_STRUCT__entry( __field(dev_t, dev) __field(int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) __field(int, level) @@ -548,17 +637,17 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_levels[level].ptr; ), - TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d", + TP_printk("dev %d:%d type %s %sbt agno 0x%x agbno 0x%x level %d nlevels %d ptr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->agno, __entry->bno, __entry->level, @@ -811,18 +900,11 @@ TRACE_EVENT(xfile_destroy, __field(loff_t, size) ), TP_fast_assign( - struct xfile_stat statbuf; - int ret; + struct inode *inode = file_inode(xf->file); - ret = xfile_stat(xf, &statbuf); - if (!ret) { - __entry->bytes = statbuf.bytes; - __entry->size = statbuf.size; - } else { - __entry->bytes = -1; - __entry->size = -1; - } - __entry->ino = file_inode(xf->file)->i_ino; + __entry->ino = inode->i_ino; + __entry->bytes = inode->i_blocks << SECTOR_SHIFT; + __entry->size = i_size_read(inode); ), TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx", __entry->ino, @@ -841,19 +923,12 @@ DECLARE_EVENT_CLASS(xfile_class, __field(unsigned long long, bytecount) ), TP_fast_assign( - struct xfile_stat statbuf; - int ret; + struct inode *inode = file_inode(xf->file); - ret = xfile_stat(xf, &statbuf); - if (!ret) { - __entry->bytes_used = statbuf.bytes; - __entry->size = statbuf.size; - } else { - __entry->bytes_used = -1; - __entry->size = -1; - } - __entry->ino = file_inode(xf->file)->i_ino; + __entry->ino = inode->i_ino; + __entry->bytes_used = inode->i_blocks << SECTOR_SHIFT; __entry->pos = pos; + __entry->size = i_size_read(inode); __entry->bytecount = bytecount; ), TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx", @@ -867,11 +942,11 @@ DECLARE_EVENT_CLASS(xfile_class, DEFINE_EVENT(xfile_class, name, \ TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \ TP_ARGS(xf, pos, bytecount)) -DEFINE_XFILE_EVENT(xfile_pread); -DEFINE_XFILE_EVENT(xfile_pwrite); +DEFINE_XFILE_EVENT(xfile_load); +DEFINE_XFILE_EVENT(xfile_store); DEFINE_XFILE_EVENT(xfile_seek_data); -DEFINE_XFILE_EVENT(xfile_get_page); -DEFINE_XFILE_EVENT(xfile_put_page); +DEFINE_XFILE_EVENT(xfile_get_folio); +DEFINE_XFILE_EVENT(xfile_put_folio); TRACE_EVENT(xfarray_create, TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity), @@ -918,7 +993,7 @@ TRACE_EVENT(xfarray_isort, __entry->hi - __entry->lo) ); -TRACE_EVENT(xfarray_pagesort, +TRACE_EVENT(xfarray_foliosort, TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), TP_ARGS(si, lo, hi), TP_STRUCT__entry( @@ -989,6 +1064,47 @@ TRACE_EVENT(xfarray_sort, __entry->bytes) ); +TRACE_EVENT(xfarray_sort_scan, + TP_PROTO(struct xfarray_sortinfo *si, unsigned long long idx), + TP_ARGS(si, idx), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, nr) + __field(size_t, obj_size) + __field(unsigned long long, idx) + __field(unsigned long long, folio_pos) + __field(unsigned long, folio_bytes) + __field(unsigned long long, first_idx) + __field(unsigned long long, last_idx) + ), + TP_fast_assign( + __entry->nr = si->array->nr; + __entry->obj_size = si->array->obj_size; + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->idx = idx; + if (si->folio) { + __entry->folio_pos = folio_pos(si->folio); + __entry->folio_bytes = folio_size(si->folio); + __entry->first_idx = si->first_folio_idx; + __entry->last_idx = si->last_folio_idx; + } else { + __entry->folio_pos = 0; + __entry->folio_bytes = 0; + __entry->first_idx = 0; + __entry->last_idx = 0; + } + ), + TP_printk("xfino 0x%lx nr %llu objsz %zu idx %llu folio_pos 0x%llx folio_bytes 0x%lx first_idx %llu last_idx %llu", + __entry->ino, + __entry->nr, + __entry->obj_size, + __entry->idx, + __entry->folio_pos, + __entry->folio_bytes, + __entry->first_idx, + __entry->last_idx) +); + TRACE_EVENT(xfarray_sort_stats, TP_PROTO(struct xfarray_sortinfo *si, int error), TP_ARGS(si, error), @@ -1036,17 +1152,18 @@ TRACE_EVENT(xfarray_sort_stats, #ifdef CONFIG_XFS_RT TRACE_EVENT(xchk_rtsum_record_free, - TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start, - uint64_t len, unsigned int log, loff_t pos, xfs_suminfo_t v), - TP_ARGS(mp, start, len, log, pos, v), + TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, + xfs_rtbxlen_t len, unsigned int log, loff_t pos, + xfs_suminfo_t value), + TP_ARGS(mp, start, len, log, pos, value), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, rtdev) - __field(xfs_rtblock_t, start) + __field(xfs_rtxnum_t, start) __field(unsigned long long, len) __field(unsigned int, log) __field(loff_t, pos) - __field(xfs_suminfo_t, v) + __field(xfs_suminfo_t, value) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; @@ -1055,7 +1172,7 @@ TRACE_EVENT(xchk_rtsum_record_free, __entry->len = len; __entry->log = log; __entry->pos = pos; - __entry->v = v; + __entry->value = value; ), TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1064,10 +1181,327 @@ TRACE_EVENT(xchk_rtsum_record_free, __entry->len, __entry->log, __entry->pos, - __entry->v) + __entry->value) ); #endif /* CONFIG_XFS_RT */ +DECLARE_EVENT_CLASS(xchk_iscan_class, + TP_PROTO(struct xchk_iscan *iscan), + TP_ARGS(iscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited) +) +#define DEFINE_ISCAN_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_class, name, \ + TP_PROTO(struct xchk_iscan *iscan), \ + TP_ARGS(iscan)) +DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor); +DEFINE_ISCAN_EVENT(xchk_iscan_visit); +DEFINE_ISCAN_EVENT(xchk_iscan_skip); +DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag); + +DECLARE_EVENT_CLASS(xchk_iscan_ino_class, + TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), + TP_ARGS(iscan, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, startino) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->startino = iscan->scan_start_ino; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->ino = ino; + ), + TP_printk("dev %d:%d iscan start 0x%llx cursor 0x%llx visited 0x%llx ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->startino, + __entry->cursor, + __entry->visited, + __entry->ino) +) +#define DEFINE_ISCAN_INO_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_ino_class, name, \ + TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), \ + TP_ARGS(iscan, ino)) +DEFINE_ISCAN_INO_EVENT(xchk_iscan_want_live_update); +DEFINE_ISCAN_INO_EVENT(xchk_iscan_start); + +TRACE_EVENT(xchk_iscan_iget, + TP_PROTO(struct xchk_iscan *iscan, int error), + TP_ARGS(iscan, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->error = error; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->error) +); + +TRACE_EVENT(xchk_iscan_iget_batch, + TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan, + unsigned int nr, unsigned int avail), + TP_ARGS(mp, iscan, nr, avail), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(unsigned int, nr) + __field(unsigned int, avail) + __field(unsigned int, unavail) + __field(xfs_ino_t, batch_ino) + __field(unsigned long long, skipmask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->nr = nr; + __entry->avail = avail; + __entry->unavail = hweight64(iscan->__skipped_inomask); + __entry->batch_ino = iscan->__batch_ino; + __entry->skipmask = iscan->__skipped_inomask; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx batchino 0x%llx skipmask 0x%llx nr %u avail %u unavail %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->batch_ino, + __entry->skipmask, + __entry->nr, + __entry->avail, + __entry->unavail) +); + +TRACE_EVENT(xchk_iscan_iget_retry_wait, + TP_PROTO(struct xchk_iscan *iscan), + TP_ARGS(iscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(unsigned int, retry_delay) + __field(unsigned long, remaining) + __field(unsigned int, iget_timeout) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->retry_delay = iscan->iget_retry_delay; + __entry->remaining = jiffies_to_msecs(iscan->__iget_deadline - jiffies); + __entry->iget_timeout = iscan->iget_timeout; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx remaining %lu timeout %u delay %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->remaining, + __entry->iget_timeout, + __entry->retry_delay) +); + +TRACE_EVENT(xchk_nlinks_collect_dirent, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + xfs_ino_t ino, const struct xfs_name *name), + TP_ARGS(mp, dp, ino, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp->i_ino; + __entry->ino = ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_nlinks_collect_metafile, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino), + TP_ARGS(mp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +); + +TRACE_EVENT(xchk_nlinks_live_update, + TP_PROTO(struct xfs_mount *mp, const struct xfs_inode *dp, + int action, xfs_ino_t ino, int delta, + const char *name, unsigned int namelen), + TP_ARGS(mp, dp, action, ino, delta, name, namelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(int, action) + __field(xfs_ino_t, ino) + __field(int, delta) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp ? dp->i_ino : NULLFSINO; + __entry->action = action; + __entry->ino = ino; + __entry->delta = delta; + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + ), + TP_printk("dev %d:%d dir 0x%llx ino 0x%llx nlink_delta %d name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->delta, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_nlinks_check_zero, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *live), + TP_ARGS(mp, ino, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + ), + TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents, + __entry->backrefs, + __entry->children) +); + +TRACE_EVENT(xchk_nlinks_update_incore, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *live, int parents_delta, + int backrefs_delta, int children_delta), + TP_ARGS(mp, ino, live, parents_delta, backrefs_delta, children_delta), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + __field(int, parents_delta) + __field(int, backrefs_delta) + __field(int, children_delta) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + __entry->parents_delta = parents_delta; + __entry->backrefs_delta = backrefs_delta; + __entry->children_delta = children_delta; + ), + TP_printk("dev %d:%d ino 0x%llx parents %d:%u backrefs %d:%u children %d:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents_delta, + __entry->parents, + __entry->backrefs_delta, + __entry->backrefs, + __entry->children_delta, + __entry->children) +); + +DECLARE_EVENT_CLASS(xchk_nlinks_diff_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, + const struct xchk_nlink *live), + TP_ARGS(mp, ip, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(uint8_t, ftype) + __field(xfs_nlink_t, nlink) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + __entry->nlink = VFS_I(ip)->i_nlink; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + ), + TP_printk("dev %d:%d ino 0x%llx ftype %s nlink %u parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __entry->nlink, + __entry->parents, + __entry->backrefs, + __entry->children) +); +#define DEFINE_SCRUB_NLINKS_DIFF_EVENT(name) \ +DEFINE_EVENT(xchk_nlinks_diff_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, \ + const struct xchk_nlink *live), \ + TP_ARGS(mp, ip, live)) +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) @@ -1171,37 +1605,156 @@ DEFINE_EVENT(xrep_rmap_class, name, \ xfs_agblock_t agbno, xfs_extlen_t len, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn); -DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn); -DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); -DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); +DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap); -TRACE_EVENT(xrep_refcount_extent_fn, +TRACE_EVENT(xrep_abt_found, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_refcount_irec *irec), - TP_ARGS(mp, agno, irec), + const struct xfs_alloc_rec_incore *rec), + TP_ARGS(mp, agno, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) - __field(xfs_nlink_t, refcount) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->startblock = irec->rc_startblock; - __entry->blockcount = irec->rc_blockcount; - __entry->refcount = irec->rc_refcount; + __entry->startblock = rec->ar_startblock; + __entry->blockcount = rec->ar_blockcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, + __entry->blockcount) +) + +TRACE_EVENT(xrep_ibt_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + const struct xfs_inobt_rec_incore *rec), + TP_ARGS(mp, agno, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(uint16_t, holemask) + __field(uint8_t, count) + __field(uint8_t, freecount) + __field(uint64_t, freemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = rec->ir_startino; + __entry->holemask = rec->ir_holemask; + __entry->count = rec->ir_count; + __entry->freecount = rec->ir_freecount; + __entry->freemask = rec->ir_free; + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x count 0x%x freecount 0x%x freemask 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->holemask, + __entry->count, + __entry->freecount, + __entry->freemask) +) + +TRACE_EVENT(xrep_refc_found, + TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec), + TP_ARGS(pag, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->domain = rec->rc_domain; + __entry->startblock = rec->rc_startblock; + __entry->blockcount = rec->rc_blockcount; + __entry->refcount = rec->rc_refcount; + ), + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), + __entry->startblock, __entry->blockcount, __entry->refcount) ) +TRACE_EVENT(xrep_bmap_found, + TP_PROTO(struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *irec), + TP_ARGS(ip, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, lblk) + __field(xfs_filblks_t, len) + __field(xfs_fsblock_t, pblk) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->whichfork = whichfork; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->lblk, + __entry->len, + __entry->pblk, + __entry->state) +); + +TRACE_EVENT(xrep_rmap_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + const struct xfs_rmap_irec *rec), + TP_ARGS(mp, agno, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = rec->rm_startblock; + __entry->len = rec->rm_blockcount; + __entry->owner = rec->rm_owner; + __entry->offset = rec->rm_offset; + __entry->flags = rec->rm_flags; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); + TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, uint32_t magic, uint16_t level), @@ -1286,51 +1839,440 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize, __entry->refcbt_sz) ) TRACE_EVENT(xrep_reset_counters, - TP_PROTO(struct xfs_mount *mp), - TP_ARGS(mp), + TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc), + TP_ARGS(mp, fsc), TP_STRUCT__entry( __field(dev_t, dev) + __field(uint64_t, icount) + __field(uint64_t, ifree) + __field(uint64_t, fdblocks) + __field(uint64_t, frextents) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; + __entry->icount = fsc->icount; + __entry->ifree = fsc->ifree; + __entry->fdblocks = fsc->fdblocks; + __entry->frextents = fsc->frextents; ), - TP_printk("dev %d:%d", - MAJOR(__entry->dev), MINOR(__entry->dev)) + TP_printk("dev %d:%d icount %llu ifree %llu fdblocks %llu frextents %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->icount, + __entry->ifree, + __entry->fdblocks, + __entry->frextents) ) -TRACE_EVENT(xrep_ialloc_insert, +DECLARE_EVENT_CLASS(xrep_newbt_extent_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t startino, uint16_t holemask, uint8_t count, - uint8_t freecount, uint64_t freemask), - TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask), + xfs_agblock_t agbno, xfs_extlen_t len, + int64_t owner), + TP_ARGS(mp, agno, agbno, len, owner), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) - __field(xfs_agino_t, startino) - __field(uint16_t, holemask) - __field(uint8_t, count) - __field(uint8_t, freecount) - __field(uint64_t, freemask) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int64_t, owner) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->startino = startino; - __entry->holemask = holemask; - __entry->count = count; - __entry->freecount = freecount; - __entry->freemask = freemask; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; ), - TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->startino, - __entry->holemask, - __entry->count, - __entry->freecount, - __entry->freemask) + __entry->agbno, + __entry->len, + __entry->owner) +); +#define DEFINE_NEWBT_EXTENT_EVENT(name) \ +DEFINE_EVENT(xrep_newbt_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + int64_t owner), \ + TP_ARGS(mp, agno, agbno, len, owner)) +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block); + +DECLARE_EVENT_CLASS(xrep_dinode_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), + TP_ARGS(sc, dip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(uint16_t, mode) + __field(uint8_t, version) + __field(uint8_t, format) + __field(uint32_t, uid) + __field(uint32_t, gid) + __field(uint64_t, size) + __field(uint64_t, nblocks) + __field(uint32_t, extsize) + __field(uint32_t, nextents) + __field(uint16_t, anextents) + __field(uint8_t, forkoff) + __field(uint8_t, aformat) + __field(uint16_t, flags) + __field(uint32_t, gen) + __field(uint64_t, flags2) + __field(uint32_t, cowextsize) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->mode = be16_to_cpu(dip->di_mode); + __entry->version = dip->di_version; + __entry->format = dip->di_format; + __entry->uid = be32_to_cpu(dip->di_uid); + __entry->gid = be32_to_cpu(dip->di_gid); + __entry->size = be64_to_cpu(dip->di_size); + __entry->nblocks = be64_to_cpu(dip->di_nblocks); + __entry->extsize = be32_to_cpu(dip->di_extsize); + __entry->nextents = be32_to_cpu(dip->di_nextents); + __entry->anextents = be16_to_cpu(dip->di_anextents); + __entry->forkoff = dip->di_forkoff; + __entry->aformat = dip->di_aformat; + __entry->flags = be16_to_cpu(dip->di_flags); + __entry->gen = be32_to_cpu(dip->di_gen); + __entry->flags2 = be64_to_cpu(dip->di_flags2); + __entry->cowextsize = be32_to_cpu(dip->di_cowextsize); + ), + TP_printk("dev %d:%d ino 0x%llx mode 0x%x version %u format %u uid %u gid %u disize 0x%llx nblocks 0x%llx extsize %u nextents %u anextents %u forkoff 0x%x aformat %u flags 0x%x gen 0x%x flags2 0x%llx cowextsize %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->mode, + __entry->version, + __entry->format, + __entry->uid, + __entry->gid, + __entry->size, + __entry->nblocks, + __entry->extsize, + __entry->nextents, + __entry->anextents, + __entry->forkoff, + __entry->aformat, + __entry->flags, + __entry->gen, + __entry->flags2, + __entry->cowextsize) +) + +#define DEFINE_REPAIR_DINODE_EVENT(name) \ +DEFINE_EVENT(xrep_dinode_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), \ + TP_ARGS(sc, dip)) +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_header); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_mode); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_flags); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_size); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_forks); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dfork); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_afork); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_ensure_forkoff); + +DECLARE_EVENT_CLASS(xrep_inode_class, + TP_PROTO(struct xfs_scrub *sc), + TP_ARGS(sc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_rfsblock_t, nblocks) + __field(uint16_t, flags) + __field(uint64_t, flags2) + __field(uint32_t, nextents) + __field(uint8_t, format) + __field(uint32_t, anextents) + __field(uint8_t, aformat) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->size = sc->ip->i_disk_size; + __entry->nblocks = sc->ip->i_nblocks; + __entry->flags = sc->ip->i_diflags; + __entry->flags2 = sc->ip->i_diflags2; + __entry->nextents = sc->ip->i_df.if_nextents; + __entry->format = sc->ip->i_df.if_format; + __entry->anextents = sc->ip->i_af.if_nextents; + __entry->aformat = sc->ip->i_af.if_format; + ), + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx nblocks 0x%llx flags 0x%x flags2 0x%llx nextents %u format %u anextents %u aformat %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->nblocks, + __entry->flags, + __entry->flags2, + __entry->nextents, + __entry->format, + __entry->anextents, + __entry->aformat) ) +#define DEFINE_REPAIR_INODE_EVENT(name) \ +DEFINE_EVENT(xrep_inode_class, name, \ + TP_PROTO(struct xfs_scrub *sc), \ + TP_ARGS(sc)) +DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockcounts); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_ids); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_flags); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockdir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_dir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed); + +TRACE_EVENT(xrep_dinode_count_rmaps, + TP_PROTO(struct xfs_scrub *sc, xfs_rfsblock_t data_blocks, + xfs_rfsblock_t rt_blocks, xfs_rfsblock_t attr_blocks, + xfs_extnum_t data_extents, xfs_extnum_t rt_extents, + xfs_aextnum_t attr_extents), + TP_ARGS(sc, data_blocks, rt_blocks, attr_blocks, data_extents, + rt_extents, attr_extents), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_rfsblock_t, data_blocks) + __field(xfs_rfsblock_t, rt_blocks) + __field(xfs_rfsblock_t, attr_blocks) + __field(xfs_extnum_t, data_extents) + __field(xfs_extnum_t, rt_extents) + __field(xfs_aextnum_t, attr_extents) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->data_blocks = data_blocks; + __entry->rt_blocks = rt_blocks; + __entry->attr_blocks = attr_blocks; + __entry->data_extents = data_extents; + __entry->rt_extents = rt_extents; + __entry->attr_extents = attr_extents; + ), + TP_printk("dev %d:%d ino 0x%llx dblocks 0x%llx rtblocks 0x%llx ablocks 0x%llx dextents %llu rtextents %llu aextents %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->data_blocks, + __entry->rt_blocks, + __entry->attr_blocks, + __entry->data_extents, + __entry->rt_extents, + __entry->attr_extents) +); + +TRACE_EVENT(xrep_dinode_findmode_dirent, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp, + unsigned int ftype), + TP_ARGS(sc, dp, ftype), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, ftype) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->parent_ino = dp->i_ino; + __entry->ftype = ftype; + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR)) +); + +TRACE_EVENT(xrep_dinode_findmode_dirent_inval, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp, + unsigned int ftype, unsigned int found_ftype), + TP_ARGS(sc, dp, ftype, found_ftype), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, ftype) + __field(unsigned int, found_ftype) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->parent_ino = dp->i_ino; + __entry->ftype = ftype; + __entry->found_ftype = found_ftype; + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s' found_ftype '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __print_symbolic(__entry->found_ftype, XFS_DIR3_FTYPE_STR)) +); + +TRACE_EVENT(xrep_cow_mark_file_range, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock, + xfs_fileoff_t startoff, xfs_filblks_t blockcount), + TP_ARGS(ip, startblock, startoff, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, startblock) + __field(xfs_fileoff_t, startoff) + __field(xfs_filblks_t, blockcount) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->startoff = startoff; + __entry->startblock = startblock; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->startoff, + __entry->startblock, + __entry->blockcount) +); + +TRACE_EVENT(xrep_cow_replace_mapping, + TP_PROTO(struct xfs_inode *ip, const struct xfs_bmbt_irec *irec, + xfs_fsblock_t new_startblock, xfs_extlen_t new_blockcount), + TP_ARGS(ip, irec, new_startblock, new_blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, startblock) + __field(xfs_fileoff_t, startoff) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(xfs_fsblock_t, new_startblock) + __field(xfs_extlen_t, new_blockcount) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->startoff = irec->br_startoff; + __entry->startblock = irec->br_startblock; + __entry->blockcount = irec->br_blockcount; + __entry->state = irec->br_state; + __entry->new_startblock = new_startblock; + __entry->new_blockcount = new_blockcount; + ), + TP_printk("dev %d:%d ino 0x%llx startoff 0x%llx startblock 0x%llx fsbcount 0x%llx state 0x%x new_startblock 0x%llx new_fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->startoff, + __entry->startblock, + __entry->blockcount, + __entry->state, + __entry->new_startblock, + __entry->new_blockcount) +); + +TRACE_EVENT(xrep_cow_free_staging, + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, + xfs_extlen_t blockcount), + TP_ARGS(pag, agbno, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, blockcount) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agbno = agbno; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->blockcount) +); + +#ifdef CONFIG_XFS_QUOTA +DECLARE_EVENT_CLASS(xrep_dquot_class, + TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), + TP_ARGS(mp, type, id), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint8_t, type) + __field(uint32_t, id) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->id = id; + __entry->type = type; + ), + TP_printk("dev %d:%d type %s id 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), + __entry->id) +); + +#define DEFINE_XREP_DQUOT_EVENT(name) \ +DEFINE_EVENT(xrep_dquot_class, name, \ + TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), \ + TP_ARGS(mp, type, id)) +DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item); +DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot); +DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole); +DEFINE_XREP_DQUOT_EVENT(xrep_quotacheck_dquot); +#endif /* CONFIG_XFS_QUOTA */ + +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode); +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode); + +TRACE_EVENT(xrep_rmap_live_update, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int op, + const struct xfs_rmap_update_params *p), + TP_ARGS(mp, agno, op, p), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, op) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->op = op; + __entry->agbno = p->startblock; + __entry->len = p->blockcount; + xfs_owner_info_unpack(&p->oinfo, &__entry->owner, + &__entry->offset, &__entry->flags); + if (p->unwritten) + __entry->flags |= XFS_RMAP_UNWRITTEN; + ), + TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->op, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index f0f532c10a5a..17c982a4821d 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -16,7 +16,7 @@ * Large Arrays of Fixed-Size Records * ================================== * - * This memory array uses an xfile (which itself is a memfd "file") to store + * This memory array uses an xfile (which itself is a shmem file) to store * large numbers of fixed-size records in memory that can be paged out. This * puts less stress on the memory reclaim algorithms during an online repair * because we don't have to pin so much memory. However, array access is less @@ -136,7 +136,7 @@ xfarray_load( if (idx >= array->nr) return -ENODATA; - return xfile_obj_load(array->xfile, ptr, array->obj_size, + return xfile_load(array->xfile, ptr, array->obj_size, xfarray_pos(array, idx)); } @@ -152,7 +152,7 @@ xfarray_is_unset( if (array->unset_slots == 0) return false; - error = xfile_obj_load(array->xfile, temp, array->obj_size, pos); + error = xfile_load(array->xfile, temp, array->obj_size, pos); if (!error && xfarray_element_is_null(array, temp)) return true; @@ -184,7 +184,7 @@ xfarray_unset( return 0; memset(temp, 0, array->obj_size); - error = xfile_obj_store(array->xfile, temp, array->obj_size, pos); + error = xfile_store(array->xfile, temp, array->obj_size, pos); if (error) return error; @@ -209,7 +209,7 @@ xfarray_store( ASSERT(!xfarray_element_is_null(array, ptr)); - ret = xfile_obj_store(array->xfile, ptr, array->obj_size, + ret = xfile_store(array->xfile, ptr, array->obj_size, xfarray_pos(array, idx)); if (ret) return ret; @@ -245,12 +245,12 @@ xfarray_store_anywhere( for (pos = 0; pos < endpos && array->unset_slots > 0; pos += array->obj_size) { - error = xfile_obj_load(array->xfile, temp, array->obj_size, + error = xfile_load(array->xfile, temp, array->obj_size, pos); if (error || !xfarray_element_is_null(array, temp)) continue; - error = xfile_obj_store(array->xfile, ptr, array->obj_size, + error = xfile_store(array->xfile, ptr, array->obj_size, pos); if (error) return error; @@ -552,7 +552,7 @@ xfarray_isort( trace_xfarray_isort(si, lo, hi); xfarray_sort_bump_loads(si); - error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos); + error = xfile_load(si->array->xfile, scratch, len, lo_pos); if (error) return error; @@ -560,88 +560,45 @@ xfarray_isort( sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); xfarray_sort_bump_stores(si); - return xfile_obj_store(si->array->xfile, scratch, len, lo_pos); + return xfile_store(si->array->xfile, scratch, len, lo_pos); } -/* Grab a page for sorting records. */ -static inline int -xfarray_sort_get_page( - struct xfarray_sortinfo *si, - loff_t pos, - uint64_t len) -{ - int error; - - error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage); - if (error) - return error; - - /* - * xfile pages must never be mapped into userspace, so we skip the - * dcache flush when mapping the page. - */ - si->page_kaddr = kmap_local_page(si->xfpage.page); - return 0; -} - -/* Release a page we grabbed for sorting records. */ -static inline int -xfarray_sort_put_page( - struct xfarray_sortinfo *si) -{ - if (!si->page_kaddr) - return 0; - - kunmap_local(si->page_kaddr); - si->page_kaddr = NULL; - - return xfile_put_page(si->array->xfile, &si->xfpage); -} - -/* Decide if these records are eligible for in-page sorting. */ -static inline bool -xfarray_want_pagesort( - struct xfarray_sortinfo *si, - xfarray_idx_t lo, - xfarray_idx_t hi) -{ - pgoff_t lo_page; - pgoff_t hi_page; - loff_t end_pos; - - /* We can only map one page at a time. */ - lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT; - end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1; - hi_page = end_pos >> PAGE_SHIFT; - - return lo_page == hi_page; -} - -/* Sort a bunch of records that all live in the same memory page. */ +/* + * Sort the records from lo to hi (inclusive) if they are all backed by the + * same memory folio. Returns 1 if it sorted, 0 if it did not, or a negative + * errno. + */ STATIC int -xfarray_pagesort( +xfarray_foliosort( struct xfarray_sortinfo *si, xfarray_idx_t lo, xfarray_idx_t hi) { + struct folio *folio; void *startp; loff_t lo_pos = xfarray_pos(si->array, lo); - uint64_t len = xfarray_pos(si->array, hi - lo); - int error = 0; + uint64_t len = xfarray_pos(si->array, hi - lo + 1); - trace_xfarray_pagesort(si, lo, hi); + /* No single folio could back this many records. */ + if (len > XFILE_MAX_FOLIO_SIZE) + return 0; xfarray_sort_bump_loads(si); - error = xfarray_sort_get_page(si, lo_pos, len); - if (error) - return error; + folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC); + if (IS_ERR(folio)) + return PTR_ERR(folio); + if (!folio) + return 0; + + trace_xfarray_foliosort(si, lo, hi); xfarray_sort_bump_heapsorts(si); - startp = si->page_kaddr + offset_in_page(lo_pos); + startp = folio_address(folio) + offset_in_folio(folio, lo_pos); sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); xfarray_sort_bump_stores(si); - return xfarray_sort_put_page(si); + xfile_put_folio(si->array->xfile, folio); + return 1; } /* Return a pointer to the xfarray pivot record within the sortinfo struct. */ @@ -829,63 +786,78 @@ xfarray_qsort_push( return 0; } +static inline void +xfarray_sort_scan_done( + struct xfarray_sortinfo *si) +{ + if (si->folio) + xfile_put_folio(si->array->xfile, si->folio); + si->folio = NULL; +} + /* - * Load an element from the array into the first scratchpad and cache the page, - * if possible. + * Cache the folio backing the start of the given array element. If the array + * element is contained entirely within the folio, return a pointer to the + * cached folio. Otherwise, load the element into the scratchpad and return a + * pointer to the scratchpad. */ static inline int -xfarray_sort_load_cached( +xfarray_sort_scan( struct xfarray_sortinfo *si, xfarray_idx_t idx, - void *ptr) + void **ptrp) { loff_t idx_pos = xfarray_pos(si->array, idx); - pgoff_t startpage; - pgoff_t endpage; int error = 0; - /* - * If this load would split a page, release the cached page, if any, - * and perform a traditional read. - */ - startpage = idx_pos >> PAGE_SHIFT; - endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT; - if (startpage != endpage) { - error = xfarray_sort_put_page(si); - if (error) - return error; + if (xfarray_sort_terminated(si, &error)) + return error; - if (xfarray_sort_terminated(si, &error)) - return error; + trace_xfarray_sort_scan(si, idx); - return xfile_obj_load(si->array->xfile, ptr, - si->array->obj_size, idx_pos); - } + /* If the cached folio doesn't cover this index, release it. */ + if (si->folio && + (idx < si->first_folio_idx || idx > si->last_folio_idx)) + xfarray_sort_scan_done(si); - /* If the cached page is not the one we want, release it. */ - if (xfile_page_cached(&si->xfpage) && - xfile_page_index(&si->xfpage) != startpage) { - error = xfarray_sort_put_page(si); - if (error) - return error; + /* Grab the first folio that backs this array element. */ + if (!si->folio) { + loff_t next_pos; + + si->folio = xfile_get_folio(si->array->xfile, idx_pos, + si->array->obj_size, XFILE_ALLOC); + if (IS_ERR(si->folio)) + return PTR_ERR(si->folio); + + si->first_folio_idx = xfarray_idx(si->array, + folio_pos(si->folio) + si->array->obj_size - 1); + + next_pos = folio_pos(si->folio) + folio_size(si->folio); + si->last_folio_idx = xfarray_idx(si->array, next_pos - 1); + if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos) + si->last_folio_idx--; + + trace_xfarray_sort_scan(si, idx); } /* - * If we don't have a cached page (and we know the load is contained - * in a single page) then grab it. + * If this folio still doesn't cover the desired element, it must cross + * a folio boundary. Read into the scratchpad and we're done. */ - if (!xfile_page_cached(&si->xfpage)) { - if (xfarray_sort_terminated(si, &error)) - return error; + if (idx < si->first_folio_idx || idx > si->last_folio_idx) { + void *temp = xfarray_scratch(si->array); - error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT, - PAGE_SIZE); + error = xfile_load(si->array->xfile, temp, si->array->obj_size, + idx_pos); if (error) return error; + + *ptrp = temp; + return 0; } - memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos), - si->array->obj_size); + /* Otherwise return a pointer to the array element in the folio. */ + *ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos); return 0; } @@ -952,6 +924,8 @@ xfarray_sort( pivot = xfarray_sortinfo_pivot(si); while (si->stack_depth >= 0) { + int ret; + lo = si_lo[si->stack_depth]; hi = si_hi[si->stack_depth]; @@ -964,13 +938,13 @@ xfarray_sort( } /* - * If directly mapping the page and sorting can solve our + * If directly mapping the folio and sorting can solve our * problems, we're done. */ - if (xfarray_want_pagesort(si, lo, hi)) { - error = xfarray_pagesort(si, lo, hi); - if (error) - goto out_free; + ret = xfarray_foliosort(si, lo, hi); + if (ret < 0) + goto out_free; + if (ret == 1) { si->stack_depth--; continue; } @@ -995,25 +969,24 @@ xfarray_sort( * than the pivot is on the right side of the range. */ while (lo < hi) { + void *p; + /* * Decrement hi until it finds an a[hi] less than the * pivot value. */ - error = xfarray_sort_load_cached(si, hi, scratch); + error = xfarray_sort_scan(si, hi, &p); if (error) goto out_free; - while (xfarray_sort_cmp(si, scratch, pivot) >= 0 && - lo < hi) { + while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) { hi--; - error = xfarray_sort_load_cached(si, hi, - scratch); + error = xfarray_sort_scan(si, hi, &p); if (error) goto out_free; } - error = xfarray_sort_put_page(si); - if (error) - goto out_free; - + if (p != scratch) + memcpy(scratch, p, si->array->obj_size); + xfarray_sort_scan_done(si); if (xfarray_sort_terminated(si, &error)) goto out_free; @@ -1028,21 +1001,18 @@ xfarray_sort( * Increment lo until it finds an a[lo] greater than * the pivot value. */ - error = xfarray_sort_load_cached(si, lo, scratch); + error = xfarray_sort_scan(si, lo, &p); if (error) goto out_free; - while (xfarray_sort_cmp(si, scratch, pivot) <= 0 && - lo < hi) { + while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) { lo++; - error = xfarray_sort_load_cached(si, lo, - scratch); + error = xfarray_sort_scan(si, lo, &p); if (error) goto out_free; } - error = xfarray_sort_put_page(si); - if (error) - goto out_free; - + if (p != scratch) + memcpy(scratch, p, si->array->obj_size); + xfarray_sort_scan_done(si); if (xfarray_sort_terminated(si, &error)) goto out_free; diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index 4ecac01363d9..acb2f94c56c1 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -45,6 +45,25 @@ int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr); int xfarray_store_anywhere(struct xfarray *array, const void *ptr); bool xfarray_element_is_null(struct xfarray *array, const void *ptr); +/* + * Load an array element, but zero the buffer if there's no data because we + * haven't stored to that array element yet. + */ +static inline int +xfarray_load_sparse( + struct xfarray *array, + uint64_t idx, + void *rec) +{ + int error = xfarray_load(array, idx, rec); + + if (error == -ENODATA) { + memset(rec, 0, array->obj_size); + return 0; + } + return error; +} + /* Append an element to the array. */ static inline int xfarray_append(struct xfarray *array, const void *ptr) { @@ -54,6 +73,28 @@ static inline int xfarray_append(struct xfarray *array, const void *ptr) uint64_t xfarray_length(struct xfarray *array); int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec); +/* + * Iterate the non-null elements in a sparse xfarray. Callers should + * initialize *idx to XFARRAY_CURSOR_INIT before the first call; on return, it + * will be set to one more than the index of the record that was retrieved. + * Returns 1 if a record was retrieved, 0 if there weren't any more records, or + * a negative errno. + */ +static inline int +xfarray_iter( + struct xfarray *array, + xfarray_idx_t *idx, + void *rec) +{ + int ret = xfarray_load_next(array, idx, rec); + + if (ret == -ENODATA) + return 0; + if (ret == 0) + return 1; + return ret; +} + /* Declarations for xfile array sort functionality. */ typedef cmp_func_t xfarray_cmp_fn; @@ -83,9 +124,14 @@ struct xfarray_sortinfo { /* XFARRAY_SORT_* flags; see below. */ unsigned int flags; - /* Cache a page here for faster access. */ - struct xfile_page xfpage; - void *page_kaddr; + /* Cache a folio here for faster scanning for pivots */ + struct folio *folio; + + /* First array index in folio that is completely readable */ + xfarray_idx_t first_folio_idx; + + /* Last array index in folio that is completely readable */ + xfarray_idx_t last_folio_idx; #ifdef DEBUG /* Performance statistics. */ diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 090c3ead43fd..8cdd863db585 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -34,13 +34,6 @@ * xfiles assume that the caller will handle all required concurrency * management; standard vfs locks (freezer and inode) are not taken. Reads * and writes are satisfied directly from the page cache. - * - * NOTE: The current shmemfs implementation has a quirk that in-kernel reads - * of a hole cause a page to be mapped into the file. If you are going to - * create a sparse xfile, please be careful about reading from uninitialized - * parts of the file. These pages are !Uptodate and will eventually be - * reclaimed if not written, but in the short term this boosts memory - * consumption. */ /* @@ -62,38 +55,27 @@ xfile_create( { struct inode *inode; struct xfile *xf; - int error = -ENOMEM; + int error; xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); if (!xf) return -ENOMEM; - xf->file = shmem_file_setup(description, isize, 0); - if (!xf->file) - goto out_xfile; + xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE); if (IS_ERR(xf->file)) { error = PTR_ERR(xf->file); goto out_xfile; } - /* - * We want a large sparse file that we can pread, pwrite, and seek. - * xfile users are responsible for keeping the xfile hidden away from - * all other callers, so we skip timestamp updates and security checks. - * Make the inode only accessible by root, just in case the xfile ever - * escapes. - */ - xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME | - FMODE_LSEEK; - xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; inode = file_inode(xf->file); - inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME; - inode->i_mode &= ~0177; - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); + /* + * We don't want to bother with kmapping data during repair, so don't + * allow highmem pages to back this mapping. + */ + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); + trace_xfile_create(xf); *xfilep = xf; @@ -118,164 +100,128 @@ xfile_destroy( } /* - * Read a memory object directly from the xfile's page cache. Unlike regular - * pread, we return -E2BIG and -EFBIG for reads that are too large or at too - * high an offset, instead of truncating the read. Otherwise, we return - * bytes read or an error code, like regular pread. + * Load an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. */ -ssize_t -xfile_pread( +int +xfile_load( struct xfile *xf, void *buf, size_t count, loff_t pos) { struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - struct page *page = NULL; - ssize_t read = 0; unsigned int pflags; - int error = 0; if (count > MAX_RW_COUNT) - return -E2BIG; + return -ENOMEM; if (inode->i_sb->s_maxbytes - pos < count) - return -EFBIG; + return -ENOMEM; - trace_xfile_pread(xf, pos, count); + trace_xfile_load(xf, pos, count); pflags = memalloc_nofs_save(); while (count > 0) { - void *p, *kaddr; + struct folio *folio; unsigned int len; + unsigned int offset; - len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); - - /* - * In-kernel reads of a shmem file cause it to allocate a page - * if the mapping shows a hole. Therefore, if we hit ENOMEM - * we can continue by zeroing the caller's buffer. - */ - page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT, - __GFP_NOWARN); - if (IS_ERR(page)) { - error = PTR_ERR(page); - if (error != -ENOMEM) - break; - - memset(buf, 0, len); - goto advance; - } - - if (PageUptodate(page)) { + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + SGP_READ) < 0) + break; + if (!folio) { /* - * xfile pages must never be mapped into userspace, so - * we skip the dcache flush. + * No data stored at this offset, just zero the output + * buffer until the next page boundary. */ - kaddr = kmap_local_page(page); - p = kaddr + offset_in_page(pos); - memcpy(buf, p, len); - kunmap_local(kaddr); - } else { + len = min_t(ssize_t, count, + PAGE_SIZE - offset_in_page(pos)); memset(buf, 0, len); - } - put_page(page); + } else { + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); + break; + } + + offset = offset_in_folio(folio, pos); + len = min_t(ssize_t, count, folio_size(folio) - offset); + memcpy(buf, folio_address(folio) + offset, len); -advance: + folio_unlock(folio); + folio_put(folio); + } count -= len; pos += len; buf += len; - read += len; } memalloc_nofs_restore(pflags); - if (read > 0) - return read; - return error; + if (count) + return -ENOMEM; + return 0; } /* - * Write a memory object directly to the xfile's page cache. Unlike regular - * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too - * high an offset, instead of truncating the write. Otherwise, we return - * bytes written or an error code, like regular pwrite. + * Store an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. */ -ssize_t -xfile_pwrite( +int +xfile_store( struct xfile *xf, const void *buf, size_t count, loff_t pos) { struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - const struct address_space_operations *aops = mapping->a_ops; - struct page *page = NULL; - ssize_t written = 0; unsigned int pflags; - int error = 0; if (count > MAX_RW_COUNT) - return -E2BIG; + return -ENOMEM; if (inode->i_sb->s_maxbytes - pos < count) - return -EFBIG; + return -ENOMEM; - trace_xfile_pwrite(xf, pos, count); + trace_xfile_store(xf, pos, count); + + /* + * Increase the file size first so that shmem_get_folio(..., SGP_CACHE), + * actually allocates a folio instead of erroring out. + */ + if (pos + count > i_size_read(inode)) + i_size_write(inode, pos + count); pflags = memalloc_nofs_save(); while (count > 0) { - void *fsdata = NULL; - void *p, *kaddr; + struct folio *folio; unsigned int len; - int ret; - - len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); - - /* - * We call write_begin directly here to avoid all the freezer - * protection lock-taking that happens in the normal path. - * shmem doesn't support fs freeze, but lockdep doesn't know - * that and will trip over that. - */ - error = aops->write_begin(NULL, mapping, pos, len, &page, - &fsdata); - if (error) - break; + unsigned int offset; - /* - * xfile pages must never be mapped into userspace, so we skip - * the dcache flush. If the page is not uptodate, zero it - * before writing data. - */ - kaddr = kmap_local_page(page); - if (!PageUptodate(page)) { - memset(kaddr, 0, PAGE_SIZE); - SetPageUptodate(page); - } - p = kaddr + offset_in_page(pos); - memcpy(p, buf, len); - kunmap_local(kaddr); - - ret = aops->write_end(NULL, mapping, pos, len, len, page, - fsdata); - if (ret < 0) { - error = ret; + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + SGP_CACHE) < 0) + break; + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); break; } - written += ret; - if (ret != len) - break; + offset = offset_in_folio(folio, pos); + len = min_t(ssize_t, count, folio_size(folio) - offset); + memcpy(folio_address(folio) + offset, buf, len); + + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); - count -= ret; - pos += ret; - buf += ret; + count -= len; + pos += len; + buf += len; } memalloc_nofs_restore(pflags); - if (written > 0) - return written; - return error; + if (count) + return -ENOMEM; + return 0; } /* Find the next written area in the xfile data for a given offset. */ @@ -291,129 +237,76 @@ xfile_seek_data( return ret; } -/* Query stat information for an xfile. */ -int -xfile_stat( - struct xfile *xf, - struct xfile_stat *statbuf) -{ - struct kstat ks; - int error; - - error = vfs_getattr_nosec(&xf->file->f_path, &ks, - STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC); - if (error) - return error; - - statbuf->size = ks.size; - statbuf->bytes = ks.blocks << SECTOR_SHIFT; - return 0; -} - /* - * Grab the (locked) page for a memory object. The object cannot span a page - * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we - * cannot grab the page, or the usual negative errno. + * Grab the (locked) folio for a memory object. The object cannot span a folio + * boundary. Returns the locked folio if successful, NULL if there was no + * folio or it didn't cover the range requested, or an ERR_PTR on failure. */ -int -xfile_get_page( +struct folio * +xfile_get_folio( struct xfile *xf, loff_t pos, - unsigned int len, - struct xfile_page *xfpage) + size_t len, + unsigned int flags) { struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - const struct address_space_operations *aops = mapping->a_ops; - struct page *page = NULL; - void *fsdata = NULL; - loff_t key = round_down(pos, PAGE_SIZE); + struct folio *folio = NULL; unsigned int pflags; int error; if (inode->i_sb->s_maxbytes - pos < len) - return -ENOMEM; - if (len > PAGE_SIZE - offset_in_page(pos)) - return -ENOTBLK; - - trace_xfile_get_page(xf, pos, len); + return ERR_PTR(-ENOMEM); - pflags = memalloc_nofs_save(); + trace_xfile_get_folio(xf, pos, len); /* - * We call write_begin directly here to avoid all the freezer - * protection lock-taking that happens in the normal path. shmem - * doesn't support fs freeze, but lockdep doesn't know that and will - * trip over that. + * Increase the file size first so that shmem_get_folio(..., SGP_CACHE), + * actually allocates a folio instead of erroring out. */ - error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page, - &fsdata); + if ((flags & XFILE_ALLOC) && pos + len > i_size_read(inode)) + i_size_write(inode, pos + len); + + pflags = memalloc_nofs_save(); + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ); + memalloc_nofs_restore(pflags); if (error) - goto out_pflags; + return ERR_PTR(error); - /* We got the page, so make sure we push out EOF. */ - if (i_size_read(inode) < pos + len) - i_size_write(inode, pos + len); + if (!folio) + return NULL; - /* - * If the page isn't up to date, fill it with zeroes before we hand it - * to the caller and make sure the backing store will hold on to them. - */ - if (!PageUptodate(page)) { - void *kaddr; + if (len > folio_size(folio) - offset_in_folio(folio, pos)) { + folio_unlock(folio); + folio_put(folio); + return NULL; + } - kaddr = kmap_local_page(page); - memset(kaddr, 0, PAGE_SIZE); - kunmap_local(kaddr); - SetPageUptodate(page); + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(-EIO); } /* - * Mark each page dirty so that the contents are written to some - * backing store when we drop this buffer, and take an extra reference - * to prevent the xfile page from being swapped or removed from the - * page cache by reclaim if the caller unlocks the page. + * Mark the folio dirty so that it won't be reclaimed once we drop the + * (potentially last) reference in xfile_put_folio. */ - set_page_dirty(page); - get_page(page); - - xfpage->page = page; - xfpage->fsdata = fsdata; - xfpage->pos = key; -out_pflags: - memalloc_nofs_restore(pflags); - return error; + if (flags & XFILE_ALLOC) + folio_set_dirty(folio); + return folio; } /* - * Release the (locked) page for a memory object. Returns 0 or a negative - * errno. + * Release the (locked) folio for a memory object. */ -int -xfile_put_page( +void +xfile_put_folio( struct xfile *xf, - struct xfile_page *xfpage) + struct folio *folio) { - struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - const struct address_space_operations *aops = mapping->a_ops; - unsigned int pflags; - int ret; - - trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE); - - /* Give back the reference that we took in xfile_get_page. */ - put_page(xfpage->page); + trace_xfile_put_folio(xf, folio_pos(folio), folio_size(folio)); - pflags = memalloc_nofs_save(); - ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE, - xfpage->page, xfpage->fsdata); - memalloc_nofs_restore(pflags); - memset(xfpage, 0, sizeof(struct xfile_page)); - - if (ret < 0) - return ret; - if (ret != PAGE_SIZE) - return -EIO; - return 0; + folio_unlock(folio); + folio_put(folio); } diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h index d56643b0f429..76d78dba7e34 100644 --- a/fs/xfs/scrub/xfile.h +++ b/fs/xfs/scrub/xfile.h @@ -6,22 +6,6 @@ #ifndef __XFS_SCRUB_XFILE_H__ #define __XFS_SCRUB_XFILE_H__ -struct xfile_page { - struct page *page; - void *fsdata; - loff_t pos; -}; - -static inline bool xfile_page_cached(const struct xfile_page *xfpage) -{ - return xfpage->page != NULL; -} - -static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage) -{ - return xfpage->page->index; -} - struct xfile { struct file *file; }; @@ -29,49 +13,17 @@ struct xfile { int xfile_create(const char *description, loff_t isize, struct xfile **xfilep); void xfile_destroy(struct xfile *xf); -ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos); -ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count, +int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos); +int xfile_store(struct xfile *xf, const void *buf, size_t count, loff_t pos); -/* - * Load an object. Since we're treating this file as "memory", any error or - * short IO is treated as a failure to allocate memory. - */ -static inline int -xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos) -{ - ssize_t ret = xfile_pread(xf, buf, count, pos); - - if (ret < 0 || ret != count) - return -ENOMEM; - return 0; -} - -/* - * Store an object. Since we're treating this file as "memory", any error or - * short IO is treated as a failure to allocate memory. - */ -static inline int -xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos) -{ - ssize_t ret = xfile_pwrite(xf, buf, count, pos); - - if (ret < 0 || ret != count) - return -ENOMEM; - return 0; -} - loff_t xfile_seek_data(struct xfile *xf, loff_t pos); -struct xfile_stat { - loff_t size; - unsigned long long bytes; -}; - -int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf); +#define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER) -int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len, - struct xfile_page *xbuf); -int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf); +#define XFILE_ALLOC (1 << 0) /* allocate folio if not present */ +struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len, + unsigned int flags); +void xfile_put_folio(struct xfile *xf, struct folio *folio); #endif /* __XFS_SCRUB_XFILE_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 6b840301817a..4bf69c9c088e 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -167,7 +167,7 @@ xfs_get_acl(struct inode *inode, int type, bool rcu) acl = ERR_PTR(error); } - kmem_free(args.value); + kvfree(args.value); return acl; } @@ -204,7 +204,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) } error = xfs_attr_change(&args); - kmem_free(args.value); + kvfree(args.value); /* * If the attribute didn't exist to start with that's fine. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 465d7630bb21..3f428620ebf2 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -112,7 +112,7 @@ xfs_end_ioend( * longer dirty. If we don't remove delalloc blocks here, they become * stale and can corrupt free space accounting on unmount. */ - error = blk_status_to_errno(ioend->io_bio->bi_status); + error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); @@ -179,7 +179,7 @@ STATIC void xfs_end_bio( struct bio *bio) { - struct iomap_ioend *ioend = bio->bi_private; + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); struct xfs_inode *ip = XFS_I(ioend->io_inode); unsigned long flags; @@ -276,7 +276,8 @@ static int xfs_map_blocks( struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset) + loff_t offset, + unsigned int len) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -444,7 +445,7 @@ xfs_prepare_ioend( /* send ioends that might require a transaction to the completion wq */ if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || (ioend->io_flags & IOMAP_F_SHARED)) - ioend->io_bio->bi_end_io = xfs_end_bio; + ioend->io_bio.bi_end_io = xfs_end_bio; return status; } @@ -502,13 +503,6 @@ xfs_vm_writepages( { struct xfs_writepage_ctx wpc = { }; - /* - * Writing back data in a transaction context can result in recursive - * transactions. This is bad, so issue a warning and get out of here. - */ - if (WARN_ON_ONCE(current->journal_info)) - return 0; - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } @@ -584,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = { .bmap = xfs_vm_bmap, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 89c7a9f4f930..24fb12986a56 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -23,6 +23,7 @@ #include "xfs_quota.h" #include "xfs_dir2.h" #include "xfs_error.h" +#include "xfs_health.h" /* * Invalidate any incore buffers associated with this remote attribute value @@ -147,6 +148,7 @@ xfs_attr3_node_inactive( if (level > XFS_DA_NODE_MAXDEPTH) { xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } @@ -197,6 +199,7 @@ xfs_attr3_node_inactive( default: xfs_buf_mark_corrupt(child_bp); xfs_trans_brelse(*trans, child_bp); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); error = -EFSCORRUPTED; break; } @@ -286,6 +289,7 @@ xfs_attr3_root_inactive( error = xfs_attr3_leaf_inactive(trans, dp, bp); break; default: + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); error = -EFSCORRUPTED; xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 36fe2abb16e6..9b4c61e1c22e 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -33,8 +33,6 @@ struct kmem_cache *xfs_attrd_cache; static const struct xfs_item_ops xfs_attri_item_ops; static const struct xfs_item_ops xfs_attrd_item_ops; -static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp, - struct xfs_attri_log_item *attrip); static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) { @@ -110,7 +108,7 @@ STATIC void xfs_attri_item_free( struct xfs_attri_log_item *attrip) { - kmem_free(attrip->attri_item.li_lv_shadow); + kvfree(attrip->attri_item.li_lv_shadow); xfs_attri_log_nameval_put(attrip->attri_nameval); kmem_cache_free(xfs_attri_cache, attrip); } @@ -228,7 +226,7 @@ xfs_attri_init( { struct xfs_attri_log_item *attrip; - attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL); + attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_KERNEL | __GFP_NOFAIL); /* * Grab an extra reference to the name/value buffer for this log item. @@ -253,7 +251,7 @@ static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip) STATIC void xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp) { - kmem_free(attrdp->attrd_item.li_lv_shadow); + kvfree(attrdp->attrd_item.li_lv_shadow); kmem_cache_free(xfs_attrd_cache, attrdp); } @@ -310,47 +308,6 @@ xfs_attrd_item_intent( return &ATTRD_ITEM(lip)->attrd_attrip->attri_item; } -/* - * Performs one step of an attribute update intent and marks the attrd item - * dirty.. An attr operation may be a set or a remove. Note that the - * transaction is marked dirty regardless of whether the operation succeeds or - * fails to support the ATTRI/ATTRD lifecycle rules. - */ -STATIC int -xfs_xattri_finish_update( - struct xfs_attr_intent *attr, - struct xfs_attrd_log_item *attrdp) -{ - struct xfs_da_args *args = attr->xattri_da_args; - int error; - - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { - error = -EIO; - goto out; - } - - error = xfs_attr_set_iter(attr); - if (!error && attr->xattri_dela_state != XFS_DAS_DONE) - error = -EAGAIN; -out: - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the ATTRI and frees the ATTRD - * 2.) shuts down the filesystem - */ - args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - - /* - * attr intent/done items are null when logged attributes are disabled - */ - if (attrdp) - set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - - return error; -} - /* Log an attr to the intent item. */ STATIC void xfs_attr_log_item( @@ -360,9 +317,6 @@ xfs_attr_log_item( { struct xfs_attri_log_format *attrp; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags); - /* * At this point the xfs_attr_intent has been constructed, and we've * created the log intent. Fill in the attri log item and log format @@ -419,7 +373,6 @@ xfs_attr_create_intent( } attrip = xfs_attri_init(mp, attr->xattri_nameval); - xfs_trans_add_item(tp, &attrip->attri_item); xfs_attr_log_item(tp, attrip, attr); return &attrip->attri_item; @@ -433,11 +386,16 @@ xfs_attr_free_item( xfs_da_state_free(attr->xattri_da_state); xfs_attri_log_nameval_put(attr->xattri_nameval); if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY) - kmem_free(attr); + kfree(attr); else kmem_cache_free(xfs_attr_intent_cache, attr); } +static inline struct xfs_attr_intent *attri_entry(const struct list_head *e) +{ + return list_entry(e, struct xfs_attr_intent, xattri_list); +} + /* Process an attr. */ STATIC int xfs_attr_finish_item( @@ -446,24 +404,33 @@ xfs_attr_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_attr_intent *attr; - struct xfs_attrd_log_item *done_item = NULL; + struct xfs_attr_intent *attr = attri_entry(item); + struct xfs_da_args *args; int error; - attr = container_of(item, struct xfs_attr_intent, xattri_list); - if (done) - done_item = ATTRD_ITEM(done); + args = attr->xattri_da_args; - /* - * Always reset trans after EAGAIN cycle - * since the transaction is new - */ - attr->xattri_da_args->trans = tp; + /* Reset trans after EAGAIN cycle since the transaction is new */ + args->trans = tp; - error = xfs_xattri_finish_update(attr, done_item); - if (error != -EAGAIN) - xfs_attr_free_item(attr); + if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + error = -EIO; + goto out; + } + + /* If an attr removal is trivially complete, we're done. */ + if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE && + !xfs_inode_hasattr(args->dp)) { + error = 0; + goto out; + } + error = xfs_attr_set_iter(attr); + if (!error && attr->xattri_dela_state != XFS_DAS_DONE) + return -EAGAIN; + +out: + xfs_attr_free_item(attr); return error; } @@ -480,9 +447,8 @@ STATIC void xfs_attr_cancel_item( struct list_head *item) { - struct xfs_attr_intent *attr; + struct xfs_attr_intent *attr = attri_entry(item); - attr = container_of(item, struct xfs_attr_intent, xattri_list); xfs_attr_free_item(attr); } @@ -532,44 +498,25 @@ xfs_attri_validate( return xfs_verify_ino(mp, attrp->alfi_ino); } -/* - * Process an attr intent item that was recovered from the log. We need to - * delete the attr that it describes. - */ -STATIC int -xfs_attri_item_recover( - struct xfs_log_item *lip, - struct list_head *capture_list) +static inline struct xfs_attr_intent * +xfs_attri_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_attri_log_format *attrp, + struct xfs_inode **ipp, + struct xfs_attri_log_nameval *nv) { - struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_attr_intent *attr; - struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_inode *ip; struct xfs_da_args *args; - struct xfs_trans *tp; - struct xfs_trans_res resv; - struct xfs_attri_log_format *attrp; - struct xfs_attri_log_nameval *nv = attrip->attri_nameval; - int error; - int total; int local; - struct xfs_attrd_log_item *done_item = NULL; - - /* - * First check the validity of the attr described by the ATTRI. If any - * are bad, then assume that all are bad and just toss the ATTRI. - */ - attrp = &attrip->attri_format; - if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) - return -EFSCORRUPTED; + int error; - error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); + error = xlog_recover_iget(mp, attrp->alfi_ino, ipp); if (error) - return error; + return ERR_PTR(error); - attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + - sizeof(struct xfs_da_args), KM_NOFS); + attr = kzalloc(sizeof(struct xfs_attr_intent) + + sizeof(struct xfs_da_args), GFP_KERNEL | __GFP_NOFAIL); args = (struct xfs_da_args *)(attr + 1); attr->xattri_da_args = args; @@ -584,7 +531,7 @@ xfs_attri_item_recover( attr->xattri_nameval = xfs_attri_log_nameval_get(nv); ASSERT(attr->xattri_nameval); - args->dp = ip; + args->dp = *ipp; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; args->name = nv->name.i_addr; @@ -608,43 +555,65 @@ xfs_attri_item_recover( attr->xattri_dela_state = xfs_attr_init_add_state(args); break; case XFS_ATTRI_OP_FLAGS_REMOVE: - if (!xfs_inode_hasattr(args->dp)) - goto out; attr->xattri_dela_state = xfs_attr_init_remove_state(args); break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - goto out; } + xfs_defer_add_item(dfp, &attr->xattri_list); + return attr; +} + +/* + * Process an attr intent item that was recovered from the log. We need to + * delete the attr that it describes. + */ +STATIC int +xfs_attr_recover_work( + struct xfs_defer_pending *dfp, + struct list_head *capture_list) +{ + struct xfs_log_item *lip = dfp->dfp_intent; + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attr_intent *attr; + struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_inode *ip; + struct xfs_da_args *args; + struct xfs_trans *tp; + struct xfs_trans_res resv; + struct xfs_attri_log_format *attrp; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; + int error; + int total; + + /* + * First check the validity of the attr described by the ATTRI. If any + * are bad, then assume that all are bad and just toss the ATTRI. + */ + attrp = &attrip->attri_format; + if (!xfs_attri_validate(mp, attrp) || + !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) + return -EFSCORRUPTED; + + attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv); + if (IS_ERR(attr)) + return PTR_ERR(attr); + args = attr->xattri_da_args; + xfs_init_attr_trans(args, &resv, &total); resv = xlog_recover_resv(&resv); error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) - goto out; - + return error; args->trans = tp; - done_item = xfs_trans_get_attrd(tp, attrip); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_xattri_finish_update(attr, done_item); - if (error == -EAGAIN) { - /* - * There's more work to do, so add the intent item to this - * transaction so that we can continue it later. - */ - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); - error = xfs_defer_ops_capture_and_commit(tp, capture_list); - if (error) - goto out_unlock; - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); - return 0; - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &attrip->attri_format, + sizeof(attrip->attri_format)); if (error) { xfs_trans_cancel(tp); goto out_unlock; @@ -654,18 +623,16 @@ xfs_attri_item_recover( out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); -out: - xfs_attr_free_item(attr); return error; } /* Re-log an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_attri_item_relog( +xfs_attr_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_attrd_log_item *attrdp; struct xfs_attri_log_item *old_attrip; struct xfs_attri_log_item *new_attrip; struct xfs_attri_log_format *new_attrp; @@ -674,10 +641,6 @@ xfs_attri_item_relog( old_attrip = ATTRI_ITEM(intent); old_attrp = &old_attrip->attri_format; - tp->t_flags |= XFS_TRANS_DIRTY; - attrdp = xfs_trans_get_attrd(tp, old_attrip); - set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - /* * Create a new log item that shares the same name/value buffer as the * old log item. @@ -691,12 +654,43 @@ xfs_attri_item_relog( new_attrp->alfi_name_len = old_attrp->alfi_name_len; new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; - xfs_trans_add_item(tp, &new_attrip->attri_item); - set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags); - return &new_attrip->attri_item; } +/* Get an ATTRD so we can process all the attrs. */ +static struct xfs_log_item * +xfs_attr_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + struct xfs_attri_log_item *attrip; + struct xfs_attrd_log_item *attrdp; + + attrip = ATTRI_ITEM(intent); + + attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_KERNEL | __GFP_NOFAIL); + + xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, + &xfs_attrd_item_ops); + attrdp->attrd_attrip = attrip; + attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; + + return &attrdp->attrd_item; +} + +const struct xfs_defer_op_type xfs_attr_defer_type = { + .name = "attr", + .max_items = 1, + .create_intent = xfs_attr_create_intent, + .abort_intent = xfs_attr_abort_intent, + .create_done = xfs_attr_create_done, + .finish_item = xfs_attr_finish_item, + .cancel_item = xfs_attr_cancel_item, + .recover_work = xfs_attr_recover_work, + .relog_intent = xfs_attr_relog_intent, +}; + STATIC int xlog_recover_attri_commit_pass2( struct xlog *log, @@ -767,63 +761,13 @@ xlog_recover_attri_commit_pass2( attrip = xfs_attri_init(mp, nv); memcpy(&attrip->attri_format, attri_formatp, len); - /* - * The ATTRI has two references. One for the ATTRD and one for ATTRI to - * ensure it makes it into the AIL. Insert the ATTRI into the AIL - * directly and drop the ATTRI reference. Note that - * xfs_trans_ail_update() drops the AIL lock. - */ - xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn); - xfs_attri_release(attrip); + xlog_recover_intent_item(log, &attrip->attri_item, lsn, + &xfs_attr_defer_type); xfs_attri_log_nameval_put(nv); return 0; } /* - * This routine is called to allocate an "attr free done" log item. - */ -static struct xfs_attrd_log_item * -xfs_trans_get_attrd(struct xfs_trans *tp, - struct xfs_attri_log_item *attrip) -{ - struct xfs_attrd_log_item *attrdp; - - ASSERT(tp != NULL); - - attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); - - xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, - &xfs_attrd_item_ops); - attrdp->attrd_attrip = attrip; - attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; - - xfs_trans_add_item(tp, &attrdp->attrd_item); - return attrdp; -} - -/* Get an ATTRD so we can process all the attrs. */ -static struct xfs_log_item * -xfs_attr_create_done( - struct xfs_trans *tp, - struct xfs_log_item *intent, - unsigned int count) -{ - if (!intent) - return NULL; - - return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item; -} - -const struct xfs_defer_op_type xfs_attr_defer_type = { - .max_items = 1, - .create_intent = xfs_attr_create_intent, - .abort_intent = xfs_attr_abort_intent, - .create_done = xfs_attr_create_done, - .finish_item = xfs_attr_finish_item, - .cancel_item = xfs_attr_cancel_item, -}; - -/* * This routine is called when an ATTRD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding ATTRI if * it was still in the log. To do this it searches the AIL for the ATTRI with @@ -857,9 +801,7 @@ static const struct xfs_item_ops xfs_attri_item_ops = { .iop_format = xfs_attri_item_format, .iop_unpin = xfs_attri_item_unpin, .iop_release = xfs_attri_item_release, - .iop_recover = xfs_attri_item_recover, .iop_match = xfs_attri_item_match, - .iop_relog = xfs_attri_item_relog, }; const struct xlog_recover_item_ops xlog_attri_item_ops = { diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 99bbbe1a0e44..a6819a642cc0 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -22,6 +22,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_dir2.h" +#include "xfs_health.h" STATIC int xfs_attr_shortform_compare(const void *a, const void *b) @@ -56,14 +57,13 @@ xfs_attr_shortform_list( struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_inode *dp = context->dp; struct xfs_attr_sf_sort *sbuf, *sbp; - struct xfs_attr_shortform *sf; + struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; int sbsize, nsbuf, count, i; int error = 0; - sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; ASSERT(sf != NULL); - if (!sf->hdr.count) + if (!sf->count) return 0; trace_xfs_attr_list_sf(context); @@ -79,12 +79,14 @@ xfs_attr_shortform_list( */ if (context->bufsize == 0 || (XFS_ISRESET_CURSOR(cursor) && - (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) { - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + (dp->i_af.if_bytes + sf->count * 16) < context->bufsize)) { + for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (XFS_IS_CORRUPT(context->dp->i_mount, !xfs_attr_namecheck(sfe->nameval, - sfe->namelen))) + sfe->namelen))) { + xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK); return -EFSCORRUPTED; + } context->put_listent(context, sfe->flags, sfe->nameval, @@ -109,15 +111,15 @@ xfs_attr_shortform_list( /* * It didn't all fit, so we have to sort everything on hashval. */ - sbsize = sf->hdr.count * sizeof(*sbuf); - sbp = sbuf = kmem_alloc(sbsize, KM_NOFS); + sbsize = sf->count * sizeof(*sbuf); + sbp = sbuf = kmalloc(sbsize, GFP_KERNEL | __GFP_NOFAIL); /* * Scan the attribute list for the rest of the entries, storing * the relevant info from only those that match into a buffer. */ nsbuf = 0; - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (unlikely( ((char *)sfe < (char *)sf) || ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) { @@ -125,7 +127,8 @@ xfs_attr_shortform_list( XFS_ERRLEVEL_LOW, context->dp->i_mount, sfe, sizeof(*sfe)); - kmem_free(sbuf); + kfree(sbuf); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } @@ -176,6 +179,7 @@ xfs_attr_shortform_list( if (XFS_IS_CORRUPT(context->dp->i_mount, !xfs_attr_namecheck(sbp->name, sbp->namelen))) { + xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK); error = -EFSCORRUPTED; goto out; } @@ -189,7 +193,7 @@ xfs_attr_shortform_list( cursor->offset++; } out: - kmem_free(sbuf); + kfree(sbuf); return error; } @@ -263,8 +267,10 @@ xfs_attr_node_list_lookup( return 0; /* We can't point back to the root. */ - if (XFS_IS_CORRUPT(mp, cursor->blkno == 0)) + if (XFS_IS_CORRUPT(mp, cursor->blkno == 0)) { + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; + } } if (expected_level != 0) @@ -276,6 +282,7 @@ xfs_attr_node_list_lookup( out_corruptbuf: xfs_buf_mark_corrupt(bp); xfs_trans_brelse(tp, bp); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } @@ -305,6 +312,8 @@ xfs_attr_node_list( if (cursor->blkno > 0) { error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp, XFS_ATTR_FORK); + if (xfs_metadata_is_sick(error)) + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); if ((error != 0) && (error != -EFSCORRUPTED)) return error; if (bp) { @@ -465,8 +474,10 @@ xfs_attr3_leaf_list_int( } if (XFS_IS_CORRUPT(context->dp->i_mount, - !xfs_attr_namecheck(name, namelen))) + !xfs_attr_namecheck(name, namelen))) { + xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK); return -EFSCORRUPTED; + } context->put_listent(context, entry->flags, name, namelen, valuelen); if (context->seen_enough) @@ -505,7 +516,7 @@ xfs_attr_list_ilocked( { struct xfs_inode *dp = context->dp; - ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); /* * Decide on what work routines to call based on the inode size. diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e736a0844c89..d27859a684aa 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -25,6 +25,7 @@ #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_ag.h" +#include "xfs_trace.h" struct kmem_cache *xfs_bui_cache; struct kmem_cache *xfs_bud_cache; @@ -40,7 +41,7 @@ STATIC void xfs_bui_item_free( struct xfs_bui_log_item *buip) { - kmem_free(buip->bui_item.li_lv_shadow); + kvfree(buip->bui_item.li_lv_shadow); kmem_cache_free(xfs_bui_cache, buip); } @@ -201,7 +202,7 @@ xfs_bud_item_release( struct xfs_bud_log_item *budp = BUD_ITEM(lip); xfs_bui_release(budp->bud_buip); - kmem_free(budp->bud_item.li_lv_shadow); + kvfree(budp->bud_item.li_lv_shadow); kmem_cache_free(xfs_bud_cache, budp); } @@ -221,49 +222,9 @@ static const struct xfs_item_ops xfs_bud_item_ops = { .iop_intent = xfs_bud_item_intent, }; -static struct xfs_bud_log_item * -xfs_trans_get_bud( - struct xfs_trans *tp, - struct xfs_bui_log_item *buip) +static inline struct xfs_bmap_intent *bi_entry(const struct list_head *e) { - struct xfs_bud_log_item *budp; - - budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, - &xfs_bud_item_ops); - budp->bud_buip = buip; - budp->bud_format.bud_bui_id = buip->bui_format.bui_id; - - xfs_trans_add_item(tp, &budp->bud_item); - return budp; -} - -/* - * Finish an bmap update and log it to the BUD. Note that the - * transaction is marked dirty regardless of whether the bmap update - * succeeds or fails to support the BUI/BUD lifecycle rules. - */ -static int -xfs_trans_log_finish_bmap_update( - struct xfs_trans *tp, - struct xfs_bud_log_item *budp, - struct xfs_bmap_intent *bi) -{ - int error; - - error = xfs_bmap_finish_one(tp, bi); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the BUI and frees the BUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); - - return error; + return list_entry(e, struct xfs_bmap_intent, bi_list); } /* Sort bmap intents by inode. */ @@ -273,37 +234,12 @@ xfs_bmap_update_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_bmap_intent *ba; - struct xfs_bmap_intent *bb; + struct xfs_bmap_intent *ba = bi_entry(a); + struct xfs_bmap_intent *bb = bi_entry(b); - ba = container_of(a, struct xfs_bmap_intent, bi_list); - bb = container_of(b, struct xfs_bmap_intent, bi_list); return ba->bi_owner->i_ino - bb->bi_owner->i_ino; } -/* Set the map extent flags for this mapping. */ -static void -xfs_trans_set_bmap_flags( - struct xfs_map_extent *map, - enum xfs_bmap_intent_type type, - int whichfork, - xfs_exntst_t state) -{ - map->me_flags = 0; - switch (type) { - case XFS_BMAP_MAP: - case XFS_BMAP_UNMAP: - map->me_flags = type; - break; - default: - ASSERT(0); - } - if (state == XFS_EXT_UNWRITTEN) - map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; - if (whichfork == XFS_ATTR_FORK) - map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; -} - /* Log bmap updates in the intent item. */ STATIC void xfs_bmap_update_log_item( @@ -314,9 +250,6 @@ xfs_bmap_update_log_item( uint next_extent; struct xfs_map_extent *map; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from @@ -329,8 +262,21 @@ xfs_bmap_update_log_item( map->me_startblock = bi->bi_bmap.br_startblock; map->me_startoff = bi->bi_bmap.br_startoff; map->me_len = bi->bi_bmap.br_blockcount; - xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork, - bi->bi_bmap.br_state); + + switch (bi->bi_type) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + map->me_flags = bi->bi_type; + break; + default: + ASSERT(0); + } + if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN) + map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; + if (bi->bi_whichfork == XFS_ATTR_FORK) + map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; + if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) + map->me_flags |= XFS_BMAP_EXTENT_REALTIME; } static struct xfs_log_item * @@ -346,7 +292,6 @@ xfs_bmap_update_create_intent( ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); - xfs_trans_add_item(tp, &buip->bui_item); if (sort) list_sort(mp, items, xfs_bmap_update_diff_items); list_for_each_entry(bi, items, bi_list) @@ -354,24 +299,36 @@ xfs_bmap_update_create_intent( return &buip->bui_item; } -/* Get an BUD so we can process all the deferred rmap updates. */ +/* Get an BUD so we can process all the deferred bmap updates. */ static struct xfs_log_item * xfs_bmap_update_create_done( struct xfs_trans *tp, struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item; + struct xfs_bui_log_item *buip = BUI_ITEM(intent); + struct xfs_bud_log_item *budp; + + budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, + &xfs_bud_item_ops); + budp->bud_buip = buip; + budp->bud_format.bud_bui_id = buip->bui_format.bui_id; + + return &budp->bud_item; } /* Take a passive ref to the AG containing the space we're mapping. */ -void +static inline void xfs_bmap_update_get_group( struct xfs_mount *mp, struct xfs_bmap_intent *bi) { xfs_agnumber_t agno; + if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) + return; + agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock); /* @@ -384,15 +341,41 @@ xfs_bmap_update_get_group( bi->bi_pag = xfs_perag_intent_get(mp, agno); } +/* Add this deferred BUI to the transaction. */ +void +xfs_bmap_defer_add( + struct xfs_trans *tp, + struct xfs_bmap_intent *bi) +{ + trace_xfs_bmap_defer(bi); + + xfs_bmap_update_get_group(tp->t_mountp, bi); + xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); +} + /* Release a passive AG ref after finishing mapping work. */ static inline void xfs_bmap_update_put_group( struct xfs_bmap_intent *bi) { + if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) + return; + xfs_perag_intent_put(bi->bi_pag); } -/* Process a deferred rmap update. */ +/* Cancel a deferred bmap update. */ +STATIC void +xfs_bmap_update_cancel_item( + struct list_head *item) +{ + struct xfs_bmap_intent *bi = bi_entry(item); + + xfs_bmap_update_put_group(bi); + kmem_cache_free(xfs_bmap_intent_cache, bi); +} + +/* Process a deferred bmap update. */ STATIC int xfs_bmap_update_finish_item( struct xfs_trans *tp, @@ -400,19 +383,16 @@ xfs_bmap_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_bmap_intent *bi; + struct xfs_bmap_intent *bi = bi_entry(item); int error; - bi = container_of(item, struct xfs_bmap_intent, bi_list); - - error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi); + error = xfs_bmap_finish_one(tp, bi); if (!error && bi->bi_bmap.br_blockcount > 0) { ASSERT(bi->bi_type == XFS_BMAP_UNMAP); return -EAGAIN; } - xfs_bmap_update_put_group(bi); - kmem_cache_free(xfs_bmap_intent_cache, bi); + xfs_bmap_update_cancel_item(item); return error; } @@ -424,28 +404,6 @@ xfs_bmap_update_abort_intent( xfs_bui_release(BUI_ITEM(intent)); } -/* Cancel a deferred bmap update. */ -STATIC void -xfs_bmap_update_cancel_item( - struct list_head *item) -{ - struct xfs_bmap_intent *bi; - - bi = container_of(item, struct xfs_bmap_intent, bi_list); - - xfs_bmap_update_put_group(bi); - kmem_cache_free(xfs_bmap_intent_cache, bi); -} - -const struct xfs_defer_op_type xfs_bmap_update_defer_type = { - .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .create_intent = xfs_bmap_update_create_intent, - .abort_intent = xfs_bmap_update_abort_intent, - .create_done = xfs_bmap_update_create_done, - .finish_item = xfs_bmap_update_finish_item, - .cancel_item = xfs_bmap_update_cancel_item, -}; - /* Is this recovered BUI ok? */ static inline bool xfs_bui_validate( @@ -477,26 +435,60 @@ xfs_bui_validate( if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) return false; + if (map->me_flags & XFS_BMAP_EXTENT_REALTIME) + return xfs_verify_rtbext(mp, map->me_startblock, map->me_len); + return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } +static inline struct xfs_bmap_intent * +xfs_bui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_inode **ipp, + struct xfs_map_extent *map) +{ + struct xfs_bmap_intent *bi; + int error; + + error = xlog_recover_iget(mp, map->me_owner, ipp); + if (error) + return ERR_PTR(error); + + bi = kmem_cache_zalloc(xfs_bmap_intent_cache, + GFP_KERNEL | __GFP_NOFAIL); + bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + bi->bi_bmap.br_startblock = map->me_startblock; + bi->bi_bmap.br_startoff = map->me_startoff; + bi->bi_bmap.br_blockcount = map->me_len; + bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + bi->bi_owner = *ipp; + xfs_bmap_update_get_group(mp, bi); + + xfs_defer_add_item(dfp, &bi->bi_list); + return bi; +} + /* * Process a bmap update intent item that was recovered from the log. * We need to update some inode's bmbt. */ STATIC int -xfs_bui_item_recover( - struct xfs_log_item *lip, +xfs_bmap_recover_work( + struct xfs_defer_pending *dfp, struct list_head *capture_list) { - struct xfs_bmap_intent fake = { }; struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_map_extent *map; - struct xfs_bud_log_item *budp; + struct xfs_bmap_intent *work; int iext_delta; int error = 0; @@ -507,13 +499,9 @@ xfs_bui_item_recover( } map = &buip->bui_format.bui_extents[0]; - fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - - error = xlog_recover_iget(mp, map->me_owner, &ip); - if (error) - return error; + work = xfs_bui_recover_work(mp, dfp, &ip, map); + if (IS_ERR(work)) + return PTR_ERR(work); /* Allocate transaction and do the work. */ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -522,42 +510,33 @@ xfs_bui_item_recover( if (error) goto err_rele; - budp = xfs_trans_get_bud(tp, buip); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - if (fake.bi_type == XFS_BMAP_MAP) + if (!!(map->me_flags & XFS_BMAP_EXTENT_REALTIME) != + xfs_ifork_is_realtime(ip, work->bi_whichfork)) { + error = -EFSCORRUPTED; + goto err_cancel; + } + + if (work->bi_type == XFS_BMAP_MAP) iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; else iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta); + error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; - fake.bi_owner = ip; - fake.bi_bmap.br_startblock = map->me_startblock; - fake.bi_bmap.br_startoff = map->me_startoff; - fake.bi_bmap.br_blockcount = map->me_len; - fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - - xfs_bmap_update_get_group(mp, &fake); - error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); + error = xlog_recover_finish_intent(tp, dfp); if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, - sizeof(*map)); - xfs_bmap_update_put_group(&fake); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &buip->bui_format, sizeof(buip->bui_format)); if (error) goto err_cancel; - if (fake.bi_bmap.br_blockcount > 0) { - ASSERT(fake.bi_type == XFS_BMAP_UNMAP); - xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap); - } - /* * Commit transaction, which frees the transaction and saves the inode * for later replay activities. @@ -579,21 +558,13 @@ err_rele: return error; } -STATIC bool -xfs_bui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return BUI_ITEM(lip)->bui_format.bui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_bui_item_relog( +xfs_bmap_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_bud_log_item *budp; struct xfs_bui_log_item *buip; struct xfs_map_extent *map; unsigned int count; @@ -601,27 +572,40 @@ xfs_bui_item_relog( count = BUI_ITEM(intent)->bui_format.bui_nextents; map = BUI_ITEM(intent)->bui_format.bui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); - buip = xfs_bui_init(tp->t_mountp); memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); atomic_set(&buip->bui_next_extent, count); - xfs_trans_add_item(tp, &buip->bui_item); - set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); + return &buip->bui_item; } +const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .name = "bmap", + .max_items = XFS_BUI_MAX_FAST_EXTENTS, + .create_intent = xfs_bmap_update_create_intent, + .abort_intent = xfs_bmap_update_abort_intent, + .create_done = xfs_bmap_update_create_done, + .finish_item = xfs_bmap_update_finish_item, + .cancel_item = xfs_bmap_update_cancel_item, + .recover_work = xfs_bmap_recover_work, + .relog_intent = xfs_bmap_relog_intent, +}; + +STATIC bool +xfs_bui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return BUI_ITEM(lip)->bui_format.bui_id == intent_id; +} + static const struct xfs_item_ops xfs_bui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_bui_item_size, .iop_format = xfs_bui_item_format, .iop_unpin = xfs_bui_item_unpin, .iop_release = xfs_bui_item_release, - .iop_recover = xfs_bui_item_recover, .iop_match = xfs_bui_item_match, - .iop_relog = xfs_bui_item_relog, }; static inline void @@ -681,12 +665,9 @@ xlog_recover_bui_commit_pass2( buip = xfs_bui_init(mp); xfs_bui_copy_format(&buip->bui_format, bui_formatp); atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn); - xfs_bui_release(buip); + + xlog_recover_intent_item(log, &buip->bui_item, lsn, + &xfs_bmap_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index 3fafd3881a0b..6fee6a508343 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -68,4 +68,8 @@ struct xfs_bud_log_item { extern struct kmem_cache *xfs_bui_cache; extern struct kmem_cache *xfs_bud_cache; +struct xfs_bmap_intent; + +void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi); + #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 40e0a1f1f753..19e11d1da660 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -28,6 +28,7 @@ #include "xfs_icache.h" #include "xfs_iomap.h" #include "xfs_reflink.h" +#include "xfs_rtbitmap.h" /* Kernel only BMAP related definitions and functions */ @@ -65,157 +66,9 @@ xfs_zero_extent( return blkdev_issue_zeroout(target->bt_bdev, block << (mp->m_super->s_blocksize_bits - 9), count_fsb << (mp->m_super->s_blocksize_bits - 9), - GFP_NOFS, 0); + GFP_KERNEL, 0); } -#ifdef CONFIG_XFS_RT -int -xfs_bmap_rtalloc( - struct xfs_bmalloca *ap) -{ - struct xfs_mount *mp = ap->ip->i_mount; - xfs_fileoff_t orig_offset = ap->offset; - xfs_rtblock_t rtb; - xfs_extlen_t prod = 0; /* product factor for allocators */ - xfs_extlen_t mod = 0; /* product factor for allocators */ - xfs_extlen_t ralen = 0; /* realtime allocation length */ - xfs_extlen_t align; /* minimum allocation alignment */ - xfs_extlen_t orig_length = ap->length; - xfs_extlen_t minlen = mp->m_sb.sb_rextsize; - xfs_extlen_t raminlen; - bool rtlocked = false; - bool ignore_locality = false; - int error; - - align = xfs_get_extsz_hint(ap->ip); -retry: - prod = align / mp->m_sb.sb_rextsize; - error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, - align, 1, ap->eof, 0, - ap->conv, &ap->offset, &ap->length); - if (error) - return error; - ASSERT(ap->length); - ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); - - /* - * If we shifted the file offset downward to satisfy an extent size - * hint, increase minlen by that amount so that the allocator won't - * give us an allocation that's too short to cover at least one of the - * blocks that the caller asked for. - */ - if (ap->offset != orig_offset) - minlen += orig_offset - ap->offset; - - /* - * If the offset & length are not perfectly aligned - * then kill prod, it will just get us in trouble. - */ - div_u64_rem(ap->offset, align, &mod); - if (mod || ap->length % align) - prod = 1; - /* - * Set ralen to be the actual requested length in rtextents. - */ - ralen = ap->length / mp->m_sb.sb_rextsize; - /* - * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that - * we rounded up to it, cut it back so it's valid again. - * Note that if it's a really large request (bigger than - * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't - * adjust the starting point to match it. - */ - if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN) - ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize; - - /* - * Lock out modifications to both the RT bitmap and summary inodes - */ - if (!rtlocked) { - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); - xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); - rtlocked = true; - } - - /* - * If it's an allocation to an empty file at offset 0, - * pick an extent that will space things out in the rt area. - */ - if (ap->eof && ap->offset == 0) { - xfs_rtblock_t rtx; /* realtime extent no */ - - error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); - if (error) - return error; - ap->blkno = rtx * mp->m_sb.sb_rextsize; - } else { - ap->blkno = 0; - } - - xfs_bmap_adjacent(ap); - - /* - * Realtime allocation, done through xfs_rtallocate_extent. - */ - if (ignore_locality) - ap->blkno = 0; - else - do_div(ap->blkno, mp->m_sb.sb_rextsize); - rtb = ap->blkno; - ap->length = ralen; - raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize); - error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length, - &ralen, ap->wasdel, prod, &rtb); - if (error) - return error; - - if (rtb != NULLRTBLOCK) { - ap->blkno = rtb * mp->m_sb.sb_rextsize; - ap->length = ralen * mp->m_sb.sb_rextsize; - ap->ip->i_nblocks += ap->length; - xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - if (ap->wasdel) - ap->ip->i_delayed_blks -= ap->length; - /* - * Adjust the disk quota also. This was reserved - * earlier. - */ - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : - XFS_TRANS_DQ_RTBCOUNT, ap->length); - return 0; - } - - if (align > mp->m_sb.sb_rextsize) { - /* - * We previously enlarged the request length to try to satisfy - * an extent size hint. The allocator didn't return anything, - * so reset the parameters to the original values and try again - * without alignment criteria. - */ - ap->offset = orig_offset; - ap->length = orig_length; - minlen = align = mp->m_sb.sb_rextsize; - goto retry; - } - - if (!ignore_locality && ap->blkno != 0) { - /* - * If we can't allocate near a specific rt extent, try again - * without locality criteria. - */ - ignore_locality = true; - goto retry; - } - - ap->blkno = NULLFSBLOCK; - ap->length = 0; - return 0; -} -#endif /* CONFIG_XFS_RT */ - /* * Extent tree block counting routines. */ @@ -655,8 +508,8 @@ xfs_can_free_eofblocks( * Caller must either hold the exclusive io lock; or be inactivating * the inode, which guarantees there are no other users of the inode. */ - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL) || - (VFS_I(ip)->i_state & I_FREEING)); + if (!(VFS_I(ip)->i_state & I_FREEING)) + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); /* prealloc/delalloc exists only on regular files */ if (!S_ISREG(VFS_I(ip)->i_mode)) @@ -690,7 +543,7 @@ xfs_can_free_eofblocks( */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) - end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize); + end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return false; @@ -780,12 +633,10 @@ xfs_alloc_file_space( { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; - xfs_filblks_t allocated_fsb; xfs_filblks_t allocatesize_fsb; xfs_extlen_t extsz, temp; xfs_fileoff_t startoffset_fsb; xfs_fileoff_t endoffset_fsb; - int nimaps; int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; @@ -808,7 +659,6 @@ xfs_alloc_file_space( count = len; imapp = &imaps[0]; - nimaps = 1; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); endoffset_fsb = XFS_B_TO_FSB(mp, offset + count); allocatesize_fsb = endoffset_fsb - startoffset_fsb; @@ -819,6 +669,7 @@ xfs_alloc_file_space( while (allocatesize_fsb && !error) { xfs_fileoff_t s, e; unsigned int dblocks, rblocks, resblks; + int nimaps = 1; /* * Determine space reservations for data/realtime. @@ -884,15 +735,19 @@ xfs_alloc_file_space( if (error) break; - allocated_fsb = imapp->br_blockcount; - - if (nimaps == 0) { - error = -ENOSPC; - break; + /* + * If the allocator cannot find a single free extent large + * enough to cover the start block of the requested range, + * xfs_bmapi_write will return 0 but leave *nimaps set to 0. + * + * In that case we simply need to keep looping with the same + * startoffset_fsb so that one of the following allocations + * will eventually reach the requested range. + */ + if (nimaps) { + startoffset_fsb += imapp->br_blockcount; + allocatesize_fsb -= imapp->br_blockcount; } - - startoffset_fsb += allocated_fsb; - allocatesize_fsb -= allocated_fsb; } return error; @@ -989,10 +844,8 @@ xfs_free_file_space( /* We can only free complete realtime extents. */ if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { - startoffset_fsb = roundup_64(startoffset_fsb, - mp->m_sb.sb_rextsize); - endoffset_fsb = rounddown_64(endoffset_fsb, - mp->m_sb.sb_rextsize); + startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb); + endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb); } /* @@ -1112,8 +965,7 @@ xfs_collapse_file_space( xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); bool done = false; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); trace_xfs_collapse_file_space(ip); @@ -1182,8 +1034,7 @@ xfs_insert_file_space( xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); bool done = false; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); trace_xfs_insert_file_space(ip); @@ -1454,16 +1305,16 @@ xfs_swap_extent_rmap( } /* Remove the mapping from the donor file. */ - xfs_bmap_unmap_extent(tp, tip, &uirec); + xfs_bmap_unmap_extent(tp, tip, XFS_DATA_FORK, &uirec); /* Remove the mapping from the source file. */ - xfs_bmap_unmap_extent(tp, ip, &irec); + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &irec); /* Map the donor file's blocks into the source file. */ - xfs_bmap_map_extent(tp, ip, &uirec); + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &uirec); /* Map the source file's blocks into the donor file. */ - xfs_bmap_map_extent(tp, tip, &irec); + xfs_bmap_map_extent(tp, tip, XFS_DATA_FORK, &irec); error = xfs_defer_finish(tpp); tp = *tpp; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 6888078f5c31..77ecbb753ef2 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -47,7 +47,7 @@ int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, int rt, int eof, int delay, int convert, xfs_fileoff_t *offp, xfs_extlen_t *lenp); -void xfs_bmap_adjacent(struct xfs_bmalloca *ap); +bool xfs_bmap_adjacent(struct xfs_bmalloca *ap); int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *rec, int *is_empty); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 545c7991b9b5..f0fa02264eda 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -21,6 +21,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_ag.h" +#include "xfs_buf_mem.h" struct kmem_cache *xfs_buf_cache; @@ -60,6 +61,11 @@ xfs_buf_submit( return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); } +static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) +{ + return bp->b_rhash_key == XFS_BUF_DADDR_NULL; +} + static inline int xfs_buf_is_vmapped( struct xfs_buf *bp) @@ -169,7 +175,7 @@ xfs_buf_stale( atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && - (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) atomic_dec(&bp->b_hold); ASSERT(atomic_read(&bp->b_hold) >= 1); @@ -189,8 +195,8 @@ xfs_buf_get_maps( return 0; } - bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), - KM_NOFS); + bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map), + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); if (!bp->b_maps) return -ENOMEM; return 0; @@ -204,7 +210,7 @@ xfs_buf_free_maps( struct xfs_buf *bp) { if (bp->b_maps != &bp->__b_map) { - kmem_free(bp->b_maps); + kfree(bp->b_maps); bp->b_maps = NULL; } } @@ -222,7 +228,8 @@ _xfs_buf_alloc( int i; *bpp = NULL; - bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL); + bp = kmem_cache_zalloc(xfs_buf_cache, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); /* * We don't want certain flags to appear in b_flags unless they are @@ -289,7 +296,7 @@ xfs_buf_free_pages( mm_account_reclaimed_pages(bp->b_page_count); if (bp->b_pages != bp->b_page_array) - kmem_free(bp->b_pages); + kfree(bp->b_pages); bp->b_pages = NULL; bp->b_flags &= ~_XBF_PAGES; } @@ -312,10 +319,12 @@ xfs_buf_free( ASSERT(list_empty(&bp->b_lru)); - if (bp->b_flags & _XBF_PAGES) + if (xfs_buftarg_is_mem(bp->b_target)) + xmbuf_unmap_page(bp); + else if (bp->b_flags & _XBF_PAGES) xfs_buf_free_pages(bp); else if (bp->b_flags & _XBF_KMEM) - kmem_free(bp->b_addr); + kfree(bp->b_addr); call_rcu(&bp->b_rcu, xfs_buf_free_callback); } @@ -325,21 +334,21 @@ xfs_buf_alloc_kmem( struct xfs_buf *bp, xfs_buf_flags_t flags) { - xfs_km_flags_t kmflag_mask = KM_NOFS; + gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL; size_t size = BBTOB(bp->b_length); /* Assure zeroed buffer for non-read cases. */ if (!(flags & XBF_READ)) - kmflag_mask |= KM_ZERO; + gfp_mask |= __GFP_ZERO; - bp->b_addr = kmem_alloc(size, kmflag_mask); + bp->b_addr = kmalloc(size, gfp_mask); if (!bp->b_addr) return -ENOMEM; if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != ((unsigned long)bp->b_addr & PAGE_MASK)) { /* b_addr spans two pages - use alloc_page instead */ - kmem_free(bp->b_addr); + kfree(bp->b_addr); bp->b_addr = NULL; return -ENOMEM; } @@ -356,13 +365,11 @@ xfs_buf_alloc_pages( struct xfs_buf *bp, xfs_buf_flags_t flags) { - gfp_t gfp_mask = __GFP_NOWARN; + gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; long filled = 0; if (flags & XBF_READ_AHEAD) gfp_mask |= __GFP_NORETRY; - else - gfp_mask |= GFP_NOFS; /* Make sure that we have a page list */ bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); @@ -429,11 +436,18 @@ _xfs_buf_map_pages( /* * vm_map_ram() will allocate auxiliary structures (e.g. - * pagetables) with GFP_KERNEL, yet we are likely to be under - * GFP_NOFS context here. Hence we need to tell memory reclaim - * that we are in such a context via PF_MEMALLOC_NOFS to prevent - * memory reclaim re-entering the filesystem here and - * potentially deadlocking. + * pagetables) with GFP_KERNEL, yet we often under a scoped nofs + * context here. Mixing GFP_KERNEL with GFP_NOFS allocations + * from the same call site that can be run from both above and + * below memory reclaim causes lockdep false positives. Hence we + * always need to force this allocation to nofs context because + * we can't pass __GFP_NOLOCKDEP down to auxillary structures to + * prevent false positive lockdep reports. + * + * XXX(dgc): I think dquot reclaim is the only place we can get + * to this function from memory reclaim context now. If we fix + * that like we've fixed inode reclaim to avoid writeback from + * reclaim, this nofs wrapping can go away. */ nofs_flag = memalloc_nofs_save(); do { @@ -499,18 +513,18 @@ static const struct rhashtable_params xfs_buf_hash_params = { }; int -xfs_buf_hash_init( - struct xfs_perag *pag) +xfs_buf_cache_init( + struct xfs_buf_cache *bch) { - spin_lock_init(&pag->pag_buf_lock); - return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); + spin_lock_init(&bch->bc_lock); + return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); } void -xfs_buf_hash_destroy( - struct xfs_perag *pag) +xfs_buf_cache_destroy( + struct xfs_buf_cache *bch) { - rhashtable_destroy(&pag->pag_buf_hash); + rhashtable_destroy(&bch->bc_hash); } static int @@ -573,7 +587,7 @@ xfs_buf_find_lock( static inline int xfs_buf_lookup( - struct xfs_perag *pag, + struct xfs_buf_cache *bch, struct xfs_buf_map *map, xfs_buf_flags_t flags, struct xfs_buf **bpp) @@ -582,7 +596,7 @@ xfs_buf_lookup( int error; rcu_read_lock(); - bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params); + bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { rcu_read_unlock(); return -ENOENT; @@ -607,6 +621,7 @@ xfs_buf_lookup( static int xfs_buf_find_insert( struct xfs_buftarg *btp, + struct xfs_buf_cache *bch, struct xfs_perag *pag, struct xfs_buf_map *cmap, struct xfs_buf_map *map, @@ -622,31 +637,33 @@ xfs_buf_find_insert( if (error) goto out_drop_pag; - /* - * For buffers that fit entirely within a single page, first attempt to - * allocate the memory from the heap to minimise memory usage. If we - * can't get heap memory for these small buffers, we fall back to using - * the page allocator. - */ - if (BBTOB(new_bp->b_length) >= PAGE_SIZE || - xfs_buf_alloc_kmem(new_bp, flags) < 0) { + if (xfs_buftarg_is_mem(new_bp->b_target)) { + error = xmbuf_map_page(new_bp); + } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE || + xfs_buf_alloc_kmem(new_bp, flags) < 0) { + /* + * For buffers that fit entirely within a single page, first + * attempt to allocate the memory from the heap to minimise + * memory usage. If we can't get heap memory for these small + * buffers, we fall back to using the page allocator. + */ error = xfs_buf_alloc_pages(new_bp, flags); - if (error) - goto out_free_buf; } + if (error) + goto out_free_buf; - spin_lock(&pag->pag_buf_lock); - bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash, + spin_lock(&bch->bc_lock); + bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, &new_bp->b_rhash_head, xfs_buf_hash_params); if (IS_ERR(bp)) { error = PTR_ERR(bp); - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); goto out_free_buf; } if (bp) { /* found an existing buffer */ atomic_inc(&bp->b_hold); - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); error = xfs_buf_find_lock(bp, flags); if (error) xfs_buf_rele(bp); @@ -657,17 +674,40 @@ xfs_buf_find_insert( /* The new buffer keeps the perag reference until it is freed. */ new_bp->b_pag = pag; - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); *bpp = new_bp; return 0; out_free_buf: xfs_buf_free(new_bp); out_drop_pag: - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); return error; } +static inline struct xfs_perag * +xfs_buftarg_get_pag( + struct xfs_buftarg *btp, + const struct xfs_buf_map *map) +{ + struct xfs_mount *mp = btp->bt_mount; + + if (xfs_buftarg_is_mem(btp)) + return NULL; + return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn)); +} + +static inline struct xfs_buf_cache * +xfs_buftarg_buf_cache( + struct xfs_buftarg *btp, + struct xfs_perag *pag) +{ + if (pag) + return &pag->pag_bcache; + return btp->bt_cache; +} + /* * Assembles a buffer covering the specified range. The code is optimised for * cache hits, as metadata intensive workloads will see 3 orders of magnitude @@ -681,6 +721,7 @@ xfs_buf_get_map( xfs_buf_flags_t flags, struct xfs_buf **bpp) { + struct xfs_buf_cache *bch; struct xfs_perag *pag; struct xfs_buf *bp = NULL; struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; @@ -696,10 +737,10 @@ xfs_buf_get_map( if (error) return error; - pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); + pag = xfs_buftarg_get_pag(btp, &cmap); + bch = xfs_buftarg_buf_cache(btp, pag); - error = xfs_buf_lookup(pag, &cmap, flags, &bp); + error = xfs_buf_lookup(bch, &cmap, flags, &bp); if (error && error != -ENOENT) goto out_put_perag; @@ -711,13 +752,14 @@ xfs_buf_get_map( goto out_put_perag; /* xfs_buf_find_insert() consumes the perag reference. */ - error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, + error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps, flags, &bp); if (error) return error; } else { XFS_STATS_INC(btp->bt_mount, xb_get_locked); - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); } /* We do not hold a perag reference anymore. */ @@ -745,7 +787,8 @@ xfs_buf_get_map( return 0; out_put_perag: - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); return error; } @@ -892,6 +935,13 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; + /* + * Currently we don't have a good means or justification for performing + * xmbuf_map_page asynchronously, so we don't do readahead. + */ + if (xfs_buftarg_is_mem(target)) + return; + xfs_buf_read_map(target, map, nmaps, XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, __this_address); @@ -957,7 +1007,10 @@ xfs_buf_get_uncached( if (error) return error; - error = xfs_buf_alloc_pages(bp, flags); + if (xfs_buftarg_is_mem(bp->b_target)) + error = xmbuf_map_page(bp); + else + error = xfs_buf_alloc_pages(bp, flags); if (error) goto fail_free_buf; @@ -990,29 +1043,29 @@ xfs_buf_hold( atomic_inc(&bp->b_hold); } -/* - * Release a hold on the specified buffer. If the hold count is 1, the buffer is - * placed on LRU or freed (depending on b_lru_ref). - */ -void -xfs_buf_rele( +static void +xfs_buf_rele_uncached( struct xfs_buf *bp) { + ASSERT(list_empty(&bp->b_lru)); + if (atomic_dec_and_test(&bp->b_hold)) { + xfs_buf_ioacct_dec(bp); + xfs_buf_free(bp); + } +} + +static void +xfs_buf_rele_cached( + struct xfs_buf *bp) +{ + struct xfs_buftarg *btp = bp->b_target; struct xfs_perag *pag = bp->b_pag; + struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); bool release; bool freebuf = false; trace_xfs_buf_rele(bp, _RET_IP_); - if (!pag) { - ASSERT(list_empty(&bp->b_lru)); - if (atomic_dec_and_test(&bp->b_hold)) { - xfs_buf_ioacct_dec(bp); - xfs_buf_free(bp); - } - return; - } - ASSERT(atomic_read(&bp->b_hold) > 0); /* @@ -1026,7 +1079,7 @@ xfs_buf_rele( * leading to a use-after-free scenario. */ spin_lock(&bp->b_lock); - release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock); if (!release) { /* * Drop the in-flight state if the buffer is already on the LRU @@ -1047,11 +1100,11 @@ xfs_buf_rele( * buffer for the LRU and clear the (now stale) dispose list * state flag */ - if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) { bp->b_state &= ~XFS_BSTATE_DISPOSE; atomic_inc(&bp->b_hold); } - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); } else { /* * most of the time buffers will already be removed from the @@ -1060,16 +1113,17 @@ xfs_buf_rele( * was on was the disposal list */ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + list_lru_del_obj(&btp->bt_lru, &bp->b_lru); } else { ASSERT(list_empty(&bp->b_lru)); } ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, - xfs_buf_hash_params); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); + rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, + xfs_buf_hash_params); + spin_unlock(&bch->bc_lock); + if (pag) + xfs_perag_put(pag); freebuf = true; } @@ -1080,6 +1134,19 @@ out_unlock: xfs_buf_free(bp); } +/* + * Release a hold on the specified buffer. + */ +void +xfs_buf_rele( + struct xfs_buf *bp) +{ + trace_xfs_buf_rele(bp, _RET_IP_); + if (xfs_buf_is_uncached(bp)) + xfs_buf_rele_uncached(bp); + else + xfs_buf_rele_cached(bp); +} /* * Lock a buffer object, if it is not already locked. @@ -1585,6 +1652,12 @@ _xfs_buf_ioapply( /* we only use the buffer cache for meta-data */ op |= REQ_META; + /* in-memory targets are directly mapped, no IO required. */ + if (xfs_buftarg_is_mem(bp->b_target)) { + xfs_buf_ioend(bp); + return; + } + /* * Walk all the vectors issuing IO on them. Set up the initial offset * into the buffer and the desired IO size before we start - @@ -1940,25 +2013,30 @@ xfs_buftarg_shrink_count( } void -xfs_free_buftarg( +xfs_destroy_buftarg( struct xfs_buftarg *btp) { shrinker_free(btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); +} +void +xfs_free_buftarg( + struct xfs_buftarg *btp) +{ + xfs_destroy_buftarg(btp); fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) - bdev_release(btp->bt_bdev_handle); - - kmem_free(btp); + bdev_fput(btp->bt_bdev_file); + kfree(btp); } int xfs_setsize_buftarg( - xfs_buftarg_t *btp, + struct xfs_buftarg *btp, unsigned int sectorsize) { /* Set up metadata sector size info */ @@ -1972,83 +2050,93 @@ xfs_setsize_buftarg( return -EINVAL; } - /* Set up device logical sector size mask */ - btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); - btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; - return 0; } -/* - * When allocating the initial buffer target we have not yet - * read in the superblock, so don't know what sized sectors - * are being used at this early stage. Play safe. - */ -STATIC int -xfs_setsize_buftarg_early( - xfs_buftarg_t *btp) +int +xfs_init_buftarg( + struct xfs_buftarg *btp, + size_t logical_sectorsize, + const char *descr) { - return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)); + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = logical_sectorsize; + btp->bt_logical_sectormask = logical_sectorsize - 1; + + /* + * Buffer IO error rate limiting. Limit it to no more than 10 messages + * per 30 seconds so as to not spam logs too much on repeated errors. + */ + ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, + DEFAULT_RATELIMIT_BURST); + + if (list_lru_init(&btp->bt_lru)) + return -ENOMEM; + if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + goto out_destroy_lru; + + btp->bt_shrinker = + shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr); + if (!btp->bt_shrinker) + goto out_destroy_io_count; + btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; + btp->bt_shrinker->private_data = btp; + shrinker_register(btp->bt_shrinker); + return 0; + +out_destroy_io_count: + percpu_counter_destroy(&btp->bt_io_count); +out_destroy_lru: + list_lru_destroy(&btp->bt_lru); + return -ENOMEM; } struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, - struct bdev_handle *bdev_handle) + struct file *bdev_file) { - xfs_buftarg_t *btp; + struct xfs_buftarg *btp; const struct dax_holder_operations *ops = NULL; #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) ops = &xfs_dax_holder_operations; #endif - btp = kmem_zalloc(sizeof(*btp), KM_NOFS); + btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL); btp->bt_mount = mp; - btp->bt_bdev_handle = bdev_handle; - btp->bt_dev = bdev_handle->bdev->bd_dev; - btp->bt_bdev = bdev_handle->bdev; + btp->bt_bdev_file = bdev_file; + btp->bt_bdev = file_bdev(bdev_file); + btp->bt_dev = btp->bt_bdev->bd_dev; btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); /* - * Buffer IO error rate limiting. Limit it to no more than 10 messages - * per 30 seconds so as to not spam logs too much on repeated errors. + * When allocating the buftargs we have not yet read the super block and + * thus don't know the file system sector size yet. */ - ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, - DEFAULT_RATELIMIT_BURST); - - if (xfs_setsize_buftarg_early(btp)) + if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) goto error_free; - - if (list_lru_init(&btp->bt_lru)) + if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev), + mp->m_super->s_id)) goto error_free; - if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) - goto error_lru; - - btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", - mp->m_super->s_id); - if (!btp->bt_shrinker) - goto error_pcpu; - - btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; - btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; - btp->bt_shrinker->private_data = btp; - - shrinker_register(btp->bt_shrinker); - return btp; -error_pcpu: - percpu_counter_destroy(&btp->bt_io_count); -error_lru: - list_lru_destroy(&btp->bt_lru); error_free: - kmem_free(btp); + kfree(btp); return NULL; } +static inline void +xfs_buf_list_del( + struct xfs_buf *bp) +{ + list_del_init(&bp->b_list); + wake_up_var(&bp->b_list); +} + /* * Cancel a delayed write list. * @@ -2066,7 +2154,7 @@ xfs_buf_delwri_cancel( xfs_buf_lock(bp); bp->b_flags &= ~_XBF_DELWRI_Q; - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); xfs_buf_relse(bp); } } @@ -2120,6 +2208,34 @@ xfs_buf_delwri_queue( } /* + * Queue a buffer to this delwri list as part of a data integrity operation. + * If the buffer is on any other delwri list, we'll wait for that to clear + * so that the caller can submit the buffer for IO and wait for the result. + * Callers must ensure the buffer is not already on the list. + */ +void +xfs_buf_delwri_queue_here( + struct xfs_buf *bp, + struct list_head *buffer_list) +{ + /* + * We need this buffer to end up on the /caller's/ delwri list, not any + * old list. This can happen if the buffer is marked stale (which + * clears DELWRI_Q) after the AIL queues the buffer to its list but + * before the AIL has a chance to submit the list. + */ + while (!list_empty(&bp->b_list)) { + xfs_buf_unlock(bp); + wait_var_event(&bp->b_list, list_empty(&bp->b_list)); + xfs_buf_lock(bp); + } + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + + xfs_buf_delwri_queue(bp, buffer_list); +} + +/* * Compare function is more complex than it needs to be because * the return value is only 32 bits and we are doing comparisons * on 64 bit values @@ -2181,7 +2297,7 @@ xfs_buf_delwri_submit_buffers( * reference and remove it from the list here. */ if (!(bp->b_flags & _XBF_DELWRI_Q)) { - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); xfs_buf_relse(bp); continue; } @@ -2201,7 +2317,7 @@ xfs_buf_delwri_submit_buffers( list_move_tail(&bp->b_list, wait_list); } else { bp->b_flags |= XBF_ASYNC; - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); } __xfs_buf_submit(bp, false); } @@ -2255,7 +2371,7 @@ xfs_buf_delwri_submit( while (!list_empty(&wait_list)) { bp = list_first_entry(&wait_list, struct xfs_buf, b_list); - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); /* * Wait on the locked buffer, check for errors and unlock and diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c86e16419656..b1580644501f 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -83,6 +83,14 @@ typedef unsigned int xfs_buf_flags_t; #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ #define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ +struct xfs_buf_cache { + spinlock_t bc_lock; + struct rhashtable bc_hash; +}; + +int xfs_buf_cache_init(struct xfs_buf_cache *bch); +void xfs_buf_cache_destroy(struct xfs_buf_cache *bch); + /* * The xfs_buftarg contains 2 notions of "sector size" - * @@ -96,11 +104,12 @@ typedef unsigned int xfs_buf_flags_t; * The latter is derived from the underlying device, and controls direct IO * alignment constraints. */ -typedef struct xfs_buftarg { +struct xfs_buftarg { dev_t bt_dev; - struct bdev_handle *bt_bdev_handle; + struct file *bt_bdev_file; struct block_device *bt_bdev; struct dax_device *bt_daxdev; + struct file *bt_file; u64 bt_dax_part_off; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; @@ -114,7 +123,10 @@ typedef struct xfs_buftarg { struct percpu_counter bt_io_count; struct ratelimit_state bt_ioerror_rl; -} xfs_buftarg_t; + + /* built-in cache, if we're not using the perag one */ + struct xfs_buf_cache bt_cache[]; +}; #define XB_PAGES 2 @@ -319,6 +331,7 @@ extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); +void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); @@ -365,7 +378,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) * Handling of buftargs. */ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, - struct bdev_handle *bdev_handle); + struct file *bdev_file); extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); @@ -378,4 +391,9 @@ int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); +/* for xfs_buf_mem.c only: */ +int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize, + const char *descr); +void xfs_destroy_buftarg(struct xfs_buftarg *btp); + #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 023d4e0385dd..43031842341a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -805,8 +805,8 @@ xfs_buf_item_get_format( return; } - bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), - 0); + bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), + GFP_KERNEL | __GFP_NOFAIL); } STATIC void @@ -814,7 +814,7 @@ xfs_buf_item_free_format( struct xfs_buf_log_item *bip) { if (bip->bli_formats != &bip->__bli_format) { - kmem_free(bip->bli_formats); + kfree(bip->bli_formats); bip->bli_formats = NULL; } } @@ -1044,7 +1044,7 @@ xfs_buf_item_free( struct xfs_buf_log_item *bip) { xfs_buf_item_free_format(bip); - kmem_free(bip->bli_item.li_lv_shadow); + kvfree(bip->bli_item.li_lv_shadow); kmem_cache_free(xfs_buf_item_cache, bip); } diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 43167f543afc..09e893cf563c 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -85,7 +85,7 @@ xlog_add_buffer_cancelled( return false; } - bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); + bcp = kmalloc(sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL); bcp->bc_blkno = blkno; bcp->bc_len = len; bcp->bc_refcount = 1; @@ -129,7 +129,7 @@ xlog_put_buffer_cancelled( if (--bcp->bc_refcount == 0) { list_del(&bcp->bc_list); - kmem_free(bcp); + kfree(bcp); } return true; } @@ -1062,10 +1062,10 @@ xlog_free_buf_cancel_table( &log->l_buf_cancel_table[i], struct xfs_buf_cancel, bc_list))) { list_del(&bc->bc_list); - kmem_free(bc); + kfree(bc); } } - kmem_free(log->l_buf_cancel_table); + kfree(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; } diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c new file mode 100644 index 000000000000..9bb2d24de709 --- /dev/null +++ b/fs/xfs/xfs_buf_mem.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_buf.h" +#include "xfs_buf_mem.h" +#include "xfs_trace.h" +#include <linux/shmem_fs.h> +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_error.h" + +/* + * Buffer Cache for In-Memory Files + * ================================ + * + * Online fsck wants to create ephemeral ordered recordsets. The existing + * btree infrastructure can do this, but we need the buffer cache to target + * memory instead of block devices. + * + * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those + * requirements. Therefore, the xmbuf mechanism uses an unlinked shmem file to + * store our staging data. This file is not installed in the file descriptor + * table so that user programs cannot access the data, which means that the + * xmbuf must be freed with xmbuf_destroy. + * + * xmbufs assume that the caller will handle all required concurrency + * management; standard vfs locks (freezer and inode) are not taken. Reads + * and writes are satisfied directly from the page cache. + * + * The only supported block size is PAGE_SIZE, and we cannot use highmem. + */ + +/* + * shmem files used to back an in-memory buffer cache must not be exposed to + * userspace. Upper layers must coordinate access to the one handle returned + * by the constructor, so establish a separate lock class for xmbufs to avoid + * confusing lockdep. + */ +static struct lock_class_key xmbuf_i_mutex_key; + +/* + * Allocate a buffer cache target for a memory-backed file and set up the + * buffer target. + */ +int +xmbuf_alloc( + struct xfs_mount *mp, + const char *descr, + struct xfs_buftarg **btpp) +{ + struct file *file; + struct inode *inode; + struct xfs_buftarg *btp; + int error; + + btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL); + if (!btp) + return -ENOMEM; + + file = shmem_kernel_file_setup(descr, 0, 0); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out_free_btp; + } + inode = file_inode(file); + + /* private file, private locking */ + lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key); + + /* + * We don't want to bother with kmapping data during repair, so don't + * allow highmem pages to back this mapping. + */ + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); + + /* ensure all writes are below EOF to avoid pagecache zeroing */ + i_size_write(inode, inode->i_sb->s_maxbytes); + + error = xfs_buf_cache_init(btp->bt_cache); + if (error) + goto out_file; + + /* Initialize buffer target */ + btp->bt_mount = mp; + btp->bt_dev = (dev_t)-1U; + btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */ + btp->bt_file = file; + btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE; + btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1; + + error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr); + if (error) + goto out_bcache; + + trace_xmbuf_create(btp); + + *btpp = btp; + return 0; + +out_bcache: + xfs_buf_cache_destroy(btp->bt_cache); +out_file: + fput(file); +out_free_btp: + kfree(btp); + return error; +} + +/* Free a buffer cache target for a memory-backed buffer cache. */ +void +xmbuf_free( + struct xfs_buftarg *btp) +{ + ASSERT(xfs_buftarg_is_mem(btp)); + ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); + + trace_xmbuf_free(btp); + + xfs_destroy_buftarg(btp); + xfs_buf_cache_destroy(btp->bt_cache); + fput(btp->bt_file); + kfree(btp); +} + +/* Directly map a shmem page into the buffer cache. */ +int +xmbuf_map_page( + struct xfs_buf *bp) +{ + struct inode *inode = file_inode(bp->b_target->bt_file); + struct folio *folio = NULL; + struct page *page; + loff_t pos = BBTOB(xfs_buf_daddr(bp)); + int error; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + if (bp->b_map_count != 1) + return -ENOMEM; + if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE) + return -ENOMEM; + if (offset_in_page(pos) != 0) { + ASSERT(offset_in_page(pos)); + return -ENOMEM; + } + + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE); + if (error) + return error; + + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); + return -EIO; + } + + page = folio_file_page(folio, pos >> PAGE_SHIFT); + + /* + * Mark the page dirty so that it won't be reclaimed once we drop the + * (potentially last) reference in xmbuf_unmap_page. + */ + set_page_dirty(page); + unlock_page(page); + + bp->b_addr = page_address(page); + bp->b_pages = bp->b_page_array; + bp->b_pages[0] = page; + bp->b_page_count = 1; + return 0; +} + +/* Unmap a shmem page that was mapped into the buffer cache. */ +void +xmbuf_unmap_page( + struct xfs_buf *bp) +{ + struct page *page = bp->b_pages[0]; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + put_page(page); + + bp->b_addr = NULL; + bp->b_pages[0] = NULL; + bp->b_pages = NULL; + bp->b_page_count = 0; +} + +/* Is this a valid daddr within the buftarg? */ +bool +xmbuf_verify_daddr( + struct xfs_buftarg *btp, + xfs_daddr_t daddr) +{ + struct inode *inode = file_inode(btp->bt_file); + + ASSERT(xfs_buftarg_is_mem(btp)); + + return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT); +} + +/* Discard the page backing this buffer. */ +static void +xmbuf_stale( + struct xfs_buf *bp) +{ + struct inode *inode = file_inode(bp->b_target->bt_file); + loff_t pos; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + pos = BBTOB(xfs_buf_daddr(bp)); + shmem_truncate_range(inode, pos, pos + BBTOB(bp->b_length) - 1); +} + +/* + * Finalize a buffer -- discard the backing page if it's stale, or run the + * write verifier to detect problems. + */ +int +xmbuf_finalize( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + int error = 0; + + if (bp->b_flags & XBF_STALE) { + xmbuf_stale(bp); + return 0; + } + + /* + * Although this btree is ephemeral, validate the buffer structure so + * that we can detect memory corruption errors and software bugs. + */ + fa = bp->b_ops->verify_struct(bp); + if (fa) { + error = -EFSCORRUPTED; + xfs_verifier_error(bp, error, fa); + } + + return error; +} + +/* + * Detach this xmbuf buffer from the transaction by any means necessary. + * All buffers are direct-mapped, so they do not need bwrite. + */ +void +xmbuf_trans_bdetach( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bli = bp->b_log_item; + + ASSERT(bli != NULL); + + bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED | + XFS_BLI_LOGGED | XFS_BLI_STALE); + clear_bit(XFS_LI_DIRTY, &bli->bli_item.li_flags); + + while (bp->b_log_item != NULL) + xfs_trans_bdetach(tp, bp); +} diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h new file mode 100644 index 000000000000..eed4a7b63232 --- /dev/null +++ b/fs/xfs/xfs_buf_mem.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_BUF_MEM_H__ +#define __XFS_BUF_MEM_H__ + +#define XMBUF_BLOCKSIZE (PAGE_SIZE) +#define XMBUF_BLOCKSHIFT (PAGE_SHIFT) + +#ifdef CONFIG_XFS_MEMORY_BUFS +static inline bool xfs_buftarg_is_mem(const struct xfs_buftarg *btp) +{ + return btp->bt_bdev == NULL; +} + +int xmbuf_alloc(struct xfs_mount *mp, const char *descr, + struct xfs_buftarg **btpp); +void xmbuf_free(struct xfs_buftarg *btp); + +int xmbuf_map_page(struct xfs_buf *bp); +void xmbuf_unmap_page(struct xfs_buf *bp); +bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr); +void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp); +int xmbuf_finalize(struct xfs_buf *bp); +#else +# define xfs_buftarg_is_mem(...) (false) +# define xmbuf_map_page(...) (-ENOMEM) +# define xmbuf_unmap_page(...) ((void)0) +# define xmbuf_verify_daddr(...) (false) +#endif /* CONFIG_XFS_MEMORY_BUFS */ + +#endif /* __XFS_BUF_MEM_H__ */ diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 9f3ceb461515..cf9296b7e06f 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -18,6 +18,7 @@ #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_error.h" +#include "xfs_health.h" /* * Directory file type support functions @@ -51,7 +52,7 @@ xfs_dir2_sf_getdents( struct xfs_mount *mp = dp->i_mount; xfs_dir2_dataptr_t off; /* current entry's offset */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; xfs_dir2_dataptr_t dot_offset; xfs_dir2_dataptr_t dotdot_offset; xfs_ino_t ino; @@ -59,9 +60,7 @@ xfs_dir2_sf_getdents( ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); /* * If the block number in the offset is out of range, we're done. @@ -119,8 +118,10 @@ xfs_dir2_sf_getdents( ctx->pos = off & 0x7fffffff; if (XFS_IS_CORRUPT(dp->i_mount, !xfs_dir2_namecheck(sfep->name, - sfep->namelen))) + sfep->namelen))) { + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; + } if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, xfs_dir3_get_dtype(mp, filetype))) return 0; @@ -212,6 +213,7 @@ xfs_dir2_block_getdents( if (XFS_IS_CORRUPT(dp->i_mount, !xfs_dir2_namecheck(dep->name, dep->namelen))) { + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_rele; } @@ -466,6 +468,7 @@ xfs_dir2_leaf_getdents( if (XFS_IS_CORRUPT(dp->i_mount, !xfs_dir2_namecheck(dep->name, dep->namelen))) { + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); error = -EFSCORRUPTED; break; } @@ -519,9 +522,11 @@ xfs_readdir( if (xfs_is_shutdown(dp->i_mount)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); - ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + xfs_assert_ilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL); XFS_STATS_INC(dp->i_mount, xs_dir_getdents); args.dp = dp; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index d5787991bb5b..268bb734dc0a 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -8,6 +8,7 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" +#include "xfs_trans.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_alloc_btree.h" @@ -18,6 +19,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_ag.h" +#include "xfs_health.h" /* * Notes on an efficient, low latency fstrim algorithm @@ -79,7 +81,7 @@ xfs_discard_endio_work( container_of(work, struct xfs_busy_extents, endio_work); xfs_extent_busy_clear(extents->mount, &extents->extent_list, false); - kmem_free(extents->owner); + kfree(extents->owner); } /* @@ -120,7 +122,7 @@ xfs_discard_extents( error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, &bio); + GFP_KERNEL, &bio); if (error && error != -EOPNOTSUPP) { xfs_info(mp, "discard failed for extent [0x%llx,%u], error %d", @@ -155,6 +157,7 @@ xfs_trim_gather_extents( uint64_t *blocks_trimmed) { struct xfs_mount *mp = pag->pag_mount; + struct xfs_trans *tp; struct xfs_btree_cur *cur; struct xfs_buf *agbp; int error; @@ -168,11 +171,15 @@ xfs_trim_gather_extents( */ xfs_log_force(mp, XFS_LOG_SYNC); - error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); + error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); + if (error) + goto out_trans_cancel; + + cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); /* * Look up the extent length requested in the AGF and start with it. @@ -204,6 +211,7 @@ xfs_trim_gather_extents( if (error) break; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; break; } @@ -279,7 +287,8 @@ next_extent: xfs_extent_busy_clear(mp, &extents->extent_list, false); out_del_cursor: xfs_btree_del_cursor(cur, error); - xfs_buf_relse(agbp); +out_trans_cancel: + xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index ac6ba646624d..c98cb468c357 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -24,6 +24,7 @@ #include "xfs_log.h" #include "xfs_bmap_btree.h" #include "xfs_error.h" +#include "xfs_health.h" /* * Lock order: @@ -44,6 +45,29 @@ static struct kmem_cache *xfs_dquot_cache; static struct lock_class_key xfs_dquot_group_class; static struct lock_class_key xfs_dquot_project_class; +/* Record observations of quota corruption with the health tracking system. */ +static void +xfs_dquot_mark_sick( + struct xfs_dquot *dqp) +{ + struct xfs_mount *mp = dqp->q_mount; + + switch (dqp->q_type) { + case XFS_DQTYPE_USER: + xfs_fs_mark_sick(mp, XFS_SICK_FS_UQUOTA); + break; + case XFS_DQTYPE_GROUP: + xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA); + break; + case XFS_DQTYPE_PROJ: + xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA); + break; + default: + ASSERT(0); + break; + } +} + /* * This is called to free all the memory associated with a dquot */ @@ -53,7 +77,7 @@ xfs_qm_dqdestroy( { ASSERT(list_empty(&dqp->q_lru)); - kmem_free(dqp->q_logitem.qli_item.li_lv_shadow); + kvfree(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); @@ -172,14 +196,14 @@ xfs_qm_adjust_dqtimers( /* * initialize a buffer full of dquots and log the whole thing */ -STATIC void +void xfs_qm_init_dquot_blk( struct xfs_trans *tp, - struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_buf *bp) { + struct xfs_mount *mp = tp->t_mountp; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dqblk *d; xfs_dqid_t curid; @@ -353,7 +377,7 @@ xfs_dquot_disk_alloc( * Make a chunk of dquots out of this buffer and log * the entire thing. */ - xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp); + xfs_qm_init_dquot_blk(tp, dqp->q_id, qtype, bp); xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* @@ -451,6 +475,8 @@ xfs_dquot_disk_read( error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_dquot_mark_sick(dqp); if (error) { ASSERT(bp == NULL); return error; @@ -562,7 +588,8 @@ xfs_dquot_from_disk( struct xfs_dquot *dqp, struct xfs_buf *bp) { - struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; + struct xfs_dqblk *dqb = xfs_buf_offset(bp, dqp->q_bufoffset); + struct xfs_disk_dquot *ddqp = &dqb->dd_diskdq; /* * Ensure that we got the type and ID we were looking for. @@ -573,6 +600,7 @@ xfs_dquot_from_disk( "Metadata corruption detected at %pS, quota %u", __this_address, dqp->q_id); xfs_alert(bp->b_mount, "Unmount and run xfs_repair"); + xfs_dquot_mark_sick(dqp); return -EFSCORRUPTED; } @@ -783,6 +811,12 @@ restart: * caller should throw away the dquot and start over. Otherwise, the dquot * is returned locked (and held by the cache) as if there had been a cache * hit. + * + * The insert needs to be done under memalloc_nofs context because the radix + * tree can do memory allocation during insert. The qi->qi_tree_lock is taken in + * memory reclaim when freeing unused dquots, so we cannot have the radix tree + * node allocation recursing into filesystem reclaim whilst we hold the + * qi_tree_lock. */ static int xfs_qm_dqget_cache_insert( @@ -792,25 +826,27 @@ xfs_qm_dqget_cache_insert( xfs_dqid_t id, struct xfs_dquot *dqp) { + unsigned int nofs_flags; int error; + nofs_flags = memalloc_nofs_save(); mutex_lock(&qi->qi_tree_lock); error = radix_tree_insert(tree, id, dqp); if (unlikely(error)) { /* Duplicate found! Caller must try again. */ - mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_dup(dqp); - return error; + goto out_unlock; } /* Return a locked dquot to the caller, with a reference taken. */ xfs_dqlock(dqp); dqp->q_nrefs = 1; - qi->qi_dquots++; - mutex_unlock(&qi->qi_tree_lock); - return 0; +out_unlock: + mutex_unlock(&qi->qi_tree_lock); + memalloc_nofs_restore(nofs_flags); + return error; } /* Check our input parameters. */ @@ -949,7 +985,7 @@ xfs_qm_dqget_inode( if (error) return error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(xfs_inode_dquot(ip, type) == NULL); id = xfs_qm_id_for_quotatype(ip, type); @@ -1006,7 +1042,7 @@ restart: } dqret: - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); trace_xfs_dqget_miss(dqp); *O_dqpp = dqp; return 0; @@ -1064,7 +1100,7 @@ xfs_qm_dqput( struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; trace_xfs_dqput_free(dqp); - if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) + if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } xfs_dqunlock(dqp); @@ -1237,6 +1273,8 @@ xfs_qm_dqflush( &bp, &xfs_dquot_buf_ops); if (error == -EAGAIN) goto out_unlock; + if (xfs_metadata_is_sick(error)) + xfs_dquot_mark_sick(dqp); if (error) goto out_abort; @@ -1245,12 +1283,13 @@ xfs_qm_dqflush( xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", dqp->q_id, fa); xfs_buf_relse(bp); + xfs_dquot_mark_sick(dqp); error = -EFSCORRUPTED; goto out_abort; } /* Flush the incore dquot to the ondisk buffer. */ - dqblk = bp->b_addr + dqp->q_bufoffset; + dqblk = xfs_buf_offset(bp, dqp->q_bufoffset); xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp); /* @@ -1361,34 +1400,3 @@ xfs_qm_exit(void) kmem_cache_destroy(xfs_dqtrx_cache); kmem_cache_destroy(xfs_dquot_cache); } - -/* - * Iterate every dquot of a particular type. The caller must ensure that the - * particular quota type is active. iter_fn can return negative error codes, - * or -ECANCELED to indicate that it wants to stop iterating. - */ -int -xfs_qm_dqiterate( - struct xfs_mount *mp, - xfs_dqtype_t type, - xfs_qm_dqiterate_fn iter_fn, - void *priv) -{ - struct xfs_dquot *dq; - xfs_dqid_t id = 0; - int error; - - do { - error = xfs_qm_dqget_next(mp, id, type, &dq); - if (error == -ENOENT) - return 0; - if (error) - return error; - - error = iter_fn(dq, type, priv); - id = dq->q_id + 1; - xfs_qm_dqput(dq); - } while (error == 0 && id != 0); - - return error; -} diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 80c8f851a2f3..956272d9b302 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -234,12 +234,10 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } -typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, - xfs_dqtype_t type, void *priv); -int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, - xfs_qm_dqiterate_fn iter_fn, void *priv); - time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout); time64_t xfs_dquot_set_grace_period(time64_t grace); +void xfs_qm_init_dquot_blk(struct xfs_trans *tp, xfs_dqid_t id, xfs_dqtype_t + type, struct xfs_buf *bp); + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index 8966ba842395..2c2720ce6923 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -19,6 +19,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_error.h" STATIC void xlog_recover_dquot_ra_pass2( @@ -65,6 +66,7 @@ xlog_recover_dquot_commit_pass2( { struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; + struct xfs_dqblk *dqb; struct xfs_disk_dquot *ddq, *recddq; struct xfs_dq_logformat *dq_f; xfs_failaddr_t fa; @@ -130,14 +132,14 @@ xlog_recover_dquot_commit_pass2( return error; ASSERT(bp); - ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); + dqb = xfs_buf_offset(bp, dq_f->qlf_boffset); + ddq = &dqb->dd_diskdq; /* * If the dquot has an LSN in it, recover the dquot only if it's less * than the lsn of the transaction we are replaying. */ if (xfs_has_crc(mp)) { - struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { @@ -147,10 +149,23 @@ xlog_recover_dquot_commit_pass2( memcpy(ddq, recddq, item->ri_buf[1].i_len); if (xfs_has_crc(mp)) { - xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } + /* Validate the recovered dquot. */ + fa = xfs_dqblk_verify(log->l_mp, dqb, dq_f->qlf_id); + if (fa) { + XFS_CORRUPTION_ERROR("Bad dquot after recovery", + XFS_ERRLEVEL_LOW, mp, dqb, + sizeof(struct xfs_dqblk)); + xfs_alert(mp, + "Metadata corruption detected at %pS, dquot 0x%x", + fa, dq_f->qlf_id); + error = -EFSCORRUPTED; + goto out_release; + } + ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_mount == mp); bp->b_flags |= _XBF_LOGRECOVERY; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index b2cbbba3e15a..7ad0e92c6b5b 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -240,15 +240,15 @@ xfs_errortag_init( { int ret; - mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, - KM_MAYFAIL); + mp->m_errortag = kzalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!mp->m_errortag) return -ENOMEM; ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, &mp->m_kobj, "errortag"); if (ret) - kmem_free(mp->m_errortag); + kfree(mp->m_errortag); return ret; } @@ -257,7 +257,7 @@ xfs_errortag_del( struct xfs_mount *mp) { xfs_sysfs_del(&mp->m_errortag_kobj); - kmem_free(mp->m_errortag); + kfree(mp->m_errortag); } static bool diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 9ecfdcdc752f..56cfa1498571 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -32,7 +32,8 @@ xfs_extent_busy_insert_list( struct rb_node **rbp; struct rb_node *parent = NULL; - new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0); + new = kzalloc(sizeof(struct xfs_extent_busy), + GFP_KERNEL | __GFP_NOFAIL); new->agno = pag->pag_agno; new->bno = bno; new->length = len; @@ -530,7 +531,7 @@ xfs_extent_busy_clear_one( } list_del_init(&busyp->list); - kmem_free(busyp); + kfree(busyp); } static void @@ -678,3 +679,16 @@ xfs_extent_busy_ag_cmp( diff = b1->bno - b2->bno; return diff; } + +/* Are there any busy extents in this AG? */ +bool +xfs_extent_busy_list_empty( + struct xfs_perag *pag) +{ + bool res; + + spin_lock(&pag->pagb_lock); + res = RB_EMPTY_ROOT(&pag->pagb_tree); + spin_unlock(&pag->pagb_lock); + return res; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 0639aab336f3..470032de3139 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -85,4 +85,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) list_sort(NULL, list, xfs_extent_busy_ag_cmp); } +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); + #endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3fa8789820ad..8c382f092332 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -40,9 +40,9 @@ STATIC void xfs_efi_item_free( struct xfs_efi_log_item *efip) { - kmem_free(efip->efi_item.li_lv_shadow); + kvfree(efip->efi_item.li_lv_shadow); if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) - kmem_free(efip); + kfree(efip); else kmem_cache_free(xfs_efi_cache, efip); } @@ -229,9 +229,9 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) STATIC void xfs_efd_item_free(struct xfs_efd_log_item *efdp) { - kmem_free(efdp->efd_item.li_lv_shadow); + kvfree(efdp->efd_item.li_lv_shadow); if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) - kmem_free(efdp); + kfree(efdp); else kmem_cache_free(xfs_efd_cache, efdp); } @@ -304,39 +304,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = { }; /* - * Allocate an "extent free done" log item that will hold nextents worth of - * extents. The caller must use all nextents extents, because we are not - * flexible about this at all. - */ -static struct xfs_efd_log_item * -xfs_trans_get_efd( - struct xfs_trans *tp, - struct xfs_efi_log_item *efip, - unsigned int nextents) -{ - struct xfs_efd_log_item *efdp; - - ASSERT(nextents > 0); - - if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { - efdp = kzalloc(xfs_efd_log_item_sizeof(nextents), - GFP_KERNEL | __GFP_NOFAIL); - } else { - efdp = kmem_cache_zalloc(xfs_efd_cache, - GFP_KERNEL | __GFP_NOFAIL); - } - - xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, - &xfs_efd_item_ops); - efdp->efd_efip = efip; - efdp->efd_format.efd_nextents = nextents; - efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; - - xfs_trans_add_item(tp, &efdp->efd_item); - return efdp; -} - -/* * Fill the EFD with all extents from the EFI when we need to roll the * transaction and continue with a new EFI. * @@ -364,69 +331,6 @@ xfs_efd_from_efi( efdp->efd_next_extent = efip->efi_format.efi_nextents; } -/* - * Free an extent and log it to the EFD. Note that the transaction is marked - * dirty regardless of whether the extent free succeeds or fails to support the - * EFI/EFD lifecycle rules. - */ -static int -xfs_trans_free_extent( - struct xfs_trans *tp, - struct xfs_efd_log_item *efdp, - struct xfs_extent_free_item *xefi) -{ - struct xfs_owner_info oinfo = { }; - struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent *extp; - uint next_extent; - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, - xefi->xefi_startblock); - int error; - - oinfo.oi_owner = xefi->xefi_owner; - if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) - oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; - if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) - oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - - trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, - agbno, xefi->xefi_blockcount); - - error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, - xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, - xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - - /* - * If we need a new transaction to make progress, the caller will log a - * new EFI with the current contents. It will also log an EFD to cancel - * the existing EFI, and so we need to copy all the unprocessed extents - * in this EFI to the EFD so this works correctly. - */ - if (error == -EAGAIN) { - xfs_efd_from_efi(efdp); - return error; - } - - next_extent = efdp->efd_next_extent; - ASSERT(next_extent < efdp->efd_format.efd_nextents); - extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = xefi->xefi_startblock; - extp->ext_len = xefi->xefi_blockcount; - efdp->efd_next_extent++; - - return error; -} - /* Sort bmap items by AG. */ static int xfs_extent_free_diff_items( @@ -453,9 +357,6 @@ xfs_extent_free_log_item( uint next_extent; struct xfs_extent *extp; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from @@ -481,7 +382,6 @@ xfs_extent_free_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &efip->efi_item); if (sort) list_sort(mp, items, xfs_extent_free_diff_items); list_for_each_entry(xefi, items, xefi_list) @@ -496,7 +396,26 @@ xfs_extent_free_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item; + struct xfs_efi_log_item *efip = EFI_ITEM(intent); + struct xfs_efd_log_item *efdp; + + ASSERT(count > 0); + + if (count > XFS_EFD_MAX_FAST_EXTENTS) { + efdp = kzalloc(xfs_efd_log_item_sizeof(count), + GFP_KERNEL | __GFP_NOFAIL); + } else { + efdp = kmem_cache_zalloc(xfs_efd_cache, + GFP_KERNEL | __GFP_NOFAIL); + } + + xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, + &xfs_efd_item_ops); + efdp->efd_efip = efip; + efdp->efd_format.efd_nextents = count; + efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; + + return &efdp->efd_item; } /* Take a passive ref to the AG containing the space we're freeing. */ @@ -527,19 +446,49 @@ xfs_extent_free_finish_item( struct list_head *item, struct xfs_btree_cur **state) { + struct xfs_owner_info oinfo = { }; struct xfs_extent_free_item *xefi; - int error; + struct xfs_efd_log_item *efdp = EFD_ITEM(done); + struct xfs_mount *mp = tp->t_mountp; + struct xfs_extent *extp; + uint next_extent; + xfs_agblock_t agbno; + int error = 0; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); + + oinfo.oi_owner = xefi->xefi_owner; + if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) + oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) + oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, + agbno, xefi->xefi_blockcount); /* - * Don't free the XEFI if we need a new transaction to complete - * processing of it. + * If we need a new transaction to make progress, the caller will log a + * new EFI with the current contents. It will also log an EFD to cancel + * the existing EFI, and so we need to copy all the unprocessed extents + * in this EFI to the EFD so this works correctly. */ - if (error == -EAGAIN) + if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) + error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, + xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, + xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + if (error == -EAGAIN) { + xfs_efd_from_efi(efdp); return error; + } + + /* Add the work we finished to the EFD, even though nobody uses that */ + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; + efdp->efd_next_extent++; xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); @@ -567,15 +516,6 @@ xfs_extent_free_cancel_item( kmem_cache_free(xfs_extfree_item_cache, xefi); } -const struct xfs_defer_op_type xfs_extent_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_extent_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, -}; - /* * AGFL blocks are accounted differently in the reserve pools and are not * inserted into the busy extent list. @@ -610,16 +550,6 @@ xfs_agfl_free_finish_item( error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno, agbno, agbp, &oinfo); - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); @@ -632,16 +562,6 @@ xfs_agfl_free_finish_item( return error; } -/* sub-type with special handling for AGFL deferred frees */ -const struct xfs_defer_op_type xfs_agfl_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_agfl_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, -}; - /* Is this recovered EFI ok? */ static inline bool xfs_efi_validate_ext( @@ -651,23 +571,41 @@ xfs_efi_validate_ext( return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len); } +static inline void +xfs_efi_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_extent *extp) +{ + struct xfs_extent_free_item *xefi; + + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, + GFP_KERNEL | __GFP_NOFAIL); + xefi->xefi_startblock = extp->ext_start; + xefi->xefi_blockcount = extp->ext_len; + xefi->xefi_agresv = XFS_AG_RESV_NONE; + xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN; + xfs_extent_free_get_group(mp, xefi); + + xfs_defer_add_item(dfp, &xefi->xefi_list); +} + /* * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. */ STATIC int -xfs_efi_item_recover( - struct xfs_log_item *lip, +xfs_extent_free_recover_work( + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_efd_log_item *efdp; struct xfs_trans *tp; int i; int error = 0; - bool requeue_only = false; /* * First check the validity of the extents described by the @@ -682,55 +620,22 @@ xfs_efi_item_recover( sizeof(efip->efi_format)); return -EFSCORRUPTED; } + + xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp); if (error) return error; - efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); - - for (i = 0; i < efip->efi_format.efi_nextents; i++) { - struct xfs_extent_free_item fake = { - .xefi_owner = XFS_RMAP_OWN_UNKNOWN, - .xefi_agresv = XFS_AG_RESV_NONE, - }; - struct xfs_extent *extp; - extp = &efip->efi_format.efi_extents[i]; - - fake.xefi_startblock = extp->ext_start; - fake.xefi_blockcount = extp->ext_len; - - if (!requeue_only) { - xfs_extent_free_get_group(mp, &fake); - error = xfs_trans_free_extent(tp, efdp, &fake); - xfs_extent_free_put_group(&fake); - } - - /* - * If we can't free the extent without potentially deadlocking, - * requeue the rest of the extents to a new so that they get - * run again later with a new transaction context. - */ - if (error == -EAGAIN || requeue_only) { - error = xfs_free_extent_later(tp, fake.xefi_startblock, - fake.xefi_blockcount, - &XFS_RMAP_OINFO_ANY_OWNER, - fake.xefi_agresv); - if (!error) { - requeue_only = true; - continue; - } - } - - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - extp, sizeof(*extp)); - if (error) - goto abort_error; - - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &efip->efi_format, + sizeof(efip->efi_format)); + if (error) + goto abort_error; return xfs_defer_ops_capture_and_commit(tp, capture_list); @@ -739,21 +644,14 @@ abort_error: return error; } -STATIC bool -xfs_efi_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return EFI_ITEM(lip)->efi_format.efi_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_efi_item_relog( +xfs_extent_free_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_efd_log_item *efdp; + struct xfs_efd_log_item *efdp = EFD_ITEM(done_item); struct xfs_efi_log_item *efip; struct xfs_extent *extp; unsigned int count; @@ -761,29 +659,56 @@ xfs_efi_item_relog( count = EFI_ITEM(intent)->efi_format.efi_nextents; extp = EFI_ITEM(intent)->efi_format.efi_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count); efdp->efd_next_extent = count; memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp)); - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); efip = xfs_efi_init(tp->t_mountp, count); memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); atomic_set(&efip->efi_next_extent, count); - xfs_trans_add_item(tp, &efip->efi_item); - set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); + return &efip->efi_item; } +const struct xfs_defer_op_type xfs_extent_free_defer_type = { + .name = "extent_free", + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_extent_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, + .relog_intent = xfs_extent_free_relog_intent, +}; + +/* sub-type with special handling for AGFL deferred frees */ +const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .name = "agfl_free", + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_agfl_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, + .relog_intent = xfs_extent_free_relog_intent, +}; + +STATIC bool +xfs_efi_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return EFI_ITEM(lip)->efi_format.efi_id == intent_id; +} + static const struct xfs_item_ops xfs_efi_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_unpin = xfs_efi_item_unpin, .iop_release = xfs_efi_item_release, - .iop_recover = xfs_efi_item_recover, .iop_match = xfs_efi_item_match, - .iop_relog = xfs_efi_item_relog, }; /* @@ -820,12 +745,9 @@ xlog_recover_efi_commit_pass2( return error; } atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn); - xfs_efi_release(efip); + + xlog_recover_intent_item(log, &efip->efi_item, lsn, + &xfs_extent_free_defer_type); return 0; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 203700278ddb..2ce302b4885f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -214,6 +214,43 @@ xfs_ilock_iocb( return 0; } +static int +xfs_ilock_iocb_for_write( + struct kiocb *iocb, + unsigned int *lock_mode) +{ + ssize_t ret; + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + + ret = xfs_ilock_iocb(iocb, *lock_mode); + if (ret) + return ret; + + if (*lock_mode == XFS_IOLOCK_EXCL) + return 0; + if (!xfs_iflags_test(ip, XFS_IREMAPPING)) + return 0; + + xfs_iunlock(ip, *lock_mode); + *lock_mode = XFS_IOLOCK_EXCL; + return xfs_ilock_iocb(iocb, *lock_mode); +} + +static unsigned int +xfs_ilock_for_write_fault( + struct xfs_inode *ip) +{ + /* get a shared lock if no remapping in progress */ + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + if (!xfs_iflags_test(ip, XFS_IREMAPPING)) + return XFS_MMAPLOCK_SHARED; + + /* wait for remapping to complete */ + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + return XFS_MMAPLOCK_EXCL; +} + STATIC ssize_t xfs_file_dio_read( struct kiocb *iocb, @@ -551,7 +588,7 @@ xfs_file_dio_write_aligned( unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; - ret = xfs_ilock_iocb(iocb, iolock); + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; ret = xfs_file_write_checks(iocb, from, &iolock); @@ -618,7 +655,7 @@ retry_exclusive: flags = IOMAP_DIO_FORCE_WAIT; } - ret = xfs_ilock_iocb(iocb, iolock); + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; @@ -842,7 +879,7 @@ xfs_break_dax_layouts( { struct page *page; - ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL)); + xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL); page = dax_layout_busy_page(inode->i_mapping); if (!page) @@ -863,7 +900,7 @@ xfs_break_layouts( bool retry; int error; - ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); + xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL); do { retry = false; @@ -1180,7 +1217,7 @@ xfs_file_remap_range( if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) xfs_log_force_inode(dest); out_unlock: - xfs_iunlock2_io_mmap(src, dest); + xfs_iunlock2_remapping(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return remapped > 0 ? remapped : ret; @@ -1193,8 +1230,7 @@ xfs_file_open( { if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | - FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT; + file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return generic_file_open(inode, file); } @@ -1207,7 +1243,9 @@ xfs_dir_open( unsigned int mode; int error; - error = xfs_file_open(inode, file); + if (xfs_is_shutdown(ip->i_mount)) + return -EIO; + error = generic_file_open(inode, file); if (error) return error; @@ -1328,6 +1366,7 @@ __xfs_filemap_fault( struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); vm_fault_t ret; + unsigned int lock_mode = 0; trace_xfs_filemap_fault(ip, order, write_fault); @@ -1336,25 +1375,24 @@ __xfs_filemap_fault( file_update_time(vmf->vma->vm_file); } + if (IS_DAX(inode) || write_fault) + lock_mode = xfs_ilock_for_write_fault(XFS_I(inode)); + if (IS_DAX(inode)) { pfn_t pfn; - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = xfs_dax_fault(vmf, order, write_fault, &pfn); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, order, pfn); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + } else if (write_fault) { + ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); } else { - if (write_fault) { - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = iomap_page_mkwrite(vmf, - &xfs_page_mkwrite_iomap_ops); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - } else { - ret = filemap_fault(vmf); - } + ret = filemap_fault(vmf); } + if (lock_mode) + xfs_iunlock(XFS_I(inode), lock_mode); + if (write_fault) sb_end_pagefault(inode->i_sb); return ret; @@ -1453,7 +1491,6 @@ const struct file_operations xfs_file_operations = { .compat_ioctl = xfs_file_compat_ioctl, #endif .mmap = xfs_file_mmap, - .mmap_supported_flags = MAP_SYNC, .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, @@ -1461,6 +1498,8 @@ const struct file_operations xfs_file_operations = { .fallocate = xfs_file_fallocate, .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, + .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | + FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 2fc98d313708..e3aaa0555597 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -44,7 +44,7 @@ xfs_fstrm_free_func( atomic_dec(&pag->pagf_fstrms); xfs_perag_rele(pag); - kmem_free(item); + kfree(item); } /* @@ -313,7 +313,7 @@ xfs_filestream_create_association( * we return a referenced AG, the allocation can still go ahead just * fine. */ - item = kmem_alloc(sizeof(*item), KM_MAYFAIL); + item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!item) goto out_put_fstrms; @@ -326,7 +326,7 @@ xfs_filestream_create_association( out_free_item: xfs_perag_rele(item->pag); - kmem_free(item); + kfree(item); out_put_fstrms: atomic_dec(&args->pag->pagf_fstrms); return 0; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 736e5545f584..de59eec74765 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -23,7 +23,7 @@ #include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_alloc_btree.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_ag.h" /* Convert an xfs_fsmap to an fsmap. */ @@ -483,11 +483,11 @@ xfs_getfsmap_rtdev_rtbitmap_helper( xfs_rtblock_t rtbno; xfs_daddr_t rec_daddr, len_daddr; - rtbno = rec->ar_startext * mp->m_sb.sb_rextsize; + rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); rec_daddr = XFS_FSB_TO_BB(mp, rtbno); irec.rm_startblock = rtbno; - rtbno = rec->ar_extcount * mp->m_sb.sb_rextsize; + rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount); len_daddr = XFS_FSB_TO_BB(mp, rtbno); irec.rm_blockcount = rtbno; @@ -514,7 +514,7 @@ xfs_getfsmap_rtdev_rtbitmap( uint64_t eofs; int error; - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rextents * mp->m_sb.sb_rextsize); + eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents)); if (keys[0].fmr_physical >= eofs) return 0; start_rtb = XFS_BB_TO_FSBT(mp, @@ -539,11 +539,8 @@ xfs_getfsmap_rtdev_rtbitmap( * Set up query parameters to return free rtextents covering the range * we want. */ - alow.ar_startext = start_rtb; - ahigh.ar_startext = end_rtb; - do_div(alow.ar_startext, mp->m_sb.sb_rextsize); - if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) - ahigh.ar_startext++; + alow.ar_startext = xfs_rtb_to_rtx(mp, start_rtb); + ahigh.ar_startext = xfs_rtb_to_rtxup(mp, end_rtb); error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) @@ -766,8 +763,8 @@ xfs_getfsmap_datadev_bnobt_query( return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info); /* Allocate cursor for this AG and query_range it. */ - *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->pag, XFS_BTNUM_BNO); + *curpp = xfs_bnobt_init_cursor(tp->t_mountp, tp, info->agf_bp, + info->pag); key->ar_startblock = info->low.rm_startblock; key[1].ar_startblock = info->high.rm_startblock; return xfs_alloc_query_range(*curpp, key, &key[1], diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7cb75cb6b8e9..83f708f62ed9 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -134,6 +134,10 @@ xfs_growfs_data_private( if (delta < 0 && nagcount < 2) return -EINVAL; + /* No work to do */ + if (delta == 0) + return 0; + oagcount = mp->m_sb.sb_agcount; /* allocate the new per-ag structures */ if (nagcount > oagcount) { @@ -153,7 +157,7 @@ xfs_growfs_data_private( error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0, 0, &tp); if (error) - return error; + goto out_free_unused_perag; last_pag = xfs_perag_get(mp, oagcount - 1); if (delta > 0) { @@ -227,6 +231,9 @@ xfs_growfs_data_private( out_trans_cancel: xfs_trans_cancel(tp); +out_free_unused_perag: + if (nagcount > oagcount) + xfs_free_unused_perag_range(mp, oagcount, nagcount); return error; } @@ -344,59 +351,20 @@ xfs_growfs_log( } /* - * exported through ioctl XFS_IOC_FSCOUNTS - */ - -void -xfs_fs_counts( - xfs_mount_t *mp, - xfs_fsop_counts_t *cnt) -{ - cnt->allocino = percpu_counter_read_positive(&mp->m_icount); - cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); - cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp); - cnt->freertx = percpu_counter_read_positive(&mp->m_frextents); -} - -/* - * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS - * - * xfs_reserve_blocks is called to set m_resblks - * in the in-core mount table. The number of unused reserved blocks - * is kept in m_resblks_avail. - * * Reserve the requested number of blocks if available. Otherwise return * as many as possible to satisfy the request. The actual number - * reserved are returned in outval - * - * A null inval pointer indicates that only the current reserved blocks - * available should be returned no settings are changed. + * reserved are returned in outval. */ - int xfs_reserve_blocks( - xfs_mount_t *mp, - uint64_t *inval, - xfs_fsop_resblks_t *outval) + struct xfs_mount *mp, + uint64_t request) { int64_t lcounter, delta; int64_t fdblks_delta = 0; - uint64_t request; int64_t free; int error = 0; - /* If inval is null, report current values and return */ - if (inval == (uint64_t *)NULL) { - if (!outval) - return -EINVAL; - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - return 0; - } - - request = *inval; - /* * With per-cpu counters, this becomes an interesting problem. we need * to work out if we are freeing or allocation blocks first, then we can @@ -466,11 +434,6 @@ xfs_reserve_blocks( spin_lock(&mp->m_sb_lock); } out: - if (outval) { - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - } - spin_unlock(&mp->m_sb_lock); return error; } @@ -482,9 +445,9 @@ xfs_fs_goingdown( { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { - if (!freeze_bdev(mp->m_super->s_bdev)) { + if (!bdev_freeze(mp->m_super->s_bdev)) { xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); - thaw_bdev(mp->m_super->s_bdev); + bdev_thaw(mp->m_super->s_bdev); } break; } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 2cffe51a31e8..44457b0a0593 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -6,14 +6,12 @@ #ifndef __XFS_FSOPS_H__ #define __XFS_FSOPS_H__ -extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); -extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); -extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); -extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, - xfs_fsop_resblks_t *outval); -extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); +int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); +int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); +int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); +int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); -extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); -extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); +int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); +int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 9edc1f2bc939..f18fec0adf66 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -44,4 +44,16 @@ struct xfs_globals xfs_globals = { .pwork_threads = -1, /* automatic thread detection */ .larp = false, /* log attribute replay */ #endif + + /* + * Leave this many record slots empty when bulk loading btrees. By + * default we load new btree leaf blocks 75% full. + */ + .bload_leaf_slack = -1, + + /* + * Leave this many key/ptr slots empty when bulk loading btrees. By + * default we load new btree node blocks 75% full. + */ + .bload_node_slack = -1, }; diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 72a075bb2c10..b39f959146bc 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -14,6 +14,10 @@ #include "xfs_trace.h" #include "xfs_health.h" #include "xfs_ag.h" +#include "xfs_btree.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_quota_defs.h" /* * Warn about metadata corruption that we detected but haven't fixed, and @@ -93,11 +97,25 @@ xfs_fs_mark_sick( struct xfs_mount *mp, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_FS_ALL)); trace_xfs_fs_mark_sick(mp, mask); spin_lock(&mp->m_sb_lock); mp->m_fs_sick |= mask; + spin_unlock(&mp->m_sb_lock); +} + +/* Mark per-fs metadata as having been checked and found unhealthy by fsck. */ +void +xfs_fs_mark_corrupt( + struct xfs_mount *mp, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_FS_ALL)); + trace_xfs_fs_mark_corrupt(mp, mask); + + spin_lock(&mp->m_sb_lock); + mp->m_fs_sick |= mask; mp->m_fs_checked |= mask; spin_unlock(&mp->m_sb_lock); } @@ -108,11 +126,13 @@ xfs_fs_mark_healthy( struct xfs_mount *mp, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_FS_ALL)); trace_xfs_fs_mark_healthy(mp, mask); spin_lock(&mp->m_sb_lock); mp->m_fs_sick &= ~mask; + if (!(mp->m_fs_sick & XFS_SICK_FS_PRIMARY)) + mp->m_fs_sick &= ~XFS_SICK_FS_SECONDARY; mp->m_fs_checked |= mask; spin_unlock(&mp->m_sb_lock); } @@ -136,11 +156,25 @@ xfs_rt_mark_sick( struct xfs_mount *mp, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_RT_ALL)); trace_xfs_rt_mark_sick(mp, mask); spin_lock(&mp->m_sb_lock); mp->m_rt_sick |= mask; + spin_unlock(&mp->m_sb_lock); +} + +/* Mark realtime metadata as having been checked and found unhealthy by fsck. */ +void +xfs_rt_mark_corrupt( + struct xfs_mount *mp, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_RT_ALL)); + trace_xfs_rt_mark_corrupt(mp, mask); + + spin_lock(&mp->m_sb_lock); + mp->m_rt_sick |= mask; mp->m_rt_checked |= mask; spin_unlock(&mp->m_sb_lock); } @@ -151,11 +185,13 @@ xfs_rt_mark_healthy( struct xfs_mount *mp, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_RT_ALL)); trace_xfs_rt_mark_healthy(mp, mask); spin_lock(&mp->m_sb_lock); mp->m_rt_sick &= ~mask; + if (!(mp->m_rt_sick & XFS_SICK_RT_PRIMARY)) + mp->m_rt_sick &= ~XFS_SICK_RT_SECONDARY; mp->m_rt_checked |= mask; spin_unlock(&mp->m_sb_lock); } @@ -173,17 +209,48 @@ xfs_rt_measure_sickness( spin_unlock(&mp->m_sb_lock); } +/* Mark unhealthy per-ag metadata given a raw AG number. */ +void +xfs_agno_mark_sick( + struct xfs_mount *mp, + xfs_agnumber_t agno, + unsigned int mask) +{ + struct xfs_perag *pag = xfs_perag_get(mp, agno); + + /* per-ag structure not set up yet? */ + if (!pag) + return; + + xfs_ag_mark_sick(pag, mask); + xfs_perag_put(pag); +} + /* Mark unhealthy per-ag metadata. */ void xfs_ag_mark_sick( struct xfs_perag *pag, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_AG_ALL)); trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask); spin_lock(&pag->pag_state_lock); pag->pag_sick |= mask; + spin_unlock(&pag->pag_state_lock); +} + +/* Mark per-ag metadata as having been checked and found unhealthy by fsck. */ +void +xfs_ag_mark_corrupt( + struct xfs_perag *pag, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_AG_ALL)); + trace_xfs_ag_mark_corrupt(pag->pag_mount, pag->pag_agno, mask); + + spin_lock(&pag->pag_state_lock); + pag->pag_sick |= mask; pag->pag_checked |= mask; spin_unlock(&pag->pag_state_lock); } @@ -194,11 +261,13 @@ xfs_ag_mark_healthy( struct xfs_perag *pag, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_AG_ALL)); trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask); spin_lock(&pag->pag_state_lock); pag->pag_sick &= ~mask; + if (!(pag->pag_sick & XFS_SICK_AG_PRIMARY)) + pag->pag_sick &= ~XFS_SICK_AG_SECONDARY; pag->pag_checked |= mask; spin_unlock(&pag->pag_state_lock); } @@ -222,11 +291,34 @@ xfs_inode_mark_sick( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_INO_ALL)); trace_xfs_inode_mark_sick(ip, mask); spin_lock(&ip->i_flags_lock); ip->i_sick |= mask; + spin_unlock(&ip->i_flags_lock); + + /* + * Keep this inode around so we don't lose the sickness report. Scrub + * grabs inodes with DONTCACHE assuming that most inode are ok, which + * is not the case here. + */ + spin_lock(&VFS_I(ip)->i_lock); + VFS_I(ip)->i_state &= ~I_DONTCACHE; + spin_unlock(&VFS_I(ip)->i_lock); +} + +/* Mark inode metadata as having been checked and found unhealthy by fsck. */ +void +xfs_inode_mark_corrupt( + struct xfs_inode *ip, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_INO_ALL)); + trace_xfs_inode_mark_corrupt(ip, mask); + + spin_lock(&ip->i_flags_lock); + ip->i_sick |= mask; ip->i_checked |= mask; spin_unlock(&ip->i_flags_lock); @@ -246,11 +338,13 @@ xfs_inode_mark_healthy( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_INO_ALL)); trace_xfs_inode_mark_healthy(ip, mask); spin_lock(&ip->i_flags_lock); ip->i_sick &= ~mask; + if (!(ip->i_sick & XFS_SICK_INO_PRIMARY)) + ip->i_sick &= ~XFS_SICK_INO_SECONDARY; ip->i_checked |= mask; spin_unlock(&ip->i_flags_lock); } @@ -280,6 +374,8 @@ static const struct ioctl_sick_map fs_map[] = { { XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA }, { XFS_SICK_FS_GQUOTA, XFS_FSOP_GEOM_SICK_GQUOTA }, { XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA }, + { XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK }, + { XFS_SICK_FS_NLINKS, XFS_FSOP_GEOM_SICK_NLINKS }, { 0, 0 }, }; @@ -335,6 +431,7 @@ static const struct ioctl_sick_map ag_map[] = { { XFS_SICK_AG_FINOBT, XFS_AG_GEOM_SICK_FINOBT }, { XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT }, { XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT }, + { XFS_SICK_AG_INODES, XFS_AG_GEOM_SICK_INODES }, { 0, 0 }, }; @@ -369,6 +466,10 @@ static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR }, { XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK }, { XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT }, + { XFS_SICK_INO_BMBTD_ZAPPED, XFS_BS_SICK_BMBTD }, + { XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA }, + { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR }, + { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK }, { 0, 0 }, }; @@ -393,3 +494,92 @@ xfs_bulkstat_health( bs->bs_sick |= m->ioctl_mask; } } + +/* Mark a block mapping sick. */ +void +xfs_bmap_mark_sick( + struct xfs_inode *ip, + int whichfork) +{ + unsigned int mask; + + switch (whichfork) { + case XFS_DATA_FORK: + mask = XFS_SICK_INO_BMBTD; + break; + case XFS_ATTR_FORK: + mask = XFS_SICK_INO_BMBTA; + break; + case XFS_COW_FORK: + mask = XFS_SICK_INO_BMBTC; + break; + default: + ASSERT(0); + return; + } + + xfs_inode_mark_sick(ip, mask); +} + +/* Record observations of btree corruption with the health tracking system. */ +void +xfs_btree_mark_sick( + struct xfs_btree_cur *cur) +{ + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + /* no health state tracking for ephemeral btrees */ + return; + case XFS_BTREE_TYPE_AG: + ASSERT(cur->bc_ops->sick_mask); + xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask); + return; + case XFS_BTREE_TYPE_INODE: + if (xfs_btree_is_bmap(cur->bc_ops)) { + xfs_bmap_mark_sick(cur->bc_ino.ip, + cur->bc_ino.whichfork); + return; + } + fallthrough; + default: + ASSERT(0); + return; + } +} + +/* + * Record observations of dir/attr btree corruption with the health tracking + * system. + */ +void +xfs_dirattr_mark_sick( + struct xfs_inode *ip, + int whichfork) +{ + unsigned int mask; + + switch (whichfork) { + case XFS_DATA_FORK: + mask = XFS_SICK_INO_DIR; + break; + case XFS_ATTR_FORK: + mask = XFS_SICK_INO_XATTR; + break; + default: + ASSERT(0); + return; + } + + xfs_inode_mark_sick(ip, mask); +} + +/* + * Record observations of dir/attr btree corruption with the health tracking + * system. + */ +void +xfs_da_mark_sick( + struct xfs_da_args *args) +{ + xfs_dirattr_mark_sick(args->dp, args->whichfork); +} diff --git a/fs/xfs/xfs_hooks.c b/fs/xfs/xfs_hooks.c new file mode 100644 index 000000000000..a58d1de2d37d --- /dev/null +++ b/fs/xfs/xfs_hooks.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_ag.h" +#include "xfs_trace.h" + +/* Initialize a notifier chain. */ +void +xfs_hooks_init( + struct xfs_hooks *chain) +{ + BLOCKING_INIT_NOTIFIER_HEAD(&chain->head); +} + +/* Make it so a function gets called whenever we hit a certain hook point. */ +int +xfs_hooks_add( + struct xfs_hooks *chain, + struct xfs_hook *hook) +{ + ASSERT(hook->nb.notifier_call != NULL); + BUILD_BUG_ON(offsetof(struct xfs_hook, nb) != 0); + + return blocking_notifier_chain_register(&chain->head, &hook->nb); +} + +/* Remove a previously installed hook. */ +void +xfs_hooks_del( + struct xfs_hooks *chain, + struct xfs_hook *hook) +{ + blocking_notifier_chain_unregister(&chain->head, &hook->nb); +} + +/* Call a hook. Returns the NOTIFY_* value returned by the last hook. */ +int +xfs_hooks_call( + struct xfs_hooks *chain, + unsigned long val, + void *priv) +{ + return blocking_notifier_call_chain(&chain->head, val, priv); +} diff --git a/fs/xfs/xfs_hooks.h b/fs/xfs/xfs_hooks.h new file mode 100644 index 000000000000..60b8a5831536 --- /dev/null +++ b/fs/xfs/xfs_hooks.h @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef XFS_HOOKS_H_ +#define XFS_HOOKS_H_ + +#ifdef CONFIG_XFS_LIVE_HOOKS +struct xfs_hooks { + struct blocking_notifier_head head; +}; + +/* + * If jump labels are enabled in Kconfig, the static key uses nop sleds and + * code patching to eliminate the overhead of taking the rwsem in + * blocking_notifier_call_chain when there are no hooks configured. If not, + * the static key per-call overhead is an atomic read. Most arches that can + * handle XFS also support jump labels. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +# define DEFINE_STATIC_XFS_HOOK_SWITCH(name) \ + static DEFINE_STATIC_KEY_FALSE(name) +# define xfs_hooks_switch_on(name) static_branch_inc(name) +# define xfs_hooks_switch_off(name) static_branch_dec(name) +# define xfs_hooks_switched_on(name) static_branch_unlikely(name) + +struct xfs_hook { + /* This must come at the start of the structure. */ + struct notifier_block nb; +}; + +typedef int (*xfs_hook_fn_t)(struct xfs_hook *hook, unsigned long action, + void *data); + +void xfs_hooks_init(struct xfs_hooks *chain); +int xfs_hooks_add(struct xfs_hooks *chain, struct xfs_hook *hook); +void xfs_hooks_del(struct xfs_hooks *chain, struct xfs_hook *hook); +int xfs_hooks_call(struct xfs_hooks *chain, unsigned long action, + void *priv); + +static inline void xfs_hook_setup(struct xfs_hook *hook, notifier_fn_t fn) +{ + hook->nb.notifier_call = fn; + hook->nb.priority = 0; +} + +#else + +struct xfs_hooks { /* empty */ }; + +# define DEFINE_STATIC_XFS_HOOK_SWITCH(name) +# define xfs_hooks_switch_on(name) ((void)0) +# define xfs_hooks_switch_off(name) ((void)0) +# define xfs_hooks_switched_on(name) (false) + +# define xfs_hooks_init(chain) ((void)0) +# define xfs_hooks_call(chain, val, priv) (NOTIFY_DONE) +#endif + +#endif /* XFS_HOOKS_H_ */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index dba514a2c84d..74f1812b03cb 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -24,6 +24,7 @@ #include "xfs_ialloc.h" #include "xfs_ag.h" #include "xfs_log_priv.h" +#include "xfs_health.h" #include <linux/iversion.h> @@ -415,6 +416,9 @@ xfs_iget_check_free_state( xfs_warn(ip->i_mount, "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", ip->i_ino, VFS_I(ip)->i_mode); + xfs_agno_mark_sick(ip->i_mount, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_SICK_AG_INOBT); return -EFSCORRUPTED; } @@ -422,6 +426,9 @@ xfs_iget_check_free_state( xfs_warn(ip->i_mount, "Corruption detected! Free inode 0x%llx has blocks allocated!", ip->i_ino); + xfs_agno_mark_sick(ip->i_mount, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_SICK_AG_INOBT); return -EFSCORRUPTED; } return 0; @@ -640,6 +647,8 @@ xfs_iget_cache_miss( xfs_buf_offset(bp, ip->i_imap.im_boffset)); if (!error) xfs_buf_set_ref(bp, XFS_INO_REF); + else + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); xfs_trans_brelse(tp, bp); if (error) @@ -659,10 +668,9 @@ xfs_iget_cache_miss( /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload - * region. Since we can be called from transaction context, don't - * recurse into the file system. + * region. */ - if (radix_tree_preload(GFP_NOFS)) { + if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) { error = -EAGAIN; goto out_destroy; } @@ -2031,8 +2039,10 @@ xfs_inodegc_want_queue_work( * - Memory shrinkers queued the inactivation worker and it hasn't finished. * - The queue depth exceeds the maximum allowable percpu backlog. * - * Note: If the current thread is running a transaction, we don't ever want to - * wait for other transactions because that could introduce a deadlock. + * Note: If we are in a NOFS context here (e.g. current thread is running a + * transaction) the we don't want to block here as inodegc progress may require + * filesystem resources we hold to make progress and that could result in a + * deadlock. Hence we skip out of here if we are in a scoped NOFS context. */ static inline bool xfs_inodegc_want_flush_work( @@ -2040,7 +2050,7 @@ xfs_inodegc_want_flush_work( unsigned int items, unsigned int shrinker_hits) { - if (current->journal_info) + if (current->flags & PF_MEMALLOC_NOFS) return false; if (shrinker_hits > 0) diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index b05314d48176..4345db501714 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -63,7 +63,7 @@ STATIC void xfs_icreate_item_release( struct xfs_log_item *lip) { - kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow); + kvfree(ICR_ITEM(lip)->ic_item.li_lv_shadow); kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip)); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 36f5cf802c07..d55b42b2480d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -37,15 +37,10 @@ #include "xfs_reflink.h" #include "xfs_ag.h" #include "xfs_log_priv.h" +#include "xfs_health.h" struct kmem_cache *xfs_inode_cache; -/* - * Used in xfs_itruncate_extents(). This is the maximum number of extents - * freed from a file in a single transaction. - */ -#define XFS_ITRUNC_MAX_EXTENTS 2 - STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_inode *); @@ -208,9 +203,9 @@ xfs_ilock( } if (lock_flags & XFS_ILOCK_EXCL) - mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); else if (lock_flags & XFS_ILOCK_SHARED) - mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); } /* @@ -251,10 +246,10 @@ xfs_ilock_nowait( } if (lock_flags & XFS_ILOCK_EXCL) { - if (!mrtryupdate(&ip->i_lock)) + if (!down_write_trylock(&ip->i_lock)) goto out_undo_mmaplock; } else if (lock_flags & XFS_ILOCK_SHARED) { - if (!mrtryaccess(&ip->i_lock)) + if (!down_read_trylock(&ip->i_lock)) goto out_undo_mmaplock; } return 1; @@ -303,9 +298,9 @@ xfs_iunlock( up_read(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_ILOCK_EXCL) - mrunlock_excl(&ip->i_lock); + up_write(&ip->i_lock); else if (lock_flags & XFS_ILOCK_SHARED) - mrunlock_shared(&ip->i_lock); + up_read(&ip->i_lock); trace_xfs_iunlock(ip, lock_flags, _RET_IP_); } @@ -324,7 +319,7 @@ xfs_ilock_demote( ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); if (lock_flags & XFS_ILOCK_EXCL) - mrdemote(&ip->i_lock); + downgrade_write(&ip->i_lock); if (lock_flags & XFS_MMAPLOCK_EXCL) downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_IOLOCK_EXCL) @@ -333,52 +328,30 @@ xfs_ilock_demote( trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); } -#if defined(DEBUG) || defined(XFS_WARN) -static inline bool -__xfs_rwsem_islocked( - struct rw_semaphore *rwsem, - bool shared) -{ - if (!debug_locks) - return rwsem_is_locked(rwsem); - - if (!shared) - return lockdep_is_held_type(rwsem, 0); - - /* - * We are checking that the lock is held at least in shared - * mode but don't care that it might be held exclusively - * (i.e. shared | excl). Hence we check if the lock is held - * in any mode rather than an explicit shared mode. - */ - return lockdep_is_held_type(rwsem, -1); -} - -bool -xfs_isilocked( +void +xfs_assert_ilocked( struct xfs_inode *ip, uint lock_flags) { - if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { - if (!(lock_flags & XFS_ILOCK_SHARED)) - return !!ip->i_lock.mr_writer; - return rwsem_is_locked(&ip->i_lock.mr_lock); - } - - if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { - return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock, - (lock_flags & XFS_MMAPLOCK_SHARED)); - } + /* + * Sometimes we assert the ILOCK is held exclusively, but we're in + * a workqueue, so lockdep doesn't know we're the owner. + */ + if (lock_flags & XFS_ILOCK_SHARED) + rwsem_assert_held(&ip->i_lock); + else if (lock_flags & XFS_ILOCK_EXCL) + rwsem_assert_held_write_nolockdep(&ip->i_lock); - if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) { - return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, - (lock_flags & XFS_IOLOCK_SHARED)); - } + if (lock_flags & XFS_MMAPLOCK_SHARED) + rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock); + else if (lock_flags & XFS_MMAPLOCK_EXCL) + rwsem_assert_held_write(&VFS_I(ip)->i_mapping->invalidate_lock); - ASSERT(0); - return false; + if (lock_flags & XFS_IOLOCK_SHARED) + rwsem_assert_held(&VFS_I(ip)->i_rwsem); + else if (lock_flags & XFS_IOLOCK_EXCL) + rwsem_assert_held_write(&VFS_I(ip)->i_rwsem); } -#endif /* * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when @@ -661,6 +634,8 @@ xfs_lookup( if (xfs_is_shutdown(dp->i_mount)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); if (error) @@ -674,7 +649,7 @@ xfs_lookup( out_free_name: if (ci_name) - kmem_free(ci_name->name); + kfree(ci_name->name); out_unlock: *ipp = NULL; return error; @@ -805,6 +780,8 @@ xfs_init_new_inode( */ if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); + xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), + XFS_SICK_AG_INOBT); return -EFSCORRUPTED; } @@ -875,7 +852,7 @@ xfs_init_new_inode( case S_IFLNK: ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_bytes = 0; - ip->i_df.if_u1.if_root = NULL; + ip->i_df.if_data = NULL; break; default: ASSERT(0); @@ -918,6 +895,13 @@ xfs_droplink( xfs_trans_t *tp, xfs_inode_t *ip) { + if (VFS_I(ip)->i_nlink == 0) { + xfs_alert(ip->i_mount, + "%s: Attempt to drop inode (%llu) with nlink zero.", + __func__, ip->i_ino); + return -EFSCORRUPTED; + } + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); drop_nlink(VFS_I(ip)); @@ -943,6 +927,81 @@ xfs_bumplink( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* + * Use a static key here to reduce the overhead of directory live update hooks. + * If the compiler supports jump labels, the static branch will be replaced by + * a nop sled when there are no hook users. Online fsck is currently the only + * caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); + +void +xfs_dir_hook_disable(void) +{ + xfs_hooks_switch_off(&xfs_dir_hooks_switch); +} + +void +xfs_dir_hook_enable(void) +{ + xfs_hooks_switch_on(&xfs_dir_hooks_switch); +} + +/* Call hooks for a directory update relating to a child dirent update. */ +inline void +xfs_dir_update_hook( + struct xfs_inode *dp, + struct xfs_inode *ip, + int delta, + const struct xfs_name *name) +{ + if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { + struct xfs_dir_update_params p = { + .dp = dp, + .ip = ip, + .delta = delta, + .name = name, + }; + struct xfs_mount *mp = ip->i_mount; + + xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); + } +} + +/* Call the specified function during a directory update. */ +int +xfs_dir_hook_add( + struct xfs_mount *mp, + struct xfs_dir_hook *hook) +{ + return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); +} + +/* Stop calling the specified function during a directory update. */ +void +xfs_dir_hook_del( + struct xfs_mount *mp, + struct xfs_dir_hook *hook) +{ + xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); +} + +/* Configure directory update hook functions. */ +void +xfs_dir_hook_setup( + struct xfs_dir_hook *hook, + notifier_fn_t mod_fn) +{ + xfs_hook_setup(&hook->dirent_hook, mod_fn); +} +#endif /* CONFIG_XFS_LIVE_HOOKS */ + int xfs_create( struct mnt_idmap *idmap, @@ -971,6 +1030,8 @@ xfs_create( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; prid = xfs_get_initial_prid(dp); @@ -1052,6 +1113,12 @@ xfs_create( } /* + * Create ip with a reference from dp, and add '.' and '..' references + * if it's a directory. + */ + xfs_dir_update_hook(dp, ip, 1, name); + + /* * If this is a synchronous mount, make sure that the * create transaction goes to disk before returning to * the user. @@ -1210,6 +1277,8 @@ xfs_link( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(tdp, XFS_DATA_FORK)) + return -EIO; error = xfs_qm_dqattach(sip); if (error) @@ -1232,8 +1301,19 @@ xfs_link( */ if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && tdp->i_projid != sip->i_projid)) { - error = -EXDEV; - goto error_return; + /* + * Project quota setup skips special files which can + * leave inodes in a PROJINHERIT directory without a + * project ID set. We need to allow links to be made + * to these "project-less" inodes because userspace + * expects them to succeed after project ID setup, + * but everything else should be rejected. + */ + if (!special_file(VFS_I(sip)->i_mode) || + sip->i_projid != 0) { + error = -EXDEV; + goto error_return; + } } if (!resblks) { @@ -1263,6 +1343,7 @@ xfs_link( xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); xfs_bumplink(tp, sip); + xfs_dir_update_hook(tdp, sip, 1, target_name); /* * If this is a synchronous mount, make sure that the @@ -1332,12 +1413,11 @@ xfs_itruncate_extents_flags( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_filblks_t unmap_len; int error = 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(!atomic_read(&VFS_I(ip)->i_count) || - xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + if (atomic_read(&VFS_I(ip)->i_count)) + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_itemp != NULL); @@ -1364,19 +1444,10 @@ xfs_itruncate_extents_flags( return 0; } - unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; - while (unmap_len > 0) { - ASSERT(tp->t_highest_agno == NULLAGNUMBER); - error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, - flags, XFS_ITRUNC_MAX_EXTENTS); - if (error) - goto out; - - /* free the just unmapped extents */ - error = xfs_defer_finish(&tp); - if (error) - goto out; - } + error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block, + XFS_MAX_FILEOFF); + if (error) + goto out; if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ @@ -1598,7 +1669,7 @@ xfs_inactive_ifree( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_ifree(tp, ip); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (error) { /* * If we fail to free the inode, shut down. The cancel @@ -1679,6 +1750,39 @@ xfs_inode_needs_inactive( } /* + * Save health status somewhere, if we're dumping an inode with uncorrected + * errors and online repair isn't running. + */ +static inline void +xfs_inactive_health( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + unsigned int sick; + unsigned int checked; + + xfs_inode_measure_sickness(ip, &sick, &checked); + if (!sick) + return; + + trace_xfs_inode_unfixed_corruption(ip, sick); + + if (sick & XFS_SICK_INO_FORGET) + return; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + if (!pag) { + /* There had better still be a perag structure! */ + ASSERT(0); + return; + } + + xfs_ag_mark_sick(pag, XFS_SICK_AG_INODES); + xfs_perag_put(pag); +} + +/* * xfs_inactive * * This is called when the vnode reference count for the vnode @@ -1706,6 +1810,8 @@ xfs_inactive( mp = ip->i_mount; ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); + xfs_inactive_health(ip); + /* * If this is a read-only mount, don't do this (would generate I/O) * unless we're in log recovery and cleaning the iunlinked list. @@ -1912,6 +2018,7 @@ xfs_iunlink_update_bucket( */ if (old_value == new_agino) { xfs_buf_mark_corrupt(agibp); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return -EFSCORRUPTED; } @@ -1961,11 +2068,14 @@ xfs_iunlink_reload_next( */ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); - if (error) + if (error) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return error; + } /* If this is not an unlinked inode, something is very wrong. */ if (VFS_I(next_ip)->i_nlink != 0) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); error = -EFSCORRUPTED; goto rele; } @@ -2003,6 +2113,7 @@ xfs_iunlink_insert_inode( if (next_agino == agino || !xfs_verify_agino_or_null(pag, next_agino)) { xfs_buf_mark_corrupt(agibp); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return -EFSCORRUPTED; } @@ -2090,6 +2201,7 @@ xfs_iunlink_remove_inode( if (!xfs_verify_agino(pag, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return -EFSCORRUPTED; } @@ -2118,8 +2230,10 @@ xfs_iunlink_remove_inode( struct xfs_inode *prev_ip; prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); - if (!prev_ip) + if (!prev_ip) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; + } error = xfs_iunlink_log_inode(tp, prev_ip, pag, ip->i_next_unlinked); @@ -2352,7 +2466,7 @@ xfs_ifree( struct xfs_inode_log_item *iip = ip->i_itemp; int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(ip->i_df.if_nextents == 0); ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); @@ -2380,8 +2494,8 @@ xfs_ifree( * already been freed by xfs_attr_inactive. */ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - kmem_free(ip->i_df.if_u1.if_data); - ip->i_df.if_u1.if_data = NULL; + kfree(ip->i_df.if_data); + ip->i_df.if_data = NULL; ip->i_df.if_bytes = 0; } @@ -2421,7 +2535,7 @@ static void xfs_iunpin( struct xfs_inode *ip) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); trace_xfs_inode_unpin_nowait(ip, _RET_IP_); @@ -2499,6 +2613,8 @@ xfs_remove( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; error = xfs_qm_dqattach(dp); if (error) @@ -2585,6 +2701,12 @@ xfs_remove( } /* + * Drop the link from dp to ip, and if ip was a directory, remove the + * '.' and '..' references since we freed the directory. + */ + xfs_dir_update_hook(dp, ip, -1, name); + + /* * If this is a synchronous mount, make sure that the * remove transaction goes to disk before returning to * the user. @@ -2774,6 +2896,20 @@ xfs_cross_rename( } xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); + + /* + * Inform our hook clients that we've finished an exchange operation as + * follows: removed the source and target files from their directories; + * added the target to the source directory; and added the source to + * the target directory. All inodes are locked, so it's ok to model a + * rename this way so long as we say we deleted entries before we add + * new ones. + */ + xfs_dir_update_hook(dp1, ip1, -1, name1); + xfs_dir_update_hook(dp2, ip2, -1, name2); + xfs_dir_update_hook(dp1, ip2, 1, name1); + xfs_dir_update_hook(dp2, ip1, 1, name2); + return xfs_finish_rename(tp); out_trans_abort: @@ -3157,6 +3293,21 @@ retry: if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + /* + * Inform our hook clients that we've finished a rename operation as + * follows: removed the source and target files from their directories; + * that we've added the source to the target directory; and finally + * that we've added the whiteout, if there was one. All inodes are + * locked, so it's ok to model a rename this way so long as we say we + * deleted entries before we add new ones. + */ + if (target_ip) + xfs_dir_update_hook(target_dp, target_ip, -1, target_name); + xfs_dir_update_hook(src_dp, src_ip, -1, src_name); + xfs_dir_update_hook(target_dp, src_ip, 1, target_name); + if (wip) + xfs_dir_update_hook(src_dp, wip, 1, src_name); + error = xfs_finish_rename(tp); if (wip) xfs_irele(wip); @@ -3182,7 +3333,7 @@ xfs_iflush( struct xfs_mount *mp = ip->i_mount; int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); @@ -3317,6 +3468,8 @@ flush_out: /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); + if (error) + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return error; } @@ -3621,6 +3774,23 @@ xfs_iunlock2_io_mmap( inode_unlock(VFS_I(ip1)); } +/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */ +void +xfs_iunlock2_remapping( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + xfs_iflags_clear(ip1, XFS_IREMAPPING); + + if (ip1 != ip2) + xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED); + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + + if (ip1 != ip2) + inode_unlock_shared(VFS_I(ip1)); + inode_unlock(VFS_I(ip2)); +} + /* * Reload the incore inode list for this inode. Caller should ensure that * the link count cannot change, either by taking ILOCK_SHARED or otherwise @@ -3734,3 +3904,45 @@ xfs_inode_reload_unlinked( return error; } + +/* Has this inode fork been zapped by repair? */ +bool +xfs_ifork_zapped( + const struct xfs_inode *ip, + int whichfork) +{ + unsigned int datamask = 0; + + switch (whichfork) { + case XFS_DATA_FORK: + switch (ip->i_vnode.i_mode & S_IFMT) { + case S_IFDIR: + datamask = XFS_SICK_INO_DIR_ZAPPED; + break; + case S_IFLNK: + datamask = XFS_SICK_INO_SYMLINK_ZAPPED; + break; + } + return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask); + case XFS_ATTR_FORK: + return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED; + default: + return false; + } +} + +/* Compute the number of data and realtime blocks used by a file. */ +void +xfs_inode_count_blocks( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_filblks_t *dblocks, + xfs_filblks_t *rblocks) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + + *rblocks = 0; + if (XFS_IS_REALTIME_INODE(ip)) + xfs_bmap_count_leaves(ifp, rblocks); + *dblocks = ip->i_nblocks - *rblocks; +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0c5bdb91152e..ab46ffb3ac19 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -39,7 +39,7 @@ typedef struct xfs_inode { /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ - mrlock_t i_lock; /* inode lock */ + struct rw_semaphore i_lock; /* inode lock */ atomic_t i_pincount; /* inode pin count */ struct llist_node i_gclist; /* deferred inactivation list */ @@ -171,6 +171,12 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) return &ip->i_vnode; } +/* convert from const xfs inode to const vfs inode */ +static inline const struct inode *VFS_IC(const struct xfs_inode *ip) +{ + return &ip->i_vnode; +} + /* * For regular files we only update the on-disk filesize when actually * writing data back to disk. Until then only the copy in the VFS inode @@ -347,6 +353,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) /* Quotacheck is running but inode has not been added to quota counts. */ #define XFS_IQUOTAUNCHECKED (1 << 14) +/* + * Remap in progress. Callers that wish to update file data while + * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake + * the lock in exclusive mode. Relocking the file will block until + * IREMAPPING is cleared. + */ +#define XFS_IREMAPPING (1U << 15) + /* All inode state flags related to inode reclaim. */ #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ XFS_IRECLAIM | \ @@ -515,7 +529,7 @@ void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); -bool xfs_isilocked(struct xfs_inode *, uint); +void xfs_assert_ilocked(struct xfs_inode *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); @@ -561,6 +575,14 @@ extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); +static inline void xfs_update_stable_writes(struct xfs_inode *ip) +{ + if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev)) + mapping_set_stable_writes(VFS_I(ip)->i_mapping); + else + mapping_clear_stable_writes(VFS_I(ip)->i_mapping); +} + /* * When setting up a newly allocated inode, we need to call * xfs_finish_inode_setup() once the inode is fully instantiated at @@ -595,6 +617,7 @@ void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); +void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2); static inline bool xfs_inode_unlinked_incomplete( @@ -605,4 +628,33 @@ xfs_inode_unlinked_incomplete( int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); int xfs_inode_reload_unlinked(struct xfs_inode *ip); +bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork); +void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_filblks_t *dblocks, xfs_filblks_t *rblocks); + +struct xfs_dir_update_params { + const struct xfs_inode *dp; + const struct xfs_inode *ip; + const struct xfs_name *name; + int delta; +}; + +#ifdef CONFIG_XFS_LIVE_HOOKS +void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip, + int delta, const struct xfs_name *name); + +struct xfs_dir_hook { + struct xfs_hook dirent_hook; +}; + +void xfs_dir_hook_disable(void); +void xfs_dir_hook_enable(void); + +int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook); +void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook); +void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn); +#else +# define xfs_dir_update_hook(dp, ip, delta, name) ((void)0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 17c51804f9c6..f28d653300d1 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -19,6 +19,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_error.h" +#include "xfs_rtbitmap.h" #include <linux/iversion.h> @@ -107,7 +108,7 @@ xfs_inode_item_precommit( */ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT); ip->i_extsize = 0; @@ -351,11 +352,10 @@ xfs_inode_item_format_data_fork( ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_df.if_data != NULL); ASSERT(ip->i_disk_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, - ip->i_df.if_u1.if_data, - ip->i_df.if_bytes); + ip->i_df.if_data, ip->i_df.if_bytes); ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes; ilf->ilf_size++; } else { @@ -430,10 +430,9 @@ xfs_inode_item_format_attr_fork( if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_af.if_bytes > 0) { - ASSERT(ip->i_af.if_u1.if_data != NULL); + ASSERT(ip->i_af.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, - ip->i_af.if_u1.if_data, - ip->i_af.if_bytes); + ip->i_af.if_data, ip->i_af.if_bytes); ilf->ilf_asize = (unsigned)ip->i_af.if_bytes; ilf->ilf_size++; } else { @@ -556,6 +555,9 @@ xfs_inode_to_log_dinode( memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_v3_pad = 0; + + /* dummy value for initialisation */ + to->di_crc = 0; } else { to->di_version = 2; to->di_flushiter = ip->i_flushiter; @@ -648,7 +650,7 @@ xfs_inode_item_pin( { struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(lip->li_buf); trace_xfs_inode_pin(ip, _RET_IP_); @@ -754,7 +756,7 @@ xfs_inode_item_release( unsigned short lock_flags; ASSERT(ip->i_itemp != NULL); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0; @@ -854,7 +856,7 @@ xfs_inode_item_destroy( ASSERT(iip->ili_item.li_buf == NULL); ip->i_itemp = NULL; - kmem_free(iip->ili_item.li_lv_shadow); + kvfree(iip->ili_item.li_lv_shadow); kmem_cache_free(xfs_ili_cache, iip); } diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 0e5dba2343ea..dbdab4ce7c44 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -286,11 +286,13 @@ xlog_recover_inode_commit_pass2( struct xfs_log_dinode *ldip; uint isize; int need_free = 0; + xfs_failaddr_t fa; if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { in_f = item->ri_buf[0].i_addr; } else { - in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); + in_f = kmalloc(sizeof(struct xfs_inode_log_format), + GFP_KERNEL | __GFP_NOFAIL); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) @@ -369,24 +371,26 @@ xlog_recover_inode_commit_pass2( * superblock flag to determine whether we need to look at di_flushiter * to skip replay when the on disk inode is newer than the log one */ - if (!xfs_has_v3inodes(mp) && - ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { - /* - * Deal with the wrap case, DI_MAX_FLUSH is less - * than smaller numbers - */ - if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && - ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { - /* do nothing */ - } else { - trace_xfs_log_recover_inode_skip(log, in_f); - error = 0; - goto out_release; + if (!xfs_has_v3inodes(mp)) { + if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * Deal with the wrap case, DI_MAX_FLUSH is less + * than smaller numbers + */ + if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && + ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { + /* do nothing */ + } else { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_release; + } } + + /* Take the opportunity to reset the flush iteration count */ + ldip->di_flushiter = 0; } - /* Take the opportunity to reset the flush iteration count */ - ldip->di_flushiter = 0; if (unlikely(S_ISREG(ldip->di_mode))) { if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && @@ -528,8 +532,19 @@ out_owner_change: (dip->di_mode != 0)) error = xfs_recover_inode_owner_change(mp, dip, in_f, buffer_list); - /* re-generate the checksum. */ + /* re-generate the checksum and validate the recovered inode. */ xfs_dinode_calc_crc(log->l_mp, dip); + fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip); + if (fa) { + XFS_CORRUPTION_ERROR( + "Bad dinode after recovery", + XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip)); + xfs_alert(mp, + "Metadata corruption detected at %pS, inode 0x%llx", + fa, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } ASSERT(bp->b_mount == mp); bp->b_flags |= _XBF_LOGRECOVERY; @@ -539,7 +554,7 @@ out_release: xfs_buf_relse(bp); error: if (need_free) - kmem_free(in_f); + kfree(in_f); return error; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 55bb01173cde..d0e2cec6210d 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -38,6 +38,7 @@ #include "xfs_reflink.h" #include "xfs_ioctl.h" #include "xfs_xattr.h" +#include "xfs_rtbitmap.h" #include <linux/mount.h> #include <linux/namei.h> @@ -434,7 +435,7 @@ xfs_ioc_attr_list( copy_to_user(ucursor, &context.cursor, sizeof(context.cursor))) error = -EFAULT; out_free: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -492,7 +493,7 @@ xfs_attrmulti_attr_get( error = -EFAULT; out_kfree: - kmem_free(args.value); + kvfree(args.value); return error; } @@ -1004,7 +1005,7 @@ xfs_fill_fsxattr( * later. */ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - ip->i_extsize % mp->m_sb.sb_rextsize > 0) { + xfs_extlen_to_rtxmod(mp, ip->i_extsize) > 0) { fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); fa->fsx_extsize = 0; @@ -1120,23 +1121,25 @@ xfs_ioctl_setattr_xflags( struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; + bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); uint64_t i_flags2; - /* Can't change realtime flag if any extents are allocated. */ - if ((ip->i_df.if_nextents || ip->i_delayed_blks) && - XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) - return -EINVAL; + if (rtflag != XFS_IS_REALTIME_INODE(ip)) { + /* Can't change realtime flag if any extents are allocated. */ + if (ip->i_df.if_nextents || ip->i_delayed_blks) + return -EINVAL; + } - /* If realtime flag is set then must have realtime device */ - if (fa->fsx_xflags & FS_XFLAG_REALTIME) { + if (rtflag) { + /* If realtime flag is set then must have realtime device */ if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || - (ip->i_extsize % mp->m_sb.sb_rextsize)) + xfs_extlen_to_rtxmod(mp, ip->i_extsize)) return -EINVAL; - } - /* Clear reflink if we are actually able to set the rt flag. */ - if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) - ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + /* Clear reflink if we are actually able to set the rt flag. */ + if (xfs_is_reflink_inode(ip)) + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + } /* diflags2 only valid for v3 inodes. */ i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); @@ -1147,6 +1150,14 @@ xfs_ioctl_setattr_xflags( ip->i_diflags2 = i_flags2; xfs_diflags_to_iflags(ip, false); + + /* + * Make the stable writes flag match that of the device the inode + * resides on when flipping the RT flag. + */ + if (rtflag != XFS_IS_REALTIME_INODE(ip) && S_ISREG(VFS_I(ip)->i_mode)) + xfs_update_stable_writes(ip); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -1495,7 +1506,7 @@ xfs_ioc_getbmap( error = 0; out_free_buf: - kmem_free(buf); + kvfree(buf); return error; } @@ -1625,7 +1636,7 @@ xfs_ioc_getfsmap( } out_free: - kmem_free(recs); + kvfree(recs); return error; } @@ -1861,6 +1872,63 @@ xfs_fs_eofblocks_from_user( return 0; } +static int +xfs_ioctl_getset_resblocks( + struct file *filp, + unsigned int cmd, + void __user *arg) +{ + struct xfs_mount *mp = XFS_I(file_inode(filp))->i_mount; + struct xfs_fsop_resblks fsop = { }; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (cmd == XFS_IOC_SET_RESBLKS) { + if (xfs_is_readonly(mp)) + return -EROFS; + + if (copy_from_user(&fsop, arg, sizeof(fsop))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_reserve_blocks(mp, fsop.resblks); + mnt_drop_write_file(filp); + if (error) + return error; + } + + spin_lock(&mp->m_sb_lock); + fsop.resblks = mp->m_resblks; + fsop.resblks_avail = mp->m_resblks_avail; + spin_unlock(&mp->m_sb_lock); + + if (copy_to_user(arg, &fsop, sizeof(fsop))) + return -EFAULT; + return 0; +} + +static int +xfs_ioctl_fs_counts( + struct xfs_mount *mp, + struct xfs_fsop_counts __user *uarg) +{ + struct xfs_fsop_counts out = { + .allocino = percpu_counter_read_positive(&mp->m_icount), + .freeino = percpu_counter_read_positive(&mp->m_ifree), + .freedata = percpu_counter_read_positive(&mp->m_fdblocks) - + xfs_fdblocks_unavailable(mp), + .freertx = percpu_counter_read_positive(&mp->m_frextents), + }; + + if (copy_to_user(uarg, &out, sizeof(out))) + return -EFAULT; + return 0; +} + /* * These long-unused ioctls were removed from the official ioctl API in 5.17, * but retain these definitions so that we can log warnings about them. @@ -1997,60 +2065,12 @@ xfs_file_ioctl( return error; } - case XFS_IOC_FSCOUNTS: { - xfs_fsop_counts_t out; - - xfs_fs_counts(mp, &out); - - if (copy_to_user(arg, &out, sizeof(out))) - return -EFAULT; - return 0; - } - - case XFS_IOC_SET_RESBLKS: { - xfs_fsop_resblks_t inout; - uint64_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (xfs_is_readonly(mp)) - return -EROFS; - - if (copy_from_user(&inout, arg, sizeof(inout))) - return -EFAULT; - - error = mnt_want_write_file(filp); - if (error) - return error; - - /* input parameter is passed in resblks field of structure */ - in = inout.resblks; - error = xfs_reserve_blocks(mp, &in, &inout); - mnt_drop_write_file(filp); - if (error) - return error; - - if (copy_to_user(arg, &inout, sizeof(inout))) - return -EFAULT; - return 0; - } - - case XFS_IOC_GET_RESBLKS: { - xfs_fsop_resblks_t out; + case XFS_IOC_FSCOUNTS: + return xfs_ioctl_fs_counts(mp, arg); - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - error = xfs_reserve_blocks(mp, NULL, &out); - if (error) - return error; - - if (copy_to_user(arg, &out, sizeof(out))) - return -EFAULT; - - return 0; - } + case XFS_IOC_SET_RESBLKS: + case XFS_IOC_GET_RESBLKS: + return xfs_ioctl_getset_resblocks(filp, cmd, arg); case XFS_IOC_FSGROWFSDATA: { struct xfs_growfs_data in; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 18c8f168b153..4087af7f3c9f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -27,6 +27,7 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" +#include "xfs_health.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -45,6 +46,7 @@ xfs_alert_fsblock_zero( (unsigned long long)imap->br_startoff, (unsigned long long)imap->br_blockcount, imap->br_state); + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); return -EFSCORRUPTED; } @@ -99,8 +101,10 @@ xfs_bmbt_to_iomap( struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); - if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); return xfs_alert_fsblock_zero(ip, imap); + } if (imap->br_startblock == HOLESTARTBLOCK) { iomap->addr = IOMAP_NULL_ADDR; @@ -325,8 +329,10 @@ xfs_iomap_write_direct( goto out_unlock; } - if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = xfs_alert_fsblock_zero(ip, imap); + } out_unlock: *seq = xfs_iomap_inode_sequence(ip, 0); @@ -639,8 +645,10 @@ xfs_iomap_write_unwritten( if (error) return error; - if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) + if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); return xfs_alert_fsblock_zero(ip, &imap); + } if ((numblks_fsb = imap.br_blockcount) == 0) { /* @@ -986,6 +994,7 @@ xfs_buffered_write_iomap_begin( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; } @@ -1323,7 +1332,7 @@ xfs_seek_iomap_begin( if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); - xfs_trim_extent(&cmap, offset_fsb, end_fsb); + xfs_trim_extent(&cmap, offset_fsb, end_fsb - offset_fsb); seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); @@ -1348,7 +1357,7 @@ xfs_seek_iomap_begin( imap.br_state = XFS_EXT_NORM; done: seq = xfs_iomap_inode_sequence(ip, 0); - xfs_trim_extent(&imap, offset_fsb, end_fsb); + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); out_unlock: xfs_iunlock(ip, lockmode); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index fdfda4fba12b..66f8c47642e8 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -346,7 +346,7 @@ xfs_vn_ci_lookup( dname.name = ci_name.name; dname.len = ci_name.len; dentry = d_add_ci(dentry, VFS_I(ip), &dname); - kmem_free(ci_name.name); + kfree(ci_name.name); return dentry; } @@ -796,8 +796,7 @@ xfs_setattr_size( uint lock_flags = 0; bool did_zeroing = false; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); ASSERT(S_ISREG(inode->i_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0); @@ -1285,9 +1284,9 @@ xfs_setup_inode( */ lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_dir_key); - lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); + lockdep_set_class(&ip->i_lock, &xfs_dir_ilock_class); } else { - lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); + lockdep_set_class(&ip->i_lock, &xfs_nondir_ilock_class); } /* @@ -1299,6 +1298,13 @@ xfs_setup_inode( mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); /* + * For real-time inodes update the stable write flags to that of the RT + * device instead of the data device. + */ + if (S_ISREG(inode->i_mode) && XFS_IS_REALTIME_INODE(ip)) + xfs_update_stable_writes(ip); + + /* * If there is no attribute fork no ACL can exist on this inode, * and it can't have any file capabilities attached to it either. */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 14462614fcc8..95fc31b9f87d 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -197,8 +197,8 @@ xfs_bulkstat_one( ASSERT(breq->icount == 1); - bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), - KM_MAYFAIL); + bc.buf = kzalloc(sizeof(struct xfs_bulkstat), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!bc.buf) return -ENOMEM; @@ -214,7 +214,7 @@ xfs_bulkstat_one( breq->startino, &bc); xfs_trans_cancel(tp); out: - kmem_free(bc.buf); + kfree(bc.buf); /* * If we reported one inode to userspace then we abort because we hit @@ -289,8 +289,8 @@ xfs_bulkstat( if (xfs_bulkstat_already_done(breq->mp, breq->startino)) return 0; - bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), - KM_MAYFAIL); + bc.buf = kzalloc(sizeof(struct xfs_bulkstat), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!bc.buf) return -ENOMEM; @@ -309,7 +309,7 @@ xfs_bulkstat( xfs_bulkstat_iwalk, breq->icount, &bc); xfs_trans_cancel(tp); out: - kmem_free(bc.buf); + kfree(bc.buf); /* * We found some inodes, so clear the error status and return them. diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index b3275e8d47b6..01b55f03a102 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -22,6 +22,7 @@ #include "xfs_trans.h" #include "xfs_pwork.h" #include "xfs_ag.h" +#include "xfs_bit.h" /* * Walking Inodes in the Filesystem @@ -99,6 +100,7 @@ xfs_iwalk_ichunk_ra( struct xfs_inobt_rec_incore *irec) { struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agnumber_t agno = pag->pag_agno; xfs_agblock_t agbno; struct blk_plug plug; int i; /* inode chunk index */ @@ -111,8 +113,9 @@ xfs_iwalk_ichunk_ra( imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster); if (imask & ~irec->ir_free) { - xfs_btree_reada_bufs(mp, pag->pag_agno, agbno, - igeo->blocks_per_cluster, + xfs_buf_readahead(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, agbno), + igeo->blocks_per_cluster * mp->m_bsize, &xfs_inode_buf_ops); } agbno += igeo->blocks_per_cluster; @@ -131,21 +134,11 @@ xfs_iwalk_adjust_start( struct xfs_inobt_rec_incore *irec) /* btree record */ { int idx; /* index into inode chunk */ - int i; idx = agino - irec->ir_startino; - /* - * We got a right chunk with some left inodes allocated at it. Grab - * the chunk record. Mark all the uninteresting inodes free because - * they're before our start point. - */ - for (i = 0; i < idx; i++) { - if (XFS_INOBT_MASK(i) & ~irec->ir_free) - irec->ir_freecount++; - } - irec->ir_free |= xfs_inobt_maskn(0, idx); + irec->ir_freecount = hweight64(irec->ir_free); } /* Allocate memory for a walk. */ @@ -160,7 +153,7 @@ xfs_iwalk_alloc( /* Allocate a prefetch buffer for inobt records. */ size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore); - iwag->recs = kmem_alloc(size, KM_MAYFAIL); + iwag->recs = kmalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (iwag->recs == NULL) return -ENOMEM; @@ -172,7 +165,7 @@ STATIC void xfs_iwalk_free( struct xfs_iwalk_ag *iwag) { - kmem_free(iwag->recs); + kfree(iwag->recs); iwag->recs = NULL; } @@ -275,9 +268,10 @@ xfs_iwalk_ag_start( /* Set up a fresh cursor and empty the inobt cache. */ iwag->nr_recs = 0; - error = xfs_inobt_cur(pag, tp, XFS_BTNUM_INO, curpp, agi_bpp); + error = xfs_ialloc_read_agi(pag, tp, agi_bpp); if (error) return error; + *curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp); /* Starting at the beginning of the AG? That's easy! */ if (agino == 0) @@ -306,8 +300,10 @@ xfs_iwalk_ag_start( error = xfs_inobt_get_rec(*curpp, irec, has_more); if (error) return error; - if (XFS_IS_CORRUPT(mp, *has_more != 1)) + if (XFS_IS_CORRUPT(mp, *has_more != 1)) { + xfs_btree_mark_sick(*curpp); return -EFSCORRUPTED; + } iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino + XFS_INODES_PER_CHUNK - 1); @@ -390,11 +386,10 @@ xfs_iwalk_run_callbacks( } /* ...and recreate the cursor just past where we left off. */ - error = xfs_inobt_cur(iwag->pag, iwag->tp, XFS_BTNUM_INO, curpp, - agi_bpp); + error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, agi_bpp); if (error) return error; - + *curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp); return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more); } @@ -434,6 +429,7 @@ xfs_iwalk_ag( rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino); if (iwag->lastino != NULLFSINO && XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } @@ -627,7 +623,7 @@ xfs_iwalk_ag_work( xfs_iwalk_free(iwag); out: xfs_perag_put(iwag->pag); - kmem_free(iwag); + kfree(iwag); return error; } @@ -663,7 +659,8 @@ xfs_iwalk_threaded( if (xfs_pwork_ctl_want_abort(&pctl)) break; - iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0); + iwag = kzalloc(sizeof(struct xfs_iwalk_ag), + GFP_KERNEL | __GFP_NOFAIL); iwag->mp = mp; /* diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index e9d317a3dafe..8f07c9f6157f 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -21,15 +21,13 @@ typedef __u32 xfs_nlink_t; #include "xfs_types.h" -#include "kmem.h" -#include "mrlock.h" - #include <linux/semaphore.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/vmalloc.h> #include <linux/crc32c.h> #include <linux/module.h> #include <linux/mutex.h> @@ -51,6 +49,7 @@ typedef __u32 xfs_nlink_t; #include <linux/notifier.h> #include <linux/delay.h> #include <linux/log2.h> +#include <linux/rwsem.h> #include <linux/spinlock.h> #include <linux/random.h> #include <linux/ctype.h> @@ -82,6 +81,7 @@ typedef __u32 xfs_nlink_t; #include "xfs_buf.h" #include "xfs_message.h" #include "xfs_drain.h" +#include "xfs_hooks.h" #ifdef __BIG_ENDIAN #define XFS_NATIVE_HOST 1 @@ -198,6 +198,18 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) return x; } +/* If @b is a power of 2, return log2(b). Else return -1. */ +static inline int8_t log2_if_power2(unsigned long b) +{ + return is_power_of_2(b) ? ilog2(b) : -1; +} + +/* If @b is a power of 2, return a mask of the lower bits, else return zero. */ +static inline unsigned long long mask64_if_power2(unsigned long b) +{ + return is_power_of_2(b) ? b - 1 : 0; +} + int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, enum req_op op); @@ -257,4 +269,15 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, # define PTR_FMT "%p" #endif +/* + * Helper for IO routines to grab backing pages from allocated kernel memory. + */ +static inline struct page * +kmem_to_page(void *addr) +{ + if (is_vmalloc_addr(addr)) + return vmalloc_to_page(addr); + return virt_to_page(addr); +} + #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 51c100c86177..5004f23d344e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -633,14 +633,14 @@ xlog_state_release_iclog( */ int xfs_log_mount( - xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks) + xfs_mount_t *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks) { - struct xlog *log; - int error = 0; - int min_logfsbs; + struct xlog *log; + int error = 0; + int min_logfsbs; if (!xfs_has_norecovery(mp)) { xfs_notice(mp, "Mounting V%d Filesystem %pU", @@ -1528,7 +1528,7 @@ xlog_alloc_log( int error = -ENOMEM; uint log2_size = 0; - log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); + log = kzalloc(sizeof(struct xlog), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!log) { xfs_warn(mp, "Log allocation failed: No memory!"); goto out; @@ -1542,6 +1542,7 @@ xlog_alloc_log( log->l_covered_state = XLOG_STATE_COVER_IDLE; set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); + INIT_LIST_HEAD(&log->r_dfops); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ @@ -1604,7 +1605,8 @@ xlog_alloc_log( size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * sizeof(struct bio_vec); - iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL); + iclog = kzalloc(sizeof(*iclog) + bvec_size, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog) goto out_free_iclog; @@ -1660,13 +1662,13 @@ out_destroy_workqueue: out_free_iclog: for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { prev_iclog = iclog->ic_next; - kmem_free(iclog->ic_data); - kmem_free(iclog); + kvfree(iclog->ic_data); + kfree(iclog); if (prev_iclog == log->l_iclog) break; } out_free_log: - kmem_free(log); + kfree(log); out: return ERR_PTR(error); } /* xlog_alloc_log */ @@ -1893,9 +1895,7 @@ xlog_write_iclog( * the buffer manually, the code needs to be kept in sync * with the I/O completion path. */ - xlog_state_done_syncing(iclog); - up(&iclog->ic_sema); - return; + goto sync; } /* @@ -1925,20 +1925,17 @@ xlog_write_iclog( * avoid shutdown re-entering this path and erroring out again. */ if (log->l_targ != log->l_mp->m_ddev_targp && - blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) { - xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return; - } + blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) + goto shutdown; } if (iclog->ic_flags & XLOG_ICL_NEED_FUA) iclog->ic_bio.bi_opf |= REQ_FUA; iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { - xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return; - } + if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) + goto shutdown; + if (is_vmalloc_addr(iclog->ic_data)) flush_kernel_vmap_range(iclog->ic_data, count); @@ -1959,6 +1956,12 @@ xlog_write_iclog( } submit_bio(&iclog->ic_bio); + return; +shutdown: + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); +sync: + xlog_state_done_syncing(iclog); + up(&iclog->ic_sema); } /* @@ -2116,14 +2119,14 @@ xlog_dealloc_log( iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next; - kmem_free(iclog->ic_data); - kmem_free(iclog); + kvfree(iclog->ic_data); + kfree(iclog); iclog = next_iclog; } log->l_mp->m_log = NULL; destroy_workqueue(log->l_ioend_workqueue); - kmem_free(log); + kfree(log); } /* @@ -3515,7 +3518,8 @@ xlog_ticket_alloc( struct xlog_ticket *tic; int unit_res; - tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL); + tic = kmem_cache_zalloc(xfs_log_ticket_cache, + GFP_KERNEL | __GFP_NOFAIL); unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 67a99d94701e..73f5b7f628f4 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -100,7 +100,7 @@ xlog_cil_ctx_alloc(void) { struct xfs_cil_ctx *ctx; - ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents.extent_list); INIT_LIST_HEAD(&ctx->log_items); @@ -339,7 +339,7 @@ xlog_cil_alloc_shadow_bufs( * the buffer, only the log vector header and the iovec * storage. */ - kmem_free(lip->li_lv_shadow); + kvfree(lip->li_lv_shadow); lv = xlog_kvmalloc(buf_size); memset(lv, 0, xlog_cil_iovec_space(niovecs)); @@ -703,7 +703,7 @@ xlog_cil_free_logvec( while (!list_empty(lv_chain)) { lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list); list_del_init(&lv->lv_list); - kmem_free(lv); + kvfree(lv); } } @@ -753,7 +753,7 @@ xlog_cil_committed( return; } - kmem_free(ctx); + kfree(ctx); } void @@ -1116,11 +1116,18 @@ xlog_cil_cleanup_whiteouts( * same sequence twice. If we get a race between multiple pushes for the same * sequence they will block on the first one and then abort, hence avoiding * needless pushes. + * + * This runs from a workqueue so it does not inherent any specific memory + * allocation context. However, we do not want to block on memory reclaim + * recursing back into the filesystem because this push may have been triggered + * by memory reclaim itself. Hence we really need to run under full GFP_NOFS + * contraints here. */ static void xlog_cil_push_work( struct work_struct *work) { + unsigned int nofs_flags = memalloc_nofs_save(); struct xfs_cil_ctx *ctx = container_of(work, struct xfs_cil_ctx, push_work); struct xfs_cil *cil = ctx->cil; @@ -1334,12 +1341,14 @@ xlog_cil_push_work( spin_unlock(&log->l_icloglock); xlog_cil_cleanup_whiteouts(&whiteouts); xfs_log_ticket_ungrant(log, ticket); + memalloc_nofs_restore(nofs_flags); return; out_skip: up_write(&cil->xc_ctx_lock); xfs_log_ticket_put(new_ctx->ticket); - kmem_free(new_ctx); + kfree(new_ctx); + memalloc_nofs_restore(nofs_flags); return; out_abort_free_ticket: @@ -1348,6 +1357,7 @@ out_abort_free_ticket: if (!ctx->commit_iclog) { xfs_log_ticket_ungrant(log, ctx->ticket); xlog_cil_committed(ctx); + memalloc_nofs_restore(nofs_flags); return; } spin_lock(&log->l_icloglock); @@ -1356,6 +1366,7 @@ out_abort_free_ticket: /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); xfs_log_ticket_ungrant(log, ticket); + memalloc_nofs_restore(nofs_flags); } /* @@ -1533,7 +1544,7 @@ xlog_cil_process_intents( set_bit(XFS_LI_WHITEOUT, &ilip->li_flags); trace_xfs_cil_whiteout_mark(ilip); len += ilip->li_lv->lv_bytes; - kmem_free(ilip->li_lv); + kvfree(ilip->li_lv); ilip->li_lv = NULL; xfs_trans_del_item(lip); @@ -1747,7 +1758,7 @@ xlog_cil_init( struct xlog_cil_pcp *cilpcp; int cpu; - cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); + cil = kzalloc(sizeof(*cil), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!cil) return -ENOMEM; /* @@ -1786,7 +1797,7 @@ xlog_cil_init( out_destroy_wq: destroy_workqueue(cil->xc_push_wq); out_destroy_cil: - kmem_free(cil); + kfree(cil); return -ENOMEM; } @@ -1799,12 +1810,12 @@ xlog_cil_destroy( if (cil->xc_ctx) { if (cil->xc_ctx->ticket) xfs_log_ticket_put(cil->xc_ctx->ticket); - kmem_free(cil->xc_ctx); + kfree(cil->xc_ctx); } ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); free_percpu(cil->xc_pcp); destroy_workqueue(cil->xc_push_wq); - kmem_free(cil); + kfree(cil); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index fa3ad1d7b31c..e30c06ec20e3 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -407,6 +407,7 @@ struct xlog { long l_opstate; /* operational state */ uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; + struct list_head r_dfops; /* recovered log intent items */ int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 13b94d2e605b..13f1d2e91540 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -361,7 +361,7 @@ xlog_find_verify_cycle( *new_blk = -1; out: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -477,7 +477,7 @@ xlog_find_verify_log_record( *last_blk = i; out: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -731,7 +731,7 @@ validate_head: goto out_free_buffer; } - kmem_free(buffer); + kvfree(buffer); if (head_blk == log_bbnum) *return_head_blk = 0; else @@ -745,7 +745,7 @@ validate_head: return 0; out_free_buffer: - kmem_free(buffer); + kvfree(buffer); if (error) xfs_warn(log->l_mp, "failed to find log head"); return error; @@ -999,7 +999,7 @@ xlog_verify_tail( "Tail block (0x%llx) overwrite detected. Updated to 0x%llx", orig_tail, *tail_blk); out: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -1046,7 +1046,7 @@ xlog_verify_head( error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk, XLOG_MAX_ICLOGS, tmp_buffer, &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped); - kmem_free(tmp_buffer); + kvfree(tmp_buffer); if (error < 0) return error; @@ -1365,7 +1365,7 @@ xlog_find_tail( error = xlog_clear_stale_blocks(log, tail_lsn); done: - kmem_free(buffer); + kvfree(buffer); if (error) xfs_warn(log->l_mp, "failed to locate log tail"); @@ -1399,6 +1399,7 @@ xlog_find_zeroed( xfs_daddr_t new_blk, last_blk, start_blk; xfs_daddr_t num_scan_bblks; int error, log_bbnum = log->l_logBBsize; + int ret = 1; *blk_no = 0; @@ -1413,8 +1414,7 @@ xlog_find_zeroed( first_cycle = xlog_get_cycle(offset); if (first_cycle == 0) { /* completely zeroed log */ *blk_no = 0; - kmem_free(buffer); - return 1; + goto out_free_buffer; } /* check partially zeroed log */ @@ -1424,8 +1424,8 @@ xlog_find_zeroed( last_cycle = xlog_get_cycle(offset); if (last_cycle != 0) { /* log completely written to */ - kmem_free(buffer); - return 0; + ret = 0; + goto out_free_buffer; } /* we have a partially zeroed log */ @@ -1471,10 +1471,10 @@ xlog_find_zeroed( *blk_no = last_blk; out_free_buffer: - kmem_free(buffer); + kvfree(buffer); if (error) return error; - return 1; + return ret; } /* @@ -1583,7 +1583,7 @@ xlog_write_log_records( } out_free_buffer: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -1723,30 +1723,24 @@ xlog_clear_stale_blocks( */ void xlog_recover_release_intent( - struct xlog *log, - unsigned short intent_type, - uint64_t intent_id) + struct xlog *log, + unsigned short intent_type, + uint64_t intent_id) { - struct xfs_ail_cursor cur; - struct xfs_log_item *lip; - struct xfs_ail *ailp = log->l_ailp; + struct xfs_defer_pending *dfp, *n; + + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + struct xfs_log_item *lip = dfp->dfp_intent; - spin_lock(&ailp->ail_lock); - for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; - lip = xfs_trans_ail_cursor_next(ailp, &cur)) { if (lip->li_type != intent_type) continue; if (!lip->li_ops->iop_match(lip, intent_id)) continue; - spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_release(lip); - spin_lock(&ailp->ail_lock); - break; - } + ASSERT(xlog_item_is_intent(lip)); - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + xfs_defer_cancel_recovery(log->l_mp, dfp); + } } int @@ -1939,6 +1933,29 @@ xlog_buf_readahead( xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops); } +/* + * Create a deferred work structure for resuming and tracking the progress of a + * log intent item that was found during recovery. + */ +void +xlog_recover_intent_item( + struct xlog *log, + struct xfs_log_item *lip, + xfs_lsn_t lsn, + const struct xfs_defer_op_type *ops) +{ + ASSERT(xlog_item_is_intent(lip)); + + xfs_defer_start_recovery(lip, &log->r_dfops, ops); + + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, lip, lsn); + lip->li_ops->iop_unpin(lip, 0); +} + STATIC int xlog_recover_items_pass2( struct xlog *log, @@ -2040,7 +2057,8 @@ xlog_recover_add_item( { struct xlog_recover_item *item; - item = kmem_zalloc(sizeof(struct xlog_recover_item), 0); + item = kzalloc(sizeof(struct xlog_recover_item), + GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&item->ri_list); list_add_tail(&item->ri_list, head); } @@ -2143,7 +2161,7 @@ xlog_recover_add_to_trans( return 0; } - ptr = kmem_alloc(len, 0); + ptr = xlog_kvmalloc(len); memcpy(ptr, dp, len); in_f = (struct xfs_inode_log_format *)ptr; @@ -2165,14 +2183,13 @@ xlog_recover_add_to_trans( "bad number of regions (%d) in inode log format", in_f->ilf_size); ASSERT(0); - kmem_free(ptr); + kvfree(ptr); return -EFSCORRUPTED; } item->ri_total = in_f->ilf_size; - item->ri_buf = - kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), - 0); + item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t), + GFP_KERNEL | __GFP_NOFAIL); } if (item->ri_total <= item->ri_cnt) { @@ -2180,7 +2197,7 @@ xlog_recover_add_to_trans( "log item region count (%d) overflowed size (%d)", item->ri_cnt, item->ri_total); ASSERT(0); - kmem_free(ptr); + kvfree(ptr); return -EFSCORRUPTED; } @@ -2210,13 +2227,13 @@ xlog_recover_free_trans( /* Free the regions in the item. */ list_del(&item->ri_list); for (i = 0; i < item->ri_cnt; i++) - kmem_free(item->ri_buf[i].i_addr); + kvfree(item->ri_buf[i].i_addr); /* Free the item itself */ - kmem_free(item->ri_buf); - kmem_free(item); + kfree(item->ri_buf); + kfree(item); } /* Free the transaction recover structure */ - kmem_free(trans); + kfree(trans); } /* @@ -2315,7 +2332,7 @@ xlog_recover_ophdr_to_trans( * This is a new transaction so allocate a new recovery container to * hold the recovery ops that will follow. */ - trans = kmem_zalloc(sizeof(struct xlog_recover), 0); + trans = kzalloc(sizeof(struct xlog_recover), GFP_KERNEL | __GFP_NOFAIL); trans->r_log_tid = tid; trans->r_lsn = be64_to_cpu(rhead->h_lsn); INIT_LIST_HEAD(&trans->r_itemq); @@ -2511,7 +2528,7 @@ xlog_abort_defer_ops( list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { list_del_init(&dfc->dfc_list); - xfs_defer_ops_capture_free(mp, dfc); + xfs_defer_ops_capture_abort(mp, dfc); } } @@ -2533,36 +2550,26 @@ xlog_abort_defer_ops( */ STATIC int xlog_recover_process_intents( - struct xlog *log) + struct xlog *log) { LIST_HEAD(capture_list); - struct xfs_ail_cursor cur; - struct xfs_log_item *lip; - struct xfs_ail *ailp; - int error = 0; + struct xfs_defer_pending *dfp, *n; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) - xfs_lsn_t last_lsn; -#endif + xfs_lsn_t last_lsn; - ailp = log->l_ailp; - spin_lock(&ailp->ail_lock); -#if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif - for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - lip != NULL; - lip = xfs_trans_ail_cursor_next(ailp, &cur)) { - const struct xfs_item_ops *ops; - if (!xlog_item_is_intent(lip)) - break; + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + ASSERT(xlog_item_is_intent(dfp->dfp_intent)); /* * We should never see a redo item with a LSN higher than * the last transaction we found in the log at the start * of recovery. */ - ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); + ASSERT(XFS_LSN_CMP(last_lsn, dfp->dfp_intent->li_lsn) >= 0); /* * NOTE: If your intent processing routine can create more @@ -2571,21 +2578,14 @@ xlog_recover_process_intents( * replayed in the wrong order! * * The recovery function can free the log item, so we must not - * access lip after it returns. + * access dfp->dfp_intent after it returns. It must dispose of + * @dfp if it returns 0. */ - spin_unlock(&ailp->ail_lock); - ops = lip->li_ops; - error = ops->iop_recover(lip, &capture_list); - spin_lock(&ailp->ail_lock); - if (error) { - trace_xlog_intent_recovery_failed(log->l_mp, error, - ops->iop_recover); + error = xfs_defer_finish_recovery(log->l_mp, dfp, + &capture_list); + if (error) break; - } } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); if (error) goto err; @@ -2606,27 +2606,34 @@ err: */ STATIC void xlog_recover_cancel_intents( - struct xlog *log) + struct xlog *log) { - struct xfs_log_item *lip; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp; - - ailp = log->l_ailp; - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (!xlog_item_is_intent(lip)) - break; + struct xfs_defer_pending *dfp, *n; - spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_release(lip); - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_next(ailp, &cur); + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + ASSERT(xlog_item_is_intent(dfp->dfp_intent)); + + xfs_defer_cancel_recovery(log->l_mp, dfp); } +} + +/* + * Transfer ownership of the recovered pending work to the recovery transaction + * and try to finish the work. If there is more work to be done, the dfp will + * remain attached to the transaction. If not, the dfp is freed. + */ +int +xlog_recover_finish_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + int error; - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + list_move(&dfp->dfp_list, &tp->t_dfops); + error = xfs_defer_finish_one(tp, dfp); + if (error == -EAGAIN) + return 0; + return error; } /* @@ -3017,7 +3024,7 @@ xlog_do_recovery_pass( hblks = xlog_logrec_hblks(log, rhead); if (hblks != 1) { - kmem_free(hbp); + kvfree(hbp); hbp = xlog_alloc_buffer(log, hblks); } } else { @@ -3031,7 +3038,7 @@ xlog_do_recovery_pass( return -ENOMEM; dbp = xlog_alloc_buffer(log, BTOBB(h_size)); if (!dbp) { - kmem_free(hbp); + kvfree(hbp); return -ENOMEM; } @@ -3192,16 +3199,33 @@ xlog_do_recovery_pass( } bread_err2: - kmem_free(dbp); + kvfree(dbp); bread_err1: - kmem_free(hbp); + kvfree(hbp); /* - * Submit buffers that have been added from the last record processed, - * regardless of error status. + * Submit buffers that have been dirtied by the last record recovered. */ - if (!list_empty(&buffer_list)) + if (!list_empty(&buffer_list)) { + if (error) { + /* + * If there has been an item recovery error then we + * cannot allow partial checkpoint writeback to + * occur. We might have multiple checkpoints with the + * same start LSN in this buffer list, and partial + * writeback of a checkpoint in this situation can + * prevent future recovery of all the changes in the + * checkpoints at this start LSN. + * + * Note: Shutting down the filesystem will result in the + * delwri submission marking all the buffers stale, + * completing them and cleaning up _XBF_LOGRECOVERY + * state without doing any IO. + */ + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + } error2 = xfs_buf_delwri_submit(&buffer_list); + } if (error && first_bad) *first_bad = rhead_blk; @@ -3436,12 +3460,19 @@ xlog_recover( * part of recovery so that the root and real-time bitmap inodes can be read in * from disk in between the two stages. This is necessary so that we can free * space in the real-time portion of the file system. + * + * We run this whole process under GFP_NOFS allocation context. We do a + * combination of non-transactional and transactional work, yet we really don't + * want to recurse into the filesystem from direct reclaim during any of this + * processing. This allows all the recovery code run here not to care about the + * memory allocation context it is running in. */ int xlog_recover_finish( struct xlog *log) { - int error; + unsigned int nofs_flags = memalloc_nofs_save(); + int error; error = xlog_recover_process_intents(log); if (error) { @@ -3455,7 +3486,7 @@ xlog_recover_finish( xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return error; + goto out_error; } /* @@ -3476,7 +3507,7 @@ xlog_recover_finish( if (error < 0) { xfs_alert(log->l_mp, "Failed to clear log incompat features on recovery"); - return error; + goto out_error; } } @@ -3501,9 +3532,13 @@ xlog_recover_finish( * and AIL. */ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + error = 0; + goto out_error; } - return 0; +out_error: + memalloc_nofs_restore(nofs_flags); + return error; } void diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index aed5be5508fe..df370eb5dc15 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -45,7 +45,7 @@ xfs_uuid_table_free(void) { if (xfs_uuid_table_size == 0) return; - kmem_free(xfs_uuid_table); + kfree(xfs_uuid_table); xfs_uuid_table = NULL; xfs_uuid_table_size = 0; } @@ -62,7 +62,7 @@ xfs_uuid_mount( int hole, i; /* Publish UUID in struct super_block */ - uuid_copy(&mp->m_super->s_uuid, uuid); + super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid)); if (xfs_has_nouuid(mp)) return 0; @@ -637,7 +637,6 @@ xfs_mountfs( struct xfs_sb *sbp = &(mp->m_sb); struct xfs_inode *rip; struct xfs_ino_geometry *igeo = M_IGEO(mp); - uint64_t resblks; uint quotamount = 0; uint quotaflags = 0; int error = 0; @@ -707,6 +706,8 @@ xfs_mountfs( /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; + super_set_sysfs_name_id(mp->m_super); + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_super->s_id); if (error) @@ -974,8 +975,7 @@ xfs_mountfs( * we were already there on the last unmount. Warn if this occurs. */ if (!xfs_is_readonly(mp)) { - resblks = xfs_default_resblks(mp); - error = xfs_reserve_blocks(mp, &resblks, NULL); + error = xfs_reserve_blocks(mp, xfs_default_resblks(mp)); if (error) xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); @@ -1053,7 +1053,6 @@ void xfs_unmountfs( struct xfs_mount *mp) { - uint64_t resblks; int error; /* @@ -1090,8 +1089,7 @@ xfs_unmountfs( * we only every apply deltas to the superblock and hence the incore * value does not matter.... */ - resblks = 0; - error = xfs_reserve_blocks(mp, &resblks, NULL); + error = xfs_reserve_blocks(mp, 0); if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 219681d29fbc..e880aa48de68 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -94,16 +94,16 @@ typedef struct xfs_mount { struct xfs_inode *m_rsumip; /* pointer to summary inode */ struct xfs_inode *m_rootip; /* pointer to root directory */ struct xfs_quotainfo *m_quotainfo; /* disk quota information */ - xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ - xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ - xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ + struct xfs_buftarg *m_ddev_targp; /* data device */ + struct xfs_buftarg *m_logdev_targp;/* log device */ + struct xfs_buftarg *m_rtdev_targp; /* rt device */ void __percpu *m_inodegc; /* percpu inodegc structures */ /* * Optional cache of rt summary level per bitmap block with the - * invariant that m_rsum_cache[bbno] <= the minimum i for which - * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip - * inode lock. + * invariant that m_rsum_cache[bbno] > the maximum i for which + * rsum[i][bbno] != 0, or 0 if rsum[i][bbno] == 0 for all i. + * Reads and writes are serialized by the rsumip inode lock. */ uint8_t *m_rsum_cache; struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ @@ -119,6 +119,7 @@ typedef struct xfs_mount { uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ uint8_t m_agno_log; /* log #ag's */ uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ + int8_t m_rtxblklog; /* log2 of rextsize, if possible */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ @@ -152,6 +153,7 @@ typedef struct xfs_mount { uint64_t m_features; /* active filesystem features */ uint64_t m_low_space[XFS_LOWSP_MAX]; uint64_t m_low_rtexts[XFS_LOWSP_MAX]; + uint64_t m_rtxblkmask; /* rt extent block mask */ struct xfs_ino_geometry m_ino_geo; /* inode geometry */ struct xfs_trans_resv m_resv; /* precomputed res values */ /* low free space thresholds */ @@ -250,6 +252,9 @@ typedef struct xfs_mount { /* cpus that have inodes queued for inactivation */ struct cpumask m_inodegc_cpumask; + + /* Hook to feed dirent updates to an active online repair. */ + struct xfs_hooks m_dir_update_hooks; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) @@ -500,9 +505,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } -int xfs_buf_hash_init(struct xfs_perag *pag); -void xfs_buf_hash_destroy(struct xfs_perag *pag); - extern void xfs_uuid_table_free(void); extern uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index f85e3b07ab44..7443debaffd6 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -333,13 +333,14 @@ xfs_mru_cache_create( if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) return -EINVAL; - if (!(mru = kmem_zalloc(sizeof(*mru), 0))) + mru = kzalloc(sizeof(*mru), GFP_KERNEL | __GFP_NOFAIL); + if (!mru) return -ENOMEM; /* An extra list is needed to avoid reaping up to a grp_time early. */ mru->grp_count = grp_count + 1; - mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0); - + mru->lists = kzalloc(mru->grp_count * sizeof(*mru->lists), + GFP_KERNEL | __GFP_NOFAIL); if (!mru->lists) { err = -ENOMEM; goto exit; @@ -364,9 +365,9 @@ xfs_mru_cache_create( exit: if (err && mru && mru->lists) - kmem_free(mru->lists); + kfree(mru->lists); if (err && mru) - kmem_free(mru); + kfree(mru); return err; } @@ -406,8 +407,8 @@ xfs_mru_cache_destroy( xfs_mru_cache_flush(mru); - kmem_free(mru->lists); - kmem_free(mru); + kfree(mru->lists); + kfree(mru); } /* @@ -427,7 +428,7 @@ xfs_mru_cache_insert( if (!mru || !mru->lists) return -EINVAL; - if (radix_tree_preload(GFP_NOFS)) + if (radix_tree_preload(GFP_KERNEL)) return -ENOMEM; INIT_LIST_HEAD(&elem->list_node); diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index a7daa522e00f..fa50e5308292 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/dax.h> +#include <linux/fs.h> struct xfs_failure_info { xfs_agblock_t startblock; @@ -73,10 +74,16 @@ xfs_dax_failure_fn( struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip; struct xfs_failure_info *notify = data; + struct address_space *mapping; + pgoff_t pgoff; + unsigned long pgcnt; int error = 0; if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { + /* Continue the query because this isn't a failure. */ + if (notify->mf_flags & MF_MEM_PRE_REMOVE) + return 0; notify->want_shutdown = true; return 0; } @@ -92,15 +99,61 @@ xfs_dax_failure_fn( return 0; } - error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, - xfs_failure_pgoff(mp, rec, notify), - xfs_failure_pgcnt(mp, rec, notify), - notify->mf_flags); + mapping = VFS_I(ip)->i_mapping; + pgoff = xfs_failure_pgoff(mp, rec, notify); + pgcnt = xfs_failure_pgcnt(mp, rec, notify); + + /* Continue the rmap query if the inode isn't a dax file. */ + if (dax_mapping(mapping)) + error = mf_dax_kill_procs(mapping, pgoff, pgcnt, + notify->mf_flags); + + /* Invalidate the cache in dax pages. */ + if (notify->mf_flags & MF_MEM_PRE_REMOVE) + invalidate_inode_pages2_range(mapping, pgoff, + pgoff + pgcnt - 1); + xfs_irele(ip); return error; } static int +xfs_dax_notify_failure_freeze( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + int error; + + error = freeze_super(sb, FREEZE_HOLDER_KERNEL); + if (error) + xfs_emerg(mp, "already frozen by kernel, err=%d", error); + + return error; +} + +static void +xfs_dax_notify_failure_thaw( + struct xfs_mount *mp, + bool kernel_frozen) +{ + struct super_block *sb = mp->m_super; + int error; + + if (kernel_frozen) { + error = thaw_super(sb, FREEZE_HOLDER_KERNEL); + if (error) + xfs_emerg(mp, "still frozen after notify failure, err=%d", + error); + } + + /* + * Also thaw userspace call anyway because the device is about to be + * removed immediately. + */ + thaw_super(sb, FREEZE_HOLDER_USERSPACE); +} + +static int xfs_dax_notify_ddev_failure( struct xfs_mount *mp, xfs_daddr_t daddr, @@ -112,15 +165,29 @@ xfs_dax_notify_ddev_failure( struct xfs_btree_cur *cur = NULL; struct xfs_buf *agf_bp = NULL; int error = 0; + bool kernel_frozen = false; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); + if (mf_flags & MF_MEM_PRE_REMOVE) { + xfs_info(mp, "Device is about to be removed!"); + /* + * Freeze fs to prevent new mappings from being created. + * - Keep going on if others already hold the kernel forzen. + * - Keep going on if other errors too because this device is + * starting to fail. + * - If kernel frozen state is hold successfully here, thaw it + * here as well at the end. + */ + kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; + } + error = xfs_trans_alloc_empty(mp, &tp); if (error) - return error; + goto out; for (; agno <= end_agno; agno++) { struct xfs_rmap_irec ri_low = { }; @@ -165,11 +232,26 @@ xfs_dax_notify_ddev_failure( } xfs_trans_cancel(tp); - if (error || notify.want_shutdown) { + + /* + * Shutdown fs from a force umount in pre-remove case which won't fail, + * so errors can be ignored. Otherwise, shutdown the filesystem with + * CORRUPT flag if error occured or notify.want_shutdown was set during + * RMAP querying. + */ + if (mf_flags & MF_MEM_PRE_REMOVE) + xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); + else if (error || notify.want_shutdown) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); if (!error) error = -EFSCORRUPTED; } + +out: + /* Thaw the fs if it has been frozen before. */ + if (mf_flags & MF_MEM_PRE_REMOVE) + xfs_dax_notify_failure_thaw(mp, kernel_frozen); + return error; } @@ -197,6 +279,14 @@ xfs_dax_notify_failure( if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && mp->m_logdev_targp != mp->m_ddev_targp) { + /* + * In the pre-remove case the failure notification is attempting + * to trigger a force unmount. The expectation is that the + * device is still present, but its removal is in progress and + * can not be cancelled, proceed with accessing the log device. + */ + if (mf_flags & MF_MEM_PRE_REMOVE) + return 0; xfs_err(mp, "ondisk log corrupt, shutting down fs!"); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); return -EFSCORRUPTED; @@ -210,6 +300,12 @@ xfs_dax_notify_failure( ddev_start = mp->m_ddev_targp->bt_dax_part_off; ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; + /* Notify failure on the whole device. */ + if (offset == 0 && len == U64_MAX) { + offset = ddev_start; + len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev); + } + /* Ignore the range out of filesystem area */ if (offset + len - 1 < ddev_start) return -ENXIO; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 94a7932ac570..0f4cf4170c35 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -26,6 +26,7 @@ #include "xfs_ag.h" #include "xfs_ialloc.h" #include "xfs_log_priv.h" +#include "xfs_health.h" /* * The global quota manager. There is only one of these for the entire @@ -171,7 +172,7 @@ xfs_qm_dqpurge( * hits zero, so it really should be on the freelist here. */ ASSERT(!list_empty(&dqp->q_lru)); - list_lru_del(&qi->qi_lru, &dqp->q_lru); + list_lru_del_obj(&qi->qi_lru, &dqp->q_lru); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); @@ -254,7 +255,7 @@ xfs_qm_dqattach_one( struct xfs_dquot *dqp; int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); error = 0; /* @@ -322,7 +323,7 @@ xfs_qm_dqattach_locked( if (!xfs_qm_need_dqattach(ip)) return 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER, @@ -353,7 +354,7 @@ done: * Don't worry about the dquots that we may have attached before any * error - they'll get detached later if it has not already been done. */ - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); return error; } @@ -628,7 +629,8 @@ xfs_qm_init_quotainfo( ASSERT(XFS_IS_QUOTA_ON(mp)); - qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0); + qinf = mp->m_quotainfo = kzalloc(sizeof(struct xfs_quotainfo), + GFP_KERNEL | __GFP_NOFAIL); error = list_lru_init(&qinf->qi_lru); if (error) @@ -642,9 +644,9 @@ xfs_qm_init_quotainfo( if (error) goto out_free_lru; - INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); - INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); - INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS); + INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_KERNEL); + INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_KERNEL); + INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_KERNEL); mutex_init(&qinf->qi_tree_lock); /* mutex used to serialize quotaoffs */ @@ -691,6 +693,9 @@ xfs_qm_init_quotainfo( shrinker_register(qinf->qi_shrinker); + xfs_hooks_init(&qinf->qi_mod_ino_dqtrx_hooks); + xfs_hooks_init(&qinf->qi_apply_dqtrx_hooks); + return 0; out_free_inos: @@ -700,7 +705,7 @@ out_free_inos: out_free_lru: list_lru_destroy(&qinf->qi_lru); out_free_qinf: - kmem_free(qinf); + kfree(qinf); mp->m_quotainfo = NULL; return error; } @@ -724,7 +729,7 @@ xfs_qm_destroy_quotainfo( xfs_qm_destroy_quotainos(qi); mutex_destroy(&qi->qi_tree_lock); mutex_destroy(&qi->qi_quotaofflock); - kmem_free(qi); + kfree(qi); mp->m_quotainfo = NULL; } @@ -758,14 +763,18 @@ xfs_qm_qino_alloc( (mp->m_sb.sb_gquotino != NULLFSINO)) { ino = mp->m_sb.sb_gquotino; if (XFS_IS_CORRUPT(mp, - mp->m_sb.sb_pquotino != NULLFSINO)) + mp->m_sb.sb_pquotino != NULLFSINO)) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA); return -EFSCORRUPTED; + } } else if ((flags & XFS_QMOPT_GQUOTA) && (mp->m_sb.sb_pquotino != NULLFSINO)) { ino = mp->m_sb.sb_pquotino; if (XFS_IS_CORRUPT(mp, - mp->m_sb.sb_gquotino != NULLFSINO)) + mp->m_sb.sb_gquotino != NULLFSINO)) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA); return -EFSCORRUPTED; + } } if (ino != NULLFSINO) { error = xfs_iget(mp, NULL, ino, 0, 0, ipp); @@ -996,7 +1005,8 @@ xfs_qm_reset_dqcounts_buf( if (qip->i_nblocks == 0) return 0; - map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0); + map = kmalloc(XFS_DQITER_MAP_SIZE * sizeof(*map), + GFP_KERNEL | __GFP_NOFAIL); lblkno = 0; maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); @@ -1058,7 +1068,7 @@ xfs_qm_reset_dqcounts_buf( } while (nmaps > 0); out: - kmem_free(map); + kfree(map); return error; } @@ -1406,8 +1416,12 @@ error_return: xfs_warn(mp, "Quotacheck: Failed to reset quota flags."); } - } else + xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK); + } else { xfs_notice(mp, "Quotacheck: Done."); + xfs_fs_mark_healthy(mp, XFS_SICK_FS_QUOTACHECK); + } + return error; error_purge: @@ -1809,7 +1823,7 @@ xfs_qm_vop_chown( XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(XFS_IS_QUOTA_ON(ip->i_mount)); /* old dquot */ @@ -1817,12 +1831,12 @@ xfs_qm_vop_chown( ASSERT(prevdq); ASSERT(prevdq != newdq); - xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_nblocks)); - xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1); + xfs_trans_mod_ino_dquot(tp, ip, prevdq, bfield, -(ip->i_nblocks)); + xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_ICOUNT, -1); /* the sparkling new dquot */ - xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_nblocks); - xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); + xfs_trans_mod_ino_dquot(tp, ip, newdq, bfield, ip->i_nblocks); + xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* * Back when we made quota reservations for the chown, we reserved the @@ -1897,29 +1911,28 @@ xfs_qm_vop_create_dqattach( if (!XFS_IS_QUOTA_ON(mp)) return; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id); ip->i_udquot = xfs_qm_dqhold(udqp); - xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id); ip->i_gdquot = xfs_qm_dqhold(gdqp); - xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); ASSERT(ip->i_projid == pdqp->q_id); ip->i_pdquot = xfs_qm_dqhold(pdqp); - xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); } + + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, 1); } /* Decide if this inode's dquot is near an enforcement boundary. */ diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index d5c9fc4ba591..f5993012bf98 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -68,6 +68,10 @@ struct xfs_quotainfo { /* Minimum and maximum quota expiration timestamp values. */ time64_t qi_expiry_min; time64_t qi_expiry_max; + + /* Hook to feed quota counter updates to an active online repair. */ + struct xfs_hooks qi_mod_ino_dqtrx_hooks; + struct xfs_hooks qi_apply_dqtrx_hooks; }; static inline struct radix_tree_root * @@ -104,6 +108,18 @@ xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type) return NULL; } +/* + * Parameters for tracking dqtrx changes on behalf of an inode. The hook + * function arg parameter is the field being updated. + */ +struct xfs_mod_ino_dqtrx_params { + uintptr_t tx_id; + xfs_ino_t ino; + xfs_dqtype_t q_type; + xfs_dqid_t q_id; + int64_t delta; +}; + extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp, uint field, int64_t delta); extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index b77673dd0558..271c1021c733 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -9,6 +9,7 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_inode.h" diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index dcc785fdd345..85a4ae1a17f6 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -74,6 +74,22 @@ struct xfs_dqtrx { int64_t qt_icount_delta; /* dquot inode count changes */ }; +enum xfs_apply_dqtrx_type { + XFS_APPLY_DQTRX_COMMIT = 0, + XFS_APPLY_DQTRX_UNRESERVE, +}; + +/* + * Parameters for applying dqtrx changes to a dquot. The hook function arg + * parameter is enum xfs_apply_dqtrx_type. + */ +struct xfs_apply_dqtrx_params { + uintptr_t tx_id; + xfs_ino_t ino; + xfs_dqtype_t q_type; + xfs_dqid_t q_id; +}; + #ifdef CONFIG_XFS_QUOTA extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *); extern void xfs_trans_free_dqinfo(struct xfs_trans *); @@ -114,6 +130,30 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); } bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type); + +# ifdef CONFIG_XFS_LIVE_HOOKS +void xfs_trans_mod_ino_dquot(struct xfs_trans *tp, struct xfs_inode *ip, + struct xfs_dquot *dqp, unsigned int field, int64_t delta); + +struct xfs_quotainfo; + +struct xfs_dqtrx_hook { + struct xfs_hook mod_hook; + struct xfs_hook apply_hook; +}; + +void xfs_dqtrx_hook_disable(void); +void xfs_dqtrx_hook_enable(void); + +int xfs_dqtrx_hook_add(struct xfs_quotainfo *qi, struct xfs_dqtrx_hook *hook); +void xfs_dqtrx_hook_del(struct xfs_quotainfo *qi, struct xfs_dqtrx_hook *hook); +void xfs_dqtrx_hook_setup(struct xfs_dqtrx_hook *hook, notifier_fn_t mod_fn, + notifier_fn_t apply_fn); +# else +# define xfs_trans_mod_ino_dquot(tp, ip, dqp, field, delta) \ + xfs_trans_mod_dquot((tp), (dqp), (field), (delta)) +# endif /* CONFIG_XFS_LIVE_HOOKS */ + #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -127,7 +167,10 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, } #define xfs_trans_dup_dqinfo(tp, tp2) #define xfs_trans_free_dqinfo(tp) -#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0) +static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp, + struct xfs_inode *ip, uint field, int64_t delta) +{ +} #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, @@ -170,6 +213,12 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) #define xfs_inode_near_dquot_enforcement(ip, type) (false) + +# ifdef CONFIG_XFS_LIVE_HOOKS +# define xfs_dqtrx_hook_enable() ((void)0) +# define xfs_dqtrx_hook_disable() ((void)0) +# endif /* CONFIG_XFS_LIVE_HOOKS */ + #endif /* CONFIG_XFS_QUOTA */ static inline int diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 2d4444d61e98..14919b33e4fe 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -36,9 +36,9 @@ STATIC void xfs_cui_item_free( struct xfs_cui_log_item *cuip) { - kmem_free(cuip->cui_item.li_lv_shadow); + kvfree(cuip->cui_item.li_lv_shadow); if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) - kmem_free(cuip); + kfree(cuip); else kmem_cache_free(xfs_cui_cache, cuip); } @@ -143,8 +143,8 @@ xfs_cui_init( ASSERT(nextents > 0); if (nextents > XFS_CUI_MAX_FAST_EXTENTS) - cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), - 0); + cuip = kzalloc(xfs_cui_log_item_sizeof(nextents), + GFP_KERNEL | __GFP_NOFAIL); else cuip = kmem_cache_zalloc(xfs_cui_cache, GFP_KERNEL | __GFP_NOFAIL); @@ -207,7 +207,7 @@ xfs_cud_item_release( struct xfs_cud_log_item *cudp = CUD_ITEM(lip); xfs_cui_release(cudp->cud_cuip); - kmem_free(cudp->cud_item.li_lv_shadow); + kvfree(cudp->cud_item.li_lv_shadow); kmem_cache_free(xfs_cud_cache, cudp); } @@ -227,52 +227,6 @@ static const struct xfs_item_ops xfs_cud_item_ops = { .iop_intent = xfs_cud_item_intent, }; -static struct xfs_cud_log_item * -xfs_trans_get_cud( - struct xfs_trans *tp, - struct xfs_cui_log_item *cuip) -{ - struct xfs_cud_log_item *cudp; - - cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, - &xfs_cud_item_ops); - cudp->cud_cuip = cuip; - cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; - - xfs_trans_add_item(tp, &cudp->cud_item); - return cudp; -} - -/* - * Finish an refcount update and log it to the CUD. Note that the - * transaction is marked dirty regardless of whether the refcount - * update succeeds or fails to support the CUI/CUD lifecycle rules. - */ -static int -xfs_trans_log_finish_refcount_update( - struct xfs_trans *tp, - struct xfs_cud_log_item *cudp, - struct xfs_refcount_intent *ri, - struct xfs_btree_cur **pcur) -{ - int error; - - error = xfs_refcount_finish_one(tp, ri, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the CUI and frees the CUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); - - return error; -} - /* Sort refcount intents by AG. */ static int xfs_refcount_update_diff_items( @@ -318,9 +272,6 @@ xfs_refcount_update_log_item( uint next_extent; struct xfs_phys_extent *pmap; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from @@ -347,7 +298,6 @@ xfs_refcount_update_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &cuip->cui_item); if (sort) list_sort(mp, items, xfs_refcount_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -362,7 +312,16 @@ xfs_refcount_update_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item; + struct xfs_cui_log_item *cuip = CUI_ITEM(intent); + struct xfs_cud_log_item *cudp; + + cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, + &xfs_cud_item_ops); + cudp->cud_cuip = cuip; + cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; + + return &cudp->cud_item; } /* Take a passive ref to the AG containing the space we're refcounting. */ @@ -397,10 +356,9 @@ xfs_refcount_update_finish_item( int error; ri = container_of(item, struct xfs_refcount_intent, ri_list); - error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri, - state); /* Did we run out of reservation? Requeue what we didn't finish. */ + error = xfs_refcount_finish_one(tp, ri, state); if (!error && ri->ri_blockcount > 0) { ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || ri->ri_type == XFS_REFCOUNT_DECREASE); @@ -433,16 +391,6 @@ xfs_refcount_update_cancel_item( kmem_cache_free(xfs_refcount_intent_cache, ri); } -const struct xfs_defer_op_type xfs_refcount_update_defer_type = { - .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .create_intent = xfs_refcount_update_create_intent, - .abort_intent = xfs_refcount_update_abort_intent, - .create_done = xfs_refcount_update_create_done, - .finish_item = xfs_refcount_update_finish_item, - .finish_cleanup = xfs_refcount_finish_one_cleanup, - .cancel_item = xfs_refcount_update_cancel_item, -}; - /* Is this recovered CUI ok? */ static inline bool xfs_cui_validate_phys( @@ -468,23 +416,38 @@ xfs_cui_validate_phys( return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); } +static inline void +xfs_cui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_phys_extent *pmap) +{ + struct xfs_refcount_intent *ri; + + ri = kmem_cache_alloc(xfs_refcount_intent_cache, + GFP_KERNEL | __GFP_NOFAIL); + ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + ri->ri_startblock = pmap->pe_startblock; + ri->ri_blockcount = pmap->pe_len; + xfs_refcount_update_get_group(mp, ri); + + xfs_defer_add_item(dfp, &ri->ri_list); +} + /* * Process a refcount update intent item that was recovered from the log. * We need to update the refcountbt. */ STATIC int -xfs_cui_item_recover( - struct xfs_log_item *lip, +xfs_refcount_recover_work( + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); - struct xfs_cud_log_item *cudp; struct xfs_trans *tp; - struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - unsigned int refc_type; - bool requeue_only = false; int i; int error = 0; @@ -501,6 +464,8 @@ xfs_cui_item_recover( sizeof(cuip->cui_format)); return -EFSCORRUPTED; } + + xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]); } /* @@ -521,100 +486,28 @@ xfs_cui_item_recover( if (error) return error; - cudp = xfs_trans_get_cud(tp, cuip); - - for (i = 0; i < cuip->cui_format.cui_nextents; i++) { - struct xfs_refcount_intent fake = { }; - struct xfs_phys_extent *pmap; - - pmap = &cuip->cui_format.cui_extents[i]; - refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; - switch (refc_type) { - case XFS_REFCOUNT_INCREASE: - case XFS_REFCOUNT_DECREASE: - case XFS_REFCOUNT_ALLOC_COW: - case XFS_REFCOUNT_FREE_COW: - fake.ri_type = refc_type; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &cuip->cui_format, - sizeof(cuip->cui_format)); - error = -EFSCORRUPTED; - goto abort_error; - } - - fake.ri_startblock = pmap->pe_startblock; - fake.ri_blockcount = pmap->pe_len; - - if (!requeue_only) { - xfs_refcount_update_get_group(mp, &fake); - error = xfs_trans_log_finish_refcount_update(tp, cudp, - &fake, &rcur); - xfs_refcount_update_put_group(&fake); - } - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &cuip->cui_format, - sizeof(cuip->cui_format)); - if (error) - goto abort_error; - - /* Requeue what we didn't finish. */ - if (fake.ri_blockcount > 0) { - struct xfs_bmbt_irec irec = { - .br_startblock = fake.ri_startblock, - .br_blockcount = fake.ri_blockcount, - }; - - switch (fake.ri_type) { - case XFS_REFCOUNT_INCREASE: - xfs_refcount_increase_extent(tp, &irec); - break; - case XFS_REFCOUNT_DECREASE: - xfs_refcount_decrease_extent(tp, &irec); - break; - case XFS_REFCOUNT_ALLOC_COW: - xfs_refcount_alloc_cow_extent(tp, - irec.br_startblock, - irec.br_blockcount); - break; - case XFS_REFCOUNT_FREE_COW: - xfs_refcount_free_cow_extent(tp, - irec.br_startblock, - irec.br_blockcount); - break; - default: - ASSERT(0); - } - requeue_only = true; - } - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &cuip->cui_format, + sizeof(cuip->cui_format)); + if (error) + goto abort_error; - xfs_refcount_finish_one_cleanup(tp, rcur, error); return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: - xfs_refcount_finish_one_cleanup(tp, rcur, error); xfs_trans_cancel(tp); return error; } -STATIC bool -xfs_cui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return CUI_ITEM(lip)->cui_format.cui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_cui_item_relog( +xfs_refcount_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_cud_log_item *cudp; struct xfs_cui_log_item *cuip; struct xfs_phys_extent *pmap; unsigned int count; @@ -622,27 +515,41 @@ xfs_cui_item_relog( count = CUI_ITEM(intent)->cui_format.cui_nextents; pmap = CUI_ITEM(intent)->cui_format.cui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); - cuip = xfs_cui_init(tp->t_mountp, count); memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); - xfs_trans_add_item(tp, &cuip->cui_item); - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); + return &cuip->cui_item; } +const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .name = "refcount", + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .create_intent = xfs_refcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_refcount_update_finish_item, + .finish_cleanup = xfs_refcount_finish_one_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, + .recover_work = xfs_refcount_recover_work, + .relog_intent = xfs_refcount_relog_intent, +}; + +STATIC bool +xfs_cui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return CUI_ITEM(lip)->cui_format.cui_id == intent_id; +} + static const struct xfs_item_ops xfs_cui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_cui_item_size, .iop_format = xfs_cui_item_format, .iop_unpin = xfs_cui_item_unpin, .iop_release = xfs_cui_item_release, - .iop_recover = xfs_cui_item_recover, .iop_match = xfs_cui_item_match, - .iop_relog = xfs_cui_item_relog, }; static inline void @@ -696,12 +603,9 @@ xlog_recover_cui_commit_pass2( cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); xfs_cui_copy_format(&cuip->cui_format, cui_formatp); atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn); - xfs_cui_release(cuip); + + xlog_recover_intent_item(log, &cuip->cui_item, lsn, + &xfs_refcount_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index eb9102453aff..7da0e8f961d3 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -29,6 +29,7 @@ #include "xfs_iomap.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" +#include "xfs_health.h" /* * Copy on Write of Shared Blocks @@ -527,7 +528,7 @@ xfs_reflink_allocate_cow( int error; bool found; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (!ip->i_cowfp) { ASSERT(!xfs_is_reflink_inode(ip)); xfs_ifork_init_cow(ip); @@ -618,7 +619,7 @@ xfs_reflink_cancel_cow_blocks( error = xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) break; @@ -784,6 +785,7 @@ xfs_reflink_end_cow_extent( } } del = got; + xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb); /* Grab the corresponding mapping in the data fork. */ nmaps = 1; @@ -804,7 +806,7 @@ xfs_reflink_end_cow_extent( * If the extent we're remapping is backed by storage (written * or not), unmap the extent and drop its refcount. */ - xfs_bmap_unmap_extent(tp, ip, &data); + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); xfs_refcount_decrease_extent(tp, &data); xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -data.br_blockcount); @@ -828,7 +830,7 @@ xfs_reflink_end_cow_extent( xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); /* Map the new blocks into the data fork. */ - xfs_bmap_map_extent(tp, ip, &del); + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del); /* Charge this new data fork mapping to the on-disk quota. */ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, @@ -1226,8 +1228,10 @@ xfs_reflink_remap_extent( * extent if they're both holes or both the same physical extent. */ if (dmap->br_startblock == smap.br_startblock) { - if (dmap->br_state != smap.br_state) + if (dmap->br_state != smap.br_state) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; + } goto out_cancel; } @@ -1290,7 +1294,7 @@ xfs_reflink_remap_extent( * If the extent we're unmapping is backed by storage (written * or not), unmap the extent and drop its refcount. */ - xfs_bmap_unmap_extent(tp, ip, &smap); + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap); xfs_refcount_decrease_extent(tp, &smap); qdelta -= smap.br_blockcount; } else if (smap.br_startblock == DELAYSTARTBLOCK) { @@ -1315,7 +1319,7 @@ xfs_reflink_remap_extent( */ if (dmap_written) { xfs_refcount_increase_extent(tp, dmap); - xfs_bmap_map_extent(tp, ip, dmap); + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap); qdelta += dmap->br_blockcount; } @@ -1390,6 +1394,7 @@ xfs_reflink_remap_blocks( ASSERT(nimaps == 1 && imap.br_startoff == srcoff); if (imap.br_startblock == DELAYSTARTBLOCK) { ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + xfs_bmap_mark_sick(src, XFS_DATA_FORK); error = -EFSCORRUPTED; break; } @@ -1540,6 +1545,10 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; + xfs_iflags_set(src, XFS_IREMAPPING); + if (inode_in != inode_out) + xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + return 0; out_unlock: xfs_iunlock2_io_mmap(src, dest); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 0e0e747028da..e473124e29cc 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -36,9 +36,9 @@ STATIC void xfs_rui_item_free( struct xfs_rui_log_item *ruip) { - kmem_free(ruip->rui_item.li_lv_shadow); + kvfree(ruip->rui_item.li_lv_shadow); if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) - kmem_free(ruip); + kfree(ruip); else kmem_cache_free(xfs_rui_cache, ruip); } @@ -142,7 +142,8 @@ xfs_rui_init( ASSERT(nextents > 0); if (nextents > XFS_RUI_MAX_FAST_EXTENTS) - ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); + ruip = kzalloc(xfs_rui_log_item_sizeof(nextents), + GFP_KERNEL | __GFP_NOFAIL); else ruip = kmem_cache_zalloc(xfs_rui_cache, GFP_KERNEL | __GFP_NOFAIL); @@ -205,7 +206,7 @@ xfs_rud_item_release( struct xfs_rud_log_item *rudp = RUD_ITEM(lip); xfs_rui_release(rudp->rud_ruip); - kmem_free(rudp->rud_item.li_lv_shadow); + kvfree(rudp->rud_item.li_lv_shadow); kmem_cache_free(xfs_rud_cache, rudp); } @@ -225,23 +226,6 @@ static const struct xfs_item_ops xfs_rud_item_ops = { .iop_intent = xfs_rud_item_intent, }; -static struct xfs_rud_log_item * -xfs_trans_get_rud( - struct xfs_trans *tp, - struct xfs_rui_log_item *ruip) -{ - struct xfs_rud_log_item *rudp; - - rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, - &xfs_rud_item_ops); - rudp->rud_ruip = ruip; - rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; - - xfs_trans_add_item(tp, &rudp->rud_item); - return rudp; -} - /* Set the map extent flags for this reverse mapping. */ static void xfs_trans_set_rmap_flags( @@ -285,35 +269,6 @@ xfs_trans_set_rmap_flags( } } -/* - * Finish an rmap update and log it to the RUD. Note that the transaction is - * marked dirty regardless of whether the rmap update succeeds or fails to - * support the RUI/RUD lifecycle rules. - */ -static int -xfs_trans_log_finish_rmap_update( - struct xfs_trans *tp, - struct xfs_rud_log_item *rudp, - struct xfs_rmap_intent *ri, - struct xfs_btree_cur **pcur) -{ - int error; - - error = xfs_rmap_finish_one(tp, ri, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the RUI and frees the RUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); - - return error; -} - /* Sort rmap intents by AG. */ static int xfs_rmap_update_diff_items( @@ -340,9 +295,6 @@ xfs_rmap_update_log_item( uint next_extent; struct xfs_map_extent *map; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from @@ -372,7 +324,6 @@ xfs_rmap_update_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &ruip->rui_item); if (sort) list_sort(mp, items, xfs_rmap_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -387,7 +338,16 @@ xfs_rmap_update_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item; + struct xfs_rui_log_item *ruip = RUI_ITEM(intent); + struct xfs_rud_log_item *rudp; + + rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, + &xfs_rud_item_ops); + rudp->rud_ruip = ruip; + rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; + + return &rudp->rud_item; } /* Take a passive ref to the AG containing the space we're rmapping. */ @@ -423,8 +383,7 @@ xfs_rmap_update_finish_item( ri = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, - state); + error = xfs_rmap_finish_one(tp, ri, state); xfs_rmap_update_put_group(ri); kmem_cache_free(xfs_rmap_intent_cache, ri); @@ -452,16 +411,6 @@ xfs_rmap_update_cancel_item( kmem_cache_free(xfs_rmap_intent_cache, ri); } -const struct xfs_defer_op_type xfs_rmap_update_defer_type = { - .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .create_intent = xfs_rmap_update_create_intent, - .abort_intent = xfs_rmap_update_abort_intent, - .create_done = xfs_rmap_update_create_done, - .finish_item = xfs_rmap_update_finish_item, - .finish_cleanup = xfs_rmap_finish_one_cleanup, - .cancel_item = xfs_rmap_update_cancel_item, -}; - /* Is this recovered RUI ok? */ static inline bool xfs_rui_validate_map( @@ -498,20 +447,72 @@ xfs_rui_validate_map( return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } +static inline void +xfs_rui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + const struct xfs_map_extent *map) +{ + struct xfs_rmap_intent *ri; + + ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); + + switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + case XFS_RMAP_EXTENT_MAP: + ri->ri_type = XFS_RMAP_MAP; + break; + case XFS_RMAP_EXTENT_MAP_SHARED: + ri->ri_type = XFS_RMAP_MAP_SHARED; + break; + case XFS_RMAP_EXTENT_UNMAP: + ri->ri_type = XFS_RMAP_UNMAP; + break; + case XFS_RMAP_EXTENT_UNMAP_SHARED: + ri->ri_type = XFS_RMAP_UNMAP_SHARED; + break; + case XFS_RMAP_EXTENT_CONVERT: + ri->ri_type = XFS_RMAP_CONVERT; + break; + case XFS_RMAP_EXTENT_CONVERT_SHARED: + ri->ri_type = XFS_RMAP_CONVERT_SHARED; + break; + case XFS_RMAP_EXTENT_ALLOC: + ri->ri_type = XFS_RMAP_ALLOC; + break; + case XFS_RMAP_EXTENT_FREE: + ri->ri_type = XFS_RMAP_FREE; + break; + default: + ASSERT(0); + return; + } + + ri->ri_owner = map->me_owner; + ri->ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + ri->ri_bmap.br_startblock = map->me_startblock; + ri->ri_bmap.br_startoff = map->me_startoff; + ri->ri_bmap.br_blockcount = map->me_len; + ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_rmap_update_get_group(mp, ri); + + xfs_defer_add_item(dfp, &ri->ri_list); +} + /* * Process an rmap update intent item that was recovered from the log. * We need to update the rmapbt. */ STATIC int -xfs_rui_item_recover( - struct xfs_log_item *lip, +xfs_rmap_recover_work( + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_rui_log_item *ruip = RUI_ITEM(lip); - struct xfs_rud_log_item *rudp; struct xfs_trans *tp; - struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; int i; int error = 0; @@ -529,6 +530,8 @@ xfs_rui_item_recover( sizeof(ruip->rui_format)); return -EFSCORRUPTED; } + + xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -536,91 +539,29 @@ xfs_rui_item_recover( XFS_TRANS_RESERVE, &tp); if (error) return error; - rudp = xfs_trans_get_rud(tp, ruip); - for (i = 0; i < ruip->rui_format.rui_nextents; i++) { - struct xfs_rmap_intent fake = { }; - struct xfs_map_extent *map; - - map = &ruip->rui_format.rui_extents[i]; - switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { - case XFS_RMAP_EXTENT_MAP: - fake.ri_type = XFS_RMAP_MAP; - break; - case XFS_RMAP_EXTENT_MAP_SHARED: - fake.ri_type = XFS_RMAP_MAP_SHARED; - break; - case XFS_RMAP_EXTENT_UNMAP: - fake.ri_type = XFS_RMAP_UNMAP; - break; - case XFS_RMAP_EXTENT_UNMAP_SHARED: - fake.ri_type = XFS_RMAP_UNMAP_SHARED; - break; - case XFS_RMAP_EXTENT_CONVERT: - fake.ri_type = XFS_RMAP_CONVERT; - break; - case XFS_RMAP_EXTENT_CONVERT_SHARED: - fake.ri_type = XFS_RMAP_CONVERT_SHARED; - break; - case XFS_RMAP_EXTENT_ALLOC: - fake.ri_type = XFS_RMAP_ALLOC; - break; - case XFS_RMAP_EXTENT_FREE: - fake.ri_type = XFS_RMAP_FREE; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &ruip->rui_format, - sizeof(ruip->rui_format)); - error = -EFSCORRUPTED; - goto abort_error; - } - - fake.ri_owner = map->me_owner; - fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - fake.ri_bmap.br_startblock = map->me_startblock; - fake.ri_bmap.br_startoff = map->me_startoff; - fake.ri_bmap.br_blockcount = map->me_len; - fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - - xfs_rmap_update_get_group(mp, &fake); - error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, - &rcur); - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - map, sizeof(*map)); - xfs_rmap_update_put_group(&fake); - if (error) - goto abort_error; - - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &ruip->rui_format, + sizeof(ruip->rui_format)); + if (error) + goto abort_error; - xfs_rmap_finish_one_cleanup(tp, rcur, error); return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: - xfs_rmap_finish_one_cleanup(tp, rcur, error); xfs_trans_cancel(tp); return error; } -STATIC bool -xfs_rui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return RUI_ITEM(lip)->rui_format.rui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_rui_item_relog( +xfs_rmap_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_rud_log_item *rudp; struct xfs_rui_log_item *ruip; struct xfs_map_extent *map; unsigned int count; @@ -628,27 +569,41 @@ xfs_rui_item_relog( count = RUI_ITEM(intent)->rui_format.rui_nextents; map = RUI_ITEM(intent)->rui_format.rui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); - ruip = xfs_rui_init(tp->t_mountp, count); memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); - xfs_trans_add_item(tp, &ruip->rui_item); - set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); + return &ruip->rui_item; } +const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + .name = "rmap", + .max_items = XFS_RUI_MAX_FAST_EXTENTS, + .create_intent = xfs_rmap_update_create_intent, + .abort_intent = xfs_rmap_update_abort_intent, + .create_done = xfs_rmap_update_create_done, + .finish_item = xfs_rmap_update_finish_item, + .finish_cleanup = xfs_rmap_finish_one_cleanup, + .cancel_item = xfs_rmap_update_cancel_item, + .recover_work = xfs_rmap_recover_work, + .relog_intent = xfs_rmap_relog_intent, +}; + +STATIC bool +xfs_rui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return RUI_ITEM(lip)->rui_format.rui_id == intent_id; +} + static const struct xfs_item_ops xfs_rui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_rui_item_size, .iop_format = xfs_rui_item_format, .iop_unpin = xfs_rui_item_unpin, .iop_release = xfs_rui_item_release, - .iop_recover = xfs_rui_item_recover, .iop_match = xfs_rui_item_match, - .iop_relog = xfs_rui_item_relog, }; static inline void @@ -702,12 +657,9 @@ xlog_recover_rui_commit_pass2( ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn); - xfs_rui_release(ruip); + + xlog_recover_intent_item(log, &ruip->rui_item, lsn, + &xfs_rmap_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 2e1a4e5cd03d..e66f9bd5de5c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -14,62 +14,51 @@ #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" #include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_icache.h" #include "xfs_rtalloc.h" #include "xfs_sb.h" - -/* - * Read and return the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - */ -static int -xfs_rtget_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_suminfo_t *sum) /* out: summary info for this block */ -{ - return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum); -} +#include "xfs_rtbitmap.h" +#include "xfs_quota.h" +#include "xfs_log_priv.h" +#include "xfs_health.h" /* * Return whether there are any free extents in the size range given * by low and high, for the bitmap block bbno. */ -STATIC int /* error */ +STATIC int xfs_rtany_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int low, /* low log2 extent size */ - int high, /* high log2 extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - int *stat) /* out: any good extents here? */ + struct xfs_rtalloc_args *args, + int low, /* low log2 extent size */ + int high, /* high log2 extent size */ + xfs_fileoff_t bbno, /* bitmap block number */ + int *maxlog) /* out: max log2 extent size free */ { - int error; /* error value */ - int log; /* loop counter, log2 of ext. size */ - xfs_suminfo_t sum; /* summary data */ - - /* There are no extents at levels < m_rsum_cache[bbno]. */ - if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno]) - low = mp->m_rsum_cache[bbno]; + struct xfs_mount *mp = args->mp; + int error; + int log; /* loop counter, log2 of ext. size */ + xfs_suminfo_t sum; /* summary data */ + + /* There are no extents at levels >= m_rsum_cache[bbno]. */ + if (mp->m_rsum_cache) { + high = min(high, mp->m_rsum_cache[bbno] - 1); + if (low > high) { + *maxlog = -1; + return 0; + } + } /* * Loop over logs of extent sizes. */ - for (log = low; log <= high; log++) { + for (log = high; log >= low; log--) { /* * Get one summary datum. */ - error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); + error = xfs_rtget_summary(args, log, bbno, &sum); if (error) { return error; } @@ -77,18 +66,18 @@ xfs_rtany_summary( * If there are any, return success. */ if (sum) { - *stat = 1; + *maxlog = log; goto out; } } /* * Found nothing, return failure. */ - *stat = 0; + *maxlog = -1; out: - /* There were no extents at levels < log. */ - if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log; + /* There were no extents at levels > log. */ + if (mp->m_rsum_cache && log + 1 < mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log + 1; return 0; } @@ -97,60 +86,54 @@ out: * Copy and transform the summary file, given the old and new * parameters in the mount structures. */ -STATIC int /* error */ +STATIC int xfs_rtcopy_summary( - xfs_mount_t *omp, /* old file system mount point */ - xfs_mount_t *nmp, /* new file system mount point */ - xfs_trans_t *tp) /* transaction pointer */ + struct xfs_rtalloc_args *oargs, + struct xfs_rtalloc_args *nargs) { - xfs_rtblock_t bbno; /* bitmap block number */ - struct xfs_buf *bp; /* summary buffer */ - int error; /* error return value */ - int log; /* summary level number (log length) */ - xfs_suminfo_t sum; /* summary data */ - xfs_fsblock_t sumbno; /* summary block number */ + xfs_fileoff_t bbno; /* bitmap block number */ + int error; + int log; /* summary level number (log length) */ + xfs_suminfo_t sum; /* summary data */ - bp = NULL; - for (log = omp->m_rsumlevels - 1; log >= 0; log--) { - for (bbno = omp->m_sb.sb_rbmblocks - 1; + for (log = oargs->mp->m_rsumlevels - 1; log >= 0; log--) { + for (bbno = oargs->mp->m_sb.sb_rbmblocks - 1; (xfs_srtblock_t)bbno >= 0; bbno--) { - error = xfs_rtget_summary(omp, tp, log, bbno, &bp, - &sumbno, &sum); + error = xfs_rtget_summary(oargs, log, bbno, &sum); if (error) - return error; + goto out; if (sum == 0) continue; - error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, - &bp, &sumbno); + error = xfs_rtmodify_summary(oargs, log, bbno, -sum); if (error) - return error; - error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, - &bp, &sumbno); + goto out; + error = xfs_rtmodify_summary(nargs, log, bbno, sum); if (error) - return error; + goto out; ASSERT(sum > 0); } } + error = 0; +out: + xfs_rtbuf_cache_relse(oargs); return 0; } /* * Mark an extent specified by start and len allocated. * Updates all the summary information as well as the bitmap. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* start block to allocate */ - xfs_extlen_t len, /* length to allocate */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* start rtext to allocate */ + xfs_rtxlen_t len) /* in/out: summary block number */ { - xfs_rtblock_t end; /* end of the allocated extent */ - int error; /* error value */ - xfs_rtblock_t postblock = 0; /* first block allocated > end */ - xfs_rtblock_t preblock = 0; /* first block allocated < start */ + struct xfs_mount *mp = args->mp; + xfs_rtxnum_t end; /* end of the allocated rtext */ + int error; + xfs_rtxnum_t postblock = 0; /* first rtext allocated > end */ + xfs_rtxnum_t preblock = 0; /* first rtext allocated < start */ end = start + len - 1; /* @@ -158,119 +141,128 @@ xfs_rtallocate_range( * We need to find the beginning and end of the extent so we can * properly update the summary. */ - error = xfs_rtfind_back(mp, tp, start, 0, &preblock); - if (error) { + error = xfs_rtfind_back(args, start, 0, &preblock); + if (error) return error; - } + /* * Find the next allocated block (end of free extent). */ - error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, - &postblock); - if (error) { + error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) return error; - } + /* * Decrement the summary information corresponding to the entire * (old) free extent. */ - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock + 1 - preblock), - XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); - if (error) { + error = xfs_rtmodify_summary(args, + xfs_highbit64(postblock + 1 - preblock), + xfs_rtx_to_rbmblock(mp, preblock), -1); + if (error) return error; - } + /* * If there are blocks not being allocated at the front of the * old extent, add summary data for them to be free. */ if (preblock < start) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(start - preblock), - XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); - if (error) { + error = xfs_rtmodify_summary(args, + xfs_highbit64(start - preblock), + xfs_rtx_to_rbmblock(mp, preblock), 1); + if (error) return error; - } } + /* * If there are blocks not being allocated at the end of the * old extent, add summary data for them to be free. */ if (postblock > end) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock - end), - XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); - if (error) { + error = xfs_rtmodify_summary(args, + xfs_highbit64(postblock - end), + xfs_rtx_to_rbmblock(mp, end + 1), 1); + if (error) return error; - } } + /* * Modify the bitmap to mark this extent allocated. */ - error = xfs_rtmodify_range(mp, tp, start, len, 0); - return error; + return xfs_rtmodify_range(args, start, len, 0); +} + +/* + * Make sure we don't run off the end of the rt volume. Be careful that + * adjusting maxlen downwards doesn't cause us to fail the alignment checks. + */ +static inline xfs_rtxlen_t +xfs_rtallocate_clamp_len( + struct xfs_mount *mp, + xfs_rtxnum_t startrtx, + xfs_rtxlen_t rtxlen, + xfs_rtxlen_t prod) +{ + xfs_rtxlen_t ret; + + ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx; + return rounddown(ret, prod); } /* * Attempt to allocate an extent minlen<=len<=maxlen starting from * bitmap block bbno. If we don't get maxlen then use prod to trim - * the length, if given. Returns error; returns starting block in *rtblock. + * the length, if given. Returns error; returns starting block in *rtx. * The lengths are all in rtextents. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_extent_block( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bbno, /* bitmap block number */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - xfs_rtblock_t *nextp, /* out: next block to try */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_rtalloc_args *args, + xfs_fileoff_t bbno, /* bitmap block number */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxnum_t *nextp, /* out: next rtext to try */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - xfs_rtblock_t besti; /* best rtblock found so far */ - xfs_rtblock_t bestlen; /* best length found so far */ - xfs_rtblock_t end; /* last rtblock in chunk */ - int error; /* error value */ - xfs_rtblock_t i; /* current rtblock trying */ - xfs_rtblock_t next; /* next rtblock to try */ - int stat; /* status from internal calls */ + struct xfs_mount *mp = args->mp; + xfs_rtxnum_t besti; /* best rtext found so far */ + xfs_rtxnum_t bestlen;/* best length found so far */ + xfs_rtxnum_t end; /* last rtext in chunk */ + int error; + xfs_rtxnum_t i; /* current rtext trying */ + xfs_rtxnum_t next; /* next rtext to try */ + int stat; /* status from internal calls */ /* * Loop over all the extents starting in this bitmap block, * looking for one that's long enough. */ - for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0, - end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1; + for (i = xfs_rbmblock_to_rtx(mp, bbno), besti = -1, bestlen = 0, + end = xfs_rbmblock_to_rtx(mp, bbno + 1) - 1; i <= end; i++) { /* Make sure we don't scan off the end of the rt volume. */ - maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i; + maxlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod); /* * See if there's a free extent of maxlen starting at i. * If it's not so then next will contain the first non-free. */ - error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat); - if (error) { + error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat); + if (error) return error; - } if (stat) { /* * i for maxlen is all free, allocate and return that. */ - error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp, - rsb); - if (error) { - return error; - } - *len = maxlen; - *rtblock = i; - return 0; + bestlen = maxlen; + besti = i; + goto allocate; } + /* * In the case where we have a variable-sized allocation * request, figure out how big this free piece is, @@ -278,7 +270,7 @@ xfs_rtallocate_extent_block( * so far, remember it. */ if (minlen < maxlen) { - xfs_rtblock_t thislen; /* this extent size */ + xfs_rtxnum_t thislen; /* this extent size */ thislen = next - i; if (thislen >= minlen && thislen > bestlen) { @@ -289,187 +281,157 @@ xfs_rtallocate_extent_block( /* * If not done yet, find the start of the next free space. */ - if (next < end) { - error = xfs_rtfind_forw(mp, tp, next, end, &i); - if (error) { - return error; - } - } else + if (next >= end) break; + error = xfs_rtfind_forw(args, next, end, &i); + if (error) + return error; } + /* * Searched the whole thing & didn't find a maxlen free extent. */ - if (minlen < maxlen && besti != -1) { - xfs_extlen_t p; /* amount to trim length by */ - + if (minlen > maxlen || besti == -1) { /* - * If size should be a multiple of prod, make that so. + * Allocation failed. Set *nextp to the next block to try. */ - if (prod > 1) { - div_u64_rem(bestlen, prod, &p); - if (p) - bestlen -= p; - } + *nextp = next; + return -ENOSPC; + } - /* - * Allocate besti for bestlen & return that. - */ - error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb); - if (error) { - return error; - } - *len = bestlen; - *rtblock = besti; - return 0; + /* + * If size should be a multiple of prod, make that so. + */ + if (prod > 1) { + xfs_rtxlen_t p; /* amount to trim length by */ + + div_u64_rem(bestlen, prod, &p); + if (p) + bestlen -= p; } + /* - * Allocation failed. Set *nextp to the next block to try. + * Allocate besti for bestlen & return that. */ - *nextp = next; - *rtblock = NULLRTBLOCK; +allocate: + error = xfs_rtallocate_range(args, besti, bestlen); + if (error) + return error; + *len = bestlen; + *rtx = besti; return 0; } /* * Allocate an extent of length minlen<=len<=maxlen, starting at block * bno. If we don't get maxlen then use prod to trim the length, if given. - * Returns error; returns starting block in *rtblock. + * Returns error; returns starting block in *rtx. * The lengths are all in rtextents. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_extent_exact( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to allocate */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext number to allocate */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - int error; /* error value */ - xfs_extlen_t i; /* extent length trimmed due to prod */ - int isfree; /* extent is free */ - xfs_rtblock_t next; /* next block to try (dummy) */ + int error; + xfs_rtxlen_t i; /* extent length trimmed due to prod */ + int isfree; /* extent is free */ + xfs_rtxnum_t next; /* next rtext to try (dummy) */ - ASSERT(minlen % prod == 0 && maxlen % prod == 0); + ASSERT(minlen % prod == 0); + ASSERT(maxlen % prod == 0); /* * Check if the range in question (for maxlen) is free. */ - error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree); - if (error) { + error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree); + if (error) return error; - } - if (isfree) { + + if (!isfree) { /* - * If it is, allocate it and return success. + * If not, allocate what there is, if it's at least minlen. */ - error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); - if (error) { - return error; - } - *len = maxlen; - *rtblock = bno; - return 0; - } - /* - * If not, allocate what there is, if it's at least minlen. - */ - maxlen = next - bno; - if (maxlen < minlen) { + maxlen = next - start; + if (maxlen < minlen) + return -ENOSPC; + /* - * Failed, return failure status. + * Trim off tail of extent, if prod is specified. */ - *rtblock = NULLRTBLOCK; - return 0; - } - /* - * Trim off tail of extent, if prod is specified. - */ - if (prod > 1 && (i = maxlen % prod)) { - maxlen -= i; - if (maxlen < minlen) { - /* - * Now we can't do it, return failure status. - */ - *rtblock = NULLRTBLOCK; - return 0; + if (prod > 1 && (i = maxlen % prod)) { + maxlen -= i; + if (maxlen < minlen) + return -ENOSPC; } } + /* * Allocate what we can and return it. */ - error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); - if (error) { + error = xfs_rtallocate_range(args, start, maxlen); + if (error) return error; - } *len = maxlen; - *rtblock = bno; + *rtx = start; return 0; } /* * Allocate an extent of length minlen<=len<=maxlen, starting as near - * to bno as possible. If we don't get maxlen then use prod to trim + * to start as possible. If we don't get maxlen then use prod to trim * the length, if given. The lengths are all in rtextents. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_extent_near( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to allocate */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext number to allocate */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - int any; /* any useful extents from summary */ - xfs_rtblock_t bbno; /* bitmap block number */ - int error; /* error value */ - int i; /* bitmap block offset (loop control) */ - int j; /* secondary loop control */ - int log2len; /* log2 of minlen */ - xfs_rtblock_t n; /* next block to try */ - xfs_rtblock_t r; /* result block */ - - ASSERT(minlen % prod == 0 && maxlen % prod == 0); + struct xfs_mount *mp = args->mp; + int maxlog; /* max useful extent from summary */ + xfs_fileoff_t bbno; /* bitmap block number */ + int error; + int i; /* bitmap block offset (loop control) */ + int j; /* secondary loop control */ + int log2len; /* log2 of minlen */ + xfs_rtxnum_t n; /* next rtext to try */ + + ASSERT(minlen % prod == 0); + ASSERT(maxlen % prod == 0); + /* * If the block number given is off the end, silently set it to * the last block. */ - if (bno >= mp->m_sb.sb_rextents) - bno = mp->m_sb.sb_rextents - 1; + if (start >= mp->m_sb.sb_rextents) + start = mp->m_sb.sb_rextents - 1; /* Make sure we don't run off the end of the rt volume. */ - maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno; - if (maxlen < minlen) { - *rtblock = NULLRTBLOCK; - return 0; - } + maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod); + if (maxlen < minlen) + return -ENOSPC; /* * Try the exact allocation first. */ - error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len, - rbpp, rsb, prod, &r); - if (error) { + error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len, + prod, rtx); + if (error != -ENOSPC) return error; - } - /* - * If the exact allocation worked, return that. - */ - if (r != NULLRTBLOCK) { - *rtblock = r; - return 0; - } - bbno = XFS_BITTOBLOCK(mp, bno); + + + bbno = xfs_rtx_to_rbmblock(mp, start); i = 0; + j = -1; ASSERT(minlen != 0); log2len = xfs_highbit32(minlen); /* @@ -480,16 +442,19 @@ xfs_rtallocate_extent_near( * Get summary information of extents of all useful levels * starting in this bitmap block. */ - error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1, - bbno + i, rbpp, rsb, &any); - if (error) { + error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1, + bbno + i, &maxlog); + if (error) return error; - } + /* * If there are any useful extents starting here, try * allocating one. */ - if (any) { + if (maxlog >= 0) { + xfs_extlen_t maxavail = + min_t(xfs_rtblock_t, maxlen, + (1ULL << (maxlog + 1)) - 1); /* * On the positive side of the starting location. */ @@ -498,85 +463,47 @@ xfs_rtallocate_extent_near( * Try to allocate an extent starting in * this block. */ - error = xfs_rtallocate_extent_block(mp, tp, - bbno + i, minlen, maxlen, len, &n, rbpp, - rsb, prod, &r); - if (error) { + error = xfs_rtallocate_extent_block(args, + bbno + i, minlen, maxavail, len, + &n, prod, rtx); + if (error != -ENOSPC) return error; - } - /* - * If it worked, return it. - */ - if (r != NULLRTBLOCK) { - *rtblock = r; - return 0; - } } /* * On the negative side of the starting location. */ else { /* i < 0 */ + int maxblocks; + /* - * Loop backwards through the bitmap blocks from - * the starting point-1 up to where we are now. - * There should be an extent which ends in this - * bitmap block and is long enough. - */ - for (j = -1; j > i; j--) { - /* - * Grab the summary information for - * this bitmap block. - */ - error = xfs_rtany_summary(mp, tp, - log2len, mp->m_rsumlevels - 1, - bbno + j, rbpp, rsb, &any); - if (error) { - return error; - } - /* - * If there's no extent given in the - * summary that means the extent we - * found must carry over from an - * earlier block. If there is an - * extent given, we've already tried - * that allocation, don't do it again. - */ - if (any) - continue; - error = xfs_rtallocate_extent_block(mp, - tp, bbno + j, minlen, maxlen, - len, &n, rbpp, rsb, prod, &r); - if (error) { - return error; - } - /* - * If it works, return the extent. - */ - if (r != NULLRTBLOCK) { - *rtblock = r; - return 0; - } - } - /* - * There weren't intervening bitmap blocks - * with a long enough extent, or the - * allocation didn't work for some reason - * (i.e. it's a little * too short). - * Try to allocate from the summary block - * that we found. + * Loop backwards to find the end of the extent + * we found in the realtime summary. + * + * maxblocks is the maximum possible number of + * bitmap blocks from the start of the extent + * to the end of the extent. */ - error = xfs_rtallocate_extent_block(mp, tp, - bbno + i, minlen, maxlen, len, &n, rbpp, - rsb, prod, &r); - if (error) { - return error; - } + if (maxlog == 0) + maxblocks = 0; + else if (maxlog < mp->m_blkbit_log) + maxblocks = 1; + else + maxblocks = 2 << (maxlog - mp->m_blkbit_log); + /* - * If it works, return the extent. + * We need to check bbno + i + maxblocks down to + * bbno + i. We already checked bbno down to + * bbno + j + 1, so we don't need to check those + * again. */ - if (r != NULLRTBLOCK) { - *rtblock = r; - return 0; + j = min(i + maxblocks, j); + for (; j >= i; j--) { + error = xfs_rtallocate_extent_block(args, + bbno + j, minlen, + maxavail, len, &n, prod, + rtx); + if (error != -ENOSPC) + return error; } } } @@ -610,8 +537,53 @@ xfs_rtallocate_extent_near( else break; } - *rtblock = NULLRTBLOCK; - return 0; + return -ENOSPC; +} + +static int +xfs_rtalloc_sumlevel( + struct xfs_rtalloc_args *args, + int l, /* level number */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ +{ + xfs_fileoff_t i; /* bitmap block number */ + + for (i = 0; i < args->mp->m_sb.sb_rbmblocks; i++) { + xfs_suminfo_t sum; /* summary information for extents */ + xfs_rtxnum_t n; /* next rtext to be tried */ + int error; + + error = xfs_rtget_summary(args, l, i, &sum); + if (error) + return error; + + /* + * Nothing there, on to the next block. + */ + if (!sum) + continue; + + /* + * Try allocating the extent. + */ + error = xfs_rtallocate_extent_block(args, i, minlen, maxlen, + len, &n, prod, rtx); + if (error != -ENOSPC) + return error; + + /* + * If the "next block to try" returned from the allocator is + * beyond the next bitmap block, skip to that bitmap block. + */ + if (xfs_rtx_to_rbmblock(args->mp, n) > i + 1) + i = xfs_rtx_to_rbmblock(args->mp, n) - 1; + } + + return -ENOSPC; } /* @@ -619,145 +591,64 @@ xfs_rtallocate_extent_near( * specified. If we don't get maxlen then use prod to trim * the length, if given. The lengths are all in rtextents. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_extent_size( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_rtalloc_args *args, + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - int error; /* error value */ - int i; /* bitmap block number */ - int l; /* level number (loop control) */ - xfs_rtblock_t n; /* next block to be tried */ - xfs_rtblock_t r; /* result block number */ - xfs_suminfo_t sum; /* summary information for extents */ - - ASSERT(minlen % prod == 0 && maxlen % prod == 0); + int error; + int l; /* level number (loop control) */ + + ASSERT(minlen % prod == 0); + ASSERT(maxlen % prod == 0); ASSERT(maxlen != 0); /* * Loop over all the levels starting with maxlen. - * At each level, look at all the bitmap blocks, to see if there - * are extents starting there that are long enough (>= maxlen). - * Note, only on the initial level can the allocation fail if - * the summary says there's an extent. + * + * At each level, look at all the bitmap blocks, to see if there are + * extents starting there that are long enough (>= maxlen). + * + * Note, only on the initial level can the allocation fail if the + * summary says there's an extent. */ - for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) { - /* - * Loop over all the bitmap blocks. - */ - for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { - /* - * Get the summary for this level/block. - */ - error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, - &sum); - if (error) { - return error; - } - /* - * Nothing there, on to the next block. - */ - if (!sum) - continue; - /* - * Try allocating the extent. - */ - error = xfs_rtallocate_extent_block(mp, tp, i, maxlen, - maxlen, len, &n, rbpp, rsb, prod, &r); - if (error) { - return error; - } - /* - * If it worked, return that. - */ - if (r != NULLRTBLOCK) { - *rtblock = r; - return 0; - } - /* - * If the "next block to try" returned from the - * allocator is beyond the next bitmap block, - * skip to that bitmap block. - */ - if (XFS_BITTOBLOCK(mp, n) > i + 1) - i = XFS_BITTOBLOCK(mp, n) - 1; - } + for (l = xfs_highbit32(maxlen); l < args->mp->m_rsumlevels; l++) { + error = xfs_rtalloc_sumlevel(args, l, minlen, maxlen, prod, len, + rtx); + if (error != -ENOSPC) + return error; } + /* - * Didn't find any maxlen blocks. Try smaller ones, unless - * we're asking for a fixed size extent. + * Didn't find any maxlen blocks. Try smaller ones, unless we are + * looking for a fixed size extent. */ - if (minlen > --maxlen) { - *rtblock = NULLRTBLOCK; - return 0; - } + if (minlen > --maxlen) + return -ENOSPC; ASSERT(minlen != 0); ASSERT(maxlen != 0); /* * Loop over sizes, from maxlen down to minlen. - * This time, when we do the allocations, allow smaller ones - * to succeed. + * + * This time, when we do the allocations, allow smaller ones to succeed, + * but make sure the specified minlen/maxlen are in the possible range + * for this summary level. */ for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) { - /* - * Loop over all the bitmap blocks, try an allocation - * starting in that block. - */ - for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { - /* - * Get the summary information for this level/block. - */ - error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, - &sum); - if (error) { - return error; - } - /* - * If nothing there, go on to next. - */ - if (!sum) - continue; - /* - * Try the allocation. Make sure the specified - * minlen/maxlen are in the possible range for - * this summary level. - */ - error = xfs_rtallocate_extent_block(mp, tp, i, - XFS_RTMAX(minlen, 1 << l), - XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), - len, &n, rbpp, rsb, prod, &r); - if (error) { - return error; - } - /* - * If it worked, return that extent. - */ - if (r != NULLRTBLOCK) { - *rtblock = r; - return 0; - } - /* - * If the "next block to try" returned from the - * allocator is beyond the next bitmap block, - * skip to that bitmap block. - */ - if (XFS_BITTOBLOCK(mp, n) > i + 1) - i = XFS_BITTOBLOCK(mp, n) - 1; - } + error = xfs_rtalloc_sumlevel(args, l, + max_t(xfs_rtxlen_t, minlen, 1 << l), + min_t(xfs_rtxlen_t, maxlen, (1 << (l + 1)) - 1), + prod, len, rtx); + if (error != -ENOSPC) + return error; } - /* - * Got nothing, return failure. - */ - *rtblock = NULLRTBLOCK; - return 0; + + return -ENOSPC; } /* @@ -886,12 +777,14 @@ xfs_alloc_rsum_cache( xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */ { /* - * The rsum cache is initialized to all zeroes, which is trivially a - * lower bound on the minimum level with any free extents. We can - * continue without the cache if it couldn't be allocated. + * The rsum cache is initialized to the maximum value, which is + * trivially an upper bound on the maximum level with any free extents. + * We can continue without the cache if it couldn't be allocated. */ - mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL); - if (!mp->m_rsum_cache) + mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL); + if (mp->m_rsum_cache) + memset(mp->m_rsum_cache, -1, rbmblocks); + else xfs_warn(mp, "could not allocate realtime summary cache"); } @@ -907,13 +800,13 @@ xfs_growfs_rt( xfs_mount_t *mp, /* mount point for filesystem */ xfs_growfs_rt_t *in) /* growfs rt input struct */ { - xfs_rtblock_t bmbno; /* bitmap block number */ + xfs_fileoff_t bmbno; /* bitmap block number */ struct xfs_buf *bp; /* temporary buffer */ int error; /* error return value */ xfs_mount_t *nmp; /* new (fake) mount structure */ xfs_rfsblock_t nrblocks; /* new number of realtime blocks */ xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ - xfs_rtblock_t nrextents; /* new number of realtime extents */ + xfs_rtxnum_t nrextents; /* new number of realtime extents */ uint8_t nrextslog; /* new log2 of sb_rextents */ xfs_extlen_t nrsumblocks; /* new number of summary blocks */ uint nrsumlevels; /* new rt summary levels */ @@ -922,7 +815,6 @@ xfs_growfs_rt( xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */ xfs_extlen_t rsumblocks; /* current number of rt summary blks */ xfs_sb_t *sbp; /* old superblock */ - xfs_fsblock_t sumbno; /* summary block number */ uint8_t *rsum_cache; /* old summary cache */ sbp = &mp->m_sb; @@ -954,7 +846,7 @@ xfs_growfs_rt( return -EINVAL; /* Unsupported realtime features. */ - if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp)) + if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp)) return -EOPNOTSUPP; nrblocks = in->newblocks; @@ -976,11 +868,12 @@ xfs_growfs_rt( */ nrextents = nrblocks; do_div(nrextents, in->extsize); - nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize); - nrextslog = xfs_highbit32(nrextents); + if (!xfs_validate_rtextents(nrextents)) + return -EINVAL; + nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); + nrextslog = xfs_compute_rextslog(nrextents); nrsumlevels = nrextslog + 1; - nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks; - nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks); nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); /* * New summary size can't be more than half the size of @@ -1012,7 +905,7 @@ xfs_growfs_rt( /* * Allocate a new (fake) mount/sb. */ - nmp = kmem_alloc(sizeof(*nmp), 0); + nmp = kmalloc(sizeof(*nmp), GFP_KERNEL | __GFP_NOFAIL); /* * Loop over the bitmap blocks. * We will do everything one bitmap block at a time. @@ -1023,6 +916,12 @@ xfs_growfs_rt( ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); bmbno < nrbmblocks; bmbno++) { + struct xfs_rtalloc_args args = { + .mp = mp, + }; + struct xfs_rtalloc_args nargs = { + .mp = nmp, + }; struct xfs_trans *tp; xfs_rfsblock_t nrblocks_step; @@ -1032,20 +931,21 @@ xfs_growfs_rt( * Calculate new sb and mount fields for this round. */ nsbp->sb_rextsize = in->extsize; + nmp->m_rtxblklog = -1; /* don't use shift or masking */ nsbp->sb_rbmblocks = bmbno + 1; nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize * nsbp->sb_rextsize; nsbp->sb_rblocks = min(nrblocks, nrblocks_step); - nsbp->sb_rextents = nsbp->sb_rblocks; - do_div(nsbp->sb_rextents, nsbp->sb_rextsize); + nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks); ASSERT(nsbp->sb_rextents != 0); - nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); + nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents); nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; - nrsumsize = - (uint)sizeof(xfs_suminfo_t) * nrsumlevels * - nsbp->sb_rbmblocks; - nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, + nsbp->sb_rbmblocks); nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* recompute growfsrt reservation from new rsumsize */ + xfs_trans_resv_calc(nmp, &nmp->m_resv); + /* * Start a transaction, get the log reservation. */ @@ -1053,6 +953,9 @@ xfs_growfs_rt( &tp); if (error) break; + args.tp = tp; + nargs.tp = tp; + /* * Lock out other callers by grabbing the bitmap inode lock. */ @@ -1086,7 +989,7 @@ xfs_growfs_rt( */ if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks || mp->m_rsumlevels != nmp->m_rsumlevels) { - error = xfs_rtcopy_summary(mp, nmp, tp); + error = xfs_rtcopy_summary(&args, &nargs); if (error) goto error_cancel; } @@ -1111,9 +1014,9 @@ xfs_growfs_rt( /* * Free new extent. */ - bp = NULL; - error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents, - nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); + error = xfs_rtfree_range(&nargs, sbp->sb_rextents, + nsbp->sb_rextents - sbp->sb_rextents); + xfs_rtbuf_cache_relse(&nargs); if (error) { error_cancel: xfs_trans_cancel(tp); @@ -1129,6 +1032,8 @@ error_cancel: */ mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; + /* recompute growfsrt reservation from new rsumsize */ + xfs_trans_resv_calc(mp, &mp->m_resv); error = xfs_trans_commit(tp); if (error) @@ -1147,7 +1052,7 @@ out_free: /* * Free the fake mp structure. */ - kmem_free(nmp); + kfree(nmp); /* * If we had to allocate a new rsum_cache, we either need to free the @@ -1156,10 +1061,10 @@ out_free: */ if (rsum_cache != mp->m_rsum_cache) { if (error) { - kmem_free(mp->m_rsum_cache); + kvfree(mp->m_rsum_cache); mp->m_rsum_cache = rsum_cache; } else { - kmem_free(rsum_cache); + kvfree(rsum_cache); } } @@ -1167,80 +1072,6 @@ out_free: } /* - * Allocate an extent in the realtime subvolume, with the usual allocation - * parameters. The length units are all in realtime extents, as is the - * result block number. - */ -int /* error */ -xfs_rtallocate_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to allocate */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - int wasdel, /* was a delayed allocation extent */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ -{ - xfs_mount_t *mp = tp->t_mountp; - int error; /* error value */ - xfs_rtblock_t r; /* result allocated block */ - xfs_fsblock_t sb; /* summary file block number */ - struct xfs_buf *sumbp; /* summary file block buffer */ - - ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); - ASSERT(minlen > 0 && minlen <= maxlen); - - /* - * If prod is set then figure out what to do to minlen and maxlen. - */ - if (prod > 1) { - xfs_extlen_t i; - - if ((i = maxlen % prod)) - maxlen -= i; - if ((i = minlen % prod)) - minlen += prod - i; - if (maxlen < minlen) { - *rtblock = NULLRTBLOCK; - return 0; - } - } - -retry: - sumbp = NULL; - if (bno == 0) { - error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len, - &sumbp, &sb, prod, &r); - } else { - error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen, - len, &sumbp, &sb, prod, &r); - } - - if (error) - return error; - - /* - * If it worked, update the superblock. - */ - if (r != NULLRTBLOCK) { - long slen = (long)*len; - - ASSERT(*len >= minlen && *len <= maxlen); - if (wasdel) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen); - else - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen); - } else if (prod > 1) { - prod = 1; - goto retry; - } - - *rtblock = r; - return 0; -} - -/* * Initialize realtime fields in the mount structure. */ int /* error */ @@ -1250,6 +1081,7 @@ xfs_rtmount_init( struct xfs_buf *bp; /* buffer for last block of subvolume */ struct xfs_sb *sbp; /* filesystem superblock copy in mount */ xfs_daddr_t d; /* address of last block of subvolume */ + unsigned int rsumblocks; int error; sbp = &mp->m_sb; @@ -1261,10 +1093,9 @@ xfs_rtmount_init( return -ENODEV; } mp->m_rsumlevels = sbp->sb_rextslog + 1; - mp->m_rsumsize = - (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels * - sbp->sb_rbmblocks; - mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize); + rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels, + mp->m_sb.sb_rbmblocks); + mp->m_rsumsize = XFS_FSB_TO_B(mp, rsumblocks); mp->m_rbmip = mp->m_rsumip = NULL; /* * Check that the realtime section is an ok size. @@ -1373,6 +1204,8 @@ xfs_rtmount_inodes( sbp = &mp->m_sb; error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); + if (xfs_metadata_is_sick(error)) + xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP); if (error) return error; ASSERT(mp->m_rbmip != NULL); @@ -1382,6 +1215,8 @@ xfs_rtmount_inodes( goto out_rele_bitmap; error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); + if (xfs_metadata_is_sick(error)) + xfs_rt_mark_sick(mp, XFS_SICK_RT_SUMMARY); if (error) goto out_rele_bitmap; ASSERT(mp->m_rsumip != NULL); @@ -1404,7 +1239,7 @@ void xfs_rtunmount_inodes( struct xfs_mount *mp) { - kmem_free(mp->m_rsum_cache); + kvfree(mp->m_rsum_cache); if (mp->m_rbmip) xfs_irele(mp->m_rbmip); if (mp->m_rsumip) @@ -1418,27 +1253,27 @@ xfs_rtunmount_inodes( * of rtextents and the fraction. * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... */ -int /* error */ +static int xfs_rtpick_extent( xfs_mount_t *mp, /* file system mount point */ xfs_trans_t *tp, /* transaction pointer */ - xfs_extlen_t len, /* allocation length (rtextents) */ - xfs_rtblock_t *pick) /* result rt extent */ - { - xfs_rtblock_t b; /* result block */ + xfs_rtxlen_t len, /* allocation length (rtextents) */ + xfs_rtxnum_t *pick) /* result rt extent */ +{ + xfs_rtxnum_t b; /* result rtext */ int log2; /* log of sequence number */ uint64_t resid; /* residual after log removed */ uint64_t seq; /* sequence number of file creation */ - struct timespec64 ts; /* temporary timespec64 storage */ + struct timespec64 ts; /* timespec in inode */ - ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL); + ts = inode_get_atime(VFS_I(mp->m_rbmip)); if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) { mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; seq = 0; } else { - ts = inode_get_atime(VFS_I(mp->m_rbmip)); - seq = (uint64_t)ts.tv_sec; + seq = ts.tv_sec; } if ((log2 = xfs_highbit64(seq)) == -1) b = 0; @@ -1451,9 +1286,183 @@ xfs_rtpick_extent( if (b + len > mp->m_sb.sb_rextents) b = mp->m_sb.sb_rextents - len; } - ts.tv_sec = (time64_t)seq + 1; + ts.tv_sec = seq + 1; inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); *pick = b; return 0; } + +static void +xfs_rtalloc_align_minmax( + xfs_rtxlen_t *raminlen, + xfs_rtxlen_t *ramaxlen, + xfs_rtxlen_t *prod) +{ + xfs_rtxlen_t newmaxlen = *ramaxlen; + xfs_rtxlen_t newminlen = *raminlen; + xfs_rtxlen_t slack; + + slack = newmaxlen % *prod; + if (slack) + newmaxlen -= slack; + slack = newminlen % *prod; + if (slack) + newminlen += *prod - slack; + + /* + * If adjusting for extent size hint alignment produces an invalid + * min/max len combination, go ahead without it. + */ + if (newmaxlen < newminlen) { + *prod = 1; + return; + } + *ramaxlen = newmaxlen; + *raminlen = newminlen; +} + +int +xfs_bmap_rtalloc( + struct xfs_bmalloca *ap) +{ + struct xfs_mount *mp = ap->ip->i_mount; + xfs_fileoff_t orig_offset = ap->offset; + xfs_rtxnum_t start; /* allocation hint rtextent no */ + xfs_rtxnum_t rtx; /* actually allocated rtextent no */ + xfs_rtxlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t mod = 0; /* product factor for allocators */ + xfs_rtxlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_extlen_t orig_length = ap->length; + xfs_extlen_t minlen = mp->m_sb.sb_rextsize; + xfs_rtxlen_t raminlen; + bool rtlocked = false; + bool ignore_locality = false; + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = ap->tp, + }; + int error; + + align = xfs_get_extsz_hint(ap->ip); +retry: + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, + align, 1, ap->eof, 0, + ap->conv, &ap->offset, &ap->length); + if (error) + return error; + ASSERT(ap->length); + ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0); + + /* + * If we shifted the file offset downward to satisfy an extent size + * hint, increase minlen by that amount so that the allocator won't + * give us an allocation that's too short to cover at least one of the + * blocks that the caller asked for. + */ + if (ap->offset != orig_offset) + minlen += orig_offset - ap->offset; + + /* + * Set ralen to be the actual requested length in rtextents. + * + * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that + * we rounded up to it, cut it back so it's valid again. + * Note that if it's a really large request (bigger than + * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't + * adjust the starting point to match it. + */ + ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); + raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); + ASSERT(raminlen > 0); + ASSERT(raminlen <= ralen); + + /* + * Lock out modifications to both the RT bitmap and summary inodes + */ + if (!rtlocked) { + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); + xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); + rtlocked = true; + } + + if (ignore_locality) { + start = 0; + } else if (xfs_bmap_adjacent(ap)) { + start = xfs_rtb_to_rtx(mp, ap->blkno); + } else if (ap->eof && ap->offset == 0) { + /* + * If it's an allocation to an empty file at offset 0, pick an + * extent that will space things out in the rt area. + */ + error = xfs_rtpick_extent(mp, ap->tp, ralen, &start); + if (error) + return error; + } else { + start = 0; + } + + /* + * Only bother calculating a real prod factor if offset & length are + * perfectly aligned, otherwise it will just get us in trouble. + */ + div_u64_rem(ap->offset, align, &mod); + if (mod || ap->length % align) { + prod = 1; + } else { + prod = xfs_extlen_to_rtxlen(mp, align); + if (prod > 1) + xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod); + } + + if (start) { + error = xfs_rtallocate_extent_near(&args, start, raminlen, + ralen, &ralen, prod, &rtx); + } else { + error = xfs_rtallocate_extent_size(&args, raminlen, + ralen, &ralen, prod, &rtx); + } + xfs_rtbuf_cache_relse(&args); + + if (error == -ENOSPC) { + if (align > mp->m_sb.sb_rextsize) { + /* + * We previously enlarged the request length to try to + * satisfy an extent size hint. The allocator didn't + * return anything, so reset the parameters to the + * original values and try again without alignment + * criteria. + */ + ap->offset = orig_offset; + ap->length = orig_length; + minlen = align = mp->m_sb.sb_rextsize; + goto retry; + } + + if (!ignore_locality && start != 0) { + /* + * If we can't allocate near a specific rt extent, try + * again without locality criteria. + */ + ignore_locality = true; + goto retry; + } + + ap->blkno = NULLFSBLOCK; + ap->length = 0; + return 0; + } + if (error) + return error; + + xfs_trans_mod_sb(ap->tp, ap->wasdel ? + XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS, + -(long)ralen); + ap->blkno = xfs_rtx_to_rtb(mp, rtx); + ap->length = xfs_rtxlen_to_extlen(mp, ralen); + xfs_bmap_alloc_account(ap); + return 0; +} diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 62c7ad79cbb6..a6836da9bebe 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -11,54 +11,8 @@ struct xfs_mount; struct xfs_trans; -/* - * XXX: Most of the realtime allocation functions deal in units of realtime - * extents, not realtime blocks. This looks funny when paired with the type - * name and screams for a larger cleanup. - */ -struct xfs_rtalloc_rec { - xfs_rtblock_t ar_startext; - xfs_rtblock_t ar_extcount; -}; - -typedef int (*xfs_rtalloc_query_range_fn)( - struct xfs_mount *mp, - struct xfs_trans *tp, - const struct xfs_rtalloc_rec *rec, - void *priv); - #ifdef CONFIG_XFS_RT /* - * Function prototypes for exported functions. - */ - -/* - * Allocate an extent in the realtime subvolume, with the usual allocation - * parameters. The length units are all in realtime extents, as is the - * result block number. - */ -int /* error */ -xfs_rtallocate_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to allocate */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - int wasdel, /* was a delayed allocation extent */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock); /* out: start block allocated */ - -/* - * Free an extent in the realtime subvolume. Length is expressed in - * realtime extents, as is the block number. - */ -int /* error */ -xfs_rtfree_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to free */ - xfs_extlen_t len); /* length of extent freed */ - -/* * Initialize realtime fields in the mount structure. */ int /* error */ @@ -77,20 +31,6 @@ xfs_rtmount_inodes( struct xfs_mount *mp); /* file system mount structure */ /* - * Pick an extent for allocation at the start of a new realtime file. - * Use the sequence number stored in the atime field of the bitmap inode. - * Translate this to a fraction of the rtextents, and return the product - * of rtextents and the fraction. - * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... - */ -int /* error */ -xfs_rtpick_extent( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_extlen_t len, /* allocation length (rtextents) */ - xfs_rtblock_t *pick); /* result rt extent */ - -/* * Grow the realtime area of the filesystem. */ int @@ -98,55 +38,10 @@ xfs_growfs_rt( struct xfs_mount *mp, /* file system mount structure */ xfs_growfs_rt_t *in); /* user supplied growfs struct */ -/* - * From xfs_rtbitmap.c - */ -int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t block, int issum, struct xfs_buf **bpp); -int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_extlen_t len, int val, - xfs_rtblock_t *new, int *stat); -int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_rtblock_t limit, - xfs_rtblock_t *rtblock); -int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_rtblock_t limit, - xfs_rtblock_t *rtblock); -int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_extlen_t len, int val); -int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp, - int log, xfs_rtblock_t bbno, int delta, - struct xfs_buf **rbpp, xfs_fsblock_t *rsb, - xfs_suminfo_t *sum); -int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, - xfs_rtblock_t bbno, int delta, struct xfs_buf **rbpp, - xfs_fsblock_t *rsb); -int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_extlen_t len, - struct xfs_buf **rbpp, xfs_fsblock_t *rsb); -int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, - const struct xfs_rtalloc_rec *low_rec, - const struct xfs_rtalloc_rec *high_rec, - xfs_rtalloc_query_range_fn fn, void *priv); -int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtalloc_query_range_fn fn, - void *priv); -bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); -int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_extlen_t len, - bool *is_free); int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else -# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) -# define xfs_rtfree_extent(t,b,l) (ENOSYS) -# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) -# define xfs_growfs_rt(mp,in) (ENOSYS) -# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) -# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS) -# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) -# define xfs_verify_rtbno(m, r) (false) -# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) -# define xfs_rtalloc_reinit_frextents(m) (0) +# define xfs_growfs_rt(mp,in) (-ENOSYS) +# define xfs_rtalloc_reinit_frextents(m) (0) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ @@ -157,7 +52,7 @@ xfs_rtmount_init( xfs_warn(mp, "Not built with CONFIG_XFS_RT"); return -ENOSYS; } -# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) # define xfs_rtunmount_inodes(m) #endif /* CONFIG_XFS_RT */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 90a77cd3ebad..ed97d72caa66 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -50,7 +50,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "ibt2", xfsstats_offset(xs_fibt_2) }, { "fibt2", xfsstats_offset(xs_rmap_2) }, { "rmapbt", xfsstats_offset(xs_refcbt_2) }, - { "refcntbt", xfsstats_offset(xs_qm_dqreclaims)}, + { "refcntbt", xfsstats_offset(xs_rmap_mem_2) }, + { "rmapbt_mem", xfsstats_offset(xs_rcbag_2) }, + { "rcbagbt", xfsstats_offset(xs_qm_dqreclaims)}, /* we print both series of quota information together */ { "qm", xfsstats_offset(xs_xstrat_bytes)}, }; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 43ffba74f045..a61fb56ed2e6 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -125,6 +125,8 @@ struct __xfsstats { uint32_t xs_fibt_2[__XBTS_MAX]; uint32_t xs_rmap_2[__XBTS_MAX]; uint32_t xs_refcbt_2[__XBTS_MAX]; + uint32_t xs_rmap_mem_2[__XBTS_MAX]; + uint32_t xs_rcbag_2[__XBTS_MAX]; uint32_t xs_qm_dqreclaims; uint32_t xs_qm_dqreclaim_misses; uint32_t xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f0ae07828153..bce020374c5e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -42,7 +42,9 @@ #include "xfs_xattr.h" #include "xfs_iunlink_item.h" #include "xfs_dahash_test.h" +#include "xfs_rtbitmap.h" #include "scrub/stats.h" +#include "scrub/rcbag_btree.h" #include <linux/magic.h> #include <linux/fs_context.h> @@ -349,7 +351,6 @@ xfs_setup_dax_always( return -EINVAL; } - xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); return 0; disable_dax: @@ -361,15 +362,16 @@ STATIC int xfs_blkdev_get( xfs_mount_t *mp, const char *name, - struct bdev_handle **handlep) + struct file **bdev_filep) { int error = 0; - *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, - mp->m_super, &fs_holder_ops); - if (IS_ERR(*handlep)) { - error = PTR_ERR(*handlep); - *handlep = NULL; + *bdev_filep = bdev_file_open_by_path(name, + BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, + mp->m_super, &fs_holder_ops); + if (IS_ERR(*bdev_filep)) { + error = PTR_ERR(*bdev_filep); + *bdev_filep = NULL; xfs_warn(mp, "Invalid device [%s], error=%d", name, error); } @@ -434,32 +436,26 @@ xfs_open_devices( { struct super_block *sb = mp->m_super; struct block_device *ddev = sb->s_bdev; - struct bdev_handle *logdev_handle = NULL, *rtdev_handle = NULL; + struct file *logdev_file = NULL, *rtdev_file = NULL; int error; /* - * blkdev_put() can't be called under s_umount, see the comment - * in get_tree_bdev() for more details - */ - up_write(&sb->s_umount); - - /* * Open real time and log devices - order is important. */ if (mp->m_logname) { - error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle); + error = xfs_blkdev_get(mp, mp->m_logname, &logdev_file); if (error) - goto out_relock; + return error; } if (mp->m_rtname) { - error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle); + error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_file); if (error) goto out_close_logdev; - if (rtdev_handle->bdev == ddev || - (logdev_handle && - rtdev_handle->bdev == logdev_handle->bdev)) { + if (file_bdev(rtdev_file) == ddev || + (logdev_file && + file_bdev(rtdev_file) == file_bdev(logdev_file))) { xfs_warn(mp, "Cannot mount filesystem with identical rtdev and ddev/logdev."); error = -EINVAL; @@ -471,31 +467,28 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file); if (!mp->m_ddev_targp) goto out_close_rtdev; - if (rtdev_handle) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle); + if (rtdev_file) { + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } - if (logdev_handle && logdev_handle->bdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle); + if (logdev_file && file_bdev(logdev_file) != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { mp->m_logdev_targp = mp->m_ddev_targp; /* Handle won't be used, drop it */ - if (logdev_handle) - bdev_release(logdev_handle); + if (logdev_file) + bdev_fput(logdev_file); } - error = 0; -out_relock: - down_write(&sb->s_umount); - return error; + return 0; out_free_rtdev_targ: if (mp->m_rtdev_targp) @@ -503,12 +496,12 @@ out_relock: out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - if (rtdev_handle) - bdev_release(rtdev_handle); + if (rtdev_file) + bdev_fput(rtdev_file); out_close_logdev: - if (logdev_handle) - bdev_release(logdev_handle); - goto out_relock; + if (logdev_file) + bdev_fput(logdev_file); + return error; } /* @@ -723,9 +716,7 @@ xfs_fs_inode_init_once( /* xfs inode */ atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); - - mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, - "xfsino", ip->i_ino); + init_rwsem(&ip->i_lock); } /* @@ -758,10 +749,6 @@ static void xfs_mount_free( struct xfs_mount *mp) { - /* - * Free the buftargs here because blkdev_put needs to be called outside - * of sb->s_umount, which is held around the call to ->put_super. - */ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_logdev_targp); if (mp->m_rtdev_targp) @@ -772,7 +759,7 @@ xfs_mount_free( debugfs_remove(mp->m_debugfs); kfree(mp->m_rtname); kfree(mp->m_logname); - kmem_free(mp); + kfree(mp); } STATIC int @@ -896,7 +883,7 @@ xfs_fs_statfs( statp->f_blocks = sbp->sb_rblocks; freertx = percpu_counter_sum_positive(&mp->m_frextents); - statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize; + statp->f_bavail = statp->f_bfree = xfs_rtx_to_rtb(mp, freertx); } return 0; @@ -905,10 +892,8 @@ xfs_fs_statfs( STATIC void xfs_save_resvblks(struct xfs_mount *mp) { - uint64_t resblks = 0; - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, &resblks, NULL); + xfs_reserve_blocks(mp, 0); } STATIC void @@ -922,7 +907,7 @@ xfs_restore_resvblks(struct xfs_mount *mp) } else resblks = xfs_default_resblks(mp); - xfs_reserve_blocks(mp, &resblks, NULL); + xfs_reserve_blocks(mp, resblks); } /* @@ -1509,6 +1494,18 @@ xfs_fs_fill_super( mp->m_super = sb; + /* + * Copy VFS mount flags from the context now that all parameter parsing + * is guaranteed to have been completed by either the old mount API or + * the newer fsopen/fsconfig API. + */ + if (fc->sb_flags & SB_RDONLY) + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + if (fc->sb_flags & SB_DIRSYNC) + mp->m_features |= XFS_FEAT_DIRSYNC; + if (fc->sb_flags & SB_SYNCHRONOUS) + mp->m_features |= XFS_FEAT_WSYNC; + error = xfs_fs_validate_params(mp); if (error) return error; @@ -1978,12 +1975,17 @@ static const struct fs_context_operations xfs_context_ops = { .free = xfs_fs_free, }; +/* + * WARNING: do not initialise any parameters in this function that depend on + * mount option parsing having already been performed as this can be called from + * fsopen() before any parameters have been set. + */ static int xfs_init_fs_context( struct fs_context *fc) { struct xfs_mount *mp; - mp = kmem_alloc(sizeof(struct xfs_mount), KM_ZERO); + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL); if (!mp) return -ENOMEM; @@ -2009,15 +2011,7 @@ static int xfs_init_fs_context( mp->m_logbsize = -1; mp->m_allocsize_log = 16; /* 64k */ - /* - * Copy binary VFS mount flags we are interested in. - */ - if (fc->sb_flags & SB_RDONLY) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); - if (fc->sb_flags & SB_DIRSYNC) - mp->m_features |= XFS_FEAT_DIRSYNC; - if (fc->sb_flags & SB_SYNCHRONOUS) - mp->m_features |= XFS_FEAT_WSYNC; + xfs_hooks_init(&mp->m_dir_update_hooks); fc->s_fs_info = mp; fc->ops = &xfs_context_ops; @@ -2050,8 +2044,7 @@ xfs_init_caches(void) xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, SLAB_HWCACHE_ALIGN | - SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT, NULL); if (!xfs_buf_cache) goto out; @@ -2066,10 +2059,14 @@ xfs_init_caches(void) if (error) goto out_destroy_log_ticket_cache; - error = xfs_defer_init_item_caches(); + error = rcbagbt_init_cur_cache(); if (error) goto out_destroy_btree_cur_cache; + error = xfs_defer_init_item_caches(); + if (error) + goto out_destroy_rcbagbt_cur_cache; + xfs_da_state_cache = kmem_cache_create("xfs_da_state", sizeof(struct xfs_da_state), 0, 0, NULL); @@ -2116,14 +2113,14 @@ xfs_init_caches(void) sizeof(struct xfs_inode), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | SLAB_ACCOUNT), + SLAB_ACCOUNT), xfs_fs_inode_init_once); if (!xfs_inode_cache) goto out_destroy_efi_cache; xfs_ili_cache = kmem_cache_create("xfs_ili", sizeof(struct xfs_inode_log_item), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT, NULL); if (!xfs_ili_cache) goto out_destroy_inode_cache; @@ -2226,6 +2223,8 @@ xfs_init_caches(void) kmem_cache_destroy(xfs_da_state_cache); out_destroy_defer_item_cache: xfs_defer_destroy_item_caches(); + out_destroy_rcbagbt_cur_cache: + rcbagbt_destroy_cur_cache(); out_destroy_btree_cur_cache: xfs_btree_destroy_cur_caches(); out_destroy_log_ticket_cache: @@ -2263,6 +2262,7 @@ xfs_destroy_caches(void) kmem_cache_destroy(xfs_ifork_cache); kmem_cache_destroy(xfs_da_state_cache); xfs_defer_destroy_item_caches(); + rcbagbt_destroy_cur_cache(); xfs_btree_destroy_cur_caches(); kmem_cache_destroy(xfs_log_ticket_cache); kmem_cache_destroy(xfs_buf_cache); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 85e433df6a3f..3e376d24c7c1 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -23,77 +23,8 @@ #include "xfs_trans.h" #include "xfs_ialloc.h" #include "xfs_error.h" - -/* ----- Kernel only functions below ----- */ -int -xfs_readlink_bmap_ilocked( - struct xfs_inode *ip, - char *link) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; - struct xfs_buf *bp; - xfs_daddr_t d; - char *cur_chunk; - int pathlen = ip->i_disk_size; - int nmaps = XFS_SYMLINK_MAPS; - int byte_cnt; - int n; - int error = 0; - int fsblocks = 0; - int offset; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - - fsblocks = xfs_symlink_blocks(mp, pathlen); - error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); - if (error) - goto out; - - offset = 0; - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - - error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, - &bp, &xfs_symlink_buf_ops); - if (error) - return error; - byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); - if (pathlen < byte_cnt) - byte_cnt = pathlen; - - cur_chunk = bp->b_addr; - if (xfs_has_crc(mp)) { - if (!xfs_symlink_hdr_ok(ip->i_ino, offset, - byte_cnt, bp)) { - error = -EFSCORRUPTED; - xfs_alert(mp, -"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", - offset, byte_cnt, ip->i_ino); - xfs_buf_relse(bp); - goto out; - - } - - cur_chunk += sizeof(struct xfs_dsymlink_hdr); - } - - memcpy(link + offset, cur_chunk, byte_cnt); - - pathlen -= byte_cnt; - offset += byte_cnt; - - xfs_buf_relse(bp); - } - ASSERT(pathlen == 0); - - link[ip->i_disk_size] = '\0'; - error = 0; - - out: - return error; -} +#include "xfs_health.h" +#include "xfs_symlink_remote.h" int xfs_readlink( @@ -102,25 +33,27 @@ xfs_readlink( { struct xfs_mount *mp = ip->i_mount; xfs_fsize_t pathlen; - int error = -EFSCORRUPTED; + int error; trace_xfs_readlink(ip); if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(ip, XFS_DATA_FORK)) + return -EIO; xfs_ilock(ip, XFS_ILOCK_SHARED); pathlen = ip->i_disk_size; if (!pathlen) - goto out; + goto out_corrupt; if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) { xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", __func__, (unsigned long long) ip->i_ino, (long long) pathlen); ASSERT(0); - goto out; + goto out_corrupt; } if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { @@ -128,18 +61,21 @@ xfs_readlink( * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED * if if_data is junk. */ - if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data)) - goto out; + if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data)) + goto out_corrupt; - memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1); + memcpy(link, ip->i_df.if_data, pathlen + 1); error = 0; } else { - error = xfs_readlink_bmap_ilocked(ip, link); + error = xfs_symlink_remote_read(ip, link); } - out: xfs_iunlock(ip, XFS_ILOCK_SHARED); return error; + out_corrupt: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); + return -EFSCORRUPTED; } int @@ -157,15 +93,7 @@ xfs_symlink( int error = 0; int pathlen; bool unlock_dp_on_error = false; - xfs_fileoff_t first_fsb; xfs_filblks_t fs_blocks; - int nmaps; - struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; - xfs_daddr_t d; - const char *cur_chunk; - int byte_cnt; - int n; - struct xfs_buf *bp; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -253,62 +181,11 @@ xfs_symlink( xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); resblks -= XFS_IALLOC_SPACE_RES(mp); - /* - * If the symlink will fit into the inode, write it inline. - */ - if (pathlen <= xfs_inode_data_fork_size(ip)) { - xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); - - ip->i_disk_size = pathlen; - ip->i_df.if_format = XFS_DINODE_FMT_LOCAL; - xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); - } else { - int offset; - - first_fsb = 0; - nmaps = XFS_SYMLINK_MAPS; - - error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, - XFS_BMAPI_METADATA, resblks, mval, &nmaps); - if (error) - goto out_trans_cancel; - - resblks -= fs_blocks; - ip->i_disk_size = pathlen; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - cur_chunk = target_path; - offset = 0; - for (n = 0; n < nmaps; n++) { - char *buf; - - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - BTOBB(byte_cnt), 0, &bp); - if (error) - goto out_trans_cancel; - bp->b_ops = &xfs_symlink_buf_ops; - - byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); - byte_cnt = min(byte_cnt, pathlen); - - buf = bp->b_addr; - buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, - byte_cnt, bp); - - memcpy(buf, cur_chunk, byte_cnt); - - cur_chunk += byte_cnt; - pathlen -= byte_cnt; - offset += byte_cnt; - - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); - xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - - (char *)bp->b_addr); - } - ASSERT(pathlen == 0); - } + error = xfs_symlink_write_target(tp, ip, target_path, pathlen, + fs_blocks, resblks); + if (error) + goto out_trans_cancel; + resblks -= fs_blocks; i_size_write(VFS_I(ip), ip->i_disk_size); /* @@ -319,6 +196,7 @@ xfs_symlink( goto out_trans_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + xfs_dir_update_hook(dp, ip, 1, link_name); /* * If this is a synchronous mount, make sure that the @@ -493,6 +371,7 @@ xfs_inactive_symlink( __func__, (unsigned long long)ip->i_ino, pathlen); xfs_iunlock(ip, XFS_ILOCK_EXCL); ASSERT(0); + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index d1ca1ce62a93..0d29a50e66fd 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -10,7 +10,6 @@ int xfs_symlink(struct mnt_idmap *idmap, struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); -int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link); int xfs_readlink(struct xfs_inode *ip, char *link); int xfs_inactive_symlink(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index fade33735393..a191f6560f98 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -206,8 +206,6 @@ static struct ctl_table xfs_table[] = { .extra2 = &xfs_params.stats_clear.max }, #endif /* CONFIG_PROC_FS */ - - {} }; int diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index f78ad6b10ea5..276696a07040 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -85,6 +85,8 @@ struct xfs_globals { int pwork_threads; /* parallel workqueue threads */ bool larp; /* log attribute replay */ #endif + int bload_leaf_slack; /* btree bulk load leaf slack */ + int bload_node_slack; /* btree bulk load node slack */ int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index a3c6b1548723..d2391eec37fe 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -193,7 +193,6 @@ always_cow_show( } XFS_SYSFS_ATTR_RW(always_cow); -#ifdef DEBUG /* * Override how many threads the parallel work queue is allowed to create. * This has to be a debug-only global (instead of an errortag) because one of @@ -229,6 +228,15 @@ pwork_threads_show( } XFS_SYSFS_ATTR_RW(pwork_threads); +/* + * The "LARP" (Logged extended Attribute Recovery Persistence) debugging knob + * sets the XFS_DA_OP_LOGGED flag on all xfs_attr_set operations performed on + * V5 filesystems. As a result, the intermediate progress of all setxattr and + * removexattr operations are tracked via the log and can be restarted during + * recovery. This is useful for testing xattr recovery prior to merging of the + * parent pointer feature which requires it to maintain consistency, and may be + * enabled for userspace xattrs in the future. + */ static ssize_t larp_store( struct kobject *kobject, @@ -251,17 +259,68 @@ larp_show( return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp); } XFS_SYSFS_ATTR_RW(larp); -#endif /* DEBUG */ + +STATIC ssize_t +bload_leaf_slack_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + xfs_globals.bload_leaf_slack = val; + return count; +} + +STATIC ssize_t +bload_leaf_slack_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack); +} +XFS_SYSFS_ATTR_RW(bload_leaf_slack); + +STATIC ssize_t +bload_node_slack_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + xfs_globals.bload_node_slack = val; + return count; +} + +STATIC ssize_t +bload_node_slack_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack); +} +XFS_SYSFS_ATTR_RW(bload_node_slack); static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), ATTR_LIST(mount_delay), ATTR_LIST(always_cow), -#ifdef DEBUG ATTR_LIST(pwork_threads), ATTR_LIST(larp), -#endif + ATTR_LIST(bload_leaf_slack), + ATTR_LIST(bload_node_slack), NULL, }; ATTRIBUTE_GROUPS(xfs_dbg); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 8a5dc1538aa8..1a963382e5e9 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -36,6 +36,9 @@ #include "xfs_error.h" #include <linux/iomap.h> #include "xfs_iomap.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_bmap.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3926cf7f2a6e..aea97fc074f8 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -67,6 +67,7 @@ struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; struct xfs_btree_cur; +struct xfs_defer_op_type; struct xfs_refcount_irec; struct xfs_fsmap; struct xfs_rmap_irec; @@ -78,6 +79,9 @@ union xfs_btree_ptr; struct xfs_dqtrx; struct xfs_icwalk; struct xfs_perag; +struct xfbtree; +struct xfs_btree_ops; +struct xfs_bmap_intent; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -145,21 +149,23 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); TRACE_EVENT(xlog_intent_recovery_failed, - TP_PROTO(struct xfs_mount *mp, int error, void *function), - TP_ARGS(mp, error, function), + TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops, + int error), + TP_ARGS(mp, ops, error), TP_STRUCT__entry( __field(dev_t, dev) + __string(name, ops->name) __field(int, error) - __field(void *, function) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; + __assign_str(name, ops->name); __entry->error = error; - __entry->function = function; ), - TP_printk("dev %d:%d error %d function %pS", + TP_printk("dev %d:%d optype %s error %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->error, __entry->function) + __get_str(name), + __entry->error) ); DECLARE_EVENT_CLASS(xfs_perag_class, @@ -637,6 +643,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf); DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur); DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf); DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bdetach); DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); @@ -1707,12 +1714,10 @@ DECLARE_EVENT_CLASS(xfs_agf_class, __entry->agno = be32_to_cpu(agf->agf_seqno), __entry->flags = flags; __entry->length = be32_to_cpu(agf->agf_length), - __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]), - __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]), - __entry->bno_level = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]), - __entry->cnt_level = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]), + __entry->bno_root = be32_to_cpu(agf->agf_bno_root), + __entry->cnt_root = be32_to_cpu(agf->agf_cnt_root), + __entry->bno_level = be32_to_cpu(agf->agf_bno_level), + __entry->cnt_level = be32_to_cpu(agf->agf_cnt_level), __entry->flfirst = be32_to_cpu(agf->agf_flfirst), __entry->fllast = be32_to_cpu(agf->agf_fllast), __entry->flcount = be32_to_cpu(agf->agf_flcount), @@ -1887,28 +1892,28 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_near_bno); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_finish); TRACE_EVENT(xfs_alloc_cur_check, - TP_PROTO(struct xfs_mount *mp, xfs_btnum_t btnum, xfs_agblock_t bno, + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, xfs_extlen_t diff, bool new), - TP_ARGS(mp, btnum, bno, len, diff, new), + TP_ARGS(cur, bno, len, diff, new), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(xfs_agblock_t, bno) __field(xfs_extlen_t, len) __field(xfs_extlen_t, diff) __field(bool, new) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->btnum = btnum; + __entry->dev = cur->bc_mp->m_super->s_dev; + __assign_str(name, cur->bc_ops->name); __entry->bno = bno; __entry->len = len; __entry->diff = diff; __entry->new = new; ), - TP_printk("dev %d:%d btree %s agbno 0x%x fsbcount 0x%x diff 0x%x new %d", + TP_printk("dev %d:%d %sbt agbno 0x%x fsbcount 0x%x diff 0x%x new %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->bno, __entry->len, __entry->diff, __entry->new) ) @@ -2449,21 +2454,12 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall); DEFINE_DISCARD_EVENT(xfs_discard_exclude); DEFINE_DISCARD_EVENT(xfs_discard_busy); -/* btree cursor events */ -TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi); -TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi); -TRACE_DEFINE_ENUM(XFS_BTNUM_INOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); -TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); - DECLARE_EVENT_CLASS(xfs_btree_cur_class, TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), TP_ARGS(cur, level, bp), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(int, nlevels) __field(int, ptr) @@ -2471,15 +2467,15 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_levels[level].ptr; __entry->daddr = bp ? xfs_buf_daddr(bp) : -1; ), - TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx", + TP_printk("dev %d:%d %sbt level %d/%d ptr %d daddr 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->nlevels, __entry->ptr, @@ -2493,6 +2489,90 @@ DEFINE_EVENT(xfs_btree_cur_class, name, \ DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys); DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range); +TRACE_EVENT(xfs_btree_alloc_block, + TP_PROTO(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, int stat, + int error), + TP_ARGS(cur, ptr, stat, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, ino) + __string(name, cur->bc_ops->name) + __field(int, error) + __field(xfs_agblock_t, agbno) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_INODE: + __entry->agno = 0; + __entry->ino = cur->bc_ino.ip->i_ino; + break; + case XFS_BTREE_TYPE_AG: + __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->ino = 0; + break; + case XFS_BTREE_TYPE_MEM: + __entry->agno = 0; + __entry->ino = 0; + break; + } + __assign_str(name, cur->bc_ops->name); + __entry->error = error; + if (!error && stat) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { + xfs_fsblock_t fsb = be64_to_cpu(ptr->l); + + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, + fsb); + __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, + fsb); + } else { + __entry->agbno = be32_to_cpu(ptr->s); + } + } else { + __entry->agbno = NULLAGBLOCK; + } + ), + TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __get_str(name), + __entry->agno, + __entry->ino, + __entry->agbno, + __entry->error) +); + +TRACE_EVENT(xfs_btree_free_block, + TP_PROTO(struct xfs_btree_cur *cur, struct xfs_buf *bp), + TP_ARGS(cur, bp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, ino) + __string(name, cur->bc_ops->name) + __field(xfs_agblock_t, agbno) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->agno = xfs_daddr_to_agno(cur->bc_mp, + xfs_buf_daddr(bp)); + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) + __entry->ino = cur->bc_ino.ip->i_ino; + else + __entry->ino = 0; + __assign_str(name, cur->bc_ops->name); + __entry->agbno = xfs_daddr_to_agbno(cur->bc_mp, + xfs_buf_daddr(bp)); + ), + TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __get_str(name), + __entry->agno, + __entry->ino, + __entry->agbno) +); + /* deferred ops */ struct xfs_defer_pending; @@ -2549,22 +2629,25 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, TP_ARGS(mp, dfp), TP_STRUCT__entry( __field(dev_t, dev) - __field(int, type) + __string(name, dfp->dfp_ops->name) __field(void *, intent) + __field(unsigned int, flags) __field(char, committed) __field(int, nr) ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __entry->type = dfp->dfp_type; + __assign_str(name, dfp->dfp_ops->name); __entry->intent = dfp->dfp_intent; + __entry->flags = dfp->dfp_flags; __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p committed %d nr %d", + TP_printk("dev %d:%d optype %s intent %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __get_str(name), __entry->intent, + __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, __entry->nr) ) @@ -2573,7 +2656,25 @@ DEFINE_EVENT(xfs_defer_pending_class, name, \ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), \ TP_ARGS(mp, dfp)) -DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class, +DEFINE_DEFER_EVENT(xfs_defer_cancel); +DEFINE_DEFER_EVENT(xfs_defer_trans_roll); +DEFINE_DEFER_EVENT(xfs_defer_trans_abort); +DEFINE_DEFER_EVENT(xfs_defer_finish); +DEFINE_DEFER_EVENT(xfs_defer_finish_done); + +DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error); +DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error); + +DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause); + +DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int type, xfs_agblock_t agbno, xfs_extlen_t len), TP_ARGS(mp, agno, type, agbno, len), @@ -2598,89 +2699,17 @@ DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class, __entry->agbno, __entry->len) ); -#define DEFINE_PHYS_EXTENT_DEFERRED_EVENT(name) \ -DEFINE_EVENT(xfs_phys_extent_deferred_class, name, \ +#define DEFINE_FREE_EXTENT_DEFERRED_EVENT(name) \ +DEFINE_EVENT(xfs_free_extent_deferred_class, name, \ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ int type, \ xfs_agblock_t bno, \ xfs_extlen_t len), \ TP_ARGS(mp, agno, type, bno, len)) - -DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int op, - xfs_agblock_t agbno, - xfs_ino_t ino, - int whichfork, - xfs_fileoff_t offset, - xfs_filblks_t len, - xfs_exntst_t state), - TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_agnumber_t, agno) - __field(xfs_ino_t, ino) - __field(xfs_agblock_t, agbno) - __field(int, whichfork) - __field(xfs_fileoff_t, l_loff) - __field(xfs_filblks_t, l_len) - __field(xfs_exntst_t, l_state) - __field(int, op) - ), - TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->ino = ino; - __entry->agbno = agbno; - __entry->whichfork = whichfork; - __entry->l_loff = offset; - __entry->l_len = len; - __entry->l_state = state; - __entry->op = op; - ), - TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->op, - __entry->agno, - __entry->agbno, - __entry->ino, - __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), - __entry->l_loff, - __entry->l_len, - __entry->l_state) -); -#define DEFINE_MAP_EXTENT_DEFERRED_EVENT(name) \ -DEFINE_EVENT(xfs_map_extent_deferred_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - int op, \ - xfs_agblock_t agbno, \ - xfs_ino_t ino, \ - int whichfork, \ - xfs_fileoff_t offset, \ - xfs_filblks_t len, \ - xfs_exntst_t state), \ - TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state)) - -DEFINE_DEFER_EVENT(xfs_defer_cancel); -DEFINE_DEFER_EVENT(xfs_defer_trans_roll); -DEFINE_DEFER_EVENT(xfs_defer_trans_abort); -DEFINE_DEFER_EVENT(xfs_defer_finish); -DEFINE_DEFER_EVENT(xfs_defer_finish_done); - -DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error); -DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error); - -DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); - -#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT -DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); -DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred); -DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer); -DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred); +DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_defer); +DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_deferred); +DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_defer); +DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred); DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, @@ -2688,25 +2717,28 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, TP_ARGS(mp, dfp, item), TP_STRUCT__entry( __field(dev_t, dev) - __field(int, type) + __string(name, dfp->dfp_ops->name) __field(void *, intent) __field(void *, item) __field(char, committed) + __field(unsigned int, flags) __field(int, nr) ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __entry->type = dfp->dfp_type; + __assign_str(name, dfp->dfp_ops->name); __entry->intent = dfp->dfp_intent; __entry->item = item; __entry->committed = dfp->dfp_done != NULL; + __entry->flags = dfp->dfp_flags; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d", + TP_printk("dev %d:%d optype %s intent %p item %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __get_str(name), __entry->intent, __entry->item, + __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, __entry->nr) ) @@ -2842,12 +2874,63 @@ DEFINE_EVENT(xfs_rmapbt_class, name, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -#define DEFINE_RMAP_DEFERRED_EVENT DEFINE_MAP_EXTENT_DEFERRED_EVENT +DECLARE_EVENT_CLASS(xfs_rmap_deferred_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int op, + xfs_agblock_t agbno, + xfs_ino_t ino, + int whichfork, + xfs_fileoff_t offset, + xfs_filblks_t len, + xfs_exntst_t state), + TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, ino) + __field(xfs_agblock_t, agbno) + __field(int, whichfork) + __field(xfs_fileoff_t, l_loff) + __field(xfs_filblks_t, l_len) + __field(xfs_exntst_t, l_state) + __field(int, op) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->ino = ino; + __entry->agbno = agbno; + __entry->whichfork = whichfork; + __entry->l_loff = offset; + __entry->l_len = len; + __entry->l_state = state; + __entry->op = op; + ), + TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->op, + __entry->agno, + __entry->agbno, + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->l_loff, + __entry->l_len, + __entry->l_state) +); +#define DEFINE_RMAP_DEFERRED_EVENT(name) \ +DEFINE_EVENT(xfs_rmap_deferred_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + int op, \ + xfs_agblock_t agbno, \ + xfs_ino_t ino, \ + int whichfork, \ + xfs_fileoff_t offset, \ + xfs_filblks_t len, \ + xfs_exntst_t state), \ + TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state)) DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer); DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred); -DEFINE_BUSY_EVENT(xfs_rmapbt_alloc_block); -DEFINE_BUSY_EVENT(xfs_rmapbt_free_block); DEFINE_RMAPBT_EVENT(xfs_rmap_update); DEFINE_RMAPBT_EVENT(xfs_rmap_insert); DEFINE_RMAPBT_EVENT(xfs_rmap_delete); @@ -2864,7 +2947,66 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); /* deferred bmbt updates */ -#define DEFINE_BMAP_DEFERRED_EVENT DEFINE_RMAP_DEFERRED_EVENT +TRACE_DEFINE_ENUM(XFS_BMAP_MAP); +TRACE_DEFINE_ENUM(XFS_BMAP_UNMAP); + +DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, + TP_PROTO(struct xfs_bmap_intent *bi), + TP_ARGS(bi), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, opdev) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, ino) + __field(xfs_agblock_t, agbno) + __field(xfs_fsblock_t, rtbno) + __field(int, whichfork) + __field(xfs_fileoff_t, l_loff) + __field(xfs_filblks_t, l_len) + __field(xfs_exntst_t, l_state) + __field(int, op) + ), + TP_fast_assign( + struct xfs_inode *ip = bi->bi_owner; + + __entry->dev = ip->i_mount->m_super->s_dev; + if (xfs_ifork_is_realtime(ip, bi->bi_whichfork)) { + __entry->agno = 0; + __entry->agbno = 0; + __entry->rtbno = bi->bi_bmap.br_startblock; + __entry->opdev = ip->i_mount->m_rtdev_targp->bt_dev; + } else { + __entry->agno = XFS_FSB_TO_AGNO(ip->i_mount, + bi->bi_bmap.br_startblock); + __entry->agbno = XFS_FSB_TO_AGBNO(ip->i_mount, + bi->bi_bmap.br_startblock); + __entry->rtbno = 0; + __entry->opdev = __entry->dev; + } + __entry->ino = ip->i_ino; + __entry->whichfork = bi->bi_whichfork; + __entry->l_loff = bi->bi_bmap.br_startoff; + __entry->l_len = bi->bi_bmap.br_blockcount; + __entry->l_state = bi->bi_bmap.br_state; + __entry->op = bi->bi_type; + ), + TP_printk("dev %d:%d op %s opdev %d:%d ino 0x%llx agno 0x%x agbno 0x%x rtbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->op, XFS_BMAP_INTENT_STRINGS), + MAJOR(__entry->opdev), MINOR(__entry->opdev), + __entry->ino, + __entry->agno, + __entry->agbno, + __entry->rtbno, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->l_loff, + __entry->l_len, + __entry->l_state) +); +#define DEFINE_BMAP_DEFERRED_EVENT(name) \ +DEFINE_EVENT(xfs_bmap_deferred_class, name, \ + TP_PROTO(struct xfs_bmap_intent *bi), \ + TP_ARGS(bi)) DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer); DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred); @@ -3205,8 +3347,6 @@ DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \ TP_ARGS(mp, agno, i1, i2, i3)) /* refcount btree tracepoints */ -DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block); -DEFINE_BUSY_EVENT(xfs_refcountbt_free_block); DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update); @@ -3243,7 +3383,39 @@ DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error); DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared); DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result); DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error); -#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT + +DECLARE_EVENT_CLASS(xfs_refcount_deferred_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int type, xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, type, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, type) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->type = type; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_deferred_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + int type, \ + xfs_agblock_t bno, \ + xfs_extlen_t len), \ + TP_ARGS(mp, agno, type, bno, len)) DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer); DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); @@ -3914,9 +4086,11 @@ DEFINE_EVENT(xfs_fs_corrupt_class, name, \ TP_PROTO(struct xfs_mount *mp, unsigned int flags), \ TP_ARGS(mp, flags)) DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick); +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_corrupt); DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy); DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption); DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick); +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_corrupt); DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy); DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption); @@ -3943,6 +4117,7 @@ DEFINE_EVENT(xfs_ag_corrupt_class, name, \ unsigned int flags), \ TP_ARGS(mp, agno, flags)) DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick); +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_corrupt); DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy); DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption); @@ -3968,7 +4143,9 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned int flags), \ TP_ARGS(ip, flags)) DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick); +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_corrupt); DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy); +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_unfixed_corruption); TRACE_EVENT(xfs_iwalk_ag, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -4028,31 +4205,6 @@ TRACE_EVENT(xfs_pwork_init, __entry->nr_threads, __entry->pid) ) -DECLARE_EVENT_CLASS(xfs_kmem_class, - TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), - TP_ARGS(size, flags, caller_ip), - TP_STRUCT__entry( - __field(ssize_t, size) - __field(int, flags) - __field(unsigned long, caller_ip) - ), - TP_fast_assign( - __entry->size = size; - __entry->flags = flags; - __entry->caller_ip = caller_ip; - ), - TP_printk("size %zd flags 0x%x caller %pS", - __entry->size, - __entry->flags, - (char *)__entry->caller_ip) -) - -#define DEFINE_KMEM_EVENT(name) \ -DEFINE_EVENT(xfs_kmem_class, name, \ - TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ - TP_ARGS(size, flags, caller_ip)) -DEFINE_KMEM_EVENT(kmem_alloc); - TRACE_EVENT(xfs_check_new_dalign, TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), TP_ARGS(mp, new_dalign, calc_rootino), @@ -4079,7 +4231,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot, TP_ARGS(cur), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(unsigned int, levels) @@ -4087,15 +4239,15 @@ TRACE_EVENT(xfs_btree_commit_afakeroot, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->agno = cur->bc_ag.pag->pag_agno; __entry->agbno = cur->bc_ag.afake->af_root; __entry->levels = cur->bc_ag.afake->af_levels; __entry->blocks = cur->bc_ag.afake->af_blocks; ), - TP_printk("dev %d:%d btree %s agno 0x%x levels %u blocks %u root %u", + TP_printk("dev %d:%d %sbt agno 0x%x levels %u blocks %u root %u", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->agno, __entry->levels, __entry->blocks, @@ -4107,7 +4259,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot, TP_ARGS(cur), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(xfs_agnumber_t, agno) __field(xfs_agino_t, agino) __field(unsigned int, levels) @@ -4116,7 +4268,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->agno = XFS_INO_TO_AGNO(cur->bc_mp, cur->bc_ino.ip->i_ino); __entry->agino = XFS_INO_TO_AGINO(cur->bc_mp, @@ -4125,9 +4277,9 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot, __entry->blocks = cur->bc_ino.ifake->if_blocks; __entry->whichfork = cur->bc_ino.whichfork; ), - TP_printk("dev %d:%d btree %s agno 0x%x agino 0x%x whichfork %s levels %u blocks %u", + TP_printk("dev %d:%d %sbt agno 0x%x agino 0x%x whichfork %s levels %u blocks %u", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->agno, __entry->agino, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), @@ -4144,7 +4296,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry, blocks_with_extra), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(unsigned int, level) __field(unsigned int, nlevels) __field(uint64_t, nr_this_level) @@ -4155,7 +4307,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->nr_this_level = nr_this_level; @@ -4164,9 +4316,9 @@ TRACE_EVENT(xfs_btree_bload_level_geometry, __entry->blocks = blocks; __entry->blocks_with_extra = blocks_with_extra; ), - TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu", + TP_printk("dev %d:%d %sbt level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->nlevels, __entry->nr_this_level, @@ -4183,7 +4335,7 @@ TRACE_EVENT(xfs_btree_bload_block, TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(unsigned int, level) __field(unsigned long long, block_idx) __field(unsigned long long, nr_blocks) @@ -4193,11 +4345,11 @@ TRACE_EVENT(xfs_btree_bload_block, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->block_idx = block_idx; __entry->nr_blocks = nr_blocks; - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { xfs_fsblock_t fsb = be64_to_cpu(ptr->l); __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb); @@ -4208,9 +4360,9 @@ TRACE_EVENT(xfs_btree_bload_block, } __entry->nr_records = nr_records; ), - TP_printk("dev %d:%d btree %s level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u", + TP_printk("dev %d:%d %sbt level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->block_idx, __entry->nr_blocks, @@ -4399,8 +4551,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add); -DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace); -DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove); TRACE_EVENT(xfs_force_shutdown, @@ -4462,6 +4612,164 @@ DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents); #endif /* CONFIG_XFS_DRAIN_INTENTS */ +#ifdef CONFIG_XFS_MEMORY_BUFS +TRACE_EVENT(xmbuf_create, + TP_PROTO(struct xfs_buftarg *btp), + TP_ARGS(btp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, ino) + __array(char, pathname, 256) + ), + TP_fast_assign( + char pathname[257]; + char *path; + struct file *file = btp->bt_file; + + __entry->dev = btp->bt_mount->m_super->s_dev; + __entry->ino = file_inode(file)->i_ino; + memset(pathname, 0, sizeof(pathname)); + path = file_path(file, pathname, sizeof(pathname) - 1); + if (IS_ERR(path)) + path = "(unknown)"; + strncpy(__entry->pathname, path, sizeof(__entry->pathname)); + ), + TP_printk("dev %d:%d xmino 0x%lx path '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pathname) +); + +TRACE_EVENT(xmbuf_free, + TP_PROTO(struct xfs_buftarg *btp), + TP_ARGS(btp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, ino) + __field(unsigned long long, bytes) + __field(loff_t, size) + ), + TP_fast_assign( + struct file *file = btp->bt_file; + struct inode *inode = file_inode(file); + + __entry->dev = btp->bt_mount->m_super->s_dev; + __entry->size = i_size_read(inode); + __entry->bytes = (inode->i_blocks << SECTOR_SHIFT) + inode->i_bytes; + __entry->ino = inode->i_ino; + ), + TP_printk("dev %d:%d xmino 0x%lx mem_bytes 0x%llx isize 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->bytes, + __entry->size) +); +#endif /* CONFIG_XFS_MEMORY_BUFS */ + +#ifdef CONFIG_XFS_BTREE_IN_MEM +TRACE_EVENT(xfbtree_init, + TP_PROTO(struct xfs_mount *mp, struct xfbtree *xfbt, + const struct xfs_btree_ops *ops), + TP_ARGS(mp, xfbt, ops), + TP_STRUCT__entry( + __field(const void *, btree_ops) + __field(unsigned long, xfino) + __field(unsigned int, leaf_mxr) + __field(unsigned int, leaf_mnr) + __field(unsigned int, node_mxr) + __field(unsigned int, node_mnr) + __field(unsigned long long, owner) + ), + TP_fast_assign( + __entry->btree_ops = ops; + __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; + __entry->leaf_mxr = xfbt->maxrecs[0]; + __entry->node_mxr = xfbt->maxrecs[1]; + __entry->leaf_mnr = xfbt->minrecs[0]; + __entry->node_mnr = xfbt->minrecs[1]; + __entry->owner = xfbt->owner; + ), + TP_printk("xfino 0x%lx btree_ops %pS owner 0x%llx leaf_mxr %u leaf_mnr %u node_mxr %u node_mnr %u", + __entry->xfino, + __entry->btree_ops, + __entry->owner, + __entry->leaf_mxr, + __entry->leaf_mnr, + __entry->node_mxr, + __entry->node_mnr) +); + +DECLARE_EVENT_CLASS(xfbtree_buf_class, + TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), + TP_ARGS(xfbt, bp), + TP_STRUCT__entry( + __field(unsigned long, xfino) + __field(xfs_daddr_t, bno) + __field(int, nblks) + __field(int, hold) + __field(int, pincount) + __field(unsigned int, lockval) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; + __entry->bno = xfs_buf_daddr(bp); + __entry->nblks = bp->b_length; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = bp->b_sema.count; + __entry->flags = bp->b_flags; + ), + TP_printk("xfino 0x%lx daddr 0x%llx bbcount 0x%x hold %d pincount %d lock %d flags %s", + __entry->xfino, + (unsigned long long)__entry->bno, + __entry->nblks, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS)) +) + +#define DEFINE_XFBTREE_BUF_EVENT(name) \ +DEFINE_EVENT(xfbtree_buf_class, name, \ + TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), \ + TP_ARGS(xfbt, bp)) +DEFINE_XFBTREE_BUF_EVENT(xfbtree_create_root_buf); +DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_commit_buf); +DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_cancel_buf); + +DECLARE_EVENT_CLASS(xfbtree_freesp_class, + TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, + xfs_fileoff_t fileoff), + TP_ARGS(xfbt, cur, fileoff), + TP_STRUCT__entry( + __field(unsigned long, xfino) + __string(btname, cur->bc_ops->name) + __field(int, nlevels) + __field(xfs_fileoff_t, fileoff) + ), + TP_fast_assign( + __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; + __assign_str(btname, cur->bc_ops->name); + __entry->nlevels = cur->bc_nlevels; + __entry->fileoff = fileoff; + ), + TP_printk("xfino 0x%lx %sbt nlevels %d fileoff 0x%llx", + __entry->xfino, + __get_str(btname), + __entry->nlevels, + (unsigned long long)__entry->fileoff) +) + +#define DEFINE_XFBTREE_FREESP_EVENT(name) \ +DEFINE_EVENT(xfbtree_freesp_class, name, \ + TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, \ + xfs_fileoff_t fileoff), \ + TP_ARGS(xfbt, cur, fileoff)) +DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block); +DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block); +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 8c0bfc9a33b1..7350640059cc 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -24,6 +24,7 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_icache.h" +#include "xfs_rtbitmap.h" struct kmem_cache *xfs_trans_cache; @@ -655,6 +656,10 @@ xfs_trans_unreserve_and_mod_sb( mp->m_sb.sb_agcount += tp->t_agcount_delta; mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; mp->m_sb.sb_rextsize += tp->t_rextsize_delta; + if (tp->t_rextsize_delta) { + mp->m_rtxblklog = log2_if_power2(mp->m_sb.sb_rextsize); + mp->m_rtxblkmask = mask64_if_power2(mp->m_sb.sb_rextsize); + } mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta; mp->m_sb.sb_rblocks += tp->t_rblocks_delta; mp->m_sb.sb_rextents += tp->t_rextents_delta; @@ -1196,7 +1201,7 @@ xfs_trans_alloc_inode( retry: error = xfs_trans_alloc(mp, resv, dblocks, - rblocks / mp->m_sb.sb_rextsize, + xfs_extlen_to_rtxlen(mp, rblocks), force ? XFS_TRANS_RESERVE : 0, &tp); if (error) return error; @@ -1232,6 +1237,68 @@ out_cancel: } /* + * Try to reserve more blocks for a transaction. + * + * This is for callers that need to attach resources to a transaction, scan + * those resources to determine the space reservation requirements, and then + * modify the attached resources. In other words, online repair. This can + * fail due to ENOSPC, so the caller must be able to cancel the transaction + * without shutting down the fs. + */ +int +xfs_trans_reserve_more( + struct xfs_trans *tp, + unsigned int blocks, + unsigned int rtextents) +{ + struct xfs_trans_res resv = { }; + + return xfs_trans_reserve(tp, &resv, blocks, rtextents); +} + +/* + * Try to reserve more blocks and file quota for a transaction. Same + * conditions of usage as xfs_trans_reserve_more. + */ +int +xfs_trans_reserve_more_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + unsigned int dblocks, + unsigned int rblocks, + bool force_quota) +{ + struct xfs_trans_res resv = { }; + struct xfs_mount *mp = ip->i_mount; + unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks); + int error; + + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + + error = xfs_trans_reserve(tp, &resv, dblocks, rtx); + if (error) + return error; + + if (!XFS_IS_QUOTA_ON(mp) || xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) + return 0; + + if (tp->t_flags & XFS_TRANS_RESERVE) + force_quota = true; + + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, + force_quota); + if (!error) + return 0; + + /* Quota failed, give back the new reservation. */ + xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE); + tp->t_blk_res -= dblocks; + xfs_mod_frextents(mp, rtx); + tp->t_rtx_res -= rtx; + return error; +} + +/* * Allocate an transaction in preparation for inode creation by reserving quota * against the given dquots. Callers are not required to hold any inode locks. */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 6e3646d524ce..1636663707dc 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -78,11 +78,7 @@ struct xfs_item_ops { xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_release)(struct xfs_log_item *); - int (*iop_recover)(struct xfs_log_item *lip, - struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); - struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, - struct xfs_trans *tp); struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done); }; @@ -168,6 +164,8 @@ typedef struct xfs_trans { int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, uint blocks, uint rtextents, uint flags, struct xfs_trans **tpp); +int xfs_trans_reserve_more(struct xfs_trans *tp, + unsigned int blocks, unsigned int rtextents); int xfs_trans_alloc_empty(struct xfs_mount *mp, struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); @@ -217,6 +215,7 @@ struct xfs_buf *xfs_trans_getsb(struct xfs_trans *); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp); void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); @@ -247,19 +246,13 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, extern struct kmem_cache *xfs_trans_cache; -static inline struct xfs_log_item * -xfs_trans_item_relog( - struct xfs_log_item *lip, - struct xfs_trans *tp) -{ - return lip->li_ops->iop_relog(lip, tp); -} - struct xfs_dquot; int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, unsigned int dblocks, unsigned int rblocks, bool force, struct xfs_trans **tpp); +int xfs_trans_reserve_more_inode(struct xfs_trans *tp, struct xfs_inode *ip, + unsigned int dblocks, unsigned int rblocks, bool force_quota); int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, unsigned int dblocks, @@ -275,19 +268,14 @@ static inline void xfs_trans_set_context( struct xfs_trans *tp) { - ASSERT(current->journal_info == NULL); tp->t_pflags = memalloc_nofs_save(); - current->journal_info = tp; } static inline void xfs_trans_clear_context( struct xfs_trans *tp) { - if (current->journal_info == tp) { - memalloc_nofs_restore(tp->t_pflags); - current->journal_info = NULL; - } + memalloc_nofs_restore(tp->t_pflags); } static inline void @@ -295,10 +283,8 @@ xfs_trans_switch_context( struct xfs_trans *old_tp, struct xfs_trans *new_tp) { - ASSERT(current->journal_info == old_tp); new_tp->t_pflags = old_tp->t_pflags; old_tp->t_pflags = 0; - current->journal_info = new_tp; } #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 1098452e7f95..e4c343096f95 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -901,7 +901,8 @@ xfs_trans_ail_init( { struct xfs_ail *ailp; - ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); + ailp = kzalloc(sizeof(struct xfs_ail), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!ailp) return -ENOMEM; @@ -921,7 +922,7 @@ xfs_trans_ail_init( return 0; out_free_ailp: - kmem_free(ailp); + kfree(ailp); return -ENOMEM; } @@ -932,5 +933,5 @@ xfs_trans_ail_destroy( struct xfs_ail *ailp = mp->m_ail; kthread_stop(ailp->ail_task); - kmem_free(ailp); + kfree(ailp); } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 6549e50d852c..e28ab74af4f0 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -393,6 +393,48 @@ xfs_trans_brelse( } /* + * Forcibly detach a buffer previously joined to the transaction. The caller + * will retain its locked reference to the buffer after this function returns. + * The buffer must be completely clean and must not be held to the transaction. + */ +void +xfs_trans_bdetach( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bip = bp->b_log_item; + + ASSERT(tp != NULL); + ASSERT(bp->b_transp == tp); + ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + trace_xfs_trans_bdetach(bip); + + /* + * Erase all recursion count, since we're removing this buffer from the + * transaction. + */ + bip->bli_recur = 0; + + /* + * The buffer must be completely clean. Specifically, it had better + * not be dirty, stale, logged, ordered, or held to the transaction. + */ + ASSERT(!test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)); + ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY)); + ASSERT(!(bip->bli_flags & XFS_BLI_HOLD)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + + /* Unlink the log item from the transaction and drop the log item. */ + xfs_trans_del_item(&bip->bli_item); + xfs_buf_item_put(bip); + bp->b_transp = NULL; +} + +/* * Mark the buffer as not needing to be unlocked when the buf item's * iop_committing() routine is called. The buffer must already be locked * and associated with the given transaction. diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index aa00cf67ad72..577b535a595c 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -17,6 +17,7 @@ #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_health.h" STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); @@ -120,6 +121,116 @@ xfs_trans_dup_dqinfo( } } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* + * Use a static key here to reduce the overhead of quota live updates. If the + * compiler supports jump labels, the static branch will be replaced by a nop + * sled when there are no hook users. Online fsck is currently the only + * caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dqtrx_hooks_switch); + +void +xfs_dqtrx_hook_disable(void) +{ + xfs_hooks_switch_off(&xfs_dqtrx_hooks_switch); +} + +void +xfs_dqtrx_hook_enable(void) +{ + xfs_hooks_switch_on(&xfs_dqtrx_hooks_switch); +} + +/* Schedule a transactional dquot update on behalf of an inode. */ +void +xfs_trans_mod_ino_dquot( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot *dqp, + unsigned int field, + int64_t delta) +{ + xfs_trans_mod_dquot(tp, dqp, field, delta); + + if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) { + struct xfs_mod_ino_dqtrx_params p = { + .tx_id = (uintptr_t)tp, + .ino = ip->i_ino, + .q_type = xfs_dquot_type(dqp), + .q_id = dqp->q_id, + .delta = delta + }; + struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo; + + xfs_hooks_call(&qi->qi_mod_ino_dqtrx_hooks, field, &p); + } +} + +/* Call the specified functions during a dquot counter update. */ +int +xfs_dqtrx_hook_add( + struct xfs_quotainfo *qi, + struct xfs_dqtrx_hook *hook) +{ + int error; + + /* + * Transactional dquot updates first call the mod hook when changes + * are attached to the transaction and then call the apply hook when + * those changes are committed (or canceled). + * + * The apply hook must be installed before the mod hook so that we + * never fail to catch the end of a quota update sequence. + */ + error = xfs_hooks_add(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook); + if (error) + goto out; + + error = xfs_hooks_add(&qi->qi_mod_ino_dqtrx_hooks, &hook->mod_hook); + if (error) + goto out_apply; + + return 0; + +out_apply: + xfs_hooks_del(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook); +out: + return error; +} + +/* Stop calling the specified function during a dquot counter update. */ +void +xfs_dqtrx_hook_del( + struct xfs_quotainfo *qi, + struct xfs_dqtrx_hook *hook) +{ + /* + * The mod hook must be removed before apply hook to avoid giving the + * hook consumer with an incomplete update. No hooks should be running + * after these functions return. + */ + xfs_hooks_del(&qi->qi_mod_ino_dqtrx_hooks, &hook->mod_hook); + xfs_hooks_del(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook); +} + +/* Configure dquot update hook functions. */ +void +xfs_dqtrx_hook_setup( + struct xfs_dqtrx_hook *hook, + notifier_fn_t mod_fn, + notifier_fn_t apply_fn) +{ + xfs_hook_setup(&hook->mod_hook, mod_fn); + xfs_hook_setup(&hook->apply_hook, apply_fn); +} +#endif /* CONFIG_XFS_LIVE_HOOKS */ + /* * Wrap around mod_dquot to account for both user and group quotas. */ @@ -137,11 +248,11 @@ xfs_trans_mod_dquot_byino( return; if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) - (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta); + xfs_trans_mod_ino_dquot(tp, ip, ip->i_udquot, field, delta); if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot) - (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); + xfs_trans_mod_ino_dquot(tp, ip, ip->i_gdquot, field, delta); if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot) - (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta); + xfs_trans_mod_ino_dquot(tp, ip, ip->i_pdquot, field, delta); } STATIC struct xfs_dqtrx * @@ -321,6 +432,29 @@ xfs_apply_quota_reservation_deltas( } } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* Call downstream hooks now that it's time to apply dquot deltas. */ +static inline void +xfs_trans_apply_dquot_deltas_hook( + struct xfs_trans *tp, + struct xfs_dquot *dqp) +{ + if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) { + struct xfs_apply_dqtrx_params p = { + .tx_id = (uintptr_t)tp, + .q_type = xfs_dquot_type(dqp), + .q_id = dqp->q_id, + }; + struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo; + + xfs_hooks_call(&qi->qi_apply_dqtrx_hooks, + XFS_APPLY_DQTRX_COMMIT, &p); + } +} +#else +# define xfs_trans_apply_dquot_deltas_hook(tp, dqp) ((void)0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + /* * Called by xfs_trans_commit() and similar in spirit to * xfs_trans_apply_sb_deltas(). @@ -366,6 +500,8 @@ xfs_trans_apply_dquot_deltas( ASSERT(XFS_DQ_IS_LOCKED(dqp)); + xfs_trans_apply_dquot_deltas_hook(tp, dqp); + /* * adjust the actual number of blocks used */ @@ -465,6 +601,29 @@ xfs_trans_apply_dquot_deltas( } } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* Call downstream hooks now that it's time to cancel dquot deltas. */ +static inline void +xfs_trans_unreserve_and_mod_dquots_hook( + struct xfs_trans *tp, + struct xfs_dquot *dqp) +{ + if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) { + struct xfs_apply_dqtrx_params p = { + .tx_id = (uintptr_t)tp, + .q_type = xfs_dquot_type(dqp), + .q_id = dqp->q_id, + }; + struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo; + + xfs_hooks_call(&qi->qi_apply_dqtrx_hooks, + XFS_APPLY_DQTRX_UNRESERVE, &p); + } +} +#else +# define xfs_trans_unreserve_and_mod_dquots_hook(tp, dqp) ((void)0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + /* * Release the reservations, and adjust the dquots accordingly. * This is called only when the transaction is being aborted. If by @@ -495,6 +654,9 @@ xfs_trans_unreserve_and_mod_dquots( */ if ((dqp = qtrx->qt_dquot) == NULL) break; + + xfs_trans_unreserve_and_mod_dquots_hook(tp, dqp); + /* * Unreserve the original reservation. We don't care * about the number of blocks used field, or deltas. @@ -706,6 +868,7 @@ error_return: error_corrupt: xfs_dqunlock(dqp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK); return -EFSCORRUPTED; } @@ -796,7 +959,7 @@ xfs_trans_reserve_quota_nblks( return 0; ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (force) qflags |= XFS_QMOPT_FORCE_RES; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 987843f84d03..364104e1b38a 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -136,6 +136,9 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, }; int error; + if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) + return -EIO; + error = xfs_attr_get(&args); if (error) return error; @@ -294,6 +297,9 @@ xfs_vn_listxattr( struct inode *inode = d_inode(dentry); int error; + if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) + return -EIO; + /* * First read the regular on-disk attributes. */ |