aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Kconfig15
-rw-r--r--fs/xfs/Makefile36
-rw-r--r--fs/xfs/kmem.c30
-rw-r--r--fs/xfs/kmem.h83
-rw-r--r--fs/xfs/libxfs/xfs_ag.c113
-rw-r--r--fs/xfs/libxfs/xfs_ag.h28
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c395
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h24
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c194
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h10
-rw-r--r--fs/xfs/libxfs/xfs_attr.c136
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c260
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h8
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c37
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h24
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c583
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h28
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c171
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h6
-rw-r--r--fs/xfs/libxfs/xfs_btree.c1106
-rw-r--r--fs/xfs/libxfs/xfs_btree.h279
-rw-r--r--fs/xfs/libxfs/xfs_btree_mem.c347
-rw-r--r--fs/xfs/libxfs/xfs_btree_mem.h75
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c222
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.h43
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c128
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h44
-rw-r--r--fs/xfs/libxfs/xfs_defer.c470
-rw-r--r--fs/xfs/libxfs/xfs_defer.h61
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c61
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h13
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c14
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c7
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c107
-rw-r--r--fs/xfs/libxfs/xfs_format.h72
-rw-r--r--fs/xfs/libxfs/xfs_fs.h8
-rw-r--r--fs/xfs/libxfs/xfs_health.h105
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c268
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h3
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c157
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h11
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c79
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c15
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c121
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h14
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h4
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h8
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h (renamed from fs/xfs/xfs_ondisk.h)26
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c126
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h12
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c93
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c286
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h31
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c231
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h8
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c878
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h383
-rw-r--r--fs/xfs/libxfs/xfs_sb.c64
-rw-r--r--fs/xfs/libxfs/xfs_sb.h9
-rw-r--r--fs/xfs/libxfs/xfs_shared.h67
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c167
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.h26
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c6
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c10
-rw-r--r--fs/xfs/libxfs/xfs_types.c4
-rw-r--r--fs/xfs/libxfs/xfs_types.h54
-rw-r--r--fs/xfs/mrlock.h78
-rw-r--r--fs/xfs/scrub/agb_bitmap.c103
-rw-r--r--fs/xfs/scrub/agb_bitmap.h73
-rw-r--r--fs/xfs/scrub/agheader.c12
-rw-r--r--fs/xfs/scrub/agheader_repair.c66
-rw-r--r--fs/xfs/scrub/alloc.c52
-rw-r--r--fs/xfs/scrub/alloc_repair.c933
-rw-r--r--fs/xfs/scrub/attr.c17
-rw-r--r--fs/xfs/scrub/bitmap.c481
-rw-r--r--fs/xfs/scrub/bitmap.h111
-rw-r--r--fs/xfs/scrub/bmap.c166
-rw-r--r--fs/xfs/scrub/bmap_repair.c873
-rw-r--r--fs/xfs/scrub/btree.c58
-rw-r--r--fs/xfs/scrub/common.c168
-rw-r--r--fs/xfs/scrub/common.h69
-rw-r--r--fs/xfs/scrub/cow_repair.c614
-rw-r--r--fs/xfs/scrub/dir.c46
-rw-r--r--fs/xfs/scrub/dqiterate.c211
-rw-r--r--fs/xfs/scrub/fsb_bitmap.h37
-rw-r--r--fs/xfs/scrub/fscounters.c31
-rw-r--r--fs/xfs/scrub/fscounters.h20
-rw-r--r--fs/xfs/scrub/fscounters_repair.c72
-rw-r--r--fs/xfs/scrub/health.c174
-rw-r--r--fs/xfs/scrub/health.h7
-rw-r--r--fs/xfs/scrub/ialloc.c53
-rw-r--r--fs/xfs/scrub/ialloc_repair.c884
-rw-r--r--fs/xfs/scrub/inode.c23
-rw-r--r--fs/xfs/scrub/inode_repair.c1750
-rw-r--r--fs/xfs/scrub/iscan.c767
-rw-r--r--fs/xfs/scrub/iscan.h84
-rw-r--r--fs/xfs/scrub/newbt.c567
-rw-r--r--fs/xfs/scrub/newbt.h75
-rw-r--r--fs/xfs/scrub/nlinks.c930
-rw-r--r--fs/xfs/scrub/nlinks.h102
-rw-r--r--fs/xfs/scrub/nlinks_repair.c223
-rw-r--r--fs/xfs/scrub/off_bitmap.h37
-rw-r--r--fs/xfs/scrub/parent.c17
-rw-r--r--fs/xfs/scrub/quota.c107
-rw-r--r--fs/xfs/scrub/quota.h36
-rw-r--r--fs/xfs/scrub/quota_repair.c575
-rw-r--r--fs/xfs/scrub/quotacheck.c867
-rw-r--r--fs/xfs/scrub/quotacheck.h76
-rw-r--r--fs/xfs/scrub/quotacheck_repair.c261
-rw-r--r--fs/xfs/scrub/rcbag.c307
-rw-r--r--fs/xfs/scrub/rcbag.h28
-rw-r--r--fs/xfs/scrub/rcbag_btree.c370
-rw-r--r--fs/xfs/scrub/rcbag_btree.h81
-rw-r--r--fs/xfs/scrub/readdir.c10
-rw-r--r--fs/xfs/scrub/reap.c170
-rw-r--r--fs/xfs/scrub/reap.h5
-rw-r--r--fs/xfs/scrub/refcount.c14
-rw-r--r--fs/xfs/scrub/refcount_repair.c751
-rw-r--r--fs/xfs/scrub/repair.c497
-rw-r--r--fs/xfs/scrub/repair.h122
-rw-r--r--fs/xfs/scrub/rmap.c27
-rw-r--r--fs/xfs/scrub/rmap_repair.c1697
-rw-r--r--fs/xfs/scrub/rtbitmap.c136
-rw-r--r--fs/xfs/scrub/rtbitmap.h22
-rw-r--r--fs/xfs/scrub/rtbitmap_repair.c202
-rw-r--r--fs/xfs/scrub/rtsummary.c196
-rw-r--r--fs/xfs/scrub/scrub.c99
-rw-r--r--fs/xfs/scrub/scrub.h33
-rw-r--r--fs/xfs/scrub/stats.c6
-rw-r--r--fs/xfs/scrub/symlink.c25
-rw-r--r--fs/xfs/scrub/trace.c12
-rw-r--r--fs/xfs/scrub/trace.h1140
-rw-r--r--fs/xfs/scrub/xfarray.c234
-rw-r--r--fs/xfs/scrub/xfarray.h52
-rw-r--r--fs/xfs/scrub/xfile.c345
-rw-r--r--fs/xfs/scrub/xfile.h62
-rw-r--r--fs/xfs/xfs_acl.c4
-rw-r--r--fs/xfs/xfs_aops.c18
-rw-r--r--fs/xfs/xfs_attr_inactive.c4
-rw-r--r--fs/xfs/xfs_attr_item.c318
-rw-r--r--fs/xfs/xfs_attr_list.c39
-rw-r--r--fs/xfs/xfs_bmap_item.c311
-rw-r--r--fs/xfs/xfs_bmap_item.h4
-rw-r--r--fs/xfs/xfs_bmap_util.c201
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c376
-rw-r--r--fs/xfs/xfs_buf.h26
-rw-r--r--fs/xfs/xfs_buf_item.c8
-rw-r--r--fs/xfs/xfs_buf_item_recover.c8
-rw-r--r--fs/xfs/xfs_buf_mem.c270
-rw-r--r--fs/xfs/xfs_buf_mem.h34
-rw-r--r--fs/xfs/xfs_dir2_readdir.c17
-rw-r--r--fs/xfs/xfs_discard.c19
-rw-r--r--fs/xfs/xfs_dquot.c98
-rw-r--r--fs/xfs/xfs_dquot.h8
-rw-r--r--fs/xfs/xfs_dquot_item_recover.c21
-rw-r--r--fs/xfs/xfs_error.c8
-rw-r--r--fs/xfs/xfs_extent_busy.c18
-rw-r--r--fs/xfs/xfs_extent_busy.h2
-rw-r--r--fs/xfs/xfs_extfree_item.c340
-rw-r--r--fs/xfs/xfs_file.c77
-rw-r--r--fs/xfs/xfs_filestream.c6
-rw-r--r--fs/xfs/xfs_fsmap.c19
-rw-r--r--fs/xfs/xfs_fsops.c63
-rw-r--r--fs/xfs/xfs_fsops.h14
-rw-r--r--fs/xfs/xfs_globals.c12
-rw-r--r--fs/xfs/xfs_health.c206
-rw-r--r--fs/xfs/xfs_hooks.c52
-rw-r--r--fs/xfs/xfs_hooks.h65
-rw-r--r--fs/xfs/xfs_icache.c22
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c376
-rw-r--r--fs/xfs/xfs_inode.h56
-rw-r--r--fs/xfs/xfs_inode_item.c22
-rw-r--r--fs/xfs/xfs_inode_item_recover.c51
-rw-r--r--fs/xfs/xfs_ioctl.c158
-rw-r--r--fs/xfs/xfs_iomap.c19
-rw-r--r--fs/xfs/xfs_iops.c16
-rw-r--r--fs/xfs/xfs_itable.c12
-rw-r--r--fs/xfs/xfs_iwalk.c41
-rw-r--r--fs/xfs/xfs_linux.h29
-rw-r--r--fs/xfs/xfs_log.c58
-rw-r--r--fs/xfs/xfs_log_cil.c31
-rw-r--r--fs/xfs/xfs_log_priv.h1
-rw-r--r--fs/xfs/xfs_log_recover.c235
-rw-r--r--fs/xfs/xfs_mount.c14
-rw-r--r--fs/xfs/xfs_mount.h20
-rw-r--r--fs/xfs/xfs_mru_cache.c17
-rw-r--r--fs/xfs/xfs_notify_failure.c108
-rw-r--r--fs/xfs/xfs_qm.c61
-rw-r--r--fs/xfs/xfs_qm.h16
-rw-r--r--fs/xfs/xfs_qm_bhv.c1
-rw-r--r--fs/xfs/xfs_quota.h51
-rw-r--r--fs/xfs/xfs_refcount_item.c244
-rw-r--r--fs/xfs/xfs_reflink.c23
-rw-r--r--fs/xfs/xfs_rmap_item.c266
-rw-r--r--fs/xfs/xfs_rtalloc.c1151
-rw-r--r--fs/xfs/xfs_rtalloc.h111
-rw-r--r--fs/xfs/xfs_stats.c4
-rw-r--r--fs/xfs/xfs_stats.h2
-rw-r--r--fs/xfs/xfs_super.c128
-rw-r--r--fs/xfs/xfs_symlink.c165
-rw-r--r--fs/xfs/xfs_symlink.h1
-rw-r--r--fs/xfs/xfs_sysctl.c2
-rw-r--r--fs/xfs/xfs_sysctl.h2
-rw-r--r--fs/xfs/xfs_sysfs.c67
-rw-r--r--fs/xfs/xfs_trace.c3
-rw-r--r--fs/xfs/xfs_trace.h648
-rw-r--r--fs/xfs/xfs_trans.c69
-rw-r--r--fs/xfs/xfs_trans.h26
-rw-r--r--fs/xfs/xfs_trans_ail.c7
-rw-r--r--fs/xfs/xfs_trans_buf.c42
-rw-r--r--fs/xfs/xfs_trans_dquot.c171
-rw-r--r--fs/xfs/xfs_xattr.c6
220 files changed, 27137 insertions, 7154 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index ed0bc8cbc703..d41edd30388b 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -124,12 +124,24 @@ config XFS_DRAIN_INTENTS
bool
select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+config XFS_LIVE_HOOKS
+ bool
+ select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+
+config XFS_MEMORY_BUFS
+ bool
+
+config XFS_BTREE_IN_MEM
+ bool
+
config XFS_ONLINE_SCRUB
bool "XFS online metadata check support"
default n
depends on XFS_FS
depends on TMPFS && SHMEM
+ select XFS_LIVE_HOOKS
select XFS_DRAIN_INTENTS
+ select XFS_MEMORY_BUFS
help
If you say Y here you will be able to check metadata on a
mounted XFS filesystem. This feature is intended to reduce
@@ -147,7 +159,7 @@ config XFS_ONLINE_SCRUB_STATS
bool "XFS online metadata check usage data collection"
default y
depends on XFS_ONLINE_SCRUB
- select XFS_DEBUG
+ select DEBUG_FS
help
If you say Y here, the kernel will gather usage data about
the online metadata check subsystem. This includes the number
@@ -164,6 +176,7 @@ config XFS_ONLINE_REPAIR
bool "XFS online metadata repair support"
default n
depends on XFS_FS && XFS_ONLINE_SCRUB
+ select XFS_BTREE_IN_MEM
help
If you say Y here you will be able to repair metadata on a
mounted XFS filesystem. This feature is intended to reduce
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7762c01a85cf..76674ad5833e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -92,8 +92,7 @@ xfs-y += xfs_aops.o \
xfs_symlink.o \
xfs_sysfs.o \
xfs_trans.o \
- xfs_xattr.o \
- kmem.o
+ xfs_xattr.o
# low-level transaction/log code
xfs-y += xfs_log.o \
@@ -137,6 +136,9 @@ xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o
endif
xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o
+xfs-$(CONFIG_XFS_LIVE_HOOKS) += xfs_hooks.o
+xfs-$(CONFIG_XFS_MEMORY_BUFS) += xfs_buf_mem.o
+xfs-$(CONFIG_XFS_BTREE_IN_MEM) += libxfs/xfs_btree_mem.o
# online scrub/repair
ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
@@ -145,6 +147,7 @@ ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
xfs-y += $(addprefix scrub/, \
trace.o \
+ agb_bitmap.o \
agheader.o \
alloc.o \
attr.o \
@@ -158,6 +161,8 @@ xfs-y += $(addprefix scrub/, \
health.o \
ialloc.o \
inode.o \
+ iscan.o \
+ nlinks.o \
parent.o \
readdir.o \
refcount.o \
@@ -175,14 +180,39 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
rtsummary.o \
)
-xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
+xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \
+ dqiterate.o \
+ quota.o \
+ quotacheck.o \
+ )
# online repair
ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
+ alloc_repair.o \
+ bmap_repair.o \
+ cow_repair.o \
+ fscounters_repair.o \
+ ialloc_repair.o \
+ inode_repair.o \
+ newbt.o \
+ nlinks_repair.o \
+ rcbag_btree.o \
+ rcbag.o \
reap.o \
+ refcount_repair.o \
repair.o \
+ rmap_repair.o \
+ )
+
+xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
+ rtbitmap_repair.o \
+ )
+
+xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \
+ quota_repair.o \
+ quotacheck_repair.o \
)
endif
endif
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
deleted file mode 100644
index c557a030acfe..000000000000
--- a/fs/xfs/kmem.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#include "xfs.h"
-#include "xfs_message.h"
-#include "xfs_trace.h"
-
-void *
-kmem_alloc(size_t size, xfs_km_flags_t flags)
-{
- int retries = 0;
- gfp_t lflags = kmem_flags_convert(flags);
- void *ptr;
-
- trace_kmem_alloc(size, flags, _RET_IP_);
-
- do {
- ptr = kmalloc(size, lflags);
- if (ptr || (flags & KM_MAYFAIL))
- return ptr;
- if (!(++retries % 100))
- xfs_err(NULL,
- "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
- current->comm, current->pid,
- (unsigned int)size, __func__, lflags);
- memalloc_retry_wait(lflags);
- } while (1);
-}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
deleted file mode 100644
index b987dc2c6851..000000000000
--- a/fs/xfs/kmem.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#ifndef __XFS_SUPPORT_KMEM_H__
-#define __XFS_SUPPORT_KMEM_H__
-
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-
-/*
- * General memory allocation interfaces
- */
-
-typedef unsigned __bitwise xfs_km_flags_t;
-#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
-#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
-#define KM_ZERO ((__force xfs_km_flags_t)0x0010u)
-#define KM_NOLOCKDEP ((__force xfs_km_flags_t)0x0020u)
-
-/*
- * We use a special process flag to avoid recursive callbacks into
- * the filesystem during transactions. We will also issue our own
- * warnings, so we explicitly skip any generic ones (silly of us).
- */
-static inline gfp_t
-kmem_flags_convert(xfs_km_flags_t flags)
-{
- gfp_t lflags;
-
- BUG_ON(flags & ~(KM_NOFS | KM_MAYFAIL | KM_ZERO | KM_NOLOCKDEP));
-
- lflags = GFP_KERNEL | __GFP_NOWARN;
- if (flags & KM_NOFS)
- lflags &= ~__GFP_FS;
-
- /*
- * Default page/slab allocator behavior is to retry for ever
- * for small allocations. We can override this behavior by using
- * __GFP_RETRY_MAYFAIL which will tell the allocator to retry as long
- * as it is feasible but rather fail than retry forever for all
- * request sizes.
- */
- if (flags & KM_MAYFAIL)
- lflags |= __GFP_RETRY_MAYFAIL;
-
- if (flags & KM_ZERO)
- lflags |= __GFP_ZERO;
-
- if (flags & KM_NOLOCKDEP)
- lflags |= __GFP_NOLOCKDEP;
-
- return lflags;
-}
-
-extern void *kmem_alloc(size_t, xfs_km_flags_t);
-static inline void kmem_free(const void *ptr)
-{
- kvfree(ptr);
-}
-
-
-static inline void *
-kmem_zalloc(size_t size, xfs_km_flags_t flags)
-{
- return kmem_alloc(size, flags | KM_ZERO);
-}
-
-/*
- * Zone interfaces
- */
-static inline struct page *
-kmem_to_page(void *addr)
-{
- if (is_vmalloc_addr(addr))
- return vmalloc_to_page(addr);
- return virt_to_page(addr);
-}
-
-#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index f9f4d694640d..dc1873f76bff 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -217,6 +217,7 @@ xfs_initialize_perag_data(
*/
if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
xfs_alert(mp, "AGF corruption. Please run xfs_repair.");
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
error = -EFSCORRUPTED;
goto out;
}
@@ -241,7 +242,7 @@ __xfs_free_perag(
struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
- kmem_free(pag);
+ kfree(pag);
}
/*
@@ -263,7 +264,7 @@ xfs_free_perag(
xfs_defer_drain_free(&pag->pag_intents_drain);
cancel_delayed_work_sync(&pag->pag_blockgc_work);
- xfs_buf_hash_destroy(pag);
+ xfs_buf_cache_destroy(&pag->pag_bcache);
/* drop the mount's active reference */
xfs_perag_rele(pag);
@@ -332,6 +333,31 @@ xfs_agino_range(
return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
}
+/*
+ * Free perag within the specified AG range, it is only used to free unused
+ * perags under the error handling path.
+ */
+void
+xfs_free_unused_perag_range(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agstart,
+ xfs_agnumber_t agend)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t index;
+
+ for (index = agstart; index < agend; index++) {
+ spin_lock(&mp->m_perag_lock);
+ pag = radix_tree_delete(&mp->m_perag_tree, index);
+ spin_unlock(&mp->m_perag_lock);
+ if (!pag)
+ break;
+ xfs_buf_cache_destroy(&pag->pag_bcache);
+ xfs_defer_drain_free(&pag->pag_intents_drain);
+ kfree(pag);
+ }
+}
+
int
xfs_initialize_perag(
struct xfs_mount *mp,
@@ -356,7 +382,7 @@ xfs_initialize_perag(
continue;
}
- pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+ pag = kzalloc(sizeof(*pag), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!pag) {
error = -ENOMEM;
goto out_unwind_new_pags;
@@ -364,7 +390,7 @@ xfs_initialize_perag(
pag->pag_agno = index;
pag->pag_mount = mp;
- error = radix_tree_preload(GFP_NOFS);
+ error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (error)
goto out_free_pag;
@@ -391,9 +417,10 @@ xfs_initialize_perag(
init_waitqueue_head(&pag->pag_active_wq);
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
+ xfs_hooks_init(&pag->pag_rmap_update_hooks);
#endif /* __KERNEL__ */
- error = xfs_buf_hash_init(pag);
+ error = xfs_buf_cache_init(&pag->pag_bcache);
if (error)
goto out_remove_pag;
@@ -424,19 +451,14 @@ xfs_initialize_perag(
out_remove_pag:
xfs_defer_drain_free(&pag->pag_intents_drain);
+ spin_lock(&mp->m_perag_lock);
radix_tree_delete(&mp->m_perag_tree, index);
+ spin_unlock(&mp->m_perag_lock);
out_free_pag:
- kmem_free(pag);
+ kfree(pag);
out_unwind_new_pags:
/* unwind any prior newly initialized pags */
- for (index = first_initialised; index < agcount; index++) {
- pag = radix_tree_delete(&mp->m_perag_tree, index);
- if (!pag)
- break;
- xfs_buf_hash_destroy(pag);
- xfs_defer_drain_free(&pag->pag_intents_drain);
- kmem_free(pag);
- }
+ xfs_free_unused_perag_range(mp, first_initialised, agcount);
return error;
}
@@ -471,7 +493,7 @@ xfs_btroot_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno);
+ xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno);
}
/* Finish initializing a free space btree. */
@@ -529,7 +551,7 @@ xfs_freesp_init_recs(
}
/*
- * Alloc btree root block init functions
+ * bnobt/cntbt btree root block init functions
*/
static void
xfs_bnoroot_init(
@@ -537,17 +559,7 @@ xfs_bnoroot_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno);
- xfs_freesp_init_recs(mp, bp, id);
-}
-
-static void
-xfs_cntroot_init(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- struct aghdr_init_data *id)
-{
- xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno);
+ xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno);
xfs_freesp_init_recs(mp, bp, id);
}
@@ -563,7 +575,7 @@ xfs_rmaproot_init(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_rmap_rec *rrec;
- xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno);
+ xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 4, id->agno);
/*
* mark the AG header regions as static metadata The BNO
@@ -658,14 +670,13 @@ xfs_agfblock_init(
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
agf->agf_seqno = cpu_to_be32(id->agno);
agf->agf_length = cpu_to_be32(id->agsize);
- agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
- agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
- agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
- agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+ agf->agf_bno_root = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ agf->agf_cnt_root = cpu_to_be32(XFS_CNT_BLOCK(mp));
+ agf->agf_bno_level = cpu_to_be32(1);
+ agf->agf_cnt_level = cpu_to_be32(1);
if (xfs_has_rmapbt(mp)) {
- agf->agf_roots[XFS_BTNUM_RMAPi] =
- cpu_to_be32(XFS_RMAP_BLOCK(mp));
- agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+ agf->agf_rmap_root = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+ agf->agf_rmap_level = cpu_to_be32(1);
agf->agf_rmap_blocks = cpu_to_be32(1);
}
@@ -776,7 +787,7 @@ struct xfs_aghdr_grow_data {
size_t numblks;
const struct xfs_buf_ops *ops;
aghdr_init_work_f work;
- xfs_btnum_t type;
+ const struct xfs_btree_ops *bc_ops;
bool need_init;
};
@@ -830,13 +841,15 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_bnobt_buf_ops,
.work = &xfs_bnoroot_init,
+ .bc_ops = &xfs_bnobt_ops,
.need_init = true
},
{ /* CNT root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_cntbt_buf_ops,
- .work = &xfs_cntroot_init,
+ .work = &xfs_bnoroot_init,
+ .bc_ops = &xfs_cntbt_ops,
.need_init = true
},
{ /* INO root block */
@@ -844,7 +857,7 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_inobt_buf_ops,
.work = &xfs_btroot_init,
- .type = XFS_BTNUM_INO,
+ .bc_ops = &xfs_inobt_ops,
.need_init = true
},
{ /* FINO root block */
@@ -852,7 +865,7 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_finobt_buf_ops,
.work = &xfs_btroot_init,
- .type = XFS_BTNUM_FINO,
+ .bc_ops = &xfs_finobt_ops,
.need_init = xfs_has_finobt(mp)
},
{ /* RMAP root block */
@@ -860,6 +873,7 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_rmapbt_buf_ops,
.work = &xfs_rmaproot_init,
+ .bc_ops = &xfs_rmapbt_ops,
.need_init = xfs_has_rmapbt(mp)
},
{ /* REFC root block */
@@ -867,7 +881,7 @@ xfs_ag_init_headers(
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_refcountbt_buf_ops,
.work = &xfs_btroot_init,
- .type = XFS_BTNUM_REFC,
+ .bc_ops = &xfs_refcountbt_ops,
.need_init = xfs_has_reflink(mp)
},
{ /* NULL terminating block */
@@ -885,7 +899,7 @@ xfs_ag_init_headers(
id->daddr = dp->daddr;
id->numblks = dp->numblks;
- id->type = dp->type;
+ id->bc_ops = dp->bc_ops;
error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops);
if (error)
break;
@@ -930,8 +944,10 @@ xfs_ag_shrink_space(
agf = agfbp->b_addr;
aglen = be32_to_cpu(agi->agi_length);
/* some extra paranoid checks before we shrink the ag */
- if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length))
+ if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
return -EFSCORRUPTED;
+ }
if (delta >= aglen)
return -EINVAL;
@@ -959,14 +975,23 @@ xfs_ag_shrink_space(
if (error) {
/*
- * if extent allocation fails, need to roll the transaction to
+ * If extent allocation fails, need to roll the transaction to
* ensure that the AGFL fixup has been committed anyway.
+ *
+ * We need to hold the AGF across the roll to ensure nothing can
+ * access the AG for allocation until the shrink is fully
+ * cleaned up. And due to the resetting of the AG block
+ * reservation space needing to lock the AGI, we also have to
+ * hold that so we don't get AGI/AGF lock order inversions in
+ * the error handling path.
*/
xfs_trans_bhold(*tpp, agfbp);
+ xfs_trans_bhold(*tpp, agibp);
err2 = xfs_trans_roll(tpp);
if (err2)
return err2;
xfs_trans_bjoin(*tpp, agfbp);
+ xfs_trans_bjoin(*tpp, agibp);
goto resv_init_out;
}
@@ -984,7 +1009,7 @@ xfs_ag_shrink_space(
if (err2 != -ENOSPC)
goto resv_err;
- err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
+ err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
XFS_AG_RESV_NONE, true);
if (err2)
goto resv_err;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 2e0aef87d633..35de09a2516c 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -36,8 +36,9 @@ struct xfs_perag {
atomic_t pag_active_ref; /* active reference count */
wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */
unsigned long pag_opstate;
- uint8_t pagf_levels[XFS_BTNUM_AGF];
- /* # of levels in bno & cnt btree */
+ uint8_t pagf_bno_level; /* # of levels in bno btree */
+ uint8_t pagf_cnt_level; /* # of levels in cnt btree */
+ uint8_t pagf_rmap_level;/* # of levels in rmap btree */
uint32_t pagf_flcount; /* count of blocks in freelist */
xfs_extlen_t pagf_freeblks; /* total free blocks */
xfs_extlen_t pagf_longest; /* longest free space */
@@ -80,6 +81,18 @@ struct xfs_perag {
*/
uint16_t pag_checked;
uint16_t pag_sick;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /*
+ * Alternate btree heights so that online repair won't trip the write
+ * verifiers while rebuilding the AG btrees.
+ */
+ uint8_t pagf_repair_bno_level;
+ uint8_t pagf_repair_cnt_level;
+ uint8_t pagf_repair_refcount_level;
+ uint8_t pagf_repair_rmap_level;
+#endif
+
spinlock_t pag_state_lock;
spinlock_t pagb_lock; /* lock for pagb_tree */
@@ -94,9 +107,7 @@ struct xfs_perag {
int pag_ici_reclaimable; /* reclaimable inodes */
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
- /* buffer cache index */
- spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
- struct rhashtable pag_buf_hash;
+ struct xfs_buf_cache pag_bcache;
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
@@ -109,6 +120,9 @@ struct xfs_perag {
* inconsistencies.
*/
struct xfs_defer_drain pag_intents_drain;
+
+ /* Hook to feed rmapbt updates to an active online repair. */
+ struct xfs_hooks pag_rmap_update_hooks;
#endif /* __KERNEL__ */
};
@@ -133,6 +147,8 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA)
__XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES)
__XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET)
+void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart,
+ xfs_agnumber_t agend);
int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
@@ -319,7 +335,7 @@ struct aghdr_init_data {
/* per header data */
xfs_daddr_t daddr; /* header location */
size_t numblks; /* size of header */
- xfs_btnum_t type; /* type of btree root block */
+ const struct xfs_btree_ops *bc_ops; /* btree ops */
};
int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 7fd1fea95552..da1057bd0e60 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -411,6 +411,8 @@ xfs_ag_resv_free_extent(
fallthrough;
case XFS_AG_RESV_NONE:
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+ fallthrough;
+ case XFS_AG_RESV_IGNORE:
return;
}
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3069194527dd..9da52e92172a 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -26,6 +26,7 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_bmap.h"
+#include "xfs_health.h"
struct kmem_cache *xfs_extfree_item_cache;
@@ -150,23 +151,38 @@ xfs_alloc_ag_max_usable(
return mp->m_sb.sb_agblocks - blocks;
}
+
+static int
+xfs_alloc_lookup(
+ struct xfs_btree_cur *cur,
+ xfs_lookup_t dir,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ int *stat)
+{
+ int error;
+
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ error = xfs_btree_lookup(cur, dir, stat);
+ if (*stat == 1)
+ cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
+ else
+ cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
+ return error;
+}
+
/*
* Lookup the record equal to [bno, len] in the btree given by cur.
*/
-STATIC int /* error */
+static inline int /* error */
xfs_alloc_lookup_eq(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t bno, /* starting block of extent */
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
- int error;
-
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
- cur->bc_ag.abt.active = (*stat == 1);
- return error;
+ return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat);
}
/*
@@ -180,13 +196,7 @@ xfs_alloc_lookup_ge(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
- int error;
-
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
- cur->bc_ag.abt.active = (*stat == 1);
- return error;
+ return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat);
}
/*
@@ -200,19 +210,14 @@ xfs_alloc_lookup_le(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
- int error;
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
- cur->bc_ag.abt.active = (*stat == 1);
- return error;
+ return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat);
}
static inline bool
xfs_alloc_cur_active(
struct xfs_btree_cur *cur)
{
- return cur && cur->bc_ag.abt.active;
+ return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE);
}
/*
@@ -246,11 +251,9 @@ xfs_alloc_btrec_to_irec(
/* Simple checks for free space records. */
xfs_failaddr_t
xfs_alloc_check_irec(
- struct xfs_btree_cur *cur,
- const struct xfs_alloc_rec_incore *irec)
+ struct xfs_perag *pag,
+ const struct xfs_alloc_rec_incore *irec)
{
- struct xfs_perag *pag = cur->bc_ag.pag;
-
if (irec->ar_blockcount == 0)
return __this_address;
@@ -270,12 +273,12 @@ xfs_alloc_complain_bad_rec(
struct xfs_mount *mp = cur->bc_mp;
xfs_warn(mp,
- "%s Freespace BTree record corruption in AG %d detected at %pS!",
- cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
- cur->bc_ag.pag->pag_agno, fa);
+ "%sbt record corruption in AG %d detected at %pS!",
+ cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
xfs_warn(mp,
"start block 0x%x block count 0x%x", irec->ar_startblock,
irec->ar_blockcount);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -299,7 +302,7 @@ xfs_alloc_get_rec(
return error;
xfs_alloc_btrec_to_irec(rec, &irec);
- fa = xfs_alloc_check_irec(cur, &irec);
+ fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
if (fa)
return xfs_alloc_complain_bad_rec(cur, fa, &irec);
@@ -499,14 +502,18 @@ xfs_alloc_fixup_trees(
if (XFS_IS_CORRUPT(mp,
i != 1 ||
nfbno1 != fbno ||
- nflen1 != flen))
+ nflen1 != flen)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
#endif
} else {
if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
}
/*
* Look up the record in the by-block tree if necessary.
@@ -518,14 +525,18 @@ xfs_alloc_fixup_trees(
if (XFS_IS_CORRUPT(mp,
i != 1 ||
nfbno1 != fbno ||
- nflen1 != flen))
+ nflen1 != flen)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
#endif
} else {
if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
}
#ifdef DEBUG
@@ -538,8 +549,10 @@ xfs_alloc_fixup_trees(
if (XFS_IS_CORRUPT(mp,
bnoblock->bb_numrecs !=
- cntblock->bb_numrecs))
+ cntblock->bb_numrecs)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
}
#endif
@@ -569,30 +582,40 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_btree_delete(cnt_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
/*
* Add new by-size btree entry(s).
*/
if (nfbno1 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 0))
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
}
if (nfbno2 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 0))
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
return -EFSCORRUPTED;
+ }
}
/*
* Fix up the by-block btree entry(s).
@@ -603,8 +626,10 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_btree_delete(bno_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
} else {
/*
* Update the by-block entry to start later|be shorter.
@@ -618,12 +643,16 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 0))
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
if ((error = xfs_btree_insert(bno_cur, &i)))
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
return -EFSCORRUPTED;
+ }
}
return 0;
}
@@ -757,6 +786,8 @@ xfs_alloc_read_agfl(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
if (error)
return error;
xfs_buf_set_ref(bp, XFS_AGFL_REF);
@@ -778,6 +809,7 @@ xfs_alloc_update_counters(
if (unlikely(be32_to_cpu(agf->agf_freeblks) >
be32_to_cpu(agf->agf_length))) {
xfs_buf_mark_corrupt(agbp);
+ xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF);
return -EFSCORRUPTED;
}
@@ -830,8 +862,8 @@ xfs_alloc_cur_setup(
* attempt a small allocation.
*/
if (!acur->cnt)
- acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->pag, XFS_BTNUM_CNT);
+ acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag);
error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
if (error)
return error;
@@ -840,11 +872,11 @@ xfs_alloc_cur_setup(
* Allocate the bnobt left and right search cursors.
*/
if (!acur->bnolt)
- acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->pag, XFS_BTNUM_BNO);
+ acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag);
if (!acur->bnogt)
- acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->pag, XFS_BTNUM_BNO);
+ acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp,
+ args->agbp, args->pag);
return i == 1 ? 0 : -ENOSPC;
}
@@ -886,15 +918,17 @@ xfs_alloc_cur_check(
bool busy;
unsigned busy_gen = 0;
bool deactivate = false;
- bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
+ bool isbnobt = xfs_btree_is_bno(cur->bc_ops);
*new = 0;
error = xfs_alloc_get_rec(cur, &bno, &len, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(args->mp, i != 1))
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
/*
* Check minlen and deactivate a cntbt cursor if out of acceptable size
@@ -960,9 +994,8 @@ xfs_alloc_cur_check(
deactivate = true;
out:
if (deactivate)
- cur->bc_ag.abt.active = false;
- trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
- *new);
+ cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
+ trace_xfs_alloc_cur_check(cur, bno, len, diff, *new);
return 0;
}
@@ -1100,6 +1133,7 @@ xfs_alloc_ag_vextent_small(
if (error)
goto error;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(ccur);
error = -EFSCORRUPTED;
goto error;
}
@@ -1134,6 +1168,7 @@ xfs_alloc_ag_vextent_small(
*fbnop = args->agbno = fbno;
*flenp = args->len = 1;
if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
+ xfs_btree_mark_sick(ccur);
error = -EFSCORRUPTED;
goto error;
}
@@ -1199,8 +1234,8 @@ xfs_alloc_ag_vextent_exact(
/*
* Allocate/initialize a cursor for the by-number freespace btree.
*/
- bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_BNO);
+ bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
/*
* Lookup bno and minlen in the btree (minlen is irrelevant, really).
@@ -1220,6 +1255,7 @@ xfs_alloc_ag_vextent_exact(
if (error)
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1259,8 +1295,8 @@ xfs_alloc_ag_vextent_exact(
* We are allocating agbno for args->len
* Allocate/initialize a cursor for the by-size btree.
*/
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
args->len, XFSA_FIXUP_BNO_OK);
@@ -1332,7 +1368,7 @@ xfs_alloc_walk_iter(
if (error)
return error;
if (i == 0)
- cur->bc_ag.abt.active = false;
+ cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
if (count > 0)
count--;
@@ -1446,7 +1482,7 @@ xfs_alloc_ag_vextent_locality(
if (error)
return error;
if (i) {
- acur->cnt->bc_ag.abt.active = true;
+ acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
fbcur = acur->cnt;
fbinc = false;
}
@@ -1499,8 +1535,10 @@ xfs_alloc_ag_vextent_lastblock(
error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(args->mp, i != 1))
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(acur->cnt);
return -EFSCORRUPTED;
+ }
if (*len >= args->minlen)
break;
error = xfs_btree_increment(acur->cnt, 0, &i);
@@ -1672,8 +1710,8 @@ restart:
/*
* Allocate and initialize a cursor for the by-size btree.
*/
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
bno_cur = NULL;
/*
@@ -1712,6 +1750,7 @@ restart:
if (error)
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1758,6 +1797,7 @@ restart:
rlen != 0 &&
(rlen > flen ||
rbno + rlen > fbno + flen))) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1780,6 +1820,7 @@ restart:
&i)))
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1792,6 +1833,7 @@ restart:
rlen != 0 &&
(rlen > flen ||
rbno + rlen > fbno + flen))) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1808,6 +1850,7 @@ restart:
&i)))
goto error0;
if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1846,14 +1889,15 @@ restart:
rlen = args->len;
if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
/*
* Allocate and initialize a cursor for the by-block tree.
*/
- bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->pag, XFS_BTNUM_BNO);
+ bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
+ args->pag);
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
rbno, rlen, XFSA_FIXUP_CNT_OK)))
goto error0;
@@ -1865,6 +1909,7 @@ restart:
if (XFS_IS_CORRUPT(args->mp,
args->agbno + args->len >
be32_to_cpu(agf->agf_length))) {
+ xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1926,7 +1971,7 @@ xfs_free_ag_extent(
/*
* Allocate and initialize a cursor for the by-block btree.
*/
- bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
+ bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
/*
* Look for a neighboring block on the left (lower block numbers)
* that is contiguous with this space.
@@ -1940,6 +1985,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1955,6 +2001,7 @@ xfs_free_ag_extent(
* Very bad.
*/
if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1973,6 +2020,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1988,6 +2036,7 @@ xfs_free_ag_extent(
* Very bad.
*/
if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1996,7 +2045,7 @@ xfs_free_ag_extent(
/*
* Now allocate and initialize a cursor for the by-size tree.
*/
- cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
/*
* Have both left and right contiguous neighbors.
* Merge all three into a single free block.
@@ -2008,12 +2057,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2023,12 +2074,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2038,6 +2091,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_delete(bno_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2047,6 +2101,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2066,6 +2121,7 @@ xfs_free_ag_extent(
i != 1 ||
xxbno != ltbno ||
xxlen != ltlen)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2090,12 +2146,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2106,6 +2164,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2125,12 +2184,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2153,6 +2214,7 @@ xfs_free_ag_extent(
if ((error = xfs_btree_insert(bno_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bno_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2165,12 +2227,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
if ((error = xfs_btree_insert(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2269,23 +2333,41 @@ xfs_alloc_min_freelist(
struct xfs_perag *pag)
{
/* AG btrees have at least 1 level. */
- static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
- const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
+ const unsigned int bno_level = pag ? pag->pagf_bno_level : 1;
+ const unsigned int cnt_level = pag ? pag->pagf_cnt_level : 1;
+ const unsigned int rmap_level = pag ? pag->pagf_rmap_level : 1;
unsigned int min_free;
ASSERT(mp->m_alloc_maxlevels > 0);
+ /*
+ * For a btree shorter than the maximum height, the worst case is that
+ * every level gets split and a new level is added, then while inserting
+ * another entry to refill the AGFL, every level under the old root gets
+ * split again. This is:
+ *
+ * (full height split reservation) + (AGFL refill split height)
+ * = (current height + 1) + (current height - 1)
+ * = (new height) + (new height - 2)
+ * = 2 * new height - 2
+ *
+ * For a btree of maximum height, the worst case is that every level
+ * under the root gets split, then while inserting another entry to
+ * refill the AGFL, every level under the root gets split again. This is
+ * also:
+ *
+ * 2 * (current height - 1)
+ * = 2 * (new height - 1)
+ * = 2 * new height - 2
+ */
+
/* space needed by-bno freespace btree */
- min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
- mp->m_alloc_maxlevels);
+ min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
/* space needed by-size freespace btree */
- min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
- mp->m_alloc_maxlevels);
+ min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
/* space needed reverse mapping used space btree */
if (xfs_has_rmapbt(mp))
- min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
- mp->m_rmap_maxlevels);
-
+ min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2;
return min_free;
}
@@ -2493,7 +2575,7 @@ xfs_defer_agfl_block(
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
xfs_extent_free_get_group(mp, xefi);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
+ xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type);
return 0;
}
@@ -2501,14 +2583,15 @@ xfs_defer_agfl_block(
* Add the extent to the list of extents to be free at transaction end.
* The list is maintained sorted (by block number).
*/
-int
-__xfs_free_extent_later(
+static int
+xfs_defer_extent_free(
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_filblks_t len,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type,
- bool skip_discard)
+ bool skip_discard,
+ struct xfs_defer_pending **dfpp)
{
struct xfs_extent_free_item *xefi;
struct xfs_mount *mp = tp->t_mountp;
@@ -2556,10 +2639,105 @@ __xfs_free_extent_later(
XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
xfs_extent_free_get_group(mp, xefi);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
+ *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type);
+ return 0;
+}
+
+int
+xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type,
+ bool skip_discard)
+{
+ struct xfs_defer_pending *dontcare = NULL;
+
+ return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard,
+ &dontcare);
+}
+
+/*
+ * Set up automatic freeing of unwritten space in the filesystem.
+ *
+ * This function attached a paused deferred extent free item to the
+ * transaction. Pausing means that the EFI will be logged in the next
+ * transaction commit, but the pending EFI will not be finished until the
+ * pending item is unpaused.
+ *
+ * If the system goes down after the EFI has been persisted to the log but
+ * before the pending item is unpaused, log recovery will find the EFI, fail to
+ * find the EFD, and free the space.
+ *
+ * If the pending item is unpaused, the next transaction commit will log an EFD
+ * without freeing the space.
+ *
+ * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the
+ * @args structure are set to the relevant values.
+ */
+int
+xfs_alloc_schedule_autoreap(
+ const struct xfs_alloc_arg *args,
+ bool skip_discard,
+ struct xfs_alloc_autoreap *aarp)
+{
+ int error;
+
+ error = xfs_defer_extent_free(args->tp, args->fsbno, args->len,
+ &args->oinfo, args->resv, skip_discard, &aarp->dfp);
+ if (error)
+ return error;
+
+ xfs_defer_item_pause(args->tp, aarp->dfp);
return 0;
}
+/*
+ * Cancel automatic freeing of unwritten space in the filesystem.
+ *
+ * Earlier, we created a paused deferred extent free item and attached it to
+ * this transaction so that we could automatically roll back a new space
+ * allocation if the system went down. Now we want to cancel the paused work
+ * item by marking the EFI stale so we don't actually free the space, unpausing
+ * the pending item and logging an EFD.
+ *
+ * The caller generally should have already mapped the space into the ondisk
+ * filesystem. If the reserved space was partially used, the caller must call
+ * xfs_free_extent_later to create a new EFI to free the unused space.
+ */
+void
+xfs_alloc_cancel_autoreap(
+ struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp)
+{
+ struct xfs_defer_pending *dfp = aarp->dfp;
+ struct xfs_extent_free_item *xefi;
+
+ if (!dfp)
+ return;
+
+ list_for_each_entry(xefi, &dfp->dfp_work, xefi_list)
+ xefi->xefi_flags |= XFS_EFI_CANCELLED;
+
+ xfs_defer_item_unpause(tp, dfp);
+}
+
+/*
+ * Commit automatic freeing of unwritten space in the filesystem.
+ *
+ * This unpauses an earlier _schedule_autoreap and commits to freeing the
+ * allocated space. Call this if none of the reserved space was used.
+ */
+void
+xfs_alloc_commit_autoreap(
+ struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp)
+{
+ if (aarp->dfp)
+ xfs_defer_item_unpause(tp, aarp->dfp);
+}
+
#ifdef DEBUG
/*
* Check if an AGF has a free extent record whose length is equal to
@@ -2576,13 +2754,14 @@ xfs_exact_minlen_extent_available(
xfs_extlen_t flen;
int error = 0;
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
- args->pag, XFS_BTNUM_CNT);
+ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp,
+ args->pag);
error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
if (error)
goto out;
if (*stat == 0) {
+ xfs_btree_mark_sick(cnt_cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -2872,8 +3051,8 @@ xfs_alloc_log_agf(
offsetof(xfs_agf_t, agf_versionnum),
offsetof(xfs_agf_t, agf_seqno),
offsetof(xfs_agf_t, agf_length),
- offsetof(xfs_agf_t, agf_roots[0]),
- offsetof(xfs_agf_t, agf_levels[0]),
+ offsetof(xfs_agf_t, agf_bno_root), /* also cnt/rmap root */
+ offsetof(xfs_agf_t, agf_bno_level), /* also cnt/rmap levels */
offsetof(xfs_agf_t, agf_flfirst),
offsetof(xfs_agf_t, agf_fllast),
offsetof(xfs_agf_t, agf_flcount),
@@ -3052,12 +3231,10 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_freeblks) > agf_length)
return __this_address;
- if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
- mp->m_alloc_maxlevels ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
- mp->m_alloc_maxlevels)
+ if (be32_to_cpu(agf->agf_bno_level) < 1 ||
+ be32_to_cpu(agf->agf_cnt_level) < 1 ||
+ be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels ||
+ be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels)
return __this_address;
if (xfs_has_lazysbcount(mp) &&
@@ -3068,9 +3245,8 @@ xfs_agf_verify(
if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length)
return __this_address;
- if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
- mp->m_rmap_maxlevels)
+ if (be32_to_cpu(agf->agf_rmap_level) < 1 ||
+ be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels)
return __this_address;
}
@@ -3153,6 +3329,8 @@ xfs_read_agf(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
if (error)
return error;
@@ -3194,12 +3372,9 @@ xfs_alloc_read_agf(
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
pag->pagf_longest = be32_to_cpu(agf->agf_longest);
- pag->pagf_levels[XFS_BTNUM_BNOi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
- pag->pagf_levels[XFS_BTNUM_CNTi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
- pag->pagf_levels[XFS_BTNUM_RMAPi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+ pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
+ pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
+ pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
if (xfs_agfl_needs_reset(pag->pag_mount, agf))
set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
@@ -3228,10 +3403,8 @@ xfs_alloc_read_agf(
ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
- ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
- ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+ ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level));
+ ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level));
}
#endif
if (agfbpp)
@@ -3780,17 +3953,23 @@ __xfs_free_extent(
return -EIO;
error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
- if (error)
+ if (error) {
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
return error;
+ }
+
agf = agbp->b_addr;
if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
error = -EFSCORRUPTED;
goto err_release;
}
/* validate the extent size is legal now we have the agf locked */
if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
error = -EFSCORRUPTED;
goto err_release;
}
@@ -3827,7 +4006,7 @@ xfs_alloc_query_range_helper(
xfs_failaddr_t fa;
xfs_alloc_btrec_to_irec(rec, &irec);
- fa = xfs_alloc_check_irec(cur, &irec);
+ fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
if (fa)
return xfs_alloc_complain_bad_rec(cur, fa, &irec);
@@ -3847,7 +4026,7 @@ xfs_alloc_query_range(
union xfs_btree_irec high_brec = { .a = *high_rec };
struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn };
- ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ ASSERT(xfs_btree_is_bno(cur->bc_ops));
return xfs_btree_query_range(cur, &low_brec, &high_brec,
xfs_alloc_query_range_helper, &query);
}
@@ -3861,7 +4040,7 @@ xfs_alloc_query_all(
{
struct xfs_alloc_query_range_info query;
- ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ ASSERT(xfs_btree_is_bno(cur->bc_ops));
query.priv = priv;
query.fn = fn;
return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6bb8d295c321..0b956f8b9d5a 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -185,7 +185,7 @@ xfs_alloc_get_rec(
union xfs_btree_rec;
void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_alloc_rec_incore *irec);
-xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_alloc_check_irec(struct xfs_perag *pag,
const struct xfs_alloc_rec_incore *irec);
int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
@@ -231,7 +231,7 @@ xfs_buf_to_agfl_bno(
return bp->b_addr;
}
-int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
+int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type, bool skip_discard);
@@ -255,18 +255,18 @@ void xfs_extent_free_get_group(struct xfs_mount *mp,
#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */
#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
+#define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */
-static inline int
-xfs_free_extent_later(
- struct xfs_trans *tp,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- const struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type type)
-{
- return __xfs_free_extent_later(tp, bno, len, oinfo, type, false);
-}
+struct xfs_alloc_autoreap {
+ struct xfs_defer_pending *dfp;
+};
+int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args,
+ bool skip_discard, struct xfs_alloc_autoreap *aarp);
+void xfs_alloc_cancel_autoreap(struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp);
+void xfs_alloc_commit_autoreap(struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp);
extern struct kmem_cache *xfs_extfree_item_cache;
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index c65228efed4a..6ef5ddd89600 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -16,6 +16,7 @@
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_error.h"
+#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_ag.h"
@@ -23,13 +24,22 @@
static struct kmem_cache *xfs_allocbt_cur_cache;
STATIC struct xfs_btree_cur *
-xfs_allocbt_dup_cursor(
+xfs_bnobt_dup_cursor(
struct xfs_btree_cur *cur)
{
- return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
+ return xfs_bnobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp,
+ cur->bc_ag.pag);
}
+STATIC struct xfs_btree_cur *
+xfs_cntbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_cntbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp,
+ cur->bc_ag.pag);
+}
+
+
STATIC void
xfs_allocbt_set_root(
struct xfs_btree_cur *cur,
@@ -38,13 +48,18 @@ xfs_allocbt_set_root(
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
- int btnum = cur->bc_btnum;
ASSERT(ptr->s != 0);
- agf->agf_roots[btnum] = ptr->s;
- be32_add_cpu(&agf->agf_levels[btnum], inc);
- cur->bc_ag.pag->pagf_levels[btnum] += inc;
+ if (xfs_btree_is_bno(cur->bc_ops)) {
+ agf->agf_bno_root = ptr->s;
+ be32_add_cpu(&agf->agf_bno_level, inc);
+ cur->bc_ag.pag->pagf_bno_level += inc;
+ } else {
+ agf->agf_cnt_root = ptr->s;
+ be32_add_cpu(&agf->agf_cnt_level, inc);
+ cur->bc_ag.pag->pagf_cnt_level += inc;
+ }
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
@@ -116,7 +131,7 @@ xfs_allocbt_update_lastrec(
__be32 len;
int numrecs;
- ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+ ASSERT(!xfs_btree_is_bno(cur->bc_ops));
switch (reason) {
case LASTREC_UPDATE:
@@ -226,7 +241,10 @@ xfs_allocbt_init_ptr_from_cur(
ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
- ptr->s = agf->agf_roots[cur->bc_btnum];
+ if (xfs_btree_is_bno(cur->bc_ops))
+ ptr->s = agf->agf_bno_root;
+ else
+ ptr->s = agf->agf_cnt_root;
}
STATIC int64_t
@@ -299,13 +317,12 @@ xfs_allocbt_verify(
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
unsigned int level;
- xfs_btnum_t btnum = XFS_BTNUM_BNOi;
if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (xfs_has_crc(mp)) {
- fa = xfs_btree_sblock_v5hdr_verify(bp);
+ fa = xfs_btree_agblock_v5hdr_verify(bp);
if (fa)
return fa;
}
@@ -320,15 +337,32 @@ xfs_allocbt_verify(
* against.
*/
level = be16_to_cpu(block->bb_level);
- if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
- btnum = XFS_BTNUM_CNTi;
if (pag && xfs_perag_initialised_agf(pag)) {
- if (level >= pag->pagf_levels[btnum])
+ unsigned int maxlevel, repair_maxlevel = 0;
+
+ /*
+ * Online repair could be rewriting the free space btrees, so
+ * we'll validate against the larger of either tree while this
+ * is going on.
+ */
+ if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) {
+ maxlevel = pag->pagf_cnt_level;
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ repair_maxlevel = pag->pagf_repair_cnt_level;
+#endif
+ } else {
+ maxlevel = pag->pagf_bno_level;
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ repair_maxlevel = pag->pagf_repair_bno_level;
+#endif
+ }
+
+ if (level >= max(maxlevel, repair_maxlevel))
return __this_address;
} else if (level >= mp->m_alloc_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
+ return xfs_btree_agblock_verify(bp, mp->m_alloc_mxr[level != 0]);
}
static void
@@ -337,7 +371,7 @@ xfs_allocbt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_sblock_verify_crc(bp))
+ if (!xfs_btree_agblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_allocbt_verify(bp);
@@ -361,7 +395,7 @@ xfs_allocbt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_sblock_calc_crc(bp);
+ xfs_btree_agblock_calc_crc(bp);
}
@@ -443,11 +477,19 @@ xfs_allocbt_keys_contiguous(
be32_to_cpu(key2->alloc.ar_startblock));
}
-static const struct xfs_btree_ops xfs_bnobt_ops = {
+const struct xfs_btree_ops xfs_bnobt_ops = {
+ .name = "bno",
+ .type = XFS_BTREE_TYPE_AG,
+
.rec_len = sizeof(xfs_alloc_rec_t),
.key_len = sizeof(xfs_alloc_key_t),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_ALLOC_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_abtb_2),
+ .sick_mask = XFS_SICK_AG_BNOBT,
- .dup_cursor = xfs_allocbt_dup_cursor,
+ .dup_cursor = xfs_bnobt_dup_cursor,
.set_root = xfs_allocbt_set_root,
.alloc_block = xfs_allocbt_alloc_block,
.free_block = xfs_allocbt_free_block,
@@ -466,11 +508,20 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
.keys_contiguous = xfs_allocbt_keys_contiguous,
};
-static const struct xfs_btree_ops xfs_cntbt_ops = {
+const struct xfs_btree_ops xfs_cntbt_ops = {
+ .name = "cnt",
+ .type = XFS_BTREE_TYPE_AG,
+ .geom_flags = XFS_BTGEO_LASTREC_UPDATE,
+
.rec_len = sizeof(xfs_alloc_rec_t),
.key_len = sizeof(xfs_alloc_key_t),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_ALLOC_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_abtc_2),
+ .sick_mask = XFS_SICK_AG_CNTBT,
- .dup_cursor = xfs_allocbt_dup_cursor,
+ .dup_cursor = xfs_cntbt_dup_cursor,
.set_root = xfs_allocbt_set_root,
.alloc_block = xfs_allocbt_alloc_block,
.free_block = xfs_allocbt_free_block,
@@ -489,76 +540,55 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
.keys_contiguous = NULL, /* not needed right now */
};
-/* Allocate most of a new allocation btree cursor. */
-STATIC struct xfs_btree_cur *
-xfs_allocbt_init_common(
+/*
+ * Allocate a new bnobt cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
+ */
+struct xfs_btree_cur *
+xfs_bnobt_init_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
- struct xfs_perag *pag,
- xfs_btnum_t btnum)
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
- ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-
- cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels,
- xfs_allocbt_cur_cache);
- cur->bc_ag.abt.active = false;
-
- if (btnum == XFS_BTNUM_CNT) {
- cur->bc_ops = &xfs_cntbt_ops;
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
- cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
- } else {
- cur->bc_ops = &xfs_bnobt_ops;
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
- }
-
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bnobt_ops,
+ mp->m_alloc_maxlevels, xfs_allocbt_cur_cache);
cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_ag.agbp = agbp;
+ if (agbp) {
+ struct xfs_agf *agf = agbp->b_addr;
- if (xfs_has_crc(mp))
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
+ cur->bc_nlevels = be32_to_cpu(agf->agf_bno_level);
+ }
return cur;
}
/*
- * Allocate a new allocation btree cursor.
+ * Allocate a new cntbt cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
*/
-struct xfs_btree_cur * /* new alloc btree cursor */
-xfs_allocbt_init_cursor(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer for agf structure */
- struct xfs_perag *pag,
- xfs_btnum_t btnum) /* btree identifier */
-{
- struct xfs_agf *agf = agbp->b_addr;
- struct xfs_btree_cur *cur;
-
- cur = xfs_allocbt_init_common(mp, tp, pag, btnum);
- if (btnum == XFS_BTNUM_CNT)
- cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
- else
- cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
-
- cur->bc_ag.agbp = agbp;
-
- return cur;
-}
-
-/* Create a free space btree cursor with a fake root for staging. */
struct xfs_btree_cur *
-xfs_allocbt_stage_cursor(
+xfs_cntbt_init_cursor(
struct xfs_mount *mp,
- struct xbtree_afakeroot *afake,
- struct xfs_perag *pag,
- xfs_btnum_t btnum)
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
- cur = xfs_allocbt_init_common(mp, NULL, pag, btnum);
- xfs_btree_stage_afakeroot(cur, afake);
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_cntbt_ops,
+ mp->m_alloc_maxlevels, xfs_allocbt_cur_cache);
+ cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_ag.agbp = agbp;
+ if (agbp) {
+ struct xfs_agf *agf = agbp->b_addr;
+
+ cur->bc_nlevels = be32_to_cpu(agf->agf_cnt_level);
+ }
return cur;
}
@@ -577,16 +607,16 @@ xfs_allocbt_commit_staged_btree(
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
- agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
- agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
- xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-
- if (cur->bc_btnum == XFS_BTNUM_BNO) {
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops);
+ if (xfs_btree_is_bno(cur->bc_ops)) {
+ agf->agf_bno_root = cpu_to_be32(afake->af_root);
+ agf->agf_bno_level = cpu_to_be32(afake->af_levels);
} else {
- cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE;
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops);
+ agf->agf_cnt_root = cpu_to_be32(afake->af_root);
+ agf->agf_cnt_level = cpu_to_be32(afake->af_levels);
}
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
}
/* Calculate number of records in an alloc btree block. */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 45df893ef6bb..155b47f231ab 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -47,12 +47,12 @@ struct xbtree_afakeroot;
(maxrecs) * sizeof(xfs_alloc_key_t) + \
((index) - 1) * sizeof(xfs_alloc_ptr_t)))
-extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp,
+struct xfs_btree_cur *xfs_bnobt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_perag *pag, xfs_btnum_t btnum);
-struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, struct xfs_perag *pag,
- xfs_btnum_t btnum);
+ struct xfs_perag *pag);
+struct xfs_btree_cur *xfs_cntbt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_perag *pag);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index e28d93d232de..673a4b6d2e8d 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -224,7 +224,7 @@ int
xfs_attr_get_ilocked(
struct xfs_da_args *args)
{
- ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
if (!xfs_inode_hasattr(args->dp))
return -ENOATTR;
@@ -421,10 +421,10 @@ xfs_attr_complete_op(
bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
args->op_flags &= ~XFS_DA_OP_REPLACE;
- if (do_replace) {
- args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ if (do_replace)
return replace_state;
- }
+
return XFS_DAS_DONE;
}
@@ -862,8 +862,11 @@ xfs_attr_lookup(
if (!xfs_inode_hasattr(dp))
return -ENOATTR;
- if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
- return xfs_attr_sf_findname(args, NULL, NULL);
+ if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) {
+ if (xfs_attr_sf_findname(args))
+ return -EEXIST;
+ return -ENOATTR;
+ }
if (xfs_attr_is_leaf(dp)) {
error = xfs_attr_leaf_hasname(args, &bp);
@@ -880,79 +883,35 @@ xfs_attr_lookup(
return error;
}
-static int
-xfs_attr_intent_init(
+static void
+xfs_attr_defer_add(
struct xfs_da_args *args,
- unsigned int op_flags, /* op flag (set or remove) */
- struct xfs_attr_intent **attr) /* new xfs_attr_intent */
+ unsigned int op_flags)
{
struct xfs_attr_intent *new;
- new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ new = kmem_cache_zalloc(xfs_attr_intent_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
new->xattri_op_flags = op_flags;
new->xattri_da_args = args;
- *attr = new;
- return 0;
-}
-
-/* Sets an attribute for an inode as a deferred operation */
-static int
-xfs_attr_defer_add(
- struct xfs_da_args *args)
-{
- struct xfs_attr_intent *new;
- int error = 0;
-
- error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new);
- if (error)
- return error;
+ switch (op_flags) {
+ case XFS_ATTRI_OP_FLAGS_SET:
+ new->xattri_dela_state = xfs_attr_init_add_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ new->xattri_dela_state = xfs_attr_init_remove_state(args);
+ break;
+ default:
+ ASSERT(0);
+ }
- new->xattri_dela_state = xfs_attr_init_add_state(args);
- xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type);
trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
-
- return 0;
-}
-
-/* Sets an attribute for an inode as a deferred operation */
-static int
-xfs_attr_defer_replace(
- struct xfs_da_args *args)
-{
- struct xfs_attr_intent *new;
- int error = 0;
-
- error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new);
- if (error)
- return error;
-
- new->xattri_dela_state = xfs_attr_init_replace_state(args);
- xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
- trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp);
-
- return 0;
-}
-
-/* Removes an attribute for an inode as a deferred operation */
-static int
-xfs_attr_defer_remove(
- struct xfs_da_args *args)
-{
-
- struct xfs_attr_intent *new;
- int error;
-
- error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new);
- if (error)
- return error;
-
- new->xattri_dela_state = xfs_attr_init_remove_state(args);
- xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
- trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp);
-
- return 0;
}
/*
@@ -1038,16 +997,16 @@ xfs_attr_set(
error = xfs_attr_lookup(args);
switch (error) {
case -EEXIST:
- /* if no value, we are performing a remove operation */
if (!args->value) {
- error = xfs_attr_defer_remove(args);
+ /* if no value, we are performing a remove operation */
+ xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE);
break;
}
+
/* Pure create fails if the attr already exists */
if (args->attr_flags & XATTR_CREATE)
goto out_trans_cancel;
-
- error = xfs_attr_defer_replace(args);
+ xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE);
break;
case -ENOATTR:
/* Can't remove what isn't there. */
@@ -1057,14 +1016,11 @@ xfs_attr_set(
/* Pure replace fails if no existing attr to replace. */
if (args->attr_flags & XATTR_REPLACE)
goto out_trans_cancel;
-
- error = xfs_attr_defer_add(args);
+ xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET);
break;
default:
goto out_trans_cancel;
}
- if (error)
- goto out_trans_cancel;
/*
* If this is a synchronous mount, make sure that the
@@ -1097,10 +1053,9 @@ out_trans_cancel:
static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
{
- struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_hdr *sf = dp->i_af.if_data;
- sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
- return be16_to_cpu(sf->hdr.totsize);
+ return be16_to_cpu(sf->totsize);
}
/*
@@ -1112,19 +1067,13 @@ xfs_attr_shortform_addname(
struct xfs_da_args *args)
{
int newsize, forkoff;
- int error;
trace_xfs_attr_sf_addname(args);
- error = xfs_attr_shortform_lookup(args);
- switch (error) {
- case -ENOATTR:
- if (args->op_flags & XFS_DA_OP_REPLACE)
- return error;
- break;
- case -EEXIST:
- if (!(args->op_flags & XFS_DA_OP_REPLACE))
- return error;
+ if (xfs_attr_sf_findname(args)) {
+ int error;
+
+ ASSERT(args->op_flags & XFS_DA_OP_REPLACE);
error = xfs_attr_sf_removename(args);
if (error)
@@ -1137,11 +1086,8 @@ xfs_attr_shortform_addname(
* around.
*/
args->op_flags &= ~XFS_DA_OP_REPLACE;
- break;
- case 0:
- break;
- default:
- return error;
+ } else {
+ ASSERT(!(args->op_flags & XFS_DA_OP_REPLACE));
}
if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2580ae47209a..ac904cc1a97b 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -29,6 +29,7 @@
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_errortag.h"
+#include "xfs_health.h"
/*
@@ -690,56 +691,32 @@ xfs_attr_shortform_create(
ASSERT(ifp->if_bytes == 0);
if (ifp->if_format == XFS_DINODE_FMT_EXTENTS)
ifp->if_format = XFS_DINODE_FMT_LOCAL;
- xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
- hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data;
+
+ hdr = xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
memset(hdr, 0, sizeof(*hdr));
hdr->totsize = cpu_to_be16(sizeof(*hdr));
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
/*
- * Return -EEXIST if attr is found, or -ENOATTR if not
- * args: args containing attribute name and namelen
- * sfep: If not null, pointer will be set to the last attr entry found on
- -EEXIST. On -ENOATTR pointer is left at the last entry in the list
- * basep: If not null, pointer is set to the byte offset of the entry in the
- * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of
- * the last entry in the list
+ * Return the entry if the attr in args is found, or NULL if not.
*/
-int
+struct xfs_attr_sf_entry *
xfs_attr_sf_findname(
- struct xfs_da_args *args,
- struct xfs_attr_sf_entry **sfep,
- unsigned int *basep)
+ struct xfs_da_args *args)
{
- struct xfs_attr_shortform *sf;
- struct xfs_attr_sf_entry *sfe;
- unsigned int base = sizeof(struct xfs_attr_sf_hdr);
- int size = 0;
- int end;
- int i;
+ struct xfs_attr_sf_hdr *sf = args->dp->i_af.if_data;
+ struct xfs_attr_sf_entry *sfe;
- sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
- sfe = &sf->list[0];
- end = sf->hdr.count;
- for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
- base += size, i++) {
- size = xfs_attr_sf_entsize(sfe);
- if (!xfs_attr_match(args, sfe->namelen, sfe->nameval,
- sfe->flags))
- continue;
- break;
+ for (sfe = xfs_attr_sf_firstentry(sf);
+ sfe < xfs_attr_sf_endptr(sf);
+ sfe = xfs_attr_sf_nextentry(sfe)) {
+ if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ return sfe;
}
- if (sfep != NULL)
- *sfep = sfe;
-
- if (basep != NULL)
- *basep = base;
-
- if (i == end)
- return -ENOATTR;
- return -EEXIST;
+ return NULL;
}
/*
@@ -751,38 +728,31 @@ xfs_attr_shortform_add(
struct xfs_da_args *args,
int forkoff)
{
- struct xfs_attr_shortform *sf;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_ifork *ifp = &dp->i_af;
+ struct xfs_attr_sf_hdr *sf = ifp->if_data;
struct xfs_attr_sf_entry *sfe;
- int offset, size;
- struct xfs_mount *mp;
- struct xfs_inode *dp;
- struct xfs_ifork *ifp;
+ int size;
trace_xfs_attr_sf_add(args);
- dp = args->dp;
- mp = dp->i_mount;
dp->i_forkoff = forkoff;
- ifp = &dp->i_af;
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
- sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
- ASSERT(0);
+ ASSERT(!xfs_attr_sf_findname(args));
- offset = (char *)sfe - (char *)sf;
size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
- sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset);
+ sf = xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+ sfe = xfs_attr_sf_endptr(sf);
sfe->namelen = args->namelen;
sfe->valuelen = args->valuelen;
sfe->flags = args->attr_filter;
memcpy(sfe->nameval, args->name, args->namelen);
memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
- sf->hdr.count++;
- be16_add_cpu(&sf->hdr.totsize, size);
+ sf->count++;
+ be16_add_cpu(&sf->totsize, size);
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
xfs_sbversion_add_attr2(mp, args->trans);
@@ -811,48 +781,43 @@ int
xfs_attr_sf_removename(
struct xfs_da_args *args)
{
- struct xfs_attr_shortform *sf;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_attr_sf_hdr *sf = dp->i_af.if_data;
struct xfs_attr_sf_entry *sfe;
- int size = 0, end, totsize;
- unsigned int base;
- struct xfs_mount *mp;
- struct xfs_inode *dp;
- int error;
+ uint16_t totsize = be16_to_cpu(sf->totsize);
+ void *next, *end;
+ int size = 0;
trace_xfs_attr_sf_remove(args);
- dp = args->dp;
- mp = dp->i_mount;
- sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
-
- error = xfs_attr_sf_findname(args, &sfe, &base);
-
- /*
- * If we are recovering an operation, finding nothing to
- * remove is not an error - it just means there was nothing
- * to clean up.
- */
- if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY))
- return 0;
- if (error != -EEXIST)
- return error;
- size = xfs_attr_sf_entsize(sfe);
+ sfe = xfs_attr_sf_findname(args);
+ if (!sfe) {
+ /*
+ * If we are recovering an operation, finding nothing to remove
+ * is not an error, it just means there was nothing to clean up.
+ */
+ if (args->op_flags & XFS_DA_OP_RECOVERY)
+ return 0;
+ return -ENOATTR;
+ }
/*
* Fix up the attribute fork data, covering the hole
*/
- end = base + size;
- totsize = be16_to_cpu(sf->hdr.totsize);
- if (end != totsize)
- memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
- sf->hdr.count--;
- be16_add_cpu(&sf->hdr.totsize, -size);
+ size = xfs_attr_sf_entsize(sfe);
+ next = xfs_attr_sf_nextentry(sfe);
+ end = xfs_attr_sf_endptr(sf);
+ if (next < end)
+ memmove(sfe, next, end - next);
+ sf->count--;
+ totsize -= size;
+ sf->totsize = cpu_to_be16(totsize);
/*
* Fix up the start offset of the attribute fork
*/
- totsize -= size;
- if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
+ if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
!(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
xfs_attr_fork_remove(dp, args->trans);
@@ -860,7 +825,7 @@ xfs_attr_sf_removename(
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
ASSERT(dp->i_forkoff);
- ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+ ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) ||
(args->op_flags & XFS_DA_OP_ADDNAME) ||
!xfs_has_attr2(mp) ||
dp->i_df.if_format == XFS_DINODE_FMT_BTREE);
@@ -874,33 +839,6 @@ xfs_attr_sf_removename(
}
/*
- * Look up a name in a shortform attribute list structure.
- */
-/*ARGSUSED*/
-int
-xfs_attr_shortform_lookup(xfs_da_args_t *args)
-{
- struct xfs_attr_shortform *sf;
- struct xfs_attr_sf_entry *sfe;
- int i;
- struct xfs_ifork *ifp;
-
- trace_xfs_attr_sf_lookup(args);
-
- ifp = &args->dp->i_af;
- ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
- sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- sfe = &sf->list[0];
- for (i = 0; i < sf->hdr.count;
- sfe = xfs_attr_sf_nextentry(sfe), i++) {
- if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
- sfe->flags))
- return -EEXIST;
- }
- return -ENOATTR;
-}
-
-/*
* Retrieve the attribute value and length.
*
* If args->valuelen is zero, only the length needs to be returned. Unlike a
@@ -909,23 +847,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
*/
int
xfs_attr_shortform_getvalue(
- struct xfs_da_args *args)
+ struct xfs_da_args *args)
{
- struct xfs_attr_shortform *sf;
- struct xfs_attr_sf_entry *sfe;
- int i;
+ struct xfs_attr_sf_entry *sfe;
ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL);
- sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
- sfe = &sf->list[0];
- for (i = 0; i < sf->hdr.count;
- sfe = xfs_attr_sf_nextentry(sfe), i++) {
- if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
- sfe->flags))
- return xfs_attr_copy_value(args,
- &sfe->nameval[args->namelen], sfe->valuelen);
- }
- return -ENOATTR;
+
+ trace_xfs_attr_sf_lookup(args);
+
+ sfe = xfs_attr_sf_findname(args);
+ if (!sfe)
+ return -ENOATTR;
+ return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
+ sfe->valuelen);
}
/* Convert from using the shortform to the leaf format. */
@@ -933,26 +867,22 @@ int
xfs_attr_shortform_to_leaf(
struct xfs_da_args *args)
{
- struct xfs_inode *dp;
- struct xfs_attr_shortform *sf;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_ifork *ifp = &dp->i_af;
+ struct xfs_attr_sf_hdr *sf = ifp->if_data;
struct xfs_attr_sf_entry *sfe;
+ int size = be16_to_cpu(sf->totsize);
struct xfs_da_args nargs;
char *tmpbuffer;
- int error, i, size;
+ int error, i;
xfs_dablk_t blkno;
struct xfs_buf *bp;
- struct xfs_ifork *ifp;
trace_xfs_attr_sf_to_leaf(args);
- dp = args->dp;
- ifp = &dp->i_af;
- sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- size = be16_to_cpu(sf->hdr.totsize);
- tmpbuffer = kmem_alloc(size, 0);
- ASSERT(tmpbuffer != NULL);
- memcpy(tmpbuffer, ifp->if_u1.if_data, size);
- sf = (struct xfs_attr_shortform *)tmpbuffer;
+ tmpbuffer = kmalloc(size, GFP_KERNEL | __GFP_NOFAIL);
+ memcpy(tmpbuffer, ifp->if_data, size);
+ sf = (struct xfs_attr_sf_hdr *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
@@ -975,8 +905,8 @@ xfs_attr_shortform_to_leaf(
nargs.trans = args->trans;
nargs.op_flags = XFS_DA_OP_OKNOENT;
- sfe = &sf->list[0];
- for (i = 0; i < sf->hdr.count; i++) {
+ sfe = xfs_attr_sf_firstentry(sf);
+ for (i = 0; i < sf->count; i++) {
nargs.name = sfe->nameval;
nargs.namelen = sfe->namelen;
nargs.value = &sfe->nameval[nargs.namelen];
@@ -994,7 +924,7 @@ xfs_attr_shortform_to_leaf(
}
error = 0;
out:
- kmem_free(tmpbuffer);
+ kfree(tmpbuffer);
return error;
}
@@ -1040,23 +970,16 @@ xfs_attr_shortform_allfit(
return xfs_attr_shortform_bytesfit(dp, bytes);
}
-/* Verify the consistency of an inline attribute fork. */
+/* Verify the consistency of a raw inline attribute fork. */
xfs_failaddr_t
xfs_attr_shortform_verify(
- struct xfs_inode *ip)
+ struct xfs_attr_sf_hdr *sfp,
+ size_t size)
{
- struct xfs_attr_shortform *sfp;
- struct xfs_attr_sf_entry *sfep;
+ struct xfs_attr_sf_entry *sfep = xfs_attr_sf_firstentry(sfp);
struct xfs_attr_sf_entry *next_sfep;
char *endp;
- struct xfs_ifork *ifp;
int i;
- int64_t size;
-
- ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL);
- ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
- sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- size = ifp->if_bytes;
/*
* Give up if the attribute is way too short.
@@ -1067,8 +990,7 @@ xfs_attr_shortform_verify(
endp = (char *)sfp + size;
/* Check all reported entries */
- sfep = &sfp->list[0];
- for (i = 0; i < sfp->hdr.count; i++) {
+ for (i = 0; i < sfp->count; i++) {
/*
* struct xfs_attr_sf_entry has a variable length.
* Check the fixed-offset parts of the structure are
@@ -1137,7 +1059,7 @@ xfs_attr3_leaf_to_shortform(
trace_xfs_attr_leaf_to_sf(args);
- tmpbuffer = kmem_alloc(args->geo->blksize, 0);
+ tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
if (!tmpbuffer)
return -ENOMEM;
@@ -1203,7 +1125,7 @@ xfs_attr3_leaf_to_shortform(
error = 0;
out:
- kmem_free(tmpbuffer);
+ kfree(tmpbuffer);
return error;
}
@@ -1244,14 +1166,10 @@ xfs_attr3_leaf_to_node(
if (error)
goto out;
- /* copy leaf to new buffer, update identifiers */
- xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
- bp2->b_ops = bp1->b_ops;
- memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
- if (xfs_has_crc(mp)) {
- struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
- hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2));
- }
+ /*
+ * Copy leaf to new buffer and log it.
+ */
+ xfs_da_buf_copy(bp2, bp1, args->geo->blksize);
xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
/*
@@ -1615,7 +1533,7 @@ xfs_attr3_leaf_compact(
trace_xfs_attr_leaf_compact(args);
- tmpbuffer = kmem_alloc(args->geo->blksize, 0);
+ tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
memset(bp->b_addr, 0, args->geo->blksize);
leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -1653,7 +1571,7 @@ xfs_attr3_leaf_compact(
*/
xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
- kmem_free(tmpbuffer);
+ kfree(tmpbuffer);
}
/*
@@ -2332,7 +2250,8 @@ xfs_attr3_leaf_unbalance(
struct xfs_attr_leafblock *tmp_leaf;
struct xfs_attr3_icleaf_hdr tmphdr;
- tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
+ tmp_leaf = kzalloc(state->args->geo->blksize,
+ GFP_KERNEL | __GFP_NOFAIL);
/*
* Copy the header into the temp leaf so that all the stuff
@@ -2372,7 +2291,7 @@ xfs_attr3_leaf_unbalance(
}
memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
savehdr = tmphdr; /* struct copy */
- kmem_free(tmp_leaf);
+ kfree(tmp_leaf);
}
xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
@@ -2425,6 +2344,7 @@ xfs_attr3_leaf_lookup_int(
entries = xfs_attr3_leaf_entryp(leaf);
if (ichdr.count >= args->geo->blksize / 8) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -2444,10 +2364,12 @@ xfs_attr3_leaf_lookup_int(
}
if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 368f4d9fa1d5..9b9948639c0f 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -47,16 +47,14 @@ struct xfs_attr3_icleaf_hdr {
*/
void xfs_attr_shortform_create(struct xfs_da_args *args);
void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
-int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_sf_removename(struct xfs_da_args *args);
-int xfs_attr_sf_findname(struct xfs_da_args *args,
- struct xfs_attr_sf_entry **sfep,
- unsigned int *basep);
+struct xfs_attr_sf_entry *xfs_attr_sf_findname(struct xfs_da_args *args);
int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
-xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip);
+xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_attr_sf_hdr *sfp,
+ size_t size);
void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
/*
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d440393b40eb..ff0412828772 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -22,6 +22,7 @@
#include "xfs_attr_remote.h"
#include "xfs_trace.h"
#include "xfs_error.h"
+#include "xfs_health.h"
#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
@@ -276,17 +277,18 @@ xfs_attr3_rmt_hdr_set(
*/
STATIC int
xfs_attr_rmtval_copyout(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- xfs_ino_t ino,
- int *offset,
- int *valuelen,
- uint8_t **dst)
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct xfs_inode *dp,
+ int *offset,
+ int *valuelen,
+ uint8_t **dst)
{
- char *src = bp->b_addr;
- xfs_daddr_t bno = xfs_buf_daddr(bp);
- int len = BBTOB(bp->b_length);
- int blksize = mp->m_attr_geo->blksize;
+ char *src = bp->b_addr;
+ xfs_ino_t ino = dp->i_ino;
+ xfs_daddr_t bno = xfs_buf_daddr(bp);
+ int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
ASSERT(len >= blksize);
@@ -302,6 +304,7 @@ xfs_attr_rmtval_copyout(
xfs_alert(mp,
"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
bno, *offset, byte_cnt, ino);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
@@ -418,10 +421,12 @@ xfs_attr_rmtval_get(
dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt,
0, &bp, &xfs_attr3_rmt_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK);
if (error)
return error;
- error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+ error = xfs_attr_rmtval_copyout(mp, bp, args->dp,
&offset, &valuelen,
&dst);
xfs_buf_relse(bp);
@@ -545,11 +550,13 @@ xfs_attr_rmtval_stale(
struct xfs_buf *bp;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) ||
- XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
+ XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) {
+ xfs_bmap_mark_sick(ip, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
error = xfs_buf_incore(mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map->br_startblock),
@@ -659,8 +666,10 @@ xfs_attr_rmtval_invalidate(
blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
if (error)
return error;
- if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1))
+ if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) {
+ xfs_bmap_mark_sick(args->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 37578b369d9b..bc4422223024 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -7,14 +7,6 @@
#define __XFS_ATTR_SF_H__
/*
- * Attribute storage when stored inside the inode.
- *
- * Small attribute lists are packed as tightly as possible so as
- * to fit into the literal area of the inode.
- */
-typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
-
-/*
* We generate this then sort it, attr_list() must return things in hash-order.
*/
typedef struct xfs_attr_sf_sort {
@@ -41,11 +33,25 @@ static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep)
return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen);
}
-/* next entry in struct */
+/* first entry in the SF attr fork */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_firstentry(struct xfs_attr_sf_hdr *hdr)
+{
+ return (struct xfs_attr_sf_entry *)(hdr + 1);
+}
+
+/* next entry after sfep */
static inline struct xfs_attr_sf_entry *
xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep)
{
return (void *)sfep + xfs_attr_sf_entsize(sfep);
}
+/* pointer to the space after the last entry, e.g. for adding a new one */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_endptr(struct xfs_attr_sf_hdr *sf)
+{
+ return (void *)sf + be16_to_cpu(sf->totsize);
+}
+
#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 30c931b38853..656c95a22f2e 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -21,7 +21,7 @@
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_quota.h"
@@ -36,6 +36,9 @@
#include "xfs_refcount.h"
#include "xfs_icache.h"
#include "xfs_iomap.h"
+#include "xfs_health.h"
+#include "xfs_bmap_item.h"
+#include "xfs_symlink_remote.h"
struct kmem_cache *xfs_bmap_intent_cache;
@@ -225,6 +228,28 @@ xfs_bmap_forkoff_reset(
}
}
+static int
+xfs_bmap_read_buf(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t fsbno, /* file system block number */
+ struct xfs_buf **bpp) /* buffer for fsbno */
+{
+ struct xfs_buf *bp; /* return value */
+ int error;
+
+ if (!xfs_verify_fsbno(mp, fsbno))
+ return -EFSCORRUPTED;
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, fsbno), mp->m_bsize, 0, &bp,
+ &xfs_bmbt_buf_ops);
+ if (!error) {
+ xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
+ *bpp = bp;
+ }
+ return error;
+}
+
#ifdef DEBUG
STATIC struct xfs_buf *
xfs_bmap_get_bp(
@@ -364,9 +389,9 @@ xfs_bmap_check_leaf_extents(
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
if (!bp) {
bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
+ error = xfs_bmap_read_buf(mp, NULL, bno, &bp);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(cur);
if (error)
goto error_norelse;
}
@@ -383,6 +408,7 @@ xfs_bmap_check_leaf_extents(
pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -450,9 +476,9 @@ xfs_bmap_check_leaf_extents(
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
if (!bp) {
bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
+ error = xfs_bmap_read_buf(mp, NULL, bno, &bp);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(cur);
if (error)
goto error_norelse;
}
@@ -562,11 +588,14 @@ xfs_bmap_btree_to_extents(
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
#ifdef DEBUG
- if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1)))
+ if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_verify_fsbno(mp, cbno))) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
#endif
- error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
+ error = xfs_bmap_read_buf(mp, tp, cbno, &cbp);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(cur);
if (error)
return error;
cblock = XFS_BUF_TO_BLOCK(cbp);
@@ -575,7 +604,7 @@ xfs_bmap_btree_to_extents(
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
return error;
@@ -634,14 +663,13 @@ xfs_bmap_extents_to_btree(
* Fill in the root.
*/
block = ifp->if_broot;
- xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
- XFS_BTNUM_BMAP, 1, 1, ip->i_ino,
- XFS_BTREE_LONG_PTRS);
+ xfs_bmbt_init_block(ip, block, NULL, 1, 1);
/*
* Need a cursor. Can't allocate until bb_level is filled in.
*/
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
+ if (wasdel)
+ cur->bc_flags |= XFS_BTREE_BMBT_WASDEL;
/*
* Convert to a btree with two levels, one record in root.
*/
@@ -667,7 +695,7 @@ xfs_bmap_extents_to_btree(
goto out_root_realloc;
}
- cur->bc_ino.allocated++;
+ cur->bc_bmap.allocated++;
ip->i_nblocks++;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
@@ -679,11 +707,8 @@ xfs_bmap_extents_to_btree(
/*
* Fill in the child block.
*/
- abp->b_ops = &xfs_bmbt_buf_ops;
ablock = XFS_BUF_TO_BLOCK(abp);
- xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp),
- XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
- XFS_BTREE_LONG_PTRS);
+ xfs_bmbt_init_block(ip, ablock, abp, 0, 0);
for_each_xfs_iext(ifp, &icur, &rec) {
if (isnullstartblock(rec.br_startblock))
@@ -747,7 +772,7 @@ xfs_bmap_local_to_extents_empty(
ASSERT(ifp->if_nextents == 0);
xfs_bmap_forkoff_reset(ip, whichfork);
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
ifp->if_height = 0;
ifp->if_format = XFS_DINODE_FMT_EXTENTS;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -832,7 +857,7 @@ xfs_bmap_local_to_extents(
xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags |= XFS_ILOG_CORE;
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
ifp->if_height = 0;
rec.br_startoff = 0;
@@ -878,6 +903,7 @@ xfs_bmap_add_attrfork_btree(
goto error0;
/* must be at least one entry */
if (XFS_IS_CORRUPT(mp, stat != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -887,7 +913,7 @@ xfs_bmap_add_attrfork_btree(
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return -ENOSPC;
}
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
}
return 0;
@@ -915,7 +941,7 @@ xfs_bmap_add_attrfork_extents(
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags,
XFS_DATA_FORK);
if (cur) {
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
return error;
@@ -960,6 +986,7 @@ xfs_bmap_add_attrfork_local(
/* should only be called for types that support local format data */
ASSERT(0);
+ xfs_bmap_mark_sick(ip, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -1143,6 +1170,7 @@ xfs_iread_bmbt_block(
(unsigned long long)ip->i_ino);
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block,
sizeof(*block), __this_address);
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -1158,6 +1186,7 @@ xfs_iread_bmbt_block(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iread_extents(2)", frp,
sizeof(*frp), fa);
+ xfs_bmap_mark_sick(ip, whichfork);
return xfs_bmap_complain_bad_rec(ip, whichfork, fa,
&new);
}
@@ -1189,7 +1218,7 @@ xfs_iread_extents(
if (!xfs_need_iread_extents(ifp))
return 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ir.loaded = 0;
xfs_iext_first(ifp, &ir.icur);
@@ -1201,6 +1230,7 @@ xfs_iread_extents(
goto out;
if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) {
+ xfs_bmap_mark_sick(ip, whichfork);
error = -EFSCORRUPTED;
goto out;
}
@@ -1213,6 +1243,8 @@ xfs_iread_extents(
smp_store_release(&ifp->if_needextents, 0);
return 0;
out:
+ if (xfs_metadata_is_sick(error))
+ xfs_bmap_mark_sick(ip, whichfork);
xfs_iext_destroy(ifp);
return error;
}
@@ -1292,6 +1324,7 @@ xfs_bmap_last_before(
break;
default:
ASSERT(0);
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -1388,8 +1421,10 @@ xfs_bmap_last_offset(
if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
return 0;
- if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp)))
+ if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
+ }
error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
if (error || is_empty)
@@ -1429,8 +1464,7 @@ xfs_bmap_add_extent_delay_real(
ASSERT(whichfork != XFS_ATTR_FORK);
ASSERT(!isnullstartblock(new->br_startblock));
- ASSERT(!bma->cur ||
- (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
+ ASSERT(!bma->cur || (bma->cur->bc_flags & XFS_BTREE_BMBT_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -1528,6 +1562,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1535,6 +1570,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1542,6 +1578,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1571,6 +1608,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1604,6 +1642,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1632,6 +1671,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1639,6 +1679,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1673,6 +1714,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1698,6 +1740,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1705,6 +1748,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1721,7 +1765,7 @@ xfs_bmap_add_extent_delay_real(
temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_ino.allocated : 0));
+ (bma->cur ? bma->cur->bc_bmap.allocated : 0));
PREV.br_startoff = new_endoff;
PREV.br_blockcount = temp;
@@ -1749,6 +1793,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1785,6 +1830,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1792,6 +1838,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1808,7 +1855,7 @@ xfs_bmap_add_extent_delay_real(
temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_ino.allocated : 0));
+ (bma->cur ? bma->cur->bc_bmap.allocated : 0));
PREV.br_startblock = nullstartblock(da_new);
PREV.br_blockcount = temp;
@@ -1871,6 +1918,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1878,6 +1926,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(bma->cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1929,8 +1978,8 @@ xfs_bmap_add_extent_delay_real(
xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
if (bma->cur) {
- da_new += bma->cur->bc_ino.allocated;
- bma->cur->bc_ino.allocated = 0;
+ da_new += bma->cur->bc_bmap.allocated;
+ bma->cur->bc_bmap.allocated = 0;
}
/* adjust for changes in reserved delayed indirect blocks */
@@ -2074,30 +2123,35 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2126,18 +2180,21 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2169,18 +2226,21 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2207,6 +2267,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2240,6 +2301,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2277,6 +2339,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2287,6 +2350,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_insert(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2317,6 +2381,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2353,6 +2418,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2363,12 +2429,14 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if ((error = xfs_btree_insert(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2405,6 +2473,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2417,6 +2486,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_insert(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2429,6 +2499,7 @@ xfs_bmap_add_extent_unwritten_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2436,6 +2507,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_insert(cur, &i)))
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2472,7 +2544,7 @@ xfs_bmap_add_extent_unwritten_real(
/* clear out the allocated field, done with it now in any case. */
if (cur) {
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
*curp = cur;
}
@@ -2651,7 +2723,7 @@ xfs_bmap_add_extent_hole_real(
struct xfs_bmbt_irec old;
ASSERT(!isnullstartblock(new->br_startblock));
- ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
+ ASSERT(!cur || !(cur->bc_flags & XFS_BTREE_BMBT_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2721,6 +2793,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2728,6 +2801,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2735,6 +2809,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2764,6 +2839,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2794,6 +2870,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2820,6 +2897,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2827,6 +2905,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2853,7 +2932,7 @@ xfs_bmap_add_extent_hole_real(
/* clear out the allocated field, done with it now in any case. */
if (cur)
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_bmap_check_leaf_extents(cur, ip, whichfork);
done:
@@ -2989,7 +3068,7 @@ xfs_bmap_extsize_align(
* If realtime, and the result isn't a multiple of the realtime
* extent size we need to remove blocks until it is.
*/
- if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+ if (rt && (temp = xfs_extlen_to_rtxmod(mp, align_alen))) {
/*
* We're not covering the original request, or
* we won't be able to once we fix the length.
@@ -3016,7 +3095,7 @@ xfs_bmap_extsize_align(
else {
align_alen -= orig_off - align_off;
align_off = orig_off;
- align_alen -= align_alen % mp->m_sb.sb_rextsize;
+ align_alen -= xfs_extlen_to_rtxmod(mp, align_alen);
}
/*
* Result doesn't cover the request, fail it.
@@ -3044,7 +3123,8 @@ xfs_bmap_extsize_align(
#define XFS_ALLOC_GAP_UNITS 4
-void
+/* returns true if ap->blkno was modified */
+bool
xfs_bmap_adjacent(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
@@ -3079,13 +3159,14 @@ xfs_bmap_adjacent(
if (adjust &&
ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
ap->blkno += adjust;
+ return true;
}
/*
* If not at eof, then compare the two neighbor blocks.
* Figure out whether either one gives us a good starting point,
* and pick the better one.
*/
- else if (!ap->eof) {
+ if (!ap->eof) {
xfs_fsblock_t gotbno; /* right side block number */
xfs_fsblock_t gotdiff=0; /* right side difference */
xfs_fsblock_t prevbno; /* left side block number */
@@ -3165,14 +3246,21 @@ xfs_bmap_adjacent(
* If both valid, pick the better one, else the only good
* one, else ap->blkno is already set (to 0 or the inode block).
*/
- if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+ if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) {
ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
- else if (prevbno != NULLFSBLOCK)
+ return true;
+ }
+ if (prevbno != NULLFSBLOCK) {
ap->blkno = prevbno;
- else if (gotbno != NULLFSBLOCK)
+ return true;
+ }
+ if (gotbno != NULLFSBLOCK) {
ap->blkno = gotbno;
+ return true;
+ }
}
#undef ISVALID
+ return false;
}
int
@@ -3263,11 +3351,14 @@ xfs_bmap_btalloc_select_lengths(
}
/* Update all inode and quota accounting for the allocation we just did. */
-static void
-xfs_bmap_btalloc_accounting(
- struct xfs_bmalloca *ap,
- struct xfs_alloc_arg *args)
+void
+xfs_bmap_alloc_account(
+ struct xfs_bmalloca *ap)
{
+ bool isrt = XFS_IS_REALTIME_INODE(ap->ip) &&
+ !(ap->flags & XFS_BMAPI_ATTRFORK);
+ uint fld;
+
if (ap->flags & XFS_BMAPI_COWFORK) {
/*
* COW fork blocks are in-core only and thus are treated as
@@ -3279,7 +3370,7 @@ xfs_bmap_btalloc_accounting(
* yet.
*/
if (ap->wasdel) {
- xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+ xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
return;
}
@@ -3291,22 +3382,25 @@ xfs_bmap_btalloc_accounting(
* This essentially transfers the transaction quota reservation
* to that of a delalloc extent.
*/
- ap->ip->i_delayed_blks += args->len;
- xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS,
- -(long)args->len);
+ ap->ip->i_delayed_blks += ap->length;
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip, isrt ?
+ XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS,
+ -(long)ap->length);
return;
}
/* data/attr fork only */
- ap->ip->i_nblocks += args->len;
+ ap->ip->i_nblocks += ap->length;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel) {
- ap->ip->i_delayed_blks -= args->len;
- xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+ ap->ip->i_delayed_blks -= ap->length;
+ xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
+ fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT;
+ } else {
+ fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
}
- xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
- ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT,
- args->len);
+
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip, fld, ap->length);
}
static int
@@ -3380,7 +3474,7 @@ xfs_bmap_process_allocated_extent(
ap->offset = orig_offset;
else if (ap->offset + ap->length < orig_offset + orig_length)
ap->offset = orig_offset + orig_length - ap->length;
- xfs_bmap_btalloc_accounting(ap, args);
+ xfs_bmap_alloc_account(ap);
}
#ifdef DEBUG
@@ -3883,14 +3977,18 @@ xfs_bmapi_read(
ASSERT(*nmap >= 1);
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE)));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
- if (WARN_ON_ONCE(!ifp))
+ if (WARN_ON_ONCE(!ifp)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
+ }
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT))
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
+ }
if (xfs_is_shutdown(mp))
return -EIO;
@@ -4145,9 +4243,8 @@ xfs_bmapi_allocate(
*/
bma->nallocs++;
- if (bma->cur)
- bma->cur->bc_ino.flags =
- bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
+ if (bma->cur && bma->wasdel)
+ bma->cur->bc_flags |= XFS_BTREE_BMBT_WASDEL;
bma->got.br_startoff = bma->offset;
bma->got.br_startblock = bma->blkno;
@@ -4354,7 +4451,7 @@ xfs_bmapi_write(
ASSERT(tp != NULL);
ASSERT(len > 0);
ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(!(flags & XFS_BMAPI_REMAP));
/* zeroing is for currently only for data extents, not metadata */
@@ -4371,6 +4468,7 @@ xfs_bmapi_write(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -4598,9 +4696,11 @@ xfs_bmapi_convert_delalloc(
error = -ENOSPC;
if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
goto out_finish;
- error = -EFSCORRUPTED;
- if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
+ if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) {
+ xfs_bmap_mark_sick(ip, whichfork);
+ error = -EFSCORRUPTED;
goto out_finish;
+ }
XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
XFS_STATS_INC(mp, xs_xstrat_quick);
@@ -4651,7 +4751,7 @@ xfs_bmapi_remap(
ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(len > 0);
ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_NORMAP)));
ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) !=
@@ -4659,6 +4759,7 @@ xfs_bmapi_remap(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -4678,10 +4779,8 @@ xfs_bmapi_remap(
ip->i_nblocks += len;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE)
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
- }
got.br_startoff = bno;
got.br_startblock = startblock;
@@ -4816,7 +4915,7 @@ xfs_bmap_del_extent_delay(
XFS_STATS_INC(mp, xs_del_exlist);
- isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+ isrt = xfs_ifork_is_realtime(ip, whichfork);
del_endoff = del->br_startoff + del->br_blockcount;
got_endoff = got->br_startoff + got->br_blockcount;
da_old = startblockval(got->br_startblock);
@@ -4826,12 +4925,8 @@ xfs_bmap_del_extent_delay(
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
- if (isrt) {
- uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
-
- do_div(rtexts, mp->m_sb.sb_rextsize);
- xfs_mod_frextents(mp, rtexts);
- }
+ if (isrt)
+ xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount));
/*
* Update the inode delalloc counter now and wait to update the
@@ -5014,7 +5109,6 @@ xfs_bmap_del_extent_real(
xfs_fileoff_t del_endoff; /* first offset past del */
int do_fx; /* free extent at end of routine */
int error; /* error return value */
- int flags = 0;/* inode logging flags */
struct xfs_bmbt_irec got; /* current extent entry */
xfs_fileoff_t got_endoff; /* first offset past got */
int i; /* temp state */
@@ -5027,6 +5121,8 @@ xfs_bmap_del_extent_real(
uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
+ *logflagsp = 0;
+
mp = ip->i_mount;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -5039,7 +5135,6 @@ xfs_bmap_del_extent_real(
ASSERT(got_endoff >= del_endoff);
ASSERT(!isnullstartblock(got.br_startblock));
qfield = 0;
- error = 0;
/*
* If it's the case where the directory code is running with no block
@@ -5055,44 +5150,31 @@ xfs_bmap_del_extent_real(
del->br_startoff > got.br_startoff && del_endoff < got_endoff)
return -ENOSPC;
- flags = XFS_ILOG_CORE;
- if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
- xfs_filblks_t len;
- xfs_extlen_t mod;
-
- len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize,
- &mod);
- ASSERT(mod == 0);
-
+ *logflagsp = XFS_ILOG_CORE;
+ if (xfs_ifork_is_realtime(ip, whichfork)) {
if (!(bflags & XFS_BMAPI_REMAP)) {
- xfs_fsblock_t bno;
-
- bno = div_u64_rem(del->br_startblock,
- mp->m_sb.sb_rextsize, &mod);
- ASSERT(mod == 0);
-
- error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+ error = xfs_rtfree_blocks(tp, del->br_startblock,
+ del->br_blockcount);
if (error)
- goto done;
+ return error;
}
do_fx = 0;
- nblks = len * mp->m_sb.sb_rextsize;
qfield = XFS_TRANS_DQ_RTBCOUNT;
} else {
do_fx = 1;
- nblks = del->br_blockcount;
qfield = XFS_TRANS_DQ_BCOUNT;
}
+ nblks = del->br_blockcount;
del_endblock = del->br_startblock + del->br_blockcount;
if (cur) {
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
- goto done;
+ return error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
}
}
@@ -5110,16 +5192,16 @@ xfs_bmap_del_extent_real(
xfs_iext_prev(ifp, icur);
ifp->if_nextents--;
- flags |= XFS_ILOG_CORE;
+ *logflagsp |= XFS_ILOG_CORE;
if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
break;
}
if ((error = xfs_btree_delete(cur, &i)))
- goto done;
+ return error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
}
break;
case BMAP_LEFT_FILLING:
@@ -5131,12 +5213,12 @@ xfs_bmap_del_extent_real(
got.br_blockcount -= del->br_blockcount;
xfs_iext_update_extent(ip, state, icur, &got);
if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
break;
}
error = xfs_bmbt_update(cur, &got);
if (error)
- goto done;
+ return error;
break;
case BMAP_RIGHT_FILLING:
/*
@@ -5145,12 +5227,12 @@ xfs_bmap_del_extent_real(
got.br_blockcount -= del->br_blockcount;
xfs_iext_update_extent(ip, state, icur, &got);
if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
break;
}
error = xfs_bmbt_update(cur, &got);
if (error)
- goto done;
+ return error;
break;
case 0:
/*
@@ -5167,18 +5249,18 @@ xfs_bmap_del_extent_real(
new.br_state = got.br_state;
new.br_startblock = del_endblock;
- flags |= XFS_ILOG_CORE;
+ *logflagsp |= XFS_ILOG_CORE;
if (cur) {
error = xfs_bmbt_update(cur, &got);
if (error)
- goto done;
+ return error;
error = xfs_btree_increment(cur, 0, &i);
if (error)
- goto done;
+ return error;
cur->bc_rec.b = new;
error = xfs_btree_insert(cur, &i);
if (error && error != -ENOSPC)
- goto done;
+ return error;
/*
* If get no-space back from btree insert, it tried a
* split, and we have a zero block reservation. Fix up
@@ -5191,10 +5273,10 @@ xfs_bmap_del_extent_real(
*/
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
- goto done;
+ return error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
}
/*
* Update the btree record back
@@ -5202,22 +5284,21 @@ xfs_bmap_del_extent_real(
*/
error = xfs_bmbt_update(cur, &old);
if (error)
- goto done;
+ return error;
/*
* Reset the extent record back
* to the original value.
*/
xfs_iext_update_extent(ip, state, icur, &old);
- flags = 0;
- error = -ENOSPC;
- goto done;
+ *logflagsp = 0;
+ return -ENOSPC;
}
if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
}
} else
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
ifp->if_nextents++;
xfs_iext_next(ifp, icur);
@@ -5235,13 +5316,13 @@ xfs_bmap_del_extent_real(
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
} else {
- error = __xfs_free_extent_later(tp, del->br_startblock,
+ error = xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
XFS_AG_RESV_NONE,
((bflags & XFS_BMAPI_NODISCARD) ||
del->br_state == XFS_EXT_UNWRITTEN));
if (error)
- goto done;
+ return error;
}
}
@@ -5256,9 +5337,7 @@ xfs_bmap_del_extent_real(
if (qfield && !(bflags & XFS_BMAPI_REMAP))
xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
-done:
- *logflagsp = flags;
- return error;
+ return 0;
}
/*
@@ -5267,7 +5346,7 @@ done:
* that value. If not all extents in the block range can be removed then
* *done is set.
*/
-int /* error */
+static int
__xfs_bunmapi(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_inode *ip, /* incore inode */
@@ -5289,7 +5368,6 @@ __xfs_bunmapi(
int tmp_logflags; /* partial logging flags */
int wasdel; /* was a delayed alloc extent */
int whichfork; /* data or attribute fork */
- xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
xfs_fileoff_t end;
struct xfs_iext_cursor icur;
@@ -5300,12 +5378,14 @@ __xfs_bunmapi(
whichfork = xfs_bmapi_whichfork(flags);
ASSERT(whichfork != XFS_COW_FORK);
ifp = xfs_ifork_ptr(ip, whichfork);
- if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)))
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
+ }
if (xfs_is_shutdown(mp))
return -EIO;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(len > 0);
ASSERT(nexts >= 0);
@@ -5318,7 +5398,7 @@ __xfs_bunmapi(
return 0;
}
XFS_STATS_INC(mp, xs_blk_unmap);
- isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+ isrt = xfs_ifork_is_realtime(ip, whichfork);
end = start + len;
if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
@@ -5331,7 +5411,6 @@ __xfs_bunmapi(
if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
} else
cur = NULL;
@@ -5381,11 +5460,11 @@ __xfs_bunmapi(
if (del.br_startoff + del.br_blockcount > end + 1)
del.br_blockcount = end + 1 - del.br_startoff;
- if (!isrt)
+ if (!isrt || (flags & XFS_BMAPI_REMAP))
goto delete;
- sum = del.br_startblock + del.br_blockcount;
- div_u64_rem(sum, mp->m_sb.sb_rextsize, &mod);
+ mod = xfs_rtb_to_rtxoff(mp,
+ del.br_startblock + del.br_blockcount);
if (mod) {
/*
* Realtime extent not lined up at the end.
@@ -5399,7 +5478,7 @@ __xfs_bunmapi(
* This piece is unwritten, or we're not
* using unwritten extents. Skip over it.
*/
- ASSERT(end >= mod);
+ ASSERT((flags & XFS_BMAPI_REMAP) || end >= mod);
end -= mod > del.br_blockcount ?
del.br_blockcount : mod;
if (end < got.br_startoff &&
@@ -5432,7 +5511,8 @@ __xfs_bunmapi(
goto error0;
goto nodelete;
}
- div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod);
+
+ mod = xfs_rtb_to_rtxoff(mp, del.br_startblock);
if (mod) {
xfs_extlen_t off = mp->m_sb.sb_rextsize - mod;
@@ -5568,7 +5648,7 @@ error0:
xfs_trans_log_inode(tp, ip, logflags);
if (cur) {
if (!error)
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
return error;
@@ -5648,8 +5728,7 @@ xfs_bmse_merge(
blockcount = left->br_blockcount + got->br_blockcount;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
ASSERT(xfs_bmse_can_merge(left, got, shift));
new = *left;
@@ -5670,21 +5749,27 @@ xfs_bmse_merge(
error = xfs_bmbt_lookup_eq(cur, got, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_btree_delete(cur, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
/* lookup and update size of the previous extent */
error = xfs_bmbt_lookup_eq(cur, left, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_bmbt_update(cur, &new);
if (error)
@@ -5732,8 +5817,10 @@ xfs_bmap_shift_update_extent(
error = xfs_bmbt_lookup_eq(cur, &prev, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, i != 1))
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_bmbt_update(cur, got);
if (error)
@@ -5771,28 +5858,28 @@ xfs_bmap_collapse_extents(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
if (xfs_is_shutdown(mp))
return -EIO;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
- if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE)
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
- }
if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
*done = true;
goto del_cursor;
}
if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+ xfs_bmap_mark_sick(ip, whichfork);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -5850,7 +5937,7 @@ xfs_bmap_can_insert_extents(
int is_empty;
int error = 0;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
if (xfs_is_shutdown(ip->i_mount))
return -EIO;
@@ -5886,22 +5973,21 @@ xfs_bmap_insert_extents(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
if (xfs_is_shutdown(mp))
return -EIO;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
- if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE)
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
- }
if (*next_fsb == NULLFSBLOCK) {
xfs_iext_last(ifp, &icur);
@@ -5917,11 +6003,13 @@ xfs_bmap_insert_extents(
}
}
if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+ xfs_bmap_mark_sick(ip, whichfork);
error = -EFSCORRUPTED;
goto del_cursor;
}
if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) {
+ xfs_bmap_mark_sick(ip, whichfork);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -5989,6 +6077,7 @@ xfs_bmap_split_extent(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -6015,11 +6104,11 @@ xfs_bmap_split_extent(
if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_ino.flags = 0;
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto del_cursor;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -6047,6 +6136,7 @@ xfs_bmap_split_extent(
if (error)
goto del_cursor;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -6054,6 +6144,7 @@ xfs_bmap_split_extent(
if (error)
goto del_cursor;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -6073,7 +6164,7 @@ xfs_bmap_split_extent(
del_cursor:
if (cur) {
- cur->bc_ino.allocated = 0;
+ cur->bc_bmap.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
@@ -6082,17 +6173,8 @@ del_cursor:
return error;
}
-/* Deferred mapping is only for real extents in the data fork. */
-static bool
-xfs_bmap_is_update_needed(
- struct xfs_bmbt_irec *bmap)
-{
- return bmap->br_startblock != HOLESTARTBLOCK &&
- bmap->br_startblock != DELAYSTARTBLOCK;
-}
-
/* Record a bmap intent. */
-static int
+static inline void
__xfs_bmap_add(
struct xfs_trans *tp,
enum xfs_bmap_intent_type type,
@@ -6102,25 +6184,19 @@ __xfs_bmap_add(
{
struct xfs_bmap_intent *bi;
- trace_xfs_bmap_defer(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
- type,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
- ip->i_ino, whichfork,
- bmap->br_startoff,
- bmap->br_blockcount,
- bmap->br_state);
+ if ((whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) ||
+ bmap->br_startblock == HOLESTARTBLOCK ||
+ bmap->br_startblock == DELAYSTARTBLOCK)
+ return;
- bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&bi->bi_list);
bi->bi_type = type;
bi->bi_owner = ip;
bi->bi_whichfork = whichfork;
bi->bi_bmap = *bmap;
- xfs_bmap_update_get_group(tp->t_mountp, bi);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list);
- return 0;
+ xfs_bmap_defer_add(tp, bi);
}
/* Map an extent into a file. */
@@ -6128,12 +6204,10 @@ void
xfs_bmap_map_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
+ int whichfork,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_bmap_is_update_needed(PREV))
- return;
-
- __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
+ __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, whichfork, PREV);
}
/* Unmap an extent out of a file. */
@@ -6141,12 +6215,10 @@ void
xfs_bmap_unmap_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
+ int whichfork,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_bmap_is_update_needed(PREV))
- return;
-
- __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
+ __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, whichfork, PREV);
}
/*
@@ -6160,57 +6232,55 @@ xfs_bmap_finish_one(
{
struct xfs_bmbt_irec *bmap = &bi->bi_bmap;
int error = 0;
+ int flags = 0;
- ASSERT(tp->t_highest_agno == NULLAGNUMBER);
+ if (bi->bi_whichfork == XFS_ATTR_FORK)
+ flags |= XFS_BMAPI_ATTRFORK;
- trace_xfs_bmap_deferred(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
- bi->bi_type,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
- bi->bi_owner->i_ino, bi->bi_whichfork,
- bmap->br_startoff, bmap->br_blockcount,
- bmap->br_state);
+ ASSERT(tp->t_highest_agno == NULLAGNUMBER);
- if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK))
- return -EFSCORRUPTED;
+ trace_xfs_bmap_deferred(bi);
- if (XFS_TEST_ERROR(false, tp->t_mountp,
- XFS_ERRTAG_BMAP_FINISH_ONE))
+ if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE))
return -EIO;
switch (bi->bi_type) {
case XFS_BMAP_MAP:
+ if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN)
+ flags |= XFS_BMAPI_PREALLOC;
error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff,
- bmap->br_blockcount, bmap->br_startblock, 0);
+ bmap->br_blockcount, bmap->br_startblock,
+ flags);
bmap->br_blockcount = 0;
break;
case XFS_BMAP_UNMAP:
error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff,
- &bmap->br_blockcount, XFS_BMAPI_REMAP, 1);
+ &bmap->br_blockcount, flags | XFS_BMAPI_REMAP,
+ 1);
break;
default:
ASSERT(0);
+ xfs_bmap_mark_sick(bi->bi_owner, bi->bi_whichfork);
error = -EFSCORRUPTED;
}
return error;
}
-/* Check that an inode's extent does not have invalid flags or bad ranges. */
+/* Check that an extent does not have invalid flags or bad ranges. */
xfs_failaddr_t
-xfs_bmap_validate_extent(
- struct xfs_inode *ip,
+xfs_bmap_validate_extent_raw(
+ struct xfs_mount *mp,
+ bool rtfile,
int whichfork,
struct xfs_bmbt_irec *irec)
{
- struct xfs_mount *mp = ip->i_mount;
-
if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
return __this_address;
- if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) {
- if (!xfs_verify_rtext(mp, irec->br_startblock,
- irec->br_blockcount))
+ if (rtfile && whichfork == XFS_DATA_FORK) {
+ if (!xfs_verify_rtbext(mp, irec->br_startblock,
+ irec->br_blockcount))
return __this_address;
} else {
if (!xfs_verify_fsbext(mp, irec->br_startblock,
@@ -6238,3 +6308,96 @@ xfs_bmap_intent_destroy_cache(void)
kmem_cache_destroy(xfs_bmap_intent_cache);
xfs_bmap_intent_cache = NULL;
}
+
+/* Check that an inode's extent does not have invalid flags or bad ranges. */
+xfs_failaddr_t
+xfs_bmap_validate_extent(
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *irec)
+{
+ return xfs_bmap_validate_extent_raw(ip->i_mount,
+ XFS_IS_REALTIME_INODE(ip), whichfork, irec);
+}
+
+/*
+ * Used in xfs_itruncate_extents(). This is the maximum number of extents
+ * freed from a file in a single transaction.
+ */
+#define XFS_ITRUNC_MAX_EXTENTS 2
+
+/*
+ * Unmap every extent in part of an inode's fork. We don't do any higher level
+ * invalidation work at all.
+ */
+int
+xfs_bunmapi_range(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip,
+ uint32_t flags,
+ xfs_fileoff_t startoff,
+ xfs_fileoff_t endoff)
+{
+ xfs_filblks_t unmap_len = endoff - startoff + 1;
+ int error = 0;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
+
+ while (unmap_len > 0) {
+ ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
+ error = __xfs_bunmapi(*tpp, ip, startoff, &unmap_len, flags,
+ XFS_ITRUNC_MAX_EXTENTS);
+ if (error)
+ goto out;
+
+ /* free the just unmapped extents */
+ error = xfs_defer_finish(tpp);
+ if (error)
+ goto out;
+ }
+out:
+ return error;
+}
+
+struct xfs_bmap_query_range {
+ xfs_bmap_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_bmap_query_range_helper(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_bmap_query_range *query = priv;
+ struct xfs_bmbt_irec irec;
+ xfs_failaddr_t fa;
+
+ xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+ fa = xfs_bmap_validate_extent(cur->bc_ino.ip, cur->bc_ino.whichfork,
+ &irec);
+ if (fa) {
+ xfs_btree_mark_sick(cur);
+ return xfs_bmap_complain_bad_rec(cur->bc_ino.ip,
+ cur->bc_ino.whichfork, fa, &irec);
+ }
+
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all bmaps. */
+int
+xfs_bmap_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_bmap_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_bmap_query_range query = {
+ .priv = priv,
+ .fn = fn,
+ };
+
+ return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query);
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index e33470e39728..f7662595309d 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -116,6 +116,8 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
return XFS_DATA_FORK;
}
+void xfs_bmap_alloc_account(struct xfs_bmalloca *ap);
+
/*
* Special values for xfs_bmbt_irec_t br_startblock field.
*/
@@ -190,9 +192,6 @@ int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
-int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
- xfs_extnum_t nexts);
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extnum_t nexts, int *done);
@@ -233,6 +232,10 @@ enum xfs_bmap_intent_type {
XFS_BMAP_UNMAP,
};
+#define XFS_BMAP_INTENT_STRINGS \
+ { XFS_BMAP_MAP, "map" }, \
+ { XFS_BMAP_UNMAP, "unmap" }
+
struct xfs_bmap_intent {
struct list_head bi_list;
enum xfs_bmap_intent_type bi_type;
@@ -242,14 +245,11 @@ struct xfs_bmap_intent {
struct xfs_bmbt_irec bi_bmap;
};
-void xfs_bmap_update_get_group(struct xfs_mount *mp,
- struct xfs_bmap_intent *bi);
-
int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap);
+ int whichfork, struct xfs_bmbt_irec *imap);
void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap);
+ int whichfork, struct xfs_bmbt_irec *imap);
static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
{
@@ -263,6 +263,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
}
}
+xfs_failaddr_t xfs_bmap_validate_extent_raw(struct xfs_mount *mp, bool rtfile,
+ int whichfork, struct xfs_bmbt_irec *irec);
xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *irec);
int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork,
@@ -271,10 +273,20 @@ int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork,
int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
uint32_t flags);
+int xfs_bunmapi_range(struct xfs_trans **tpp, struct xfs_inode *ip,
+ uint32_t flags, xfs_fileoff_t startoff, xfs_fileoff_t endoff);
extern struct kmem_cache *xfs_bmap_intent_cache;
int __init xfs_bmap_intent_init_cache(void);
void xfs_bmap_intent_destroy_cache(void);
+typedef int (*xfs_bmap_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *rec,
+ void *priv);
+
+int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn,
+ void *priv);
+
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index bf3f1b36fdd2..f5d84dcb58da 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -15,6 +15,7 @@
#include "xfs_trans.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
@@ -25,6 +26,22 @@
static struct kmem_cache *xfs_bmbt_cur_cache;
+void
+xfs_bmbt_init_block(
+ struct xfs_inode *ip,
+ struct xfs_btree_block *buf,
+ struct xfs_buf *bp,
+ __u16 level,
+ __u16 numrecs)
+{
+ if (bp)
+ xfs_btree_init_buf(ip->i_mount, bp, &xfs_bmbt_ops, level,
+ numrecs, ip->i_ino);
+ else
+ xfs_btree_init_block(ip->i_mount, buf, &xfs_bmbt_ops, level,
+ numrecs, ip->i_ino);
+}
+
/*
* Convert on-disk form of btree root to in-memory form.
*/
@@ -43,9 +60,7 @@ xfs_bmdr_to_bmbt(
xfs_bmbt_key_t *tkp;
__be64 *tpp;
- xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
- XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
- XFS_BTREE_LONG_PTRS);
+ xfs_bmbt_init_block(ip, rblock, NULL, 0, 0);
rblock->bb_level = dblock->bb_level;
ASSERT(be16_to_cpu(rblock->bb_level) > 0);
rblock->bb_numrecs = dblock->bb_numrecs;
@@ -170,13 +185,8 @@ xfs_bmbt_dup_cursor(
new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
cur->bc_ino.ip, cur->bc_ino.whichfork);
-
- /*
- * Copy the firstblock, dfops, and flags values,
- * since init cursor doesn't get them.
- */
- new->bc_ino.flags = cur->bc_ino.flags;
-
+ new->bc_flags |= (cur->bc_flags &
+ (XFS_BTREE_BMBT_INVALID_OWNER | XFS_BTREE_BMBT_WASDEL));
return new;
}
@@ -188,10 +198,10 @@ xfs_bmbt_update_cursor(
ASSERT((dst->bc_tp->t_highest_agno != NULLAGNUMBER) ||
(dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME));
- dst->bc_ino.allocated += src->bc_ino.allocated;
+ dst->bc_bmap.allocated += src->bc_bmap.allocated;
dst->bc_tp->t_highest_agno = src->bc_tp->t_highest_agno;
- src->bc_ino.allocated = 0;
+ src->bc_bmap.allocated = 0;
}
STATIC int
@@ -210,7 +220,7 @@ xfs_bmbt_alloc_block(
xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino,
cur->bc_ino.whichfork);
args.minlen = args.maxlen = args.prod = 1;
- args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL;
+ args.wasdel = cur->bc_flags & XFS_BTREE_BMBT_WASDEL;
if (!args.wasdel && args.tp->t_blk_res == 0)
return -ENOSPC;
@@ -246,7 +256,7 @@ xfs_bmbt_alloc_block(
}
ASSERT(args.len == 1);
- cur->bc_ino.allocated++;
+ cur->bc_bmap.allocated++;
cur->bc_ino.ip->i_nblocks++;
xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE);
xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip,
@@ -272,7 +282,7 @@ xfs_bmbt_free_block(
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
return error;
@@ -288,10 +298,7 @@ xfs_bmbt_get_minrecs(
int level)
{
if (level == cur->bc_nlevels - 1) {
- struct xfs_ifork *ifp;
-
- ifp = xfs_ifork_ptr(cur->bc_ino.ip,
- cur->bc_ino.whichfork);
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
return xfs_bmbt_maxrecs(cur->bc_mp,
ifp->if_broot_bytes, level == 0) / 2;
@@ -306,10 +313,7 @@ xfs_bmbt_get_maxrecs(
int level)
{
if (level == cur->bc_nlevels - 1) {
- struct xfs_ifork *ifp;
-
- ifp = xfs_ifork_ptr(cur->bc_ino.ip,
- cur->bc_ino.whichfork);
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
return xfs_bmbt_maxrecs(cur->bc_mp,
ifp->if_broot_bytes, level == 0);
@@ -365,14 +369,6 @@ xfs_bmbt_init_rec_from_cur(
xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
}
-STATIC void
-xfs_bmbt_init_ptr_from_cur(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr)
-{
- ptr->l = 0;
-}
-
STATIC int64_t
xfs_bmbt_key_diff(
struct xfs_btree_cur *cur,
@@ -424,7 +420,7 @@ xfs_bmbt_verify(
* XXX: need a better way of verifying the owner here. Right now
* just make sure there has been one set.
*/
- fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
if (fa)
return fa;
}
@@ -440,7 +436,7 @@ xfs_bmbt_verify(
if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
return __this_address;
- return xfs_btree_lblock_verify(bp, mp->m_bmap_dmxr[level != 0]);
+ return xfs_btree_fsblock_verify(bp, mp->m_bmap_dmxr[level != 0]);
}
static void
@@ -449,7 +445,7 @@ xfs_bmbt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_lblock_verify_crc(bp))
+ if (!xfs_btree_fsblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_bmbt_verify(bp);
@@ -473,7 +469,7 @@ xfs_bmbt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_lblock_calc_crc(bp);
+ xfs_btree_fsblock_calc_crc(bp);
}
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
@@ -520,9 +516,16 @@ xfs_bmbt_keys_contiguous(
be64_to_cpu(key2->bmbt.br_startoff));
}
-static const struct xfs_btree_ops xfs_bmbt_ops = {
+const struct xfs_btree_ops xfs_bmbt_ops = {
+ .name = "bmap",
+ .type = XFS_BTREE_TYPE_INODE,
+
.rec_len = sizeof(xfs_bmbt_rec_t),
.key_len = sizeof(xfs_bmbt_key_t),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_BMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2),
.dup_cursor = xfs_bmbt_dup_cursor,
.update_cursor = xfs_bmbt_update_cursor,
@@ -534,7 +537,6 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.init_key_from_rec = xfs_bmbt_init_key_from_rec,
.init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
- .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
.key_diff = xfs_bmbt_key_diff,
.diff_two_keys = xfs_bmbt_diff_two_keys,
.buf_ops = &xfs_bmbt_buf_ops,
@@ -544,35 +546,45 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
};
/*
- * Allocate a new bmap btree cursor.
+ * Create a new bmap btree cursor.
+ *
+ * For staging cursors -1 in passed in whichfork.
*/
-struct xfs_btree_cur * /* new bmap btree cursor */
+struct xfs_btree_cur *
xfs_bmbt_init_cursor(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* inode owning the btree */
- int whichfork) /* data or attr fork */
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork)
{
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur;
- ASSERT(whichfork != XFS_COW_FORK);
-
- cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
- mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
- cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
+ unsigned int maxlevels;
- cur->bc_ops = &xfs_bmbt_ops;
- cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
- if (xfs_has_crc(mp))
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+ ASSERT(whichfork != XFS_COW_FORK);
- cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
+ /*
+ * The Data fork always has larger maxlevel, so use that for staging
+ * cursors.
+ */
+ switch (whichfork) {
+ case XFS_STAGING_FORK:
+ maxlevels = mp->m_bm_maxlevels[XFS_DATA_FORK];
+ break;
+ default:
+ maxlevels = mp->m_bm_maxlevels[whichfork];
+ break;
+ }
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bmbt_ops, maxlevels,
+ xfs_bmbt_cur_cache);
cur->bc_ino.ip = ip;
- cur->bc_ino.allocated = 0;
- cur->bc_ino.flags = 0;
cur->bc_ino.whichfork = whichfork;
+ cur->bc_bmap.allocated = 0;
+ if (whichfork != XFS_STAGING_FORK) {
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
+ }
return cur;
}
@@ -588,6 +600,49 @@ xfs_bmbt_block_maxrecs(
}
/*
+ * Swap in the new inode fork root. Once we pass this point the newly rebuilt
+ * mappings are in place and we have to kill off any old btree blocks.
+ */
+void
+xfs_bmbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ int whichfork)
+{
+ struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake;
+ struct xfs_ifork *ifp;
+ static const short brootflag[2] = {XFS_ILOG_DBROOT, XFS_ILOG_ABROOT};
+ static const short extflag[2] = {XFS_ILOG_DEXT, XFS_ILOG_AEXT};
+ int flags = XFS_ILOG_CORE;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(whichfork != XFS_COW_FORK);
+
+ /*
+ * Free any resources hanging off the real fork, then shallow-copy the
+ * staging fork's contents into the real fork to transfer everything
+ * we just built.
+ */
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip, whichfork);
+ xfs_idestroy_fork(ifp);
+ memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+ switch (ifp->if_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ flags |= extflag[whichfork];
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ flags |= brootflag[whichfork];
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+ xfs_btree_commit_ifakeroot(cur, tp, whichfork);
+}
+
+/*
* Calculate number of records in a bmap btree block.
*/
int
@@ -670,7 +725,7 @@ xfs_bmbt_change_owner(
ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
- cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER;
+ cur->bc_flags |= XFS_BTREE_BMBT_INVALID_OWNER;
error = xfs_btree_change_owner(cur, new_owner, buffer_list);
xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 3e7a40a83835..de1b73f1225c 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -11,6 +11,7 @@ struct xfs_btree_block;
struct xfs_mount;
struct xfs_inode;
struct xfs_trans;
+struct xbtree_ifakeroot;
/*
* Btree block header size depends on a superblock flag.
@@ -106,6 +107,8 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
+void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, int whichfork);
extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
@@ -115,4 +118,7 @@ unsigned int xfs_bmbt_maxlevels_ondisk(void);
int __init xfs_bmbt_init_cur_cache(void);
void xfs_bmbt_destroy_cur_cache(void);
+void xfs_bmbt_init_block(struct xfs_inode *ip, struct xfs_btree_block *buf,
+ struct xfs_buf *bp, __u16 level, __u16 numrecs);
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 6a6503ab0cd7..d29547572a68 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -27,28 +27,24 @@
#include "xfs_bmap_btree.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
+#include "xfs_health.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
/*
* Btree magic numbers.
*/
-static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
- { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
- XFS_FIBT_MAGIC, 0 },
- { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
- XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
- XFS_REFC_CRC_MAGIC }
-};
-
uint32_t
xfs_btree_magic(
- int crc,
- xfs_btnum_t btnum)
+ struct xfs_mount *mp,
+ const struct xfs_btree_ops *ops)
{
- uint32_t magic = xfs_magics[crc][btnum];
+ int idx = xfs_has_crc(mp) ? 1 : 0;
+ __be32 magic = ops->buf_ops->magic[idx];
/* Ensure we asked for crc for crc-only magics. */
ASSERT(magic != 0);
- return magic;
+ return be32_to_cpu(magic);
}
/*
@@ -63,10 +59,8 @@ xfs_btree_magic(
* bytes.
*/
static inline xfs_failaddr_t
-xfs_btree_check_lblock_siblings(
+xfs_btree_check_fsblock_siblings(
struct xfs_mount *mp,
- struct xfs_btree_cur *cur,
- int level,
xfs_fsblock_t fsb,
__be64 dsibling)
{
@@ -78,22 +72,33 @@ xfs_btree_check_lblock_siblings(
sibling = be64_to_cpu(dsibling);
if (sibling == fsb)
return __this_address;
- if (level >= 0) {
- if (!xfs_btree_check_lptr(cur, sibling, level + 1))
- return __this_address;
- } else {
- if (!xfs_verify_fsbno(mp, sibling))
- return __this_address;
- }
+ if (!xfs_verify_fsbno(mp, sibling))
+ return __this_address;
+ return NULL;
+}
+
+static inline xfs_failaddr_t
+xfs_btree_check_memblock_siblings(
+ struct xfs_buftarg *btp,
+ xfbno_t bno,
+ __be64 dsibling)
+{
+ xfbno_t sibling;
+ if (dsibling == cpu_to_be64(NULLFSBLOCK))
+ return NULL;
+
+ sibling = be64_to_cpu(dsibling);
+ if (sibling == bno)
+ return __this_address;
+ if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling)))
+ return __this_address;
return NULL;
}
static inline xfs_failaddr_t
-xfs_btree_check_sblock_siblings(
+xfs_btree_check_agblock_siblings(
struct xfs_perag *pag,
- struct xfs_btree_cur *cur,
- int level,
xfs_agblock_t agbno,
__be32 dsibling)
{
@@ -105,34 +110,21 @@ xfs_btree_check_sblock_siblings(
sibling = be32_to_cpu(dsibling);
if (sibling == agbno)
return __this_address;
- if (level >= 0) {
- if (!xfs_btree_check_sptr(cur, sibling, level + 1))
- return __this_address;
- } else {
- if (!xfs_verify_agbno(pag, sibling))
- return __this_address;
- }
+ if (!xfs_verify_agbno(pag, sibling))
+ return __this_address;
return NULL;
}
-/*
- * Check a long btree block header. Return the address of the failing check,
- * or NULL if everything is ok.
- */
-xfs_failaddr_t
-__xfs_btree_check_lblock(
+static xfs_failaddr_t
+__xfs_btree_check_lblock_hdr(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_btnum_t btnum = cur->bc_btnum;
- int crc = xfs_has_crc(mp);
- xfs_failaddr_t fa;
- xfs_fsblock_t fsb = NULLFSBLOCK;
- if (crc) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (block->bb_u.l.bb_blkno !=
@@ -142,7 +134,7 @@ __xfs_btree_check_lblock(
return __this_address;
}
- if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+ if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
return __this_address;
if (be16_to_cpu(block->bb_level) != level)
return __this_address;
@@ -150,44 +142,83 @@ __xfs_btree_check_lblock(
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (bp)
- fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ return NULL;
+}
+
+/*
+ * Check a long btree block header. Return the address of the failing check,
+ * or NULL if everything is ok.
+ */
+static xfs_failaddr_t
+__xfs_btree_check_fsblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_failaddr_t fa;
+ xfs_fsblock_t fsb;
+
+ fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
+ if (fa)
+ return fa;
- fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ /*
+ * For inode-rooted btrees, the root block sits in the inode fork. In
+ * that case bp is NULL, and the block must not have any siblings.
+ */
+ if (!bp) {
+ if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK))
+ return __this_address;
+ if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK))
+ return __this_address;
+ return NULL;
+ }
+
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_fsblock_siblings(mp, fsb,
block->bb_u.l.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ fa = xfs_btree_check_fsblock_siblings(mp, fsb,
block->bb_u.l.bb_rightsib);
return fa;
}
-/* Check a long btree block header. */
-static int
-xfs_btree_check_lblock(
+/*
+ * Check an in-memory btree block header. Return the address of the failing
+ * check, or NULL if everything is ok.
+ */
+static xfs_failaddr_t
+__xfs_btree_check_memblock(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
struct xfs_buf *bp)
{
- struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target;
xfs_failaddr_t fa;
+ xfbno_t bno;
- fa = __xfs_btree_check_lblock(cur, block, level, bp);
- if (XFS_IS_CORRUPT(mp, fa != NULL) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
- if (bp)
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- return -EFSCORRUPTED;
- }
- return 0;
+ fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
+ if (fa)
+ return fa;
+
+ bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
+ fa = xfs_btree_check_memblock_siblings(btp, bno,
+ block->bb_u.l.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_memblock_siblings(btp, bno,
+ block->bb_u.l.bb_rightsib);
+ return fa;
}
/*
* Check a short btree block header. Return the address of the failing check,
* or NULL if everything is ok.
*/
-xfs_failaddr_t
-__xfs_btree_check_sblock(
+static xfs_failaddr_t
+__xfs_btree_check_agblock(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
@@ -195,20 +226,17 @@ __xfs_btree_check_sblock(
{
struct xfs_mount *mp = cur->bc_mp;
struct xfs_perag *pag = cur->bc_ag.pag;
- xfs_btnum_t btnum = cur->bc_btnum;
- int crc = xfs_has_crc(mp);
xfs_failaddr_t fa;
- xfs_agblock_t agbno = NULLAGBLOCK;
+ xfs_agblock_t agbno;
- if (crc) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (block->bb_u.s.bb_blkno !=
- cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
return __this_address;
}
- if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+ if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
return __this_address;
if (be16_to_cpu(block->bb_level) != level)
return __this_address;
@@ -216,36 +244,45 @@ __xfs_btree_check_sblock(
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (bp)
- agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
-
- fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_agblock_siblings(pag, agbno,
block->bb_u.s.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ fa = xfs_btree_check_agblock_siblings(pag, agbno,
block->bb_u.s.bb_rightsib);
return fa;
}
-/* Check a short btree block header. */
-STATIC int
-xfs_btree_check_sblock(
+/*
+ * Internal btree block check.
+ *
+ * Return NULL if the block is ok or the address of the failed check otherwise.
+ */
+xfs_failaddr_t
+__xfs_btree_check_block(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
int level,
struct xfs_buf *bp)
{
- struct xfs_mount *mp = cur->bc_mp;
- xfs_failaddr_t fa;
-
- fa = __xfs_btree_check_sblock(cur, block, level, bp);
- if (XFS_IS_CORRUPT(mp, fa != NULL) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
- if (bp)
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- return -EFSCORRUPTED;
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ return __xfs_btree_check_memblock(cur, block, level, bp);
+ case XFS_BTREE_TYPE_AG:
+ return __xfs_btree_check_agblock(cur, block, level, bp);
+ case XFS_BTREE_TYPE_INODE:
+ return __xfs_btree_check_fsblock(cur, block, level, bp);
+ default:
+ ASSERT(0);
+ return __this_address;
}
- return 0;
+}
+
+static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur)
+{
+ if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN)
+ return XFS_ERRTAG_BTREE_CHECK_SBLOCK;
+ return XFS_ERRTAG_BTREE_CHECK_LBLOCK;
}
/*
@@ -258,34 +295,49 @@ xfs_btree_check_block(
int level, /* level of the btree block */
struct xfs_buf *bp) /* buffer containing block, if any */
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return xfs_btree_check_lblock(cur, block, level, bp);
- else
- return xfs_btree_check_sblock(cur, block, level, bp);
-}
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_failaddr_t fa;
-/* Check that this long pointer is valid and points within the fs. */
-bool
-xfs_btree_check_lptr(
- struct xfs_btree_cur *cur,
- xfs_fsblock_t fsbno,
- int level)
-{
- if (level <= 0)
- return false;
- return xfs_verify_fsbno(cur->bc_mp, fsbno);
+ fa = __xfs_btree_check_block(cur, block, level, bp);
+ if (XFS_IS_CORRUPT(mp, fa != NULL) ||
+ XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) {
+ if (bp)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+ return 0;
}
-/* Check that this short pointer is valid and points within the AG. */
-bool
-xfs_btree_check_sptr(
- struct xfs_btree_cur *cur,
- xfs_agblock_t agbno,
- int level)
+int
+__xfs_btree_check_ptr(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int index,
+ int level)
{
if (level <= 0)
- return false;
- return xfs_verify_agbno(cur->bc_ag.pag, agbno);
+ return -EFSCORRUPTED;
+
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ if (!xfbtree_verify_bno(cur->bc_mem.xfbtree,
+ be64_to_cpu((&ptr->l)[index])))
+ return -EFSCORRUPTED;
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ if (!xfs_verify_fsbno(cur->bc_mp,
+ be64_to_cpu((&ptr->l)[index])))
+ return -EFSCORRUPTED;
+ break;
+ case XFS_BTREE_TYPE_AG:
+ if (!xfs_verify_agbno(cur->bc_ag.pag,
+ be32_to_cpu((&ptr->s)[index])))
+ return -EFSCORRUPTED;
+ break;
+ }
+
+ return 0;
}
/*
@@ -299,26 +351,35 @@ xfs_btree_check_ptr(
int index,
int level)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
- level))
- return 0;
- xfs_err(cur->bc_mp,
-"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
+ int error;
+
+ error = __xfs_btree_check_ptr(cur, ptr, index, level);
+ if (error) {
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ xfs_err(cur->bc_mp,
+"In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.",
+ cur->bc_ops->name, cur->bc_flags, level, index,
+ __this_address);
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ xfs_err(cur->bc_mp,
+"Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.",
cur->bc_ino.ip->i_ino,
- cur->bc_ino.whichfork, cur->bc_btnum,
+ cur->bc_ino.whichfork, cur->bc_ops->name,
level, index);
- } else {
- if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
- level))
- return 0;
- xfs_err(cur->bc_mp,
-"AG %u: Corrupt btree %d pointer at level %d index %d.",
- cur->bc_ag.pag->pag_agno, cur->bc_btnum,
+ break;
+ case XFS_BTREE_TYPE_AG:
+ xfs_err(cur->bc_mp,
+"AG %u: Corrupt %sbt pointer at level %d index %d.",
+ cur->bc_ag.pag->pag_agno, cur->bc_ops->name,
level, index);
+ break;
+ }
+ xfs_btree_mark_sick(cur);
}
- return -EFSCORRUPTED;
+ return error;
}
#ifdef DEBUG
@@ -336,7 +397,7 @@ xfs_btree_check_ptr(
* it to disk.
*/
void
-xfs_btree_lblock_calc_crc(
+xfs_btree_fsblock_calc_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -350,7 +411,7 @@ xfs_btree_lblock_calc_crc(
}
bool
-xfs_btree_lblock_verify_crc(
+xfs_btree_fsblock_verify_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -374,7 +435,7 @@ xfs_btree_lblock_verify_crc(
* it to disk.
*/
void
-xfs_btree_sblock_calc_crc(
+xfs_btree_agblock_calc_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -388,7 +449,7 @@ xfs_btree_sblock_calc_crc(
}
bool
-xfs_btree_sblock_verify_crc(
+xfs_btree_agblock_verify_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
@@ -410,6 +471,17 @@ xfs_btree_free_block(
{
int error;
+ trace_xfs_btree_free_block(cur, bp);
+
+ /*
+ * Don't allow block freeing for a staging cursor, because staging
+ * cursors do not support regular btree modifications.
+ */
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
error = cur->bc_ops->free_block(cur, bp);
if (!error) {
xfs_trans_binval(cur->bc_tp, bp);
@@ -448,33 +520,70 @@ xfs_btree_del_cursor(
* zero, then we should be shut down or on our way to shutdown due to
* cancelling a dirty transaction on error.
*/
- ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
+ ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 ||
xfs_is_shutdown(cur->bc_mp) || error != 0);
- if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
- kmem_free(cur->bc_ops);
- if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
- xfs_perag_put(cur->bc_ag.pag);
+
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_AG:
+ if (cur->bc_ag.pag)
+ xfs_perag_put(cur->bc_ag.pag);
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ /* nothing to do */
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ if (cur->bc_mem.pag)
+ xfs_perag_put(cur->bc_mem.pag);
+ break;
+ }
+
kmem_cache_free(cur->bc_cache, cur);
}
+/* Return the buffer target for this btree's buffer. */
+static inline struct xfs_buftarg *
+xfs_btree_buftarg(
+ struct xfs_btree_cur *cur)
+{
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
+ return cur->bc_mem.xfbtree->target;
+ return cur->bc_mp->m_ddev_targp;
+}
+
+/* Return the block size (in units of 512b sectors) for this btree. */
+static inline unsigned int
+xfs_btree_bbsize(
+ struct xfs_btree_cur *cur)
+{
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
+ return XFBNO_BBSIZE;
+ return cur->bc_mp->m_bsize;
+}
+
/*
* Duplicate the btree cursor.
* Allocate a new one, copy the record, re-get the buffers.
*/
-int /* error */
+int /* error */
xfs_btree_dup_cursor(
- struct xfs_btree_cur *cur, /* input cursor */
- struct xfs_btree_cur **ncur) /* output cursor */
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur) /* output cursor */
{
- struct xfs_buf *bp; /* btree block's buffer pointer */
- int error; /* error return value */
- int i; /* level number of btree block */
- xfs_mount_t *mp; /* mount structure for filesystem */
- struct xfs_btree_cur *new; /* new cursor value */
- xfs_trans_t *tp; /* transaction pointer, can be NULL */
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_trans *tp = cur->bc_tp;
+ struct xfs_buf *bp;
+ struct xfs_btree_cur *new;
+ int error;
+ int i;
- tp = cur->bc_tp;
- mp = cur->bc_mp;
+ /*
+ * Don't allow staging cursors to be duplicated because they're supposed
+ * to be kept private to a single thread.
+ */
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
/*
* Allocate a new cursor like the old one.
@@ -494,10 +603,13 @@ xfs_btree_dup_cursor(
new->bc_levels[i].ra = cur->bc_levels[i].ra;
bp = cur->bc_levels[i].bp;
if (bp) {
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- xfs_buf_daddr(bp), mp->m_bsize,
- 0, &bp,
- cur->bc_ops->buf_ops);
+ error = xfs_trans_read_buf(mp, tp,
+ xfs_btree_buftarg(cur),
+ xfs_buf_daddr(bp),
+ xfs_btree_bbsize(cur), 0, &bp,
+ cur->bc_ops->buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(new);
if (error) {
xfs_btree_del_cursor(new, error);
*ncur = NULL;
@@ -539,7 +651,7 @@ xfs_btree_dup_cursor(
* record, key or pointer (xfs_btree_*_addr). Note that all addressing
* inside the btree block is done using indices starting at one, not zero!
*
- * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
+ * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing
* overlapping intervals. In such a tree, records are still sorted lowest to
* highest and indexed by the smallest key value that refers to the record.
* However, nodes are different: each pointer has two associated keys -- one
@@ -589,26 +701,17 @@ xfs_btree_dup_cursor(
*/
static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
+ if (xfs_has_crc(cur->bc_mp))
return XFS_BTREE_LBLOCK_CRC_LEN;
return XFS_BTREE_LBLOCK_LEN;
}
- if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ if (xfs_has_crc(cur->bc_mp))
return XFS_BTREE_SBLOCK_CRC_LEN;
return XFS_BTREE_SBLOCK_LEN;
}
/*
- * Return size of btree block pointers for this btree instance.
- */
-static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
-{
- return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
- sizeof(__be64) : sizeof(__be32);
-}
-
-/*
* Calculate offset of the n-th record in a btree block.
*/
STATIC size_t
@@ -655,7 +758,7 @@ xfs_btree_ptr_offset(
{
return xfs_btree_block_len(cur) +
cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
- (n - 1) * xfs_btree_ptr_len(cur);
+ (n - 1) * cur->bc_ops->ptr_len;
}
/*
@@ -718,7 +821,7 @@ struct xfs_ifork *
xfs_btree_ifork_ptr(
struct xfs_btree_cur *cur)
{
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
if (cur->bc_flags & XFS_BTREE_STAGING)
return cur->bc_ino.ifake->if_fork;
@@ -750,8 +853,7 @@ xfs_btree_get_block(
int level, /* level in btree */
struct xfs_buf **bpp) /* buffer containing the block */
{
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (level == cur->bc_nlevels - 1)) {
+ if (xfs_btree_at_iroot(cur, level)) {
*bpp = NULL;
return xfs_btree_get_iroot(cur);
}
@@ -856,95 +958,52 @@ xfs_btree_offsets(
}
}
-/*
- * Get a buffer for the block, return it read in.
- * Long-form addressing.
- */
-int
-xfs_btree_read_bufl(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- struct xfs_buf **bpp, /* buffer for fsbno */
- int refval, /* ref count value for buffer */
- const struct xfs_buf_ops *ops)
-{
- struct xfs_buf *bp; /* return value */
- xfs_daddr_t d; /* real disk block address */
- int error;
-
- if (!xfs_verify_fsbno(mp, fsbno))
- return -EFSCORRUPTED;
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
- mp->m_bsize, 0, &bp, ops);
- if (error)
- return error;
- if (bp)
- xfs_buf_set_ref(bp, refval);
- *bpp = bp;
- return 0;
-}
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Long-form addressing.
- */
-/* ARGSUSED */
-void
-xfs_btree_reada_bufl(
- struct xfs_mount *mp, /* file system mount point */
- xfs_fsblock_t fsbno, /* file system block number */
- xfs_extlen_t count, /* count of filesystem blocks */
- const struct xfs_buf_ops *ops)
+STATIC int
+xfs_btree_readahead_fsblock(
+ struct xfs_btree_cur *cur,
+ int lr,
+ struct xfs_btree_block *block)
{
- xfs_daddr_t d;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+ xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ int rval = 0;
- ASSERT(fsbno != NULLFSBLOCK);
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
-}
+ if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
+ xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left),
+ mp->m_bsize, cur->bc_ops->buf_ops);
+ rval++;
+ }
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Short-form addressing.
- */
-/* ARGSUSED */
-void
-xfs_btree_reada_bufs(
- struct xfs_mount *mp, /* file system mount point */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- xfs_extlen_t count, /* count of filesystem blocks */
- const struct xfs_buf_ops *ops)
-{
- xfs_daddr_t d;
+ if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
+ xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right),
+ mp->m_bsize, cur->bc_ops->buf_ops);
+ rval++;
+ }
- ASSERT(agno != NULLAGNUMBER);
- ASSERT(agbno != NULLAGBLOCK);
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+ return rval;
}
STATIC int
-xfs_btree_readahead_lblock(
+xfs_btree_readahead_memblock(
struct xfs_btree_cur *cur,
int lr,
struct xfs_btree_block *block)
{
+ struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target;
+ xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+ xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
int rval = 0;
- xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
- xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
- xfs_btree_reada_bufl(cur->bc_mp, left, 1,
- cur->bc_ops->buf_ops);
+ xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE,
+ cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
- xfs_btree_reada_bufl(cur->bc_mp, right, 1,
- cur->bc_ops->buf_ops);
+ xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE,
+ cur->bc_ops->buf_ops);
rval++;
}
@@ -952,25 +1011,28 @@ xfs_btree_readahead_lblock(
}
STATIC int
-xfs_btree_readahead_sblock(
+xfs_btree_readahead_agblock(
struct xfs_btree_cur *cur,
int lr,
- struct xfs_btree_block *block)
+ struct xfs_btree_block *block)
{
- int rval = 0;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib);
xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib);
-
+ int rval = 0;
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- left, 1, cur->bc_ops->buf_ops);
+ xfs_buf_readahead(mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, left),
+ mp->m_bsize, cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- right, 1, cur->bc_ops->buf_ops);
+ xfs_buf_readahead(mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, right),
+ mp->m_bsize, cur->bc_ops->buf_ops);
rval++;
}
@@ -993,8 +1055,7 @@ xfs_btree_readahead(
* No readahead needed if we are at the root level and the
* btree root is stored in the inode.
*/
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (lev == cur->bc_nlevels - 1))
+ if (xfs_btree_at_iroot(cur, lev))
return 0;
if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
@@ -1003,9 +1064,17 @@ xfs_btree_readahead(
cur->bc_levels[lev].ra |= lr;
block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return xfs_btree_readahead_lblock(cur, lr, block);
- return xfs_btree_readahead_sblock(cur, lr, block);
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_AG:
+ return xfs_btree_readahead_agblock(cur, lr, block);
+ case XFS_BTREE_TYPE_INODE:
+ return xfs_btree_readahead_fsblock(cur, lr, block);
+ case XFS_BTREE_TYPE_MEM:
+ return xfs_btree_readahead_memblock(cur, lr, block);
+ default:
+ ASSERT(0);
+ return 0;
+ }
}
STATIC int
@@ -1014,23 +1083,24 @@ xfs_btree_ptr_to_daddr(
const union xfs_btree_ptr *ptr,
xfs_daddr_t *daddr)
{
- xfs_fsblock_t fsbno;
- xfs_agblock_t agbno;
int error;
error = xfs_btree_check_ptr(cur, ptr, 0, 1);
if (error)
return error;
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- fsbno = be64_to_cpu(ptr->l);
- *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
- } else {
- agbno = be32_to_cpu(ptr->s);
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_AG:
*daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- agbno);
+ be32_to_cpu(ptr->s));
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l));
+ break;
}
-
return 0;
}
@@ -1050,8 +1120,9 @@ xfs_btree_readahead_ptr(
if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
return;
- xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
- cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+ xfs_buf_readahead(xfs_btree_buftarg(cur), daddr,
+ xfs_btree_bbsize(cur) * count,
+ cur->bc_ops->buf_ops);
}
/*
@@ -1072,7 +1143,7 @@ xfs_btree_setbuf(
cur->bc_levels[lev].ra = 0;
b = XFS_BUF_TO_BLOCK(bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
@@ -1090,7 +1161,7 @@ xfs_btree_ptr_is_null(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return ptr->l == cpu_to_be64(NULLFSBLOCK);
else
return ptr->s == cpu_to_be32(NULLAGBLOCK);
@@ -1101,12 +1172,23 @@ xfs_btree_set_ptr_null(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
ptr->l = cpu_to_be64(NULLFSBLOCK);
else
ptr->s = cpu_to_be32(NULLAGBLOCK);
}
+static inline bool
+xfs_btree_ptrs_equal(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr1,
+ union xfs_btree_ptr *ptr2)
+{
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
+ return ptr1->l == ptr2->l;
+ return ptr1->s == ptr2->s;
+}
+
/*
* Get/set/init sibling pointers
*/
@@ -1119,7 +1201,7 @@ xfs_btree_get_sibling(
{
ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (lr == XFS_BB_RIGHTSIB)
ptr->l = block->bb_u.l.bb_rightsib;
else
@@ -1141,7 +1223,7 @@ xfs_btree_set_sibling(
{
ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (lr == XFS_BB_RIGHTSIB)
block->bb_u.l.bb_rightsib = ptr->l;
else
@@ -1154,25 +1236,24 @@ xfs_btree_set_sibling(
}
}
-void
-xfs_btree_init_block_int(
+static void
+__xfs_btree_init_block(
struct xfs_mount *mp,
struct xfs_btree_block *buf,
+ const struct xfs_btree_ops *ops,
xfs_daddr_t blkno,
- xfs_btnum_t btnum,
__u16 level,
__u16 numrecs,
- __u64 owner,
- unsigned int flags)
+ __u64 owner)
{
- int crc = xfs_has_crc(mp);
- __u32 magic = xfs_btree_magic(crc, btnum);
+ bool crc = xfs_has_crc(mp);
+ __u32 magic = xfs_btree_magic(mp, ops);
buf->bb_magic = cpu_to_be32(magic);
buf->bb_level = cpu_to_be16(level);
buf->bb_numrecs = cpu_to_be16(numrecs);
- if (flags & XFS_BTREE_LONG_PTRS) {
+ if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
if (crc) {
@@ -1183,14 +1264,12 @@ xfs_btree_init_block_int(
buf->bb_u.l.bb_lsn = 0;
}
} else {
- /* owner is a 32 bit value on short blocks */
- __u32 __owner = (__u32)owner;
-
buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
if (crc) {
buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
- buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+ /* owner is a 32 bit value on short blocks */
+ buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner);
uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
buf->bb_u.s.bb_lsn = 0;
}
@@ -1199,15 +1278,46 @@ xfs_btree_init_block_int(
void
xfs_btree_init_block(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- xfs_btnum_t btnum,
- __u16 level,
- __u16 numrecs,
- __u64 owner)
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
+ const struct xfs_btree_ops *ops,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner)
{
- xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp),
- btnum, level, numrecs, owner, 0);
+ __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level,
+ numrecs, owner);
+}
+
+void
+xfs_btree_init_buf(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ const struct xfs_btree_ops *ops,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner)
+{
+ __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops,
+ xfs_buf_daddr(bp), level, numrecs, owner);
+ bp->b_ops = ops->buf_ops;
+}
+
+static inline __u64
+xfs_btree_owner(
+ struct xfs_btree_cur *cur)
+{
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ return cur->bc_mem.xfbtree->owner;
+ case XFS_BTREE_TYPE_INODE:
+ return cur->bc_ino.ip->i_ino;
+ case XFS_BTREE_TYPE_AG:
+ return cur->bc_ag.pag->pag_agno;
+ default:
+ ASSERT(0);
+ return 0;
+ }
}
void
@@ -1217,22 +1327,8 @@ xfs_btree_init_block_cur(
int level,
int numrecs)
{
- __u64 owner;
-
- /*
- * we can pull the owner from the cursor right now as the different
- * owners align directly with the pointer size of the btree. This may
- * change in future, but is safe for current users of the generic btree
- * code.
- */
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- owner = cur->bc_ino.ip->i_ino;
- else
- owner = cur->bc_ag.pag->pag_agno;
-
- xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp),
- xfs_buf_daddr(bp), cur->bc_btnum, level,
- numrecs, owner, cur->bc_flags);
+ xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs,
+ xfs_btree_owner(cur));
}
/*
@@ -1250,7 +1346,7 @@ xfs_btree_is_lastrec(
if (level > 0)
return 0;
- if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_LASTREC_UPDATE))
return 0;
xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
@@ -1265,41 +1361,27 @@ xfs_btree_buf_to_ptr(
struct xfs_buf *bp,
union xfs_btree_ptr *ptr)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
- xfs_buf_daddr(bp)));
- else {
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_AG:
ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
xfs_buf_daddr(bp)));
+ break;
+ case XFS_BTREE_TYPE_INODE:
+ ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+ xfs_buf_daddr(bp)));
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp)));
+ break;
}
}
-STATIC void
+static inline void
xfs_btree_set_refs(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- switch (cur->bc_btnum) {
- case XFS_BTNUM_BNO:
- case XFS_BTNUM_CNT:
- xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
- break;
- case XFS_BTNUM_INO:
- case XFS_BTNUM_FINO:
- xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
- break;
- case XFS_BTNUM_BMAP:
- xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
- break;
- case XFS_BTNUM_RMAP:
- xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
- break;
- case XFS_BTNUM_REFC:
- xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
- break;
- default:
- ASSERT(0);
- }
+ xfs_buf_set_ref(bp, cur->bc_ops->lru_refs);
}
int
@@ -1309,15 +1391,14 @@ xfs_btree_get_buf_block(
struct xfs_btree_block **block,
struct xfs_buf **bpp)
{
- struct xfs_mount *mp = cur->bc_mp;
- xfs_daddr_t d;
- int error;
+ xfs_daddr_t d;
+ int error;
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
if (error)
return error;
- error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
- 0, bpp);
+ error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d,
+ xfs_btree_bbsize(cur), 0, bpp);
if (error)
return error;
@@ -1330,7 +1411,7 @@ xfs_btree_get_buf_block(
* Read in the buffer at the given ptr and return the buffer and
* the block pointer within the buffer.
*/
-STATIC int
+int
xfs_btree_read_buf_block(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr,
@@ -1348,9 +1429,11 @@ xfs_btree_read_buf_block(
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
if (error)
return error;
- error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
- mp->m_bsize, flags, bpp,
- cur->bc_ops->buf_ops);
+ error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d,
+ xfs_btree_bbsize(cur), flags, bpp,
+ cur->bc_ops->buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_btree_mark_sick(cur);
if (error)
return error;
@@ -1398,7 +1481,7 @@ xfs_btree_copy_ptrs(
int numptrs)
{
ASSERT(numptrs >= 0);
- memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+ memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len);
}
/*
@@ -1454,8 +1537,8 @@ xfs_btree_shift_ptrs(
ASSERT(numptrs >= 0);
ASSERT(dir == 1 || dir == -1);
- dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
- memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+ dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len);
+ memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len);
}
/*
@@ -1566,7 +1649,7 @@ xfs_btree_log_block(
if (bp) {
int nbits;
- if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ if (xfs_has_crc(cur->bc_mp)) {
/*
* We don't log the CRC when updating a btree
* block but instead recreate it during log
@@ -1581,7 +1664,7 @@ xfs_btree_log_block(
nbits = XFS_BB_NUM_BITS;
}
xfs_btree_offsets(fields,
- (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ?
loffsets : soffsets,
nbits, &first, &last);
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
@@ -1658,9 +1741,10 @@ xfs_btree_increment(
* confused or have the tree root in an inode.
*/
if (lev == cur->bc_nlevels) {
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
goto out0;
ASSERT(0);
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1751,9 +1835,10 @@ xfs_btree_decrement(
* or the root of the tree is in an inode.
*/
if (lev == cur->bc_nlevels) {
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
goto out0;
ASSERT(0);
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1786,6 +1871,33 @@ error0:
return error;
}
+/*
+ * Check the btree block owner now that we have the context to know who the
+ * real owner is.
+ */
+static inline xfs_failaddr_t
+xfs_btree_check_block_owner(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block)
+{
+ __u64 owner;
+
+ if (!xfs_has_crc(cur->bc_mp) ||
+ (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER))
+ return NULL;
+
+ owner = xfs_btree_owner(cur);
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
+ if (be64_to_cpu(block->bb_u.l.bb_owner) != owner)
+ return __this_address;
+ } else {
+ if (be32_to_cpu(block->bb_u.s.bb_owner) != owner)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
int
xfs_btree_lookup_get_block(
struct xfs_btree_cur *cur, /* btree cursor */
@@ -1798,8 +1910,7 @@ xfs_btree_lookup_get_block(
int error = 0;
/* special case the root block if in an inode */
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (level == cur->bc_nlevels - 1)) {
+ if (xfs_btree_at_iroot(cur, level)) {
*blkp = xfs_btree_get_iroot(cur);
return 0;
}
@@ -1824,11 +1935,7 @@ xfs_btree_lookup_get_block(
return error;
/* Check the inode owner since the verifiers don't. */
- if (xfs_has_crc(cur->bc_mp) &&
- !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
- (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
- be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
- cur->bc_ino.ip->i_ino)
+ if (xfs_btree_check_block_owner(cur, *blkp) != NULL)
goto out_bad;
/* Did we get the level we were looking for? */
@@ -1846,6 +1953,7 @@ out_bad:
*blkp = NULL;
xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(cur->bc_tp, bp);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -1872,6 +1980,27 @@ xfs_lookup_get_search_key(
}
/*
+ * Initialize a pointer to the root block.
+ */
+void
+xfs_btree_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
+ /*
+ * Inode-rooted btrees call xfs_btree_get_iroot to find the root
+ * in xfs_btree_lookup_get_block and don't need a pointer here.
+ */
+ ptr->l = 0;
+ } else if (cur->bc_flags & XFS_BTREE_STAGING) {
+ ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root);
+ } else {
+ cur->bc_ops->init_ptr_from_cur(cur, ptr);
+ }
+}
+
+/*
* Lookup the record. The cursor is made to point to it, based on dir.
* stat is set to 0 if can't find any such record, 1 for success.
*/
@@ -1892,14 +2021,16 @@ xfs_btree_lookup(
XFS_BTREE_STATS_INC(cur, lookup);
/* No such thing as a zero-level tree. */
- if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
+ if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
block = NULL;
keyno = 0;
/* initialise start pointer from cursor */
- cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ xfs_btree_init_ptr_from_cur(cur, &ptr);
pp = &ptr;
/*
@@ -1936,6 +2067,7 @@ xfs_btree_lookup(
XFS_ERRLEVEL_LOW,
cur->bc_mp, block,
sizeof(*block));
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -2012,8 +2144,10 @@ xfs_btree_lookup(
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
*stat = 1;
return 0;
}
@@ -2040,7 +2174,7 @@ xfs_btree_high_key_from_key(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
{
- ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+ ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
return (union xfs_btree_key *)((char *)key +
(cur->bc_ops->key_len / 2));
}
@@ -2061,7 +2195,7 @@ xfs_btree_get_leaf_keys(
rec = xfs_btree_rec_addr(cur, 1, block);
cur->bc_ops->init_key_from_rec(key, rec);
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
@@ -2088,7 +2222,7 @@ xfs_btree_get_node_keys(
union xfs_btree_key *high;
int n;
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
memcpy(key, xfs_btree_key_addr(cur, 1, block),
cur->bc_ops->key_len / 2);
@@ -2132,7 +2266,7 @@ xfs_btree_needs_key_update(
struct xfs_btree_cur *cur,
int ptr)
{
- return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
+ return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1;
}
/*
@@ -2156,7 +2290,7 @@ __xfs_btree_updkeys(
struct xfs_buf *bp;
int ptr;
- ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+ ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
/* Exit if there aren't any parent levels to update. */
if (level + 1 >= cur->bc_nlevels)
@@ -2225,7 +2359,7 @@ xfs_btree_update_keys(
ASSERT(level >= 0);
block = xfs_btree_get_block(cur, level, &bp);
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)
return __xfs_btree_updkeys(cur, level, block, bp, false);
/*
@@ -2332,8 +2466,7 @@ xfs_btree_lshift(
int error; /* error return value */
int i;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- level == cur->bc_nlevels - 1)
+ if (xfs_btree_at_iroot(cur, level))
goto out0;
/* Set up variables for this block as "right". */
@@ -2460,12 +2593,13 @@ xfs_btree_lshift(
* Using a temporary cursor, update the parent key values of the
* block on the left.
*/
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
error = xfs_btree_dup_cursor(cur, &tcur);
if (error)
goto error0;
i = xfs_btree_firstrec(tcur, level);
if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2527,8 +2661,7 @@ xfs_btree_rshift(
int error; /* error return value */
int i; /* loop counter */
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (level == cur->bc_nlevels - 1))
+ if (xfs_btree_at_iroot(cur, level))
goto out0;
/* Set up variables for this block as "left". */
@@ -2636,6 +2769,7 @@ xfs_btree_rshift(
goto error0;
i = xfs_btree_lastrec(tcur, level);
if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2645,7 +2779,7 @@ xfs_btree_rshift(
goto error1;
/* Update the parent high keys of the left block, if needed. */
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
error = xfs_btree_update_keys(cur, level);
if (error)
goto error1;
@@ -2673,6 +2807,32 @@ error1:
return error;
}
+static inline int
+xfs_btree_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *hint_block,
+ union xfs_btree_ptr *new_block,
+ int *stat)
+{
+ int error;
+
+ /*
+ * Don't allow block allocation for a staging cursor, because staging
+ * cursors do not support regular btree modifications.
+ *
+ * Bulk loading uses a separate callback to obtain new blocks from a
+ * preallocated list, which prevents ENOSPC failures during loading.
+ */
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat);
+ trace_xfs_btree_alloc_block(cur, new_block, *stat, error);
+ return error;
+}
+
/*
* Split cur/level block in half.
* Return new block number and the key to its first
@@ -2716,7 +2876,7 @@ __xfs_btree_split(
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
+ error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -2823,7 +2983,7 @@ __xfs_btree_split(
}
/* Update the parent high keys of the left block, if needed. */
- if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
error = xfs_btree_update_keys(cur, level);
if (error)
goto error0;
@@ -2941,7 +3101,7 @@ xfs_btree_split(
struct xfs_btree_split_args args;
DECLARE_COMPLETION_ONSTACK(done);
- if (cur->bc_btnum != XFS_BTNUM_BMAP ||
+ if (!xfs_btree_is_bmap(cur->bc_ops) ||
cur->bc_tp->t_highest_agno == NULLAGNUMBER)
return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
@@ -2963,7 +3123,6 @@ xfs_btree_split(
#define xfs_btree_split __xfs_btree_split
#endif /* __KERNEL__ */
-
/*
* Copy the old inode root contents into a real block and make the
* broot point to it.
@@ -2988,7 +3147,7 @@ xfs_btree_new_iroot(
XFS_BTREE_STATS_INC(cur, newroot);
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
level = cur->bc_nlevels - 1;
@@ -2996,7 +3155,7 @@ xfs_btree_new_iroot(
pp = xfs_btree_ptr_addr(cur, 1, block);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
+ error = xfs_btree_alloc_block(cur, pp, &nptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -3014,9 +3173,9 @@ xfs_btree_new_iroot(
* In that case have to also ensure the blkno remains correct
*/
memcpy(cblock, block, xfs_btree_block_len(cur));
- if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ if (xfs_has_crc(cur->bc_mp)) {
__be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
cblock->bb_u.l.bb_blkno = bno;
else
cblock->bb_u.s.bb_blkno = bno;
@@ -3069,6 +3228,21 @@ error0:
return error;
}
+static void
+xfs_btree_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
+{
+ if (cur->bc_flags & XFS_BTREE_STAGING) {
+ /* Update the btree root information for a per-AG fake root. */
+ cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s);
+ cur->bc_ag.afake->af_levels += inc;
+ } else {
+ cur->bc_ops->set_root(cur, ptr, inc);
+ }
+}
+
/*
* Allocate a new root block, fill it in.
*/
@@ -3093,10 +3267,10 @@ xfs_btree_new_root(
XFS_BTREE_STATS_INC(cur, newroot);
/* initialise our start point from the cursor */
- cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+ xfs_btree_init_ptr_from_cur(cur, &rptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
+ error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -3109,7 +3283,7 @@ xfs_btree_new_root(
goto error0;
/* Set the root in the holding structure increasing the level by 1. */
- cur->bc_ops->set_root(cur, &lptr, 1);
+ xfs_btree_set_root(cur, &lptr, 1);
/*
* At the previous root level there are now two blocks: the old root,
@@ -3213,8 +3387,7 @@ xfs_btree_make_block_unfull(
{
int error = 0;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- level == cur->bc_nlevels - 1) {
+ if (xfs_btree_at_iroot(cur, level)) {
struct xfs_inode *ip = cur->bc_ino.ip;
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
@@ -3299,8 +3472,8 @@ xfs_btree_insrec(
* If we have an external root pointer, and we've made it to the
* root level, allocate a new root block and we're done.
*/
- if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- (level >= cur->bc_nlevels)) {
+ if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE &&
+ level >= cur->bc_nlevels) {
error = xfs_btree_new_root(cur, stat);
xfs_btree_set_ptr_null(cur, ptrp);
@@ -3524,6 +3697,7 @@ xfs_btree_insert(
}
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -3537,7 +3711,8 @@ xfs_btree_insert(
if (pcur != cur &&
(ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
/* Save the state from the cursor before we trash it */
- if (cur->bc_ops->update_cursor)
+ if (cur->bc_ops->update_cursor &&
+ !(cur->bc_flags & XFS_BTREE_STAGING))
cur->bc_ops->update_cursor(pcur, cur);
cur->bc_nlevels = pcur->bc_nlevels;
xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
@@ -3586,7 +3761,7 @@ xfs_btree_kill_iroot(
#endif
int i;
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
ASSERT(cur->bc_nlevels > 1);
/*
@@ -3680,7 +3855,7 @@ xfs_btree_kill_root(
* Update the root pointer, decreasing the level by 1 and then
* free the old root.
*/
- cur->bc_ops->set_root(cur, newroot, -1);
+ xfs_btree_set_root(cur, newroot, -1);
error = xfs_btree_free_block(cur, bp);
if (error)
@@ -3822,27 +3997,25 @@ xfs_btree_delrec(
* Try to get rid of the next level down. If we can't then there's
* nothing left to do.
*/
- if (level == cur->bc_nlevels - 1) {
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
- xfs_iroot_realloc(cur->bc_ino.ip, -1,
- cur->bc_ino.whichfork);
+ if (xfs_btree_at_iroot(cur, level)) {
+ xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork);
- error = xfs_btree_kill_iroot(cur);
- if (error)
- goto error0;
+ error = xfs_btree_kill_iroot(cur);
+ if (error)
+ goto error0;
- error = xfs_btree_dec_cursor(cur, level, stat);
- if (error)
- goto error0;
- *stat = 1;
- return 0;
- }
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ *stat = 1;
+ return 0;
+ }
- /*
- * If this is the root level, and there's only one entry left,
- * and it's NOT the leaf level, then we can get rid of this
- * level.
- */
+ /*
+ * If this is the root level, and there's only one entry left, and it's
+ * NOT the leaf level, then we can get rid of this level.
+ */
+ if (level == cur->bc_nlevels - 1) {
if (numrecs == 1 && level > 0) {
union xfs_btree_ptr *pp;
/*
@@ -3891,7 +4064,7 @@ xfs_btree_delrec(
xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
/*
* One child of root, need to get a chance to copy its contents
* into the root and delete it. Can't go up to next level,
@@ -3931,6 +4104,7 @@ xfs_btree_delrec(
*/
i = xfs_btree_lastrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -3939,12 +4113,14 @@ xfs_btree_delrec(
if (error)
goto error0;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
i = xfs_btree_lastrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -3992,6 +4168,7 @@ xfs_btree_delrec(
if (!xfs_btree_ptr_is_null(cur, &lptr)) {
i = xfs_btree_firstrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -4000,6 +4177,7 @@ xfs_btree_delrec(
if (error)
goto error0;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -4017,6 +4195,7 @@ xfs_btree_delrec(
*/
i = xfs_btree_firstrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -4026,6 +4205,7 @@ xfs_btree_delrec(
goto error0;
i = xfs_btree_firstrec(tcur, level);
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -4201,8 +4381,8 @@ xfs_btree_delrec(
* If we joined with the right neighbor and there's a level above
* us, increment the cursor at that level.
*/
- else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
- (level + 1 < cur->bc_nlevels)) {
+ else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE ||
+ level + 1 < cur->bc_nlevels) {
error = xfs_btree_increment(cur, level + 1, &i);
if (error)
goto error0;
@@ -4270,7 +4450,7 @@ xfs_btree_delete(
* If we combined blocks as part of deleting the record, delrec won't
* have updated the parent high keys so we have to do that here.
*/
- if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
+ if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) {
error = xfs_btree_updkeys_force(cur, 0);
if (error)
goto error0;
@@ -4344,7 +4524,7 @@ xfs_btree_visit_block(
{
struct xfs_btree_block *block;
struct xfs_buf *bp;
- union xfs_btree_ptr rptr;
+ union xfs_btree_ptr rptr, bufptr;
int error;
/* do right sibling readahead */
@@ -4367,15 +4547,12 @@ xfs_btree_visit_block(
* return the same block without checking if the right sibling points
* back to us and creates a cyclic reference in the btree.
*/
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
- xfs_buf_daddr(bp)))
- return -EFSCORRUPTED;
- } else {
- if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
- xfs_buf_daddr(bp)))
- return -EFSCORRUPTED;
+ xfs_btree_buf_to_ptr(cur, bp, &bufptr);
+ if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
}
+
return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
}
@@ -4393,7 +4570,7 @@ xfs_btree_visit_blocks(
struct xfs_btree_block *block = NULL;
int error = 0;
- cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+ xfs_btree_init_ptr_from_cur(cur, &lptr);
/* for each level */
for (level = cur->bc_nlevels - 1; level >= 0; level--) {
@@ -4471,7 +4648,7 @@ xfs_btree_block_change_owner(
/* modify the owner */
block = xfs_btree_get_block(cur, level, &bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
return 0;
block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
@@ -4489,7 +4666,7 @@ xfs_btree_block_change_owner(
* though, so everything is consistent in memory.
*/
if (!bp) {
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
ASSERT(level == cur->bc_nlevels - 1);
return 0;
}
@@ -4523,7 +4700,7 @@ xfs_btree_change_owner(
/* Verify the v5 fields of a long-format btree block. */
xfs_failaddr_t
-xfs_btree_lblock_v5hdr_verify(
+xfs_btree_fsblock_v5hdr_verify(
struct xfs_buf *bp,
uint64_t owner)
{
@@ -4544,7 +4721,7 @@ xfs_btree_lblock_v5hdr_verify(
/* Verify a long-format btree block. */
xfs_failaddr_t
-xfs_btree_lblock_verify(
+xfs_btree_fsblock_verify(
struct xfs_buf *bp,
unsigned int max_recs)
{
@@ -4553,28 +4730,60 @@ xfs_btree_lblock_verify(
xfs_fsblock_t fsb;
xfs_failaddr_t fa;
+ ASSERT(!xfs_buftarg_is_mem(bp->b_target));
+
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
- fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ fa = xfs_btree_check_fsblock_siblings(mp, fsb,
block->bb_u.l.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ fa = xfs_btree_check_fsblock_siblings(mp, fsb,
block->bb_u.l.bb_rightsib);
return fa;
}
+/* Verify an in-memory btree block. */
+xfs_failaddr_t
+xfs_btree_memblock_verify(
+ struct xfs_buf *bp,
+ unsigned int max_recs)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buftarg *btp = bp->b_target;
+ xfs_failaddr_t fa;
+ xfbno_t bno;
+
+ ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+ /* numrecs verification */
+ if (be16_to_cpu(block->bb_numrecs) > max_recs)
+ return __this_address;
+
+ /* sibling pointer verification */
+ bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
+ fa = xfs_btree_check_memblock_siblings(btp, bno,
+ block->bb_u.l.bb_leftsib);
+ if (fa)
+ return fa;
+ fa = xfs_btree_check_memblock_siblings(btp, bno,
+ block->bb_u.l.bb_rightsib);
+ if (fa)
+ return fa;
+
+ return NULL;
+}
/**
- * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
+ * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format
* btree block
*
* @bp: buffer containing the btree block
*/
xfs_failaddr_t
-xfs_btree_sblock_v5hdr_verify(
+xfs_btree_agblock_v5hdr_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_mount;
@@ -4593,13 +4802,13 @@ xfs_btree_sblock_v5hdr_verify(
}
/**
- * xfs_btree_sblock_verify() -- verify a short-format btree block
+ * xfs_btree_agblock_verify() -- verify a short-format btree block
*
* @bp: buffer containing the btree block
* @max_recs: maximum records allowed in this btree node
*/
xfs_failaddr_t
-xfs_btree_sblock_verify(
+xfs_btree_agblock_verify(
struct xfs_buf *bp,
unsigned int max_recs)
{
@@ -4608,16 +4817,18 @@ xfs_btree_sblock_verify(
xfs_agblock_t agbno;
xfs_failaddr_t fa;
+ ASSERT(!xfs_buftarg_is_mem(bp->b_target));
+
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
- fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+ fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
block->bb_u.s.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+ fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
block->bb_u.s.bb_rightsib);
return fa;
}
@@ -4815,7 +5026,7 @@ xfs_btree_overlapped_query_range(
/* Load the root of the btree. */
level = cur->bc_nlevels - 1;
- cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ xfs_btree_init_ptr_from_cur(cur, &ptr);
error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
if (error)
return error;
@@ -4966,7 +5177,7 @@ xfs_btree_query_range(
if (!xfs_btree_keycmp_le(cur, &low_key, &high_key))
return -EINVAL;
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return xfs_btree_simple_query_range(cur, &low_key,
&high_key, fn, priv);
return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
@@ -5020,7 +5231,7 @@ xfs_btree_diff_two_ptrs(
const union xfs_btree_ptr *a,
const union xfs_btree_ptr *b)
{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
}
@@ -5074,7 +5285,7 @@ xfs_btree_has_records_helper(
key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key,
&rec_key, info->key_mask);
if (key_contig == XBTREE_KEY_OVERLAP &&
- !(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return -EFSCORRUPTED;
if (key_contig == XBTREE_KEY_GAP)
return -ECANCELED;
@@ -5168,7 +5379,7 @@ xfs_btree_has_more_records(
return true;
/* There are more record blocks. */
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
else
return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
@@ -5212,3 +5423,30 @@ xfs_btree_destroy_cur_caches(void)
xfs_rmapbt_destroy_cur_cache();
xfs_refcountbt_destroy_cur_cache();
}
+
+/* Move the btree cursor before the first record. */
+int
+xfs_btree_goto_left_edge(
+ struct xfs_btree_cur *cur)
+{
+ int stat = 0;
+ int error;
+
+ memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return 0;
+
+ error = xfs_btree_decrement(cur, 0, &stat);
+ if (error)
+ return error;
+ if (stat != 0) {
+ ASSERT(0);
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4d68a58be160..f93374278aa1 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -55,15 +55,8 @@ union xfs_btree_rec {
#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
-#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi)
-#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
-#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
-#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
-#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
-#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi)
-#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi)
-
-uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
+struct xfs_btree_ops;
+uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops);
/*
* For logging record fields.
@@ -86,9 +79,11 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
* Generic stats interface
*/
#define XFS_BTREE_STATS_INC(cur, stat) \
- XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat)
+ XFS_STATS_INC_OFF((cur)->bc_mp, \
+ (cur)->bc_ops->statoff + __XBTS_ ## stat)
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
- XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
+ XFS_STATS_ADD_OFF((cur)->bc_mp, \
+ (cur)->bc_ops->statoff + __XBTS_ ## stat, val)
enum xbtree_key_contig {
XBTREE_KEY_GAP = 0,
@@ -111,10 +106,37 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y)
return XBTREE_KEY_OVERLAP;
}
+#define XFS_BTREE_LONG_PTR_LEN (sizeof(__be64))
+#define XFS_BTREE_SHORT_PTR_LEN (sizeof(__be32))
+
+enum xfs_btree_type {
+ XFS_BTREE_TYPE_AG,
+ XFS_BTREE_TYPE_INODE,
+ XFS_BTREE_TYPE_MEM,
+};
+
struct xfs_btree_ops {
- /* size of the key and record structures */
- size_t key_len;
- size_t rec_len;
+ const char *name;
+
+ /* Type of btree - AG-rooted or inode-rooted */
+ enum xfs_btree_type type;
+
+ /* XFS_BTGEO_* flags that determine the geometry of the btree */
+ unsigned int geom_flags;
+
+ /* size of the key, pointer, and record structures */
+ size_t key_len;
+ size_t ptr_len;
+ size_t rec_len;
+
+ /* LRU refcount to set on each btree buffer created */
+ unsigned int lru_refs;
+
+ /* offset of btree stats array */
+ unsigned int statoff;
+
+ /* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */
+ unsigned int sick_mask;
/* cursor operations */
struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
@@ -199,6 +221,10 @@ struct xfs_btree_ops {
const union xfs_btree_key *mask);
};
+/* btree geometry flags */
+#define XFS_BTGEO_LASTREC_UPDATE (1U << 0) /* track last rec externally */
+#define XFS_BTGEO_OVERLAPPING (1U << 1) /* overlapping intervals */
+
/*
* Reasons for the update_lastrec method to be called.
*/
@@ -215,39 +241,6 @@ union xfs_btree_irec {
struct xfs_refcount_irec rc;
};
-/* Per-AG btree information. */
-struct xfs_btree_cur_ag {
- struct xfs_perag *pag;
- union {
- struct xfs_buf *agbp;
- struct xbtree_afakeroot *afake; /* for staging cursor */
- };
- union {
- struct {
- unsigned int nr_ops; /* # record updates */
- unsigned int shape_changes; /* # of extent splits */
- } refc;
- struct {
- bool active; /* allocation cursor state */
- } abt;
- };
-};
-
-/* Btree-in-inode cursor information */
-struct xfs_btree_cur_ino {
- struct xfs_inode *ip;
- struct xbtree_ifakeroot *ifake; /* for staging cursor */
- int allocated;
- short forksize;
- char whichfork;
- char flags;
-/* We are converting a delalloc reservation */
-#define XFS_BTCUR_BMBT_WASDEL (1 << 0)
-
-/* For extent swap, ignore owner check in verifier */
-#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1)
-};
-
struct xfs_btree_level {
/* buffer pointer */
struct xfs_buf *bp;
@@ -272,21 +265,38 @@ struct xfs_btree_cur
const struct xfs_btree_ops *bc_ops;
struct kmem_cache *bc_cache; /* cursor cache */
unsigned int bc_flags; /* btree features - below */
- xfs_btnum_t bc_btnum; /* identifies which btree type */
union xfs_btree_irec bc_rec; /* current insert/search record value */
uint8_t bc_nlevels; /* number of levels in the tree */
uint8_t bc_maxlevels; /* maximum levels for this btree type */
- int bc_statoff; /* offset of btree stats array */
- /*
- * Short btree pointers need an agno to be able to turn the pointers
- * into physical addresses for IO, so the btree cursor switches between
- * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the
- * cursor.
- */
+ /* per-type information */
union {
- struct xfs_btree_cur_ag bc_ag;
- struct xfs_btree_cur_ino bc_ino;
+ struct {
+ struct xfs_inode *ip;
+ short forksize;
+ char whichfork;
+ struct xbtree_ifakeroot *ifake; /* for staging cursor */
+ } bc_ino;
+ struct {
+ struct xfs_perag *pag;
+ struct xfs_buf *agbp;
+ struct xbtree_afakeroot *afake; /* for staging cursor */
+ } bc_ag;
+ struct {
+ struct xfbtree *xfbtree;
+ struct xfs_perag *pag;
+ } bc_mem;
+ };
+
+ /* per-format private data */
+ union {
+ struct {
+ int allocated;
+ } bc_bmap; /* bmapbt */
+ struct {
+ unsigned int nr_ops; /* # record updates */
+ unsigned int shape_changes; /* # of extent splits */
+ } bc_refc; /* refcountbt */
};
/* Must be at the end of the struct! */
@@ -304,18 +314,22 @@ xfs_btree_cur_sizeof(unsigned int nlevels)
return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels);
}
-/* cursor flags */
-#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
-#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
-#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
-#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
-#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */
+/* cursor state flags */
/*
* The root of this btree is a fakeroot structure so that we can stage a btree
* rebuild without leaving it accessible via primary metadata. The ops struct
* is dynamically allocated and must be freed when the cursor is deleted.
*/
-#define XFS_BTREE_STAGING (1<<5)
+#define XFS_BTREE_STAGING (1U << 0)
+
+/* We are converting a delalloc reservation (only for bmbt btrees) */
+#define XFS_BTREE_BMBT_WASDEL (1U << 1)
+
+/* For extent swap, ignore owner check in verifier (only for bmbt btrees) */
+#define XFS_BTREE_BMBT_INVALID_OWNER (1U << 2)
+
+/* Cursor is active (only for allocbt btrees) */
+#define XFS_BTREE_ALLOCBT_ACTIVE (1U << 3)
#define XFS_BTREE_NOERROR 0
#define XFS_BTREE_ERROR 1
@@ -325,14 +339,10 @@ xfs_btree_cur_sizeof(unsigned int nlevels)
*/
#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr))
-/*
- * Internal long and short btree block checks. They return NULL if the
- * block is ok or the address of the failed check otherwise.
- */
-xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur,
- struct xfs_btree_block *block, int level, struct xfs_buf *bp);
-xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur,
+xfs_failaddr_t __xfs_btree_check_block(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, int level, struct xfs_buf *bp);
+int __xfs_btree_check_ptr(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr, int index, int level);
/*
* Check that block header is ok.
@@ -345,24 +355,6 @@ xfs_btree_check_block(
struct xfs_buf *bp); /* buffer containing block, if any */
/*
- * Check that (long) pointer is ok.
- */
-bool /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_fsblock_t fsbno, /* btree block disk address */
- int level); /* btree block level */
-
-/*
- * Check that (short) pointer is ok.
- */
-bool /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agblock_t agbno, /* btree block disk address */
- int level); /* btree block level */
-
-/*
* Delete the btree cursor.
*/
void
@@ -392,63 +384,14 @@ xfs_btree_offsets(
int *last); /* output: last byte offset */
/*
- * Get a buffer for the block, return it read in.
- * Long-form addressing.
- */
-int /* error */
-xfs_btree_read_bufl(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- struct xfs_buf **bpp, /* buffer for fsbno */
- int refval, /* ref count value for buffer */
- const struct xfs_buf_ops *ops);
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Long-form addressing.
- */
-void /* error */
-xfs_btree_reada_bufl(
- struct xfs_mount *mp, /* file system mount point */
- xfs_fsblock_t fsbno, /* file system block number */
- xfs_extlen_t count, /* count of filesystem blocks */
- const struct xfs_buf_ops *ops);
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Short-form addressing.
- */
-void /* error */
-xfs_btree_reada_bufs(
- struct xfs_mount *mp, /* file system mount point */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- xfs_extlen_t count, /* count of filesystem blocks */
- const struct xfs_buf_ops *ops);
-
-/*
* Initialise a new btree block header
*/
-void
-xfs_btree_init_block(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- xfs_btnum_t btnum,
- __u16 level,
- __u16 numrecs,
- __u64 owner);
-
-void
-xfs_btree_init_block_int(
- struct xfs_mount *mp,
- struct xfs_btree_block *buf,
- xfs_daddr_t blkno,
- xfs_btnum_t btnum,
- __u16 level,
- __u16 numrecs,
- __u64 owner,
- unsigned int flags);
+void xfs_btree_init_buf(struct xfs_mount *mp, struct xfs_buf *bp,
+ const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs,
+ __u64 owner);
+void xfs_btree_init_block(struct xfs_mount *mp,
+ struct xfs_btree_block *buf, const struct xfs_btree_ops *ops,
+ __u16 level, __u16 numrecs, __u64 owner);
/*
* Common btree core entry points.
@@ -467,10 +410,10 @@ int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner,
/*
* btree block CRC helpers
*/
-void xfs_btree_lblock_calc_crc(struct xfs_buf *);
-bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
-void xfs_btree_sblock_calc_crc(struct xfs_buf *);
-bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
+void xfs_btree_fsblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_fsblock_verify_crc(struct xfs_buf *);
+void xfs_btree_agblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_agblock_verify_crc(struct xfs_buf *);
/*
* Internal btree helpers also used by xfs_bmap.c.
@@ -510,12 +453,14 @@ static inline int xfs_btree_get_level(const struct xfs_btree_block *block)
#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b))
#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
-xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
-xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp,
+xfs_failaddr_t xfs_btree_agblock_v5hdr_verify(struct xfs_buf *bp);
+xfs_failaddr_t xfs_btree_agblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
-xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp,
+xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp,
uint64_t owner);
-xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
+xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp,
+ unsigned int max_recs);
+xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
@@ -690,7 +635,7 @@ xfs_btree_islastblock(
block = xfs_btree_get_block(cur, level, &bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
}
@@ -700,6 +645,9 @@ void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur,
int xfs_btree_get_buf_block(struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr, struct xfs_btree_block **block,
struct xfs_buf **bpp);
+int xfs_btree_read_buf_block(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr, int flags,
+ struct xfs_btree_block **block, struct xfs_buf **bpp);
void xfs_btree_set_sibling(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, const union xfs_btree_ptr *ptr,
int lr);
@@ -711,21 +659,28 @@ void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur,
void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
union xfs_btree_key *dst_key,
const union xfs_btree_key *src_key, int numkeys);
+void xfs_btree_init_ptr_from_cur(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
static inline struct xfs_btree_cur *
xfs_btree_alloc_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_btnum_t btnum,
+ const struct xfs_btree_ops *ops,
uint8_t maxlevels,
struct kmem_cache *cache)
{
struct xfs_btree_cur *cur;
- cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL);
+ ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN ||
+ ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN);
+
+ /* BMBT allocations can come through from non-transactional context. */
+ cur = kmem_cache_zalloc(cache,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+ cur->bc_ops = ops;
cur->bc_tp = tp;
cur->bc_mp = mp;
- cur->bc_btnum = btnum;
cur->bc_maxlevels = maxlevels;
cur->bc_cache = cache;
@@ -735,4 +690,16 @@ xfs_btree_alloc_cursor(
int __init xfs_btree_init_cur_caches(void);
void xfs_btree_destroy_cur_caches(void);
+int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur);
+
+/* Does this level of the cursor point to the inode root (and not a block)? */
+static inline bool
+xfs_btree_at_iroot(
+ const struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
+ level == cur->bc_nlevels - 1;
+}
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c
new file mode 100644
index 000000000000..036061fe32cc
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_mem.c
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_error.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_ag.h"
+#include "xfs_buf_item.h"
+#include "xfs_trace.h"
+
+/* Set the root of an in-memory btree. */
+void
+xfbtree_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
+{
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ cur->bc_mem.xfbtree->root = *ptr;
+ cur->bc_mem.xfbtree->nlevels += inc;
+}
+
+/* Initialize a pointer from the in-memory btree header. */
+void
+xfbtree_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ *ptr = cur->bc_mem.xfbtree->root;
+}
+
+/* Duplicate an in-memory btree cursor. */
+struct xfs_btree_cur *
+xfbtree_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_btree_cur *ncur;
+
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ ncur = xfs_btree_alloc_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ops,
+ cur->bc_maxlevels, cur->bc_cache);
+ ncur->bc_flags = cur->bc_flags;
+ ncur->bc_nlevels = cur->bc_nlevels;
+ ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree;
+
+ if (cur->bc_mem.pag)
+ ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag);
+
+ return ncur;
+}
+
+/* Close the btree xfile and release all resources. */
+void
+xfbtree_destroy(
+ struct xfbtree *xfbt)
+{
+ xfs_buftarg_drain(xfbt->target);
+}
+
+/* Compute the number of bytes available for records. */
+static inline unsigned int
+xfbtree_rec_bytes(
+ struct xfs_mount *mp,
+ const struct xfs_btree_ops *ops)
+{
+ return XMBUF_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+}
+
+/* Initialize an empty leaf block as the btree root. */
+STATIC int
+xfbtree_init_leaf_block(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ const struct xfs_btree_ops *ops)
+{
+ struct xfs_buf *bp;
+ xfbno_t bno = xfbt->highest_bno++;
+ int error;
+
+ error = xfs_buf_get(xfbt->target, xfbno_to_daddr(bno), XFBNO_BBSIZE,
+ &bp);
+ if (error)
+ return error;
+
+ trace_xfbtree_create_root_buf(xfbt, bp);
+
+ bp->b_ops = ops->buf_ops;
+ xfs_btree_init_buf(mp, bp, ops, 0, 0, xfbt->owner);
+ xfs_buf_relse(bp);
+
+ xfbt->root.l = cpu_to_be64(bno);
+ return 0;
+}
+
+/*
+ * Create an in-memory btree root that can be used with the given xmbuf.
+ * Callers must set xfbt->owner.
+ */
+int
+xfbtree_init(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp,
+ const struct xfs_btree_ops *ops)
+{
+ unsigned int blocklen = xfbtree_rec_bytes(mp, ops);
+ unsigned int keyptr_len;
+ int error;
+
+ /* Requires a long-format CRC-format btree */
+ if (!xfs_has_crc(mp)) {
+ ASSERT(xfs_has_crc(mp));
+ return -EINVAL;
+ }
+ if (ops->ptr_len != XFS_BTREE_LONG_PTR_LEN) {
+ ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN);
+ return -EINVAL;
+ }
+
+ memset(xfbt, 0, sizeof(*xfbt));
+ xfbt->target = btp;
+
+ /* Set up min/maxrecs for this btree. */
+ keyptr_len = ops->key_len + sizeof(__be64);
+ xfbt->maxrecs[0] = blocklen / ops->rec_len;
+ xfbt->maxrecs[1] = blocklen / keyptr_len;
+ xfbt->minrecs[0] = xfbt->maxrecs[0] / 2;
+ xfbt->minrecs[1] = xfbt->maxrecs[1] / 2;
+ xfbt->highest_bno = 0;
+ xfbt->nlevels = 1;
+
+ /* Initialize the empty btree. */
+ error = xfbtree_init_leaf_block(mp, xfbt, ops);
+ if (error)
+ goto err_freesp;
+
+ trace_xfbtree_init(mp, xfbt, ops);
+
+ return 0;
+
+err_freesp:
+ xfs_buftarg_drain(xfbt->target);
+ return error;
+}
+
+/* Allocate a block to our in-memory btree. */
+int
+xfbtree_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ struct xfbtree *xfbt = cur->bc_mem.xfbtree;
+ xfbno_t bno = xfbt->highest_bno++;
+
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ trace_xfbtree_alloc_block(xfbt, cur, bno);
+
+ /* Fail if the block address exceeds the maximum for the buftarg. */
+ if (!xfbtree_verify_bno(xfbt, bno)) {
+ ASSERT(xfbtree_verify_bno(xfbt, bno));
+ *stat = 0;
+ return 0;
+ }
+
+ new->l = cpu_to_be64(bno);
+ *stat = 1;
+ return 0;
+}
+
+/* Free a block from our in-memory btree. */
+int
+xfbtree_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfbtree *xfbt = cur->bc_mem.xfbtree;
+ xfs_daddr_t daddr = xfs_buf_daddr(bp);
+ xfbno_t bno = xfs_daddr_to_xfbno(daddr);
+
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+ trace_xfbtree_free_block(xfbt, cur, bno);
+
+ if (bno + 1 == xfbt->highest_bno)
+ xfbt->highest_bno--;
+
+ return 0;
+}
+
+/* Return the minimum number of records for a btree block. */
+int
+xfbtree_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ struct xfbtree *xfbt = cur->bc_mem.xfbtree;
+
+ return xfbt->minrecs[level != 0];
+}
+
+/* Return the maximum number of records for a btree block. */
+int
+xfbtree_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ struct xfbtree *xfbt = cur->bc_mem.xfbtree;
+
+ return xfbt->maxrecs[level != 0];
+}
+
+/* If this log item is a buffer item that came from the xfbtree, return it. */
+static inline struct xfs_buf *
+xfbtree_buf_match(
+ struct xfbtree *xfbt,
+ const struct xfs_log_item *lip)
+{
+ const struct xfs_buf_log_item *bli;
+ struct xfs_buf *bp;
+
+ if (lip->li_type != XFS_LI_BUF)
+ return NULL;
+
+ bli = container_of(lip, struct xfs_buf_log_item, bli_item);
+ bp = bli->bli_buf;
+ if (bp->b_target != xfbt->target)
+ return NULL;
+
+ return bp;
+}
+
+/*
+ * Commit changes to the incore btree immediately by writing all dirty xfbtree
+ * buffers to the backing xfile. This detaches all xfbtree buffers from the
+ * transaction, even on failure. The buffer locks are dropped between the
+ * delwri queue and submit, so the caller must synchronize btree access.
+ *
+ * Normally we'd let the buffers commit with the transaction and get written to
+ * the xfile via the log, but online repair stages ephemeral btrees in memory
+ * and uses the btree_staging functions to write new btrees to disk atomically.
+ * The in-memory btree (and its backing store) are discarded at the end of the
+ * repair phase, which means that xfbtree buffers cannot commit with the rest
+ * of a transaction.
+ *
+ * In other words, online repair only needs the transaction to collect buffer
+ * pointers and to avoid buffer deadlocks, not to guarantee consistency of
+ * updates.
+ */
+int
+xfbtree_trans_commit(
+ struct xfbtree *xfbt,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item *lip, *n;
+ bool tp_dirty = false;
+ int error = 0;
+
+ /*
+ * For each xfbtree buffer attached to the transaction, write the dirty
+ * buffers to the xfile and release them.
+ */
+ list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+ struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip);
+
+ if (!bp) {
+ if (test_bit(XFS_LI_DIRTY, &lip->li_flags))
+ tp_dirty |= true;
+ continue;
+ }
+
+ trace_xfbtree_trans_commit_buf(xfbt, bp);
+
+ xmbuf_trans_bdetach(tp, bp);
+
+ /*
+ * If the buffer fails verification, note the failure but
+ * continue walking the transaction items so that we remove all
+ * ephemeral btree buffers.
+ */
+ if (!error)
+ error = xmbuf_finalize(bp);
+
+ xfs_buf_relse(bp);
+ }
+
+ /*
+ * Reset the transaction's dirty flag to reflect the dirty state of the
+ * log items that are still attached.
+ */
+ tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) |
+ (tp_dirty ? XFS_TRANS_DIRTY : 0);
+
+ return error;
+}
+
+/*
+ * Cancel changes to the incore btree by detaching all the xfbtree buffers.
+ * Changes are not undone, so callers must not access the btree ever again.
+ */
+void
+xfbtree_trans_cancel(
+ struct xfbtree *xfbt,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item *lip, *n;
+ bool tp_dirty = false;
+
+ list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+ struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip);
+
+ if (!bp) {
+ if (test_bit(XFS_LI_DIRTY, &lip->li_flags))
+ tp_dirty |= true;
+ continue;
+ }
+
+ trace_xfbtree_trans_cancel_buf(xfbt, bp);
+
+ xmbuf_trans_bdetach(tp, bp);
+ xfs_buf_relse(bp);
+ }
+
+ /*
+ * Reset the transaction's dirty flag to reflect the dirty state of the
+ * log items that are still attached.
+ */
+ tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) |
+ (tp_dirty ? XFS_TRANS_DIRTY : 0);
+}
diff --git a/fs/xfs/libxfs/xfs_btree_mem.h b/fs/xfs/libxfs/xfs_btree_mem.h
new file mode 100644
index 000000000000..1c3825786ec8
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_mem.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_BTREE_MEM_H__
+#define __XFS_BTREE_MEM_H__
+
+typedef uint64_t xfbno_t;
+
+#define XFBNO_BLOCKSIZE (XMBUF_BLOCKSIZE)
+#define XFBNO_BBSHIFT (XMBUF_BLOCKSHIFT - BBSHIFT)
+#define XFBNO_BBSIZE (XFBNO_BLOCKSIZE >> BBSHIFT)
+
+static inline xfs_daddr_t xfbno_to_daddr(xfbno_t blkno)
+{
+ return blkno << XFBNO_BBSHIFT;
+}
+
+static inline xfbno_t xfs_daddr_to_xfbno(xfs_daddr_t daddr)
+{
+ return daddr >> XFBNO_BBSHIFT;
+}
+
+struct xfbtree {
+ /* buffer cache target for this in-memory btree */
+ struct xfs_buftarg *target;
+
+ /* Highest block number that has been written to. */
+ xfbno_t highest_bno;
+
+ /* Owner of this btree. */
+ unsigned long long owner;
+
+ /* Btree header */
+ union xfs_btree_ptr root;
+ unsigned int nlevels;
+
+ /* Minimum and maximum records per block. */
+ unsigned int maxrecs[2];
+ unsigned int minrecs[2];
+};
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline bool xfbtree_verify_bno(struct xfbtree *xfbt, xfbno_t bno)
+{
+ return xmbuf_verify_daddr(xfbt->target, xfbno_to_daddr(bno));
+}
+
+void xfbtree_set_root(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr, int inc);
+void xfbtree_init_ptr_from_cur(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
+struct xfs_btree_cur *xfbtree_dup_cursor(struct xfs_btree_cur *cur);
+
+int xfbtree_get_minrecs(struct xfs_btree_cur *cur, int level);
+int xfbtree_get_maxrecs(struct xfs_btree_cur *cur, int level);
+
+int xfbtree_alloc_block(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start, union xfs_btree_ptr *ptr,
+ int *stat);
+int xfbtree_free_block(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+/* Callers must set xfbt->target and xfbt->owner before calling this */
+int xfbtree_init(struct xfs_mount *mp, struct xfbtree *xfbt,
+ struct xfs_buftarg *btp, const struct xfs_btree_ops *ops);
+void xfbtree_destroy(struct xfbtree *xfbt);
+
+int xfbtree_trans_commit(struct xfbtree *xfbt, struct xfs_trans *tp);
+void xfbtree_trans_cancel(struct xfbtree *xfbt, struct xfs_trans *tp);
+#else
+# define xfbtree_verify_bno(...) (false)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+#endif /* __XFS_BTREE_MEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index dd75e208b543..694929703152 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -39,63 +39,6 @@
*/
/*
- * Don't allow staging cursors to be duplicated because they're supposed to be
- * kept private to a single thread.
- */
-STATIC struct xfs_btree_cur *
-xfs_btree_fakeroot_dup_cursor(
- struct xfs_btree_cur *cur)
-{
- ASSERT(0);
- return NULL;
-}
-
-/*
- * Don't allow block allocation for a staging cursor, because staging cursors
- * do not support regular btree modifications.
- *
- * Bulk loading uses a separate callback to obtain new blocks from a
- * preallocated list, which prevents ENOSPC failures during loading.
- */
-STATIC int
-xfs_btree_fakeroot_alloc_block(
- struct xfs_btree_cur *cur,
- const union xfs_btree_ptr *start_bno,
- union xfs_btree_ptr *new_bno,
- int *stat)
-{
- ASSERT(0);
- return -EFSCORRUPTED;
-}
-
-/*
- * Don't allow block freeing for a staging cursor, because staging cursors
- * do not support regular btree modifications.
- */
-STATIC int
-xfs_btree_fakeroot_free_block(
- struct xfs_btree_cur *cur,
- struct xfs_buf *bp)
-{
- ASSERT(0);
- return -EFSCORRUPTED;
-}
-
-/* Initialize a pointer to the root block from the fakeroot. */
-STATIC void
-xfs_btree_fakeroot_init_ptr_from_cur(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr)
-{
- struct xbtree_afakeroot *afake;
-
- ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
-
- afake = cur->bc_ag.afake;
- ptr->s = cpu_to_be32(afake->af_root);
-}
-
-/*
* Bulk Loading for AG Btrees
* ==========================
*
@@ -109,47 +52,20 @@ xfs_btree_fakeroot_init_ptr_from_cur(
* cursor into a regular btree cursor.
*/
-/* Update the btree root information for a per-AG fake root. */
-STATIC void
-xfs_btree_afakeroot_set_root(
- struct xfs_btree_cur *cur,
- const union xfs_btree_ptr *ptr,
- int inc)
-{
- struct xbtree_afakeroot *afake = cur->bc_ag.afake;
-
- ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
- afake->af_root = be32_to_cpu(ptr->s);
- afake->af_levels += inc;
-}
-
/*
* Initialize a AG-rooted btree cursor with the given AG btree fake root.
- * The btree cursor's bc_ops will be overridden as needed to make the staging
- * functionality work.
*/
void
xfs_btree_stage_afakeroot(
struct xfs_btree_cur *cur,
struct xbtree_afakeroot *afake)
{
- struct xfs_btree_ops *nops;
-
ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
- ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+ ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE);
ASSERT(cur->bc_tp == NULL);
- nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
- memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
- nops->alloc_block = xfs_btree_fakeroot_alloc_block;
- nops->free_block = xfs_btree_fakeroot_free_block;
- nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
- nops->set_root = xfs_btree_afakeroot_set_root;
- nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
-
cur->bc_ag.afake = afake;
cur->bc_nlevels = afake->af_levels;
- cur->bc_ops = nops;
cur->bc_flags |= XFS_BTREE_STAGING;
}
@@ -163,17 +79,15 @@ void
xfs_btree_commit_afakeroot(
struct xfs_btree_cur *cur,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
- const struct xfs_btree_ops *ops)
+ struct xfs_buf *agbp)
{
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
ASSERT(cur->bc_tp == NULL);
trace_xfs_btree_commit_afakeroot(cur);
- kmem_free((void *)cur->bc_ops);
+ cur->bc_ag.afake = NULL;
cur->bc_ag.agbp = agbp;
- cur->bc_ops = ops;
cur->bc_flags &= ~XFS_BTREE_STAGING;
cur->bc_tp = tp;
}
@@ -211,29 +125,16 @@ xfs_btree_commit_afakeroot(
void
xfs_btree_stage_ifakeroot(
struct xfs_btree_cur *cur,
- struct xbtree_ifakeroot *ifake,
- struct xfs_btree_ops **new_ops)
+ struct xbtree_ifakeroot *ifake)
{
- struct xfs_btree_ops *nops;
-
ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
ASSERT(cur->bc_tp == NULL);
- nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
- memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
- nops->alloc_block = xfs_btree_fakeroot_alloc_block;
- nops->free_block = xfs_btree_fakeroot_free_block;
- nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
- nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
-
cur->bc_ino.ifake = ifake;
cur->bc_nlevels = ifake->if_levels;
- cur->bc_ops = nops;
+ cur->bc_ino.forksize = ifake->if_fork_size;
cur->bc_flags |= XFS_BTREE_STAGING;
-
- if (new_ops)
- *new_ops = nops;
}
/*
@@ -246,18 +147,15 @@ void
xfs_btree_commit_ifakeroot(
struct xfs_btree_cur *cur,
struct xfs_trans *tp,
- int whichfork,
- const struct xfs_btree_ops *ops)
+ int whichfork)
{
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
ASSERT(cur->bc_tp == NULL);
trace_xfs_btree_commit_ifakeroot(cur);
- kmem_free((void *)cur->bc_ops);
cur->bc_ino.ifake = NULL;
cur->bc_ino.whichfork = whichfork;
- cur->bc_ops = ops;
cur->bc_flags &= ~XFS_BTREE_STAGING;
cur->bc_tp = tp;
}
@@ -333,20 +231,41 @@ xfs_btree_commit_ifakeroot(
/*
* Put a btree block that we're loading onto the ordered list and release it.
* The btree blocks will be written to disk when bulk loading is finished.
+ * If we reach the dirty buffer threshold, flush them to disk before
+ * continuing.
*/
-static void
+static int
xfs_btree_bload_drop_buf(
- struct list_head *buffers_list,
- struct xfs_buf **bpp)
+ struct xfs_btree_bload *bbl,
+ struct list_head *buffers_list,
+ struct xfs_buf **bpp)
{
- if (*bpp == NULL)
- return;
+ struct xfs_buf *bp = *bpp;
+ int error;
+
+ if (!bp)
+ return 0;
- if (!xfs_buf_delwri_queue(*bpp, buffers_list))
- ASSERT(0);
+ /*
+ * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
+ * xfs_buf_read will not pointlessly reread the contents from the disk.
+ */
+ bp->b_flags |= XBF_DONE;
- xfs_buf_relse(*bpp);
+ xfs_buf_delwri_queue_here(bp, buffers_list);
+ xfs_buf_relse(bp);
*bpp = NULL;
+ bbl->nr_dirty++;
+
+ if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty)
+ return 0;
+
+ error = xfs_buf_delwri_submit(buffers_list);
+ if (error)
+ return error;
+
+ bbl->nr_dirty = 0;
+ return 0;
}
/*
@@ -376,23 +295,20 @@ xfs_btree_bload_prep_block(
struct xfs_btree_block *new_block;
int ret;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
- level == cur->bc_nlevels - 1) {
+ if (xfs_btree_at_iroot(cur, level)) {
struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
size_t new_size;
ASSERT(*bpp == NULL);
/* Allocate a new incore btree root block. */
- new_size = bbl->iroot_size(cur, nr_this_block, priv);
- ifp->if_broot = kmem_zalloc(new_size, 0);
+ new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
+ ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
ifp->if_broot_bytes = (int)new_size;
/* Initialize it and send it out. */
- xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
- XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
- nr_this_block, cur->bc_ino.ip->i_ino,
- cur->bc_flags);
+ xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops,
+ level, nr_this_block, cur->bc_ino.ip->i_ino);
*bpp = NULL;
*blockp = ifp->if_broot;
@@ -418,7 +334,10 @@ xfs_btree_bload_prep_block(
*/
if (*blockp)
xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
- xfs_btree_bload_drop_buf(buffers_list, bpp);
+
+ ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
+ if (ret)
+ return ret;
/* Initialize the new btree block. */
xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
@@ -436,22 +355,19 @@ STATIC int
xfs_btree_bload_leaf(
struct xfs_btree_cur *cur,
unsigned int recs_this_block,
- xfs_btree_bload_get_record_fn get_record,
+ xfs_btree_bload_get_records_fn get_records,
struct xfs_btree_block *block,
void *priv)
{
- unsigned int j;
+ unsigned int j = 1;
int ret;
/* Fill the leaf block with records. */
- for (j = 1; j <= recs_this_block; j++) {
- union xfs_btree_rec *block_rec;
-
- ret = get_record(cur, priv);
- if (ret)
+ while (j <= recs_this_block) {
+ ret = get_records(cur, j, block, recs_this_block - j + 1, priv);
+ if (ret < 0)
return ret;
- block_rec = xfs_btree_rec_addr(cur, j, block);
- cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ j += ret;
}
return 0;
@@ -485,7 +401,12 @@ xfs_btree_bload_node(
ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
- ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
+ /*
+ * Read the lower-level block in case the buffer for it has
+ * been reclaimed. LRU refs will be set on the block, which is
+ * desirable if the new btree commits.
+ */
+ ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block,
&child_bp);
if (ret)
return ret;
@@ -570,7 +491,14 @@ xfs_btree_bload_level_geometry(
unsigned int desired_npb;
unsigned int maxnr;
- maxnr = cur->bc_ops->get_maxrecs(cur, level);
+ /*
+ * Compute the absolute maximum number of records that we can store in
+ * the ondisk block or inode root.
+ */
+ if (cur->bc_ops->get_dmaxrecs)
+ maxnr = cur->bc_ops->get_dmaxrecs(cur, level);
+ else
+ maxnr = cur->bc_ops->get_maxrecs(cur, level);
/*
* Compute the number of blocks we need to fill each block with the
@@ -671,7 +599,7 @@ xfs_btree_bload_compute_geometry(
xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
&avg_per_block, &level_blocks, &dontcare64);
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
/*
* If all the items we want to store at this level
* would fit in the inode root block, then we have our
@@ -730,7 +658,7 @@ xfs_btree_bload_compute_geometry(
return -EOVERFLOW;
bbl->btree_height = cur->bc_nlevels;
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
bbl->nr_blocks = nr_blocks - 1;
else
bbl->nr_blocks = nr_blocks;
@@ -764,6 +692,7 @@ xfs_btree_bload(
cur->bc_nlevels = bbl->btree_height;
xfs_btree_set_ptr_null(cur, &child_ptr);
xfs_btree_set_ptr_null(cur, &ptr);
+ bbl->nr_dirty = 0;
xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
&avg_per_block, &blocks, &blocks_with_extra);
@@ -789,7 +718,7 @@ xfs_btree_bload(
trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
nr_this_block);
- ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
+ ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
block, priv);
if (ret)
goto out;
@@ -802,7 +731,10 @@ xfs_btree_bload(
xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
}
total_blocks += blocks;
- xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+ ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
+ if (ret)
+ goto out;
/* Populate the internal btree nodes. */
for (level = 1; level < cur->bc_nlevels; level++) {
@@ -844,12 +776,16 @@ xfs_btree_bload(
xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
}
total_blocks += blocks;
- xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+ ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
+ if (ret)
+ goto out;
+
xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
}
/* Initialize the new root. */
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
cur->bc_ino.ifake->if_blocks = total_blocks - 1;
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
index f0d2976050ae..0c9c2ffb127a 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.h
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -22,7 +22,7 @@ struct xbtree_afakeroot {
void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
struct xbtree_afakeroot *afake);
void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
- struct xfs_buf *agbp, const struct xfs_btree_ops *ops);
+ struct xfs_buf *agbp);
/* Fake root for an inode-rooted btree. */
struct xbtree_ifakeroot {
@@ -37,35 +37,33 @@ struct xbtree_ifakeroot {
/* Number of bytes available for this fork in the inode. */
unsigned int if_fork_size;
-
- /* Fork format. */
- unsigned int if_format;
-
- /* Number of records. */
- unsigned int if_extents;
};
/* Cursor interactions with fake roots for inode-rooted btrees. */
void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur,
- struct xbtree_ifakeroot *ifake,
- struct xfs_btree_ops **new_ops);
+ struct xbtree_ifakeroot *ifake);
void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
- int whichfork, const struct xfs_btree_ops *ops);
+ int whichfork);
/* Bulk loading of staged btrees. */
-typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv);
+typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur,
+ unsigned int idx, struct xfs_btree_block *block,
+ unsigned int nr_wanted, void *priv);
typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr, void *priv);
typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur,
- unsigned int nr_this_level, void *priv);
+ unsigned int level, unsigned int nr_this_level, void *priv);
struct xfs_btree_bload {
/*
- * This function will be called nr_records times to load records into
- * the btree. The function does this by setting the cursor's bc_rec
- * field in in-core format. Records must be returned in sort order.
+ * This function will be called to load @nr_wanted records into the
+ * btree. The implementation does this by setting the cursor's bc_rec
+ * field in in-core format and using init_rec_from_cur to set the
+ * records in the btree block. Records must be returned in sort order.
+ * The function must return the number of records loaded or the usual
+ * negative errno.
*/
- xfs_btree_bload_get_record_fn get_record;
+ xfs_btree_bload_get_records_fn get_records;
/*
* This function will be called nr_blocks times to obtain a pointer
@@ -77,8 +75,7 @@ struct xfs_btree_bload {
/*
* This function should return the size of the in-core btree root
- * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree
- * types.
+ * block. It is only necessary for XFS_BTREE_TYPE_INODE btrees.
*/
xfs_btree_bload_iroot_size_fn iroot_size;
@@ -113,6 +110,16 @@ struct xfs_btree_bload {
* height of the new btree.
*/
unsigned int btree_height;
+
+ /*
+ * Flush the new btree block buffer list to disk after this many blocks
+ * have been formatted. Zero prohibits writing any buffers until all
+ * blocks have been formatted.
+ */
+ uint16_t max_dirty;
+
+ /* Number of dirty buffers. */
+ uint16_t nr_dirty;
};
int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur,
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index e576560b46e9..718d071bb21a 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -23,6 +23,7 @@
#include "xfs_buf_item.h"
#include "xfs_log.h"
#include "xfs_errortag.h"
+#include "xfs_health.h"
/*
* xfs_da_btree.c
@@ -85,7 +86,8 @@ xfs_da_state_alloc(
{
struct xfs_da_state *state;
- state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL);
+ state = kmem_cache_zalloc(xfs_da_state_cache,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
state->args = args;
state->mp = args->dp->i_mount;
return state;
@@ -352,6 +354,8 @@ const struct xfs_buf_ops xfs_da3_node_buf_ops = {
static int
xfs_da3_node_set_type(
struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ int whichfork,
struct xfs_buf *bp)
{
struct xfs_da_blkinfo *info = bp->b_addr;
@@ -373,6 +377,7 @@ xfs_da3_node_set_type(
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp,
info, sizeof(*info));
xfs_trans_brelse(tp, bp);
+ xfs_dirattr_mark_sick(dp, whichfork);
return -EFSCORRUPTED;
}
}
@@ -391,7 +396,7 @@ xfs_da3_node_read(
&xfs_da3_node_buf_ops);
if (error || !*bpp || !tp)
return error;
- return xfs_da3_node_set_type(tp, *bpp);
+ return xfs_da3_node_set_type(tp, dp, whichfork, *bpp);
}
int
@@ -408,6 +413,8 @@ xfs_da3_node_read_mapped(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno,
XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0,
bpp, &xfs_da3_node_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_dirattr_mark_sick(dp, whichfork);
if (error || !*bpp)
return error;
@@ -418,7 +425,26 @@ xfs_da3_node_read_mapped(
if (!tp)
return 0;
- return xfs_da3_node_set_type(tp, *bpp);
+ return xfs_da3_node_set_type(tp, dp, whichfork, *bpp);
+}
+
+/*
+ * Copy src directory/attr leaf/node buffer to the dst.
+ * For v5 file systems make sure the right blkno is stamped in.
+ */
+void
+xfs_da_buf_copy(
+ struct xfs_buf *dst,
+ struct xfs_buf *src,
+ size_t size)
+{
+ struct xfs_da3_blkinfo *da3 = dst->b_addr;
+
+ memcpy(dst->b_addr, src->b_addr, size);
+ dst->b_ops = src->b_ops;
+ xfs_trans_buf_copy_type(dst, src);
+ if (xfs_has_crc(dst->b_mount))
+ da3->blkno = cpu_to_be64(xfs_buf_daddr(dst));
}
/*========================================================================
@@ -612,6 +638,7 @@ xfs_da3_split(
if (node->hdr.info.forw) {
if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
xfs_buf_mark_corrupt(oldblk->bp);
+ xfs_da_mark_sick(state->args);
error = -EFSCORRUPTED;
goto out;
}
@@ -625,6 +652,7 @@ xfs_da3_split(
if (node->hdr.info.back) {
if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
xfs_buf_mark_corrupt(oldblk->bp);
+ xfs_da_mark_sick(state->args);
error = -EFSCORRUPTED;
goto out;
}
@@ -690,12 +718,6 @@ xfs_da3_root_split(
btree = icnodehdr.btree;
size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
level = icnodehdr.level;
-
- /*
- * we are about to copy oldroot to bp, so set up the type
- * of bp while we know exactly what it will be.
- */
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
} else {
struct xfs_dir3_icleaf_hdr leafhdr;
@@ -707,31 +729,17 @@ xfs_da3_root_split(
size = (int)((char *)&leafhdr.ents[leafhdr.count] -
(char *)leaf);
level = 0;
-
- /*
- * we are about to copy oldroot to bp, so set up the type
- * of bp while we know exactly what it will be.
- */
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
}
/*
- * we can copy most of the information in the node from one block to
- * another, but for CRC enabled headers we have to make sure that the
- * block specific identifiers are kept intact. We update the buffer
- * directly for this.
+ * Copy old root to new buffer and log it.
*/
- memcpy(node, oldroot, size);
- if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
- oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
- struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
-
- node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
- }
+ xfs_da_buf_copy(bp, blk1->bp, size);
xfs_trans_log_buf(tp, bp, 0, size - 1);
- bp->b_ops = blk1->bp->b_ops;
- xfs_trans_buf_copy_type(bp, blk1->bp);
+ /*
+ * Update blk1 to point to new buffer.
+ */
blk1->bp = bp;
blk1->blkno = blkno;
@@ -1220,21 +1228,14 @@ xfs_da3_root_join(
xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
/*
- * This could be copying a leaf back into the root block in the case of
- * there only being a single leaf block left in the tree. Hence we have
- * to update the b_ops pointer as well to match the buffer type change
- * that could occur. For dir3 blocks we also need to update the block
- * number in the buffer header.
+ * Copy child to root buffer and log it.
*/
- memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
- root_blk->bp->b_ops = bp->b_ops;
- xfs_trans_buf_copy_type(root_blk->bp, bp);
- if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
- struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
- da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp));
- }
+ xfs_da_buf_copy(root_blk->bp, bp, args->geo->blksize);
xfs_trans_log_buf(args->trans, root_blk->bp, 0,
args->geo->blksize - 1);
+ /*
+ * Now we can drop the child buffer.
+ */
error = xfs_da_shrink_inode(args, child, bp);
return error;
}
@@ -1643,6 +1644,7 @@ xfs_da3_node_lookup_int(
if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) {
xfs_buf_mark_corrupt(blk->bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -1658,6 +1660,7 @@ xfs_da3_node_lookup_int(
/* Tree taller than we can handle; bail out! */
if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
xfs_buf_mark_corrupt(blk->bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -1666,6 +1669,7 @@ xfs_da3_node_lookup_int(
expected_level = nodehdr.level - 1;
else if (expected_level != nodehdr.level) {
xfs_buf_mark_corrupt(blk->bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
} else
expected_level--;
@@ -1717,12 +1721,16 @@ xfs_da3_node_lookup_int(
}
/* We can't point back to the root. */
- if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk))
+ if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk)) {
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
+ }
}
- if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0))
+ if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0)) {
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
+ }
/*
* A leaf block that ends in the hashval that we are interested in
@@ -1740,6 +1748,7 @@ xfs_da3_node_lookup_int(
args->blkno = blk->blkno;
} else {
ASSERT(0);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
@@ -2190,7 +2199,8 @@ xfs_da_grow_inode_int(
* If we didn't get it and the block might work if fragmented,
* try without the CONTIG flag. Loop until we get it all.
*/
- mapp = kmem_alloc(sizeof(*mapp) * count, 0);
+ mapp = kmalloc(sizeof(*mapp) * count,
+ GFP_KERNEL | __GFP_NOFAIL);
for (b = *bno, mapi = 0; b < *bno + count; ) {
c = (int)(*bno + count - b);
nmap = min(XFS_BMAP_MAX_NMAP, c);
@@ -2227,7 +2237,7 @@ xfs_da_grow_inode_int(
out_free_map:
if (mapp != &map)
- kmem_free(mapp);
+ kfree(mapp);
return error;
}
@@ -2305,8 +2315,10 @@ xfs_da3_swap_lastblock(
error = xfs_bmap_last_before(tp, dp, &lastoff, w);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, lastoff == 0))
+ if (XFS_IS_CORRUPT(mp, lastoff == 0)) {
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
+ }
/*
* Read the last block in the btree space.
*/
@@ -2317,9 +2329,10 @@ xfs_da3_swap_lastblock(
/*
* Copy the last block into the dead buffer and log it.
*/
- memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+ xfs_da_buf_copy(dead_buf, last_buf, args->geo->blksize);
xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
dead_info = dead_buf->b_addr;
+
/*
* Get values from the moved block.
*/
@@ -2355,6 +2368,7 @@ xfs_da3_swap_lastblock(
if (XFS_IS_CORRUPT(mp,
be32_to_cpu(sib_info->forw) != last_blkno ||
sib_info->magic != dead_info->magic)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2375,6 +2389,7 @@ xfs_da3_swap_lastblock(
if (XFS_IS_CORRUPT(mp,
be32_to_cpu(sib_info->back) != last_blkno ||
sib_info->magic != dead_info->magic)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2397,6 +2412,7 @@ xfs_da3_swap_lastblock(
xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
if (XFS_IS_CORRUPT(mp,
level >= 0 && level != par_hdr.level + 1)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2408,6 +2424,7 @@ xfs_da3_swap_lastblock(
entno++)
continue;
if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2433,6 +2450,7 @@ xfs_da3_swap_lastblock(
xfs_trans_brelse(tp, par_buf);
par_buf = NULL;
if (XFS_IS_CORRUPT(mp, par_blkno == 0)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2442,6 +2460,7 @@ xfs_da3_swap_lastblock(
par_node = par_buf->b_addr;
xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) {
+ xfs_da_mark_sick(args);
error = -EFSCORRUPTED;
goto done;
}
@@ -2525,7 +2544,8 @@ xfs_dabuf_map(
int error = 0, nirecs, i;
if (nfsb > 1)
- irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS);
+ irecs = kzalloc(sizeof(irec) * nfsb,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
nirecs = nfsb;
error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs,
@@ -2538,7 +2558,8 @@ xfs_dabuf_map(
* larger one that needs to be free by the caller.
*/
if (nirecs > 1) {
- map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS);
+ map = kzalloc(nirecs * sizeof(struct xfs_buf_map),
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
if (!map) {
error = -ENOMEM;
goto out_free_irecs;
@@ -2564,12 +2585,13 @@ xfs_dabuf_map(
*nmaps = nirecs;
out_free_irecs:
if (irecs != &irec)
- kmem_free(irecs);
+ kfree(irecs);
return error;
invalid_mapping:
/* Caller ok with no mapping. */
if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) {
+ xfs_dirattr_mark_sick(dp, whichfork);
error = -EFSCORRUPTED;
if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
xfs_alert(mp, "%s: bno %u inode %llu",
@@ -2620,7 +2642,7 @@ xfs_da_get_buf(
out_free:
if (mapp != &map)
- kmem_free(mapp);
+ kfree(mapp);
return error;
}
@@ -2651,6 +2673,8 @@ xfs_da_read_buf(
error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0,
&bp, ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_dirattr_mark_sick(dp, whichfork);
if (error)
goto out_free;
@@ -2661,7 +2685,7 @@ xfs_da_read_buf(
*bpp = bp;
out_free:
if (mapp != &map)
- kmem_free(mapp);
+ kfree(mapp);
return error;
}
@@ -2692,7 +2716,7 @@ xfs_da_reada_buf(
out_free:
if (mapp != &map)
- kmem_free(mapp);
+ kfree(mapp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index ffa3df5b2893..706baf36e175 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -219,6 +219,8 @@ int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
const struct xfs_buf_ops *ops);
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
struct xfs_buf *dead_buf);
+void xfs_da_buf_copy(struct xfs_buf *dst, struct xfs_buf *src,
+ size_t size);
uint xfs_da_hashname(const uint8_t *name_string, int name_length);
enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index f9015f88eca7..060e5c96b70f 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -159,6 +159,17 @@ struct xfs_da3_intnode {
#define XFS_DIR3_FT_MAX 9
+#define XFS_DIR3_FTYPE_STR \
+ { XFS_DIR3_FT_UNKNOWN, "unknown" }, \
+ { XFS_DIR3_FT_REG_FILE, "file" }, \
+ { XFS_DIR3_FT_DIR, "directory" }, \
+ { XFS_DIR3_FT_CHRDEV, "char" }, \
+ { XFS_DIR3_FT_BLKDEV, "block" }, \
+ { XFS_DIR3_FT_FIFO, "fifo" }, \
+ { XFS_DIR3_FT_SOCK, "sock" }, \
+ { XFS_DIR3_FT_SYMLINK, "symlink" }, \
+ { XFS_DIR3_FT_WHT, "whiteout" }
+
/*
* Byte offset in data block and shortform entry.
*/
@@ -578,20 +589,25 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
/*
- * Entries are packed toward the top as tight as possible.
- */
-struct xfs_attr_shortform {
- struct xfs_attr_sf_hdr { /* constant-structure header block */
- __be16 totsize; /* total bytes in shortform list */
- __u8 count; /* count of active entries */
- __u8 padding;
- } hdr;
- struct xfs_attr_sf_entry {
- uint8_t namelen; /* actual length of name (no NULL) */
- uint8_t valuelen; /* actual length of value (no NULL) */
- uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
- uint8_t nameval[]; /* name & value bytes concatenated */
- } list[]; /* variable sized array */
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as to fit into the
+ * literal area of the inode.
+ *
+ * These "shortform" attribute forks consist of a single xfs_attr_sf_hdr header
+ * followed by zero or more xfs_attr_sf_entry structures.
+ */
+struct xfs_attr_sf_hdr { /* constant-structure header block */
+ __be16 totsize; /* total bytes in shortform list */
+ __u8 count; /* count of active entries */
+ __u8 padding;
+};
+
+struct xfs_attr_sf_entry {
+ __u8 namelen; /* actual length of name (no NULL) */
+ __u8 valuelen; /* actual length of value (no NULL) */
+ __u8 flags; /* flags bits (XFS_ATTR_*) */
+ __u8 nameval[]; /* name & value bytes concatenated */
};
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index bcfb6a4203cd..c13276095cc0 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -26,6 +26,7 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr.h"
+#include "xfs_trans_priv.h"
static struct kmem_cache *xfs_defer_pending_cache;
@@ -181,16 +182,89 @@ static struct kmem_cache *xfs_defer_pending_cache;
* Note that the continuation requested between t2 and t3 is likely to
* reoccur.
*/
+STATIC struct xfs_log_item *
+xfs_defer_barrier_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ return NULL;
+}
+
+STATIC void
+xfs_defer_barrier_abort_intent(
+ struct xfs_log_item *intent)
+{
+ /* empty */
+}
-static const struct xfs_defer_op_type *defer_op_types[] = {
- [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type,
- [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type,
- [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type,
- [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type,
- [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
- [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type,
+STATIC struct xfs_log_item *
+xfs_defer_barrier_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ return NULL;
+}
+
+STATIC int
+xfs_defer_barrier_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+STATIC void
+xfs_defer_barrier_cancel_item(
+ struct list_head *item)
+{
+ ASSERT(0);
+}
+
+static const struct xfs_defer_op_type xfs_barrier_defer_type = {
+ .max_items = 1,
+ .create_intent = xfs_defer_barrier_create_intent,
+ .abort_intent = xfs_defer_barrier_abort_intent,
+ .create_done = xfs_defer_barrier_create_done,
+ .finish_item = xfs_defer_barrier_finish_item,
+ .cancel_item = xfs_defer_barrier_cancel_item,
};
+/* Create a log intent done item for a log intent item. */
+static inline void
+xfs_defer_create_done(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ struct xfs_log_item *lip;
+
+ /* If there is no log intent item, there can be no log done item. */
+ if (!dfp->dfp_intent)
+ return;
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the log intent item and frees the log done item
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+ if (!lip)
+ return;
+
+ tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE;
+ xfs_trans_add_item(tp, lip);
+ set_bit(XFS_LI_DIRTY, &lip->li_flags);
+ dfp->dfp_done = lip;
+}
+
/*
* Ensure there's a log intent item associated with this deferred work item if
* the operation must be restarted on crash. Returns 1 if there's a log item;
@@ -202,18 +276,21 @@ xfs_defer_create_intent(
struct xfs_defer_pending *dfp,
bool sort)
{
- const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
struct xfs_log_item *lip;
if (dfp->dfp_intent)
return 1;
- lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort);
+ lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count,
+ sort);
if (!lip)
return 0;
if (IS_ERR(lip))
return PTR_ERR(lip);
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ xfs_trans_add_item(tp, lip);
+ set_bit(XFS_LI_DIRTY, &lip->li_flags);
dfp->dfp_intent = lip;
return 1;
}
@@ -245,26 +322,60 @@ xfs_defer_create_intents(
return ret;
}
+static inline void
+xfs_defer_pending_abort(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp)
+{
+ trace_xfs_defer_pending_abort(mp, dfp);
+
+ if (dfp->dfp_intent && !dfp->dfp_done) {
+ dfp->dfp_ops->abort_intent(dfp->dfp_intent);
+ dfp->dfp_intent = NULL;
+ }
+}
+
+static inline void
+xfs_defer_pending_cancel_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp)
+{
+ struct list_head *pwi;
+ struct list_head *n;
+
+ trace_xfs_defer_cancel_list(mp, dfp);
+
+ list_del(&dfp->dfp_list);
+ list_for_each_safe(pwi, n, &dfp->dfp_work) {
+ list_del(pwi);
+ dfp->dfp_count--;
+ trace_xfs_defer_cancel_item(mp, dfp, pwi);
+ dfp->dfp_ops->cancel_item(pwi);
+ }
+ ASSERT(dfp->dfp_count == 0);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
+}
+
+STATIC void
+xfs_defer_pending_abort_list(
+ struct xfs_mount *mp,
+ struct list_head *dop_list)
+{
+ struct xfs_defer_pending *dfp;
+
+ /* Abort intent items that don't have a done item. */
+ list_for_each_entry(dfp, dop_list, dfp_list)
+ xfs_defer_pending_abort(mp, dfp);
+}
+
/* Abort all the intents that were committed. */
STATIC void
xfs_defer_trans_abort(
struct xfs_trans *tp,
struct list_head *dop_pending)
{
- struct xfs_defer_pending *dfp;
- const struct xfs_defer_op_type *ops;
-
trace_xfs_defer_trans_abort(tp, _RET_IP_);
-
- /* Abort intent items that don't have a done item. */
- list_for_each_entry(dfp, dop_pending, dfp_list) {
- ops = defer_op_types[dfp->dfp_type];
- trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
- if (dfp->dfp_intent && !dfp->dfp_done) {
- ops->abort_intent(dfp->dfp_intent);
- dfp->dfp_intent = NULL;
- }
- }
+ xfs_defer_pending_abort_list(tp->t_mountp, dop_pending);
}
/*
@@ -382,27 +493,31 @@ xfs_defer_cancel_list(
{
struct xfs_defer_pending *dfp;
struct xfs_defer_pending *pli;
- struct list_head *pwi;
- struct list_head *n;
- const struct xfs_defer_op_type *ops;
/*
* Free the pending items. Caller should already have arranged
* for the intent items to be released.
*/
- list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
- ops = defer_op_types[dfp->dfp_type];
- trace_xfs_defer_cancel_list(mp, dfp);
- list_del(&dfp->dfp_list);
- list_for_each_safe(pwi, n, &dfp->dfp_work) {
- list_del(pwi);
- dfp->dfp_count--;
- trace_xfs_defer_cancel_item(mp, dfp, pwi);
- ops->cancel_item(pwi);
- }
- ASSERT(dfp->dfp_count == 0);
- kmem_cache_free(xfs_defer_pending_cache, dfp);
+ list_for_each_entry_safe(dfp, pli, dop_list, dfp_list)
+ xfs_defer_pending_cancel_work(mp, dfp);
+}
+
+static inline void
+xfs_defer_relog_intent(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ struct xfs_log_item *lip;
+
+ xfs_defer_create_done(tp, dfp);
+
+ lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done);
+ if (lip) {
+ xfs_trans_add_item(tp, lip);
+ set_bit(XFS_LI_DIRTY, &lip->li_flags);
}
+ dfp->dfp_done = NULL;
+ dfp->dfp_intent = lip;
}
/*
@@ -410,7 +525,7 @@ xfs_defer_cancel_list(
* done item to release the intent item; and then log a new intent item.
* The caller should provide a fresh transaction and roll it after we're done.
*/
-static int
+static void
xfs_defer_relog(
struct xfs_trans **tpp,
struct list_head *dfops)
@@ -449,31 +564,28 @@ xfs_defer_relog(
trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
- dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
- }
- if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
- return xfs_defer_trans_roll(tpp);
- return 0;
+ xfs_defer_relog_intent(*tpp, dfp);
+ }
}
/*
* Log an intent-done item for the first pending intent, and finish the work
* items.
*/
-static int
+int
xfs_defer_finish_one(
struct xfs_trans *tp,
struct xfs_defer_pending *dfp)
{
- const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+ const struct xfs_defer_op_type *ops = dfp->dfp_ops;
struct xfs_btree_cur *state = NULL;
struct list_head *li, *n;
int error;
trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
- dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+ xfs_defer_create_done(tp, dfp);
list_for_each_safe(li, n, &dfp->dfp_work) {
list_del(li);
dfp->dfp_count--;
@@ -510,6 +622,24 @@ out:
return error;
}
+/* Move all paused deferred work from @tp to @paused_list. */
+static void
+xfs_defer_isolate_paused(
+ struct xfs_trans *tp,
+ struct list_head *paused_list)
+{
+ struct xfs_defer_pending *dfp;
+ struct xfs_defer_pending *pli;
+
+ list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) {
+ if (!(dfp->dfp_flags & XFS_DEFER_PAUSED))
+ continue;
+
+ list_move_tail(&dfp->dfp_list, paused_list);
+ trace_xfs_defer_isolate_paused(tp->t_mountp, dfp);
+ }
+}
+
/*
* Finish all the pending work. This involves logging intent items for
* any work items that wandered in since the last transaction roll (if
@@ -525,6 +655,7 @@ xfs_defer_finish_noroll(
struct xfs_defer_pending *dfp = NULL;
int error = 0;
LIST_HEAD(dop_pending);
+ LIST_HEAD(dop_paused);
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -543,6 +674,8 @@ xfs_defer_finish_noroll(
*/
int has_intents = xfs_defer_create_intents(*tp);
+ xfs_defer_isolate_paused(*tp, &dop_paused);
+
list_splice_init(&(*tp)->t_dfops, &dop_pending);
if (has_intents < 0) {
@@ -555,22 +688,33 @@ xfs_defer_finish_noroll(
goto out_shutdown;
/* Relog intent items to keep the log moving. */
- error = xfs_defer_relog(tp, &dop_pending);
- if (error)
- goto out_shutdown;
+ xfs_defer_relog(tp, &dop_pending);
+ xfs_defer_relog(tp, &dop_paused);
+
+ if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
+ error = xfs_defer_trans_roll(tp);
+ if (error)
+ goto out_shutdown;
+ }
}
- dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
- dfp_list);
+ dfp = list_first_entry_or_null(&dop_pending,
+ struct xfs_defer_pending, dfp_list);
+ if (!dfp)
+ break;
error = xfs_defer_finish_one(*tp, dfp);
if (error && error != -EAGAIN)
goto out_shutdown;
}
+ /* Requeue the paused items in the outgoing transaction. */
+ list_splice_tail_init(&dop_paused, &(*tp)->t_dfops);
+
trace_xfs_defer_finish_done(*tp, _RET_IP_);
return 0;
out_shutdown:
+ list_splice_tail_init(&dop_paused, &dop_pending);
xfs_defer_trans_abort(*tp, &dop_pending);
xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
trace_xfs_defer_finish_error(*tp, error);
@@ -583,6 +727,9 @@ int
xfs_defer_finish(
struct xfs_trans **tp)
{
+#ifdef DEBUG
+ struct xfs_defer_pending *dfp;
+#endif
int error;
/*
@@ -602,7 +749,10 @@ xfs_defer_finish(
}
/* Reset LOWMODE now that we've finished all the dfops. */
- ASSERT(list_empty(&(*tp)->t_dfops));
+#ifdef DEBUG
+ list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list)
+ ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED);
+#endif
(*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
return 0;
}
@@ -614,48 +764,160 @@ xfs_defer_cancel(
struct xfs_mount *mp = tp->t_mountp;
trace_xfs_defer_cancel(tp, _RET_IP_);
+ xfs_defer_trans_abort(tp, &tp->t_dfops);
xfs_defer_cancel_list(mp, &tp->t_dfops);
}
+/*
+ * Return the last pending work item attached to this transaction if it matches
+ * the deferred op type.
+ */
+static inline struct xfs_defer_pending *
+xfs_defer_find_last(
+ struct xfs_trans *tp,
+ const struct xfs_defer_op_type *ops)
+{
+ struct xfs_defer_pending *dfp = NULL;
+
+ /* No dfops at all? */
+ if (list_empty(&tp->t_dfops))
+ return NULL;
+
+ dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending,
+ dfp_list);
+
+ /* Wrong type? */
+ if (dfp->dfp_ops != ops)
+ return NULL;
+ return dfp;
+}
+
+/*
+ * Decide if we can add a deferred work item to the last dfops item attached
+ * to the transaction.
+ */
+static inline bool
+xfs_defer_can_append(
+ struct xfs_defer_pending *dfp,
+ const struct xfs_defer_op_type *ops)
+{
+ /* Already logged? */
+ if (dfp->dfp_intent)
+ return false;
+
+ /* Paused items cannot absorb more work */
+ if (dfp->dfp_flags & XFS_DEFER_PAUSED)
+ return NULL;
+
+ /* Already full? */
+ if (ops->max_items && dfp->dfp_count >= ops->max_items)
+ return false;
+
+ return true;
+}
+
+/* Create a new pending item at the end of the transaction list. */
+static inline struct xfs_defer_pending *
+xfs_defer_alloc(
+ struct list_head *dfops,
+ const struct xfs_defer_op_type *ops)
+{
+ struct xfs_defer_pending *dfp;
+
+ dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ dfp->dfp_ops = ops;
+ INIT_LIST_HEAD(&dfp->dfp_work);
+ list_add_tail(&dfp->dfp_list, dfops);
+
+ return dfp;
+}
+
/* Add an item for later deferred processing. */
-void
+struct xfs_defer_pending *
xfs_defer_add(
struct xfs_trans *tp,
- enum xfs_defer_ops_type type,
- struct list_head *li)
+ struct list_head *li,
+ const struct xfs_defer_op_type *ops)
{
struct xfs_defer_pending *dfp = NULL;
- const struct xfs_defer_op_type *ops = defer_op_types[type];
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
- BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
- /*
- * Add the item to a pending item at the end of the intake list.
- * If the last pending item has the same type, reuse it. Else,
- * create a new pending item at the end of the intake list.
- */
- if (!list_empty(&tp->t_dfops)) {
- dfp = list_last_entry(&tp->t_dfops,
- struct xfs_defer_pending, dfp_list);
- if (dfp->dfp_type != type ||
- (ops->max_items && dfp->dfp_count >= ops->max_items))
- dfp = NULL;
- }
- if (!dfp) {
- dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
- GFP_NOFS | __GFP_NOFAIL);
- dfp->dfp_type = type;
- dfp->dfp_intent = NULL;
- dfp->dfp_done = NULL;
- dfp->dfp_count = 0;
- INIT_LIST_HEAD(&dfp->dfp_work);
- list_add_tail(&dfp->dfp_list, &tp->t_dfops);
- }
+ dfp = xfs_defer_find_last(tp, ops);
+ if (!dfp || !xfs_defer_can_append(dfp, ops))
+ dfp = xfs_defer_alloc(&tp->t_dfops, ops);
- list_add_tail(li, &dfp->dfp_work);
+ xfs_defer_add_item(dfp, li);
trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
- dfp->dfp_count++;
+ return dfp;
+}
+
+/*
+ * Add a defer ops barrier to force two otherwise adjacent deferred work items
+ * to be tracked separately and have separate log items.
+ */
+void
+xfs_defer_add_barrier(
+ struct xfs_trans *tp)
+{
+ struct xfs_defer_pending *dfp;
+
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ /* If the last defer op added was a barrier, we're done. */
+ dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type);
+ if (dfp)
+ return;
+
+ xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type);
+
+ trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL);
+}
+
+/*
+ * Create a pending deferred work item to replay the recovered intent item
+ * and add it to the list.
+ */
+void
+xfs_defer_start_recovery(
+ struct xfs_log_item *lip,
+ struct list_head *r_dfops,
+ const struct xfs_defer_op_type *ops)
+{
+ struct xfs_defer_pending *dfp = xfs_defer_alloc(r_dfops, ops);
+
+ dfp->dfp_intent = lip;
+}
+
+/*
+ * Cancel a deferred work item created to recover a log intent item. @dfp
+ * will be freed after this function returns.
+ */
+void
+xfs_defer_cancel_recovery(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp)
+{
+ xfs_defer_pending_abort(mp, dfp);
+ xfs_defer_pending_cancel_work(mp, dfp);
+}
+
+/* Replay the deferred work item created from a recovered log intent item. */
+int
+xfs_defer_finish_recovery(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct list_head *capture_list)
+{
+ const struct xfs_defer_op_type *ops = dfp->dfp_ops;
+ int error;
+
+ /* dfp is freed by recover_work and must not be accessed afterwards */
+ error = ops->recover_work(dfp, capture_list);
+ if (error)
+ trace_xlog_intent_recovery_failed(mp, ops, error);
+ return error;
}
/*
@@ -712,7 +974,7 @@ xfs_defer_ops_capture(
return ERR_PTR(error);
/* Create an object to capture the defer ops. */
- dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
+ dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&dfc->dfc_list);
INIT_LIST_HEAD(&dfc->dfc_dfops);
@@ -744,7 +1006,7 @@ xfs_defer_ops_capture(
* transaction.
*/
for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
- ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL);
ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
}
@@ -756,12 +1018,13 @@ xfs_defer_ops_capture(
/* Release all resources that we used to capture deferred ops. */
void
-xfs_defer_ops_capture_free(
+xfs_defer_ops_capture_abort(
struct xfs_mount *mp,
struct xfs_defer_capture *dfc)
{
unsigned short i;
+ xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops);
xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
@@ -770,7 +1033,7 @@ xfs_defer_ops_capture_free(
for (i = 0; i < dfc->dfc_held.dr_inos; i++)
xfs_irele(dfc->dfc_held.dr_ip[i]);
- kmem_free(dfc);
+ kfree(dfc);
}
/*
@@ -802,7 +1065,7 @@ xfs_defer_ops_capture_and_commit(
/* Commit the transaction and add the capture structure to the list. */
error = xfs_trans_commit(tp);
if (error) {
- xfs_defer_ops_capture_free(mp, dfc);
+ xfs_defer_ops_capture_abort(mp, dfc);
return error;
}
@@ -846,7 +1109,7 @@ xfs_defer_ops_continue(
list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
tp->t_flags |= dfc->dfc_tpflags;
- kmem_free(dfc);
+ kfree(dfc);
}
/* Release the resources captured and continued during recovery. */
@@ -930,3 +1193,36 @@ xfs_defer_destroy_item_caches(void)
xfs_rmap_intent_destroy_cache();
xfs_defer_destroy_cache();
}
+
+/*
+ * Mark a deferred work item so that it will be requeued indefinitely without
+ * being finished. Caller must ensure there are no data dependencies on this
+ * work item in the meantime.
+ */
+void
+xfs_defer_item_pause(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED));
+
+ dfp->dfp_flags |= XFS_DEFER_PAUSED;
+
+ trace_xfs_defer_item_pause(tp->t_mountp, dfp);
+}
+
+/*
+ * Release a paused deferred work item so that it will be finished during the
+ * next transaction roll.
+ */
+void
+xfs_defer_item_unpause(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED);
+
+ dfp->dfp_flags &= ~XFS_DEFER_PAUSED;
+
+ trace_xfs_defer_item_unpause(tp->t_mountp, dfp);
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 114a3a4930a3..18a9fb92dde8 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -11,19 +11,6 @@ struct xfs_defer_op_type;
struct xfs_defer_capture;
/*
- * Header for deferred operation list.
- */
-enum xfs_defer_ops_type {
- XFS_DEFER_OPS_TYPE_BMAP,
- XFS_DEFER_OPS_TYPE_REFCOUNT,
- XFS_DEFER_OPS_TYPE_RMAP,
- XFS_DEFER_OPS_TYPE_FREE,
- XFS_DEFER_OPS_TYPE_AGFL_FREE,
- XFS_DEFER_OPS_TYPE_ATTR,
- XFS_DEFER_OPS_TYPE_MAX,
-};
-
-/*
* Save a log intent item and a list of extents, so that we can replay
* whatever action had to happen to the extent list and file the log done
* item.
@@ -33,19 +20,35 @@ struct xfs_defer_pending {
struct list_head dfp_work; /* work items */
struct xfs_log_item *dfp_intent; /* log intent item */
struct xfs_log_item *dfp_done; /* log done item */
+ const struct xfs_defer_op_type *dfp_ops;
unsigned int dfp_count; /* # extent items */
- enum xfs_defer_ops_type dfp_type;
+ unsigned int dfp_flags;
};
-void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type,
- struct list_head *h);
+/*
+ * Create a log intent item for this deferred item, but don't actually finish
+ * the work. Caller must clear this before the final transaction commit.
+ */
+#define XFS_DEFER_PAUSED (1U << 0)
+
+#define XFS_DEFER_PENDING_STRINGS \
+ { XFS_DEFER_PAUSED, "paused" }
+
+void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
+void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
+
+struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, struct list_head *h,
+ const struct xfs_defer_op_type *ops);
int xfs_defer_finish_noroll(struct xfs_trans **tp);
int xfs_defer_finish(struct xfs_trans **tp);
+int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
void xfs_defer_cancel(struct xfs_trans *);
void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp);
/* Description of a deferred type. */
struct xfs_defer_op_type {
+ const char *name;
+ unsigned int max_items;
struct xfs_log_item *(*create_intent)(struct xfs_trans *tp,
struct list_head *items, unsigned int count, bool sort);
void (*abort_intent)(struct xfs_log_item *intent);
@@ -56,7 +59,11 @@ struct xfs_defer_op_type {
void (*finish_cleanup)(struct xfs_trans *tp,
struct xfs_btree_cur *state, int error);
void (*cancel_item)(struct list_head *item);
- unsigned int max_items;
+ int (*recover_work)(struct xfs_defer_pending *dfp,
+ struct list_head *capture_list);
+ struct xfs_log_item *(*relog_intent)(struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ struct xfs_log_item *done_item);
};
extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
@@ -121,11 +128,29 @@ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
struct list_head *capture_list);
void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
struct xfs_defer_resources *dres);
-void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+void xfs_defer_ops_capture_abort(struct xfs_mount *mp,
struct xfs_defer_capture *d);
void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
+void xfs_defer_start_recovery(struct xfs_log_item *lip,
+ struct list_head *r_dfops, const struct xfs_defer_op_type *ops);
+void xfs_defer_cancel_recovery(struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp);
+int xfs_defer_finish_recovery(struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp, struct list_head *capture_list);
+
+static inline void
+xfs_defer_add_item(
+ struct xfs_defer_pending *dfp,
+ struct list_head *work)
+{
+ list_add_tail(work, &dfp->dfp_work);
+ dfp->dfp_count++;
+}
+
int __init xfs_defer_init_item_caches(void);
void xfs_defer_destroy_item_caches(void);
+void xfs_defer_add_barrier(struct xfs_trans *tp);
+
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index f5462fd582d5..4821519efad4 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -18,6 +18,7 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_health.h"
const struct xfs_name xfs_name_dotdot = {
.name = (const unsigned char *)"..",
@@ -25,6 +26,12 @@ const struct xfs_name xfs_name_dotdot = {
.type = XFS_DIR3_FT_DIR,
};
+const struct xfs_name xfs_name_dot = {
+ .name = (const unsigned char *)".",
+ .len = 1,
+ .type = XFS_DIR3_FT_DIR,
+};
+
/*
* Convert inode mode to directory entry filetype
*/
@@ -104,13 +111,13 @@ xfs_da_mount(
ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE);
- mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
- KM_MAYFAIL);
- mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
- KM_MAYFAIL);
+ mp->m_dir_geo = kzalloc(sizeof(struct xfs_da_geometry),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ mp->m_attr_geo = kzalloc(sizeof(struct xfs_da_geometry),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!mp->m_dir_geo || !mp->m_attr_geo) {
- kmem_free(mp->m_dir_geo);
- kmem_free(mp->m_attr_geo);
+ kfree(mp->m_dir_geo);
+ kfree(mp->m_attr_geo);
return -ENOMEM;
}
@@ -178,8 +185,8 @@ void
xfs_da_unmount(
struct xfs_mount *mp)
{
- kmem_free(mp->m_dir_geo);
- kmem_free(mp->m_attr_geo);
+ kfree(mp->m_dir_geo);
+ kfree(mp->m_attr_geo);
}
/*
@@ -196,7 +203,7 @@ xfs_dir_isempty(
return 1;
if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
return 0;
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = dp->i_df.if_data;
return !sfp->count;
}
@@ -236,7 +243,7 @@ xfs_dir_init(
if (error)
return error;
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
if (!args)
return -ENOMEM;
@@ -244,7 +251,7 @@ xfs_dir_init(
args->dp = dp;
args->trans = tp;
error = xfs_dir2_sf_create(args, pdp->i_ino);
- kmem_free(args);
+ kfree(args);
return error;
}
@@ -273,7 +280,7 @@ xfs_dir_createname(
XFS_STATS_INC(dp->i_mount, xs_dir_create);
}
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
if (!args)
return -ENOMEM;
@@ -313,7 +320,7 @@ xfs_dir_createname(
rval = xfs_dir2_node_addname(args);
out_free:
- kmem_free(args);
+ kfree(args);
return rval;
}
@@ -333,7 +340,8 @@ xfs_dir_cilookup_result(
!(args->op_flags & XFS_DA_OP_CILOOKUP))
return -EEXIST;
- args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+ args->value = kmalloc(len,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL);
if (!args->value)
return -ENOMEM;
@@ -364,15 +372,8 @@ xfs_dir_lookup(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
- /*
- * We need to use KM_NOFS here so that lockdep will not throw false
- * positive deadlock warnings on a non-transactional lookup path. It is
- * safe to recurse into inode recalim in that case, but lockdep can't
- * easily be taught about it. Hence KM_NOFS avoids having to add more
- * lockdep Doing this avoids having to add a bunch of lockdep class
- * annotations into the reclaim path for the ilock.
- */
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args),
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
args->namelen = name->len;
@@ -419,7 +420,7 @@ out_check_rval:
}
out_free:
xfs_iunlock(dp, lock_mode);
- kmem_free(args);
+ kfree(args);
return rval;
}
@@ -441,7 +442,7 @@ xfs_dir_removename(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_remove);
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
if (!args)
return -ENOMEM;
@@ -477,7 +478,7 @@ xfs_dir_removename(
else
rval = xfs_dir2_node_removename(args);
out_free:
- kmem_free(args);
+ kfree(args);
return rval;
}
@@ -502,7 +503,7 @@ xfs_dir_replace(
if (rval)
return rval;
- args = kmem_zalloc(sizeof(*args), KM_NOFS);
+ args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
if (!args)
return -ENOMEM;
@@ -538,7 +539,7 @@ xfs_dir_replace(
else
rval = xfs_dir2_node_replace(args);
out_free:
- kmem_free(args);
+ kfree(args);
return rval;
}
@@ -626,8 +627,10 @@ xfs_dir2_isblock(
return 0;
*isblock = true;
- if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize))
+ if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) {
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
+ }
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 19af22a16c41..8497d041f316 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -22,6 +22,19 @@ struct xfs_dir3_icfree_hdr;
struct xfs_dir3_icleaf_hdr;
extern const struct xfs_name xfs_name_dotdot;
+extern const struct xfs_name xfs_name_dot;
+
+static inline bool
+xfs_dir2_samename(
+ const struct xfs_name *n1,
+ const struct xfs_name *n2)
+{
+ if (n1 == n2)
+ return true;
+ if (n1->len != n2->len)
+ return false;
+ return !memcmp(n1->name, n2->name, n1->len);
+}
/*
* Convert inode mode to directory entry filetype
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 00f960a703b2..a2da007adb46 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -20,6 +20,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_health.h"
/*
* Local function prototypes.
@@ -152,6 +153,7 @@ xfs_dir3_block_read(
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
*bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
return -EFSCORRUPTED;
}
@@ -1089,7 +1091,7 @@ xfs_dir2_sf_to_block(
int newoffset; /* offset from current entry */
unsigned int offset = geo->data_entry_offset;
xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */
- xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */
+ struct xfs_dir2_sf_hdr *oldsfp = ifp->if_data;
xfs_dir2_sf_hdr_t *sfp; /* shortform header */
__be16 *tagp; /* end of data entry */
struct xfs_name name;
@@ -1099,10 +1101,8 @@ xfs_dir2_sf_to_block(
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
- oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
-
ASSERT(ifp->if_bytes == dp->i_disk_size);
- ASSERT(ifp->if_u1.if_data != NULL);
+ ASSERT(oldsfp != NULL);
ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
ASSERT(dp->i_df.if_nextents == 0);
@@ -1110,7 +1110,7 @@ xfs_dir2_sf_to_block(
* Copy the directory into a temporary buffer.
* Then pitch the incore inode data so we can make extents.
*/
- sfp = kmem_alloc(ifp->if_bytes, 0);
+ sfp = kmalloc(ifp->if_bytes, GFP_KERNEL | __GFP_NOFAIL);
memcpy(sfp, oldsfp, ifp->if_bytes);
xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
@@ -1255,7 +1255,7 @@ xfs_dir2_sf_to_block(
sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
}
/* Done with the temporary buffer */
- kmem_free(sfp);
+ kfree(sfp);
/*
* Sort the leaf entries by hash value.
*/
@@ -1270,6 +1270,6 @@ xfs_dir2_sf_to_block(
xfs_dir3_data_check(dp, bp);
return 0;
out_free:
- kmem_free(sfp);
+ kfree(sfp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index dbcf58979a59..7a6d965bea71 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -18,6 +18,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_health.h"
static xfs_failaddr_t xfs_dir2_data_freefind_verify(
struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
@@ -433,6 +434,7 @@ xfs_dir3_data_read(
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
*bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
return -EFSCORRUPTED;
}
@@ -1198,6 +1200,7 @@ xfs_dir2_data_use_free(
corrupt:
xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount,
hdr, sizeof(*hdr), __FILE__, __LINE__, fa);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index cb9e950a911d..08dda5ce9d91 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -19,6 +19,7 @@
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
+#include "xfs_health.h"
/*
* Local function declarations.
@@ -1393,8 +1394,10 @@ xfs_dir2_leaf_removename(
bestsp = xfs_dir2_leaf_bests_p(ltp);
if (be16_to_cpu(bestsp[db]) != oldbest) {
xfs_buf_mark_corrupt(lbp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
+
/*
* Mark the former data entry unused.
*/
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 7a03aeb9f4c9..be0b8834028c 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -20,6 +20,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_health.h"
/*
* Function declarations.
@@ -231,6 +232,7 @@ __xfs_dir3_free_read(
__xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
*bpp = NULL;
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
return -EFSCORRUPTED;
}
@@ -443,6 +445,7 @@ xfs_dir2_leaf_to_node(
if (be32_to_cpu(ltp->bestcount) >
(uint)dp->i_disk_size / args->geo->blksize) {
xfs_buf_mark_corrupt(lbp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -517,6 +520,7 @@ xfs_dir2_leafn_add(
*/
if (index < 0) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -736,6 +740,7 @@ xfs_dir2_leafn_lookup_for_addname(
cpu_to_be16(NULLDATAOFF))) {
if (curfdb != newfdb)
xfs_trans_brelse(tp, curbp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
curfdb = newfdb;
@@ -804,6 +809,7 @@ xfs_dir2_leafn_lookup_for_entry(
xfs_dir3_leaf_check(dp, bp);
if (leafhdr.count <= 0) {
xfs_buf_mark_corrupt(bp);
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
@@ -1739,6 +1745,7 @@ xfs_dir2_node_add_datablk(
} else {
xfs_alert(mp, " ... fblk is NULL");
}
+ xfs_da_mark_sick(args);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 7404a9ff1a92..1db2e60ba827 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -175,7 +175,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
-extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip);
+xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *sfp, int64_t size);
int xfs_dir2_sf_entsize(struct xfs_mount *mp,
struct xfs_dir2_sf_hdr *hdr, int len);
void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 8cd37e6e9d38..17a20384c8b7 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -276,7 +276,7 @@ xfs_dir2_block_to_sf(
* format the data into. Once we have formatted the data, we can free
* the block and copy the formatted data into the inode literal area.
*/
- sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0);
+ sfp = kmalloc(mp->m_sb.sb_inodesize, GFP_KERNEL | __GFP_NOFAIL);
memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
/*
@@ -350,7 +350,7 @@ xfs_dir2_block_to_sf(
xfs_dir2_sf_check(args);
out:
xfs_trans_log_inode(args->trans, dp, logflags);
- kmem_free(sfp);
+ kfree(sfp);
return error;
}
@@ -364,25 +364,23 @@ int /* error */
xfs_dir2_sf_addname(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_inode_t *dp; /* incore directory inode */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int error; /* error return value */
int incr_isize; /* total change in size */
int new_isize; /* size after adding name */
int objchange; /* changing to 8-byte inodes */
xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */
int pick; /* which algorithm to use */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */
trace_xfs_dir2_sf_addname(args);
ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
- dp = args->dp;
ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Compute entry (and change in) size.
@@ -462,20 +460,17 @@ xfs_dir2_sf_addname_easy(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
- int byteoff; /* byte offset in sf dir */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
+ int byteoff = (int)((char *)sfep - (char *)sfp);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- byteoff = (int)((char *)sfep - (char *)sfp);
/*
* Grow the in-inode space.
*/
- xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen),
+ sfp = xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen),
XFS_DATA_FORK);
/*
* Need to set up again due to realloc of the inode data.
*/
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
/*
* Fill in the new entry.
@@ -528,11 +523,10 @@ xfs_dir2_sf_addname_hard(
/*
* Copy the old directory to the stack buffer.
*/
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
old_isize = (int)dp->i_disk_size;
- buf = kmem_alloc(old_isize, 0);
+ buf = kmalloc(old_isize, GFP_KERNEL | __GFP_NOFAIL);
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
- memcpy(oldsfp, sfp, old_isize);
+ memcpy(oldsfp, dp->i_df.if_data, old_isize);
/*
* Loop over the old directory finding the place we're going
* to insert the new entry.
@@ -556,11 +550,8 @@ xfs_dir2_sf_addname_hard(
* the data.
*/
xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
- xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
- /*
- * Reset the pointer since the buffer was reallocated.
- */
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+
/*
* Copy the first part of the directory, including the header.
*/
@@ -585,7 +576,7 @@ xfs_dir2_sf_addname_hard(
sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
memcpy(sfep, oldsfep, old_isize - nbytes);
}
- kmem_free(buf);
+ kfree(buf);
dp->i_disk_size = new_isize;
xfs_dir2_sf_check(args);
}
@@ -610,11 +601,10 @@ xfs_dir2_sf_addname_pick(
int i; /* entry number */
xfs_dir2_data_aoff_t offset; /* data block offset */
xfs_dir2_sf_entry_t *sfep; /* shortform entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int size; /* entry's data size */
int used; /* data bytes used */
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
size = xfs_dir2_data_entsize(mp, args->namelen);
offset = args->geo->data_first_offset;
sfep = xfs_dir2_sf_firstentry(sfp);
@@ -673,14 +663,13 @@ xfs_dir2_sf_check(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int i; /* entry number */
int i8count; /* number of big inode#s */
xfs_ino_t ino; /* entry inode number */
int offset; /* data offset */
xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
offset = args->geo->data_first_offset;
ino = xfs_dir2_sf_get_parent_ino(sfp);
i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
@@ -707,11 +696,10 @@ xfs_dir2_sf_check(
/* Verify the consistency of an inline directory. */
xfs_failaddr_t
xfs_dir2_sf_verify(
- struct xfs_inode *ip)
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *sfp,
+ int64_t size)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
- struct xfs_dir2_sf_hdr *sfp;
struct xfs_dir2_sf_entry *sfep;
struct xfs_dir2_sf_entry *next_sfep;
char *endp;
@@ -719,15 +707,9 @@ xfs_dir2_sf_verify(
int i;
int i8count;
int offset;
- int64_t size;
int error;
uint8_t filetype;
- ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
-
- sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data;
- size = ifp->if_bytes;
-
/*
* Give up if the directory is way too short.
*/
@@ -834,15 +816,13 @@ xfs_dir2_sf_create(
ASSERT(dp->i_df.if_bytes == 0);
i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
size = xfs_dir2_sf_hdr_size(i8count);
+
/*
- * Make a buffer for the data.
- */
- xfs_idata_realloc(dp, size, XFS_DATA_FORK);
- /*
- * Fill in the header,
+ * Make a buffer for the data and fill in the header.
*/
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = xfs_idata_realloc(dp, size, XFS_DATA_FORK);
sfp->i8count = i8count;
+
/*
* Now can put in the inode number, since i8count is set.
*/
@@ -864,9 +844,9 @@ xfs_dir2_sf_lookup(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int i; /* entry index */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
enum xfs_dacmp cmp; /* comparison result */
xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */
@@ -877,8 +857,7 @@ xfs_dir2_sf_lookup(
ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Special case for .
@@ -940,13 +919,13 @@ xfs_dir2_sf_removename(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int byteoff; /* offset of removed entry */
int entsize; /* this entry's size */
int i; /* shortform entry index */
int newsize; /* new inode size */
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
trace_xfs_dir2_sf_removename(args);
@@ -954,8 +933,7 @@ xfs_dir2_sf_removename(
oldsize = (int)dp->i_disk_size;
ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == oldsize);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Loop over the old directory entries.
@@ -992,11 +970,12 @@ xfs_dir2_sf_removename(
*/
sfp->count--;
dp->i_disk_size = newsize;
+
/*
* Reallocate, making it smaller.
*/
- xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+
/*
* Are we changing inode number size?
*/
@@ -1019,13 +998,12 @@ xfs_dir2_sf_replace_needblock(
struct xfs_inode *dp,
xfs_ino_t inum)
{
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int newsize;
- struct xfs_dir2_sf_hdr *sfp;
if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL)
return false;
- sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
return inum > XFS_DIR2_MAX_SHORT_INUM &&
@@ -1041,19 +1019,18 @@ xfs_dir2_sf_replace(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int i; /* entry index */
xfs_ino_t ino=0; /* entry old inode number */
int i8elevated; /* sf_toino8 set i8count=1 */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
trace_xfs_dir2_sf_replace(args);
ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
@@ -1076,7 +1053,7 @@ xfs_dir2_sf_replace(
*/
xfs_dir2_sf_toino8(args);
i8elevated = 1;
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = dp->i_df.if_data;
} else
i8elevated = 0;
@@ -1157,11 +1134,11 @@ xfs_dir2_sf_toino4(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data;
char *buf; /* old dir's buffer */
int i; /* entry index */
int newsize; /* new inode size */
xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
- xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* new sf entry */
xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
@@ -1174,8 +1151,7 @@ xfs_dir2_sf_toino4(
* Don't want xfs_idata_realloc copying the data here.
*/
oldsize = dp->i_df.if_bytes;
- buf = kmem_alloc(oldsize, 0);
- oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL);
ASSERT(oldsfp->i8count == 1);
memcpy(buf, oldsfp, oldsize);
/*
@@ -1188,7 +1164,7 @@ xfs_dir2_sf_toino4(
* Reset our pointers, the data has moved.
*/
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = dp->i_df.if_data;
/*
* Fill in the new header.
*/
@@ -1214,7 +1190,7 @@ xfs_dir2_sf_toino4(
/*
* Clean up the inode.
*/
- kmem_free(buf);
+ kfree(buf);
dp->i_disk_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
@@ -1230,11 +1206,11 @@ xfs_dir2_sf_toino8(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data;
char *buf; /* old dir's buffer */
int i; /* entry index */
int newsize; /* new inode size */
xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
- xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* new sf entry */
xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
@@ -1247,8 +1223,7 @@ xfs_dir2_sf_toino8(
* Don't want xfs_idata_realloc copying the data here.
*/
oldsize = dp->i_df.if_bytes;
- buf = kmem_alloc(oldsize, 0);
- oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL);
ASSERT(oldsfp->i8count == 0);
memcpy(buf, oldsfp, oldsize);
/*
@@ -1261,7 +1236,7 @@ xfs_dir2_sf_toino8(
* Reset our pointers, the data has moved.
*/
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = dp->i_df.if_data;
/*
* Fill in the new header.
*/
@@ -1287,7 +1262,7 @@ xfs_dir2_sf_toino8(
/*
* Clean up the inode.
*/
- kmem_free(buf);
+ kfree(buf);
dp->i_disk_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 371dc07233e0..2b2f9050fbfb 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -98,7 +98,7 @@ typedef struct xfs_sb {
uint32_t sb_blocksize; /* logical block size, bytes */
xfs_rfsblock_t sb_dblocks; /* number of data blocks */
xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
- xfs_rtblock_t sb_rextents; /* number of realtime extents */
+ xfs_rtbxlen_t sb_rextents; /* number of realtime extents */
uuid_t sb_uuid; /* user-visible file system unique id */
xfs_fsblock_t sb_logstart; /* starting block of log if internal */
xfs_ino_t sb_rootino; /* root inode number */
@@ -477,15 +477,9 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
/*
- * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the
- * arrays below.
- */
-#define XFS_BTNUM_AGF ((int)XFS_BTNUM_RMAPi + 1)
-
-/*
- * The second word of agf_levels in the first a.g. overlaps the EFS
- * superblock's magic number. Since the magic numbers valid for EFS
- * are > 64k, our value cannot be confused for an EFS superblock's.
+ * agf_cnt_level in the first AGF overlaps the EFS superblock's magic number.
+ * Since the magic numbers valid for EFS are > 64k, our value cannot be confused
+ * for an EFS superblock.
*/
typedef struct xfs_agf {
@@ -499,8 +493,13 @@ typedef struct xfs_agf {
/*
* Freespace and rmap information
*/
- __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */
- __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
+ __be32 agf_bno_root; /* bnobt root block */
+ __be32 agf_cnt_root; /* cntbt root block */
+ __be32 agf_rmap_root; /* rmapbt root block */
+
+ __be32 agf_bno_level; /* bnobt btree levels */
+ __be32 agf_cnt_level; /* cntbt btree levels */
+ __be32 agf_rmap_level; /* rmapbt btree levels */
__be32 agf_flfirst; /* first freelist block's index */
__be32 agf_fllast; /* last freelist block's index */
@@ -691,6 +690,22 @@ struct xfs_agfl {
xfs_daddr_to_agno(mp, (d) + (len) - 1)))
/*
+ * Realtime bitmap information is accessed by the word, which is currently
+ * stored in host-endian format.
+ */
+union xfs_rtword_raw {
+ __u32 old;
+};
+
+/*
+ * Realtime summary counts are accessed by the word, which is currently
+ * stored in host-endian format.
+ */
+union xfs_suminfo_raw {
+ __u32 old;
+};
+
+/*
* XFS Timestamps
* ==============
*
@@ -992,7 +1007,7 @@ enum xfs_dinode_fmt {
* Return pointers to the data or attribute forks.
*/
#define XFS_DFORK_DPTR(dip) \
- ((char *)dip + xfs_dinode_size(dip->di_version))
+ ((void *)dip + xfs_dinode_size(dip->di_version))
#define XFS_DFORK_APTR(dip) \
(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
#define XFS_DFORK_PTR(dip,w) \
@@ -1140,34 +1155,6 @@ static inline bool xfs_dinode_has_large_extent_counts(
#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
-#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
-#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
-#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
-#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
-
-/*
- * RT Summary and bit manipulation macros.
- */
-#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
-#define XFS_SUMOFFSTOBLOCK(mp,s) \
- (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
-#define XFS_SUMPTR(mp,bp,so) \
- ((xfs_suminfo_t *)((bp)->b_addr + \
- (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
-
-#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
-#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
-#define XFS_BITTOWORD(mp,bi) \
- ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
-
-#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
-#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
-
-#define XFS_RTLOBIT(w) xfs_lowbit32(w)
-#define XFS_RTHIBIT(w) xfs_highbit32(w)
-
-#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
-
/*
* Dquot and dquot block format definitions
*/
@@ -1270,6 +1257,9 @@ static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds)
#define XFS_DQ_GRACE_MIN ((int64_t)0)
#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX)
+/* Maximum id value for a quota record */
+#define XFS_DQ_ID_MAX (U32_MAX)
+
/*
* This is the main portion of the on-disk representation of quota information
* for a user. We pad this with some more expansion room to construct the on
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 6360073865db..ca1b17d01437 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -195,6 +195,8 @@ struct xfs_fsop_geom {
#define XFS_FSOP_GEOM_SICK_PQUOTA (1 << 3) /* project quota */
#define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */
#define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */
+#define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */
+#define XFS_FSOP_GEOM_SICK_NLINKS (1 << 7) /* inode link counts */
/* Output for XFS_FS_COUNTS */
typedef struct xfs_fsop_counts {
@@ -292,6 +294,7 @@ struct xfs_ag_geometry {
#define XFS_AG_GEOM_SICK_FINOBT (1 << 7) /* free inode index */
#define XFS_AG_GEOM_SICK_RMAPBT (1 << 8) /* reverse mappings */
#define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9) /* reference counts */
+#define XFS_AG_GEOM_SICK_INODES (1 << 10) /* bad inodes were seen */
/*
* Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
@@ -709,9 +712,12 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */
#define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */
#define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */
+#define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */
+#define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */
+#define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */
/* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR 25
+#define XFS_SCRUB_TYPE_NR 28
/* i: Repair this metadata. */
#define XFS_SCRUB_IFLAG_REPAIR (1u << 0)
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 99e796256c5d..3c64b5f9bd68 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -26,21 +26,40 @@
* and the "sick" field tells us if that piece was found to need repairs.
* Therefore we can conclude that for a given sick flag value:
*
- * - checked && sick => metadata needs repair
- * - checked && !sick => metadata is ok
- * - !checked => has not been examined since mount
+ * - checked && sick => metadata needs repair
+ * - checked && !sick => metadata is ok
+ * - !checked && sick => errors have been observed during normal operation,
+ * but the metadata has not been checked thoroughly
+ * - !checked && !sick => has not been examined since mount
+ *
+ * Evidence of health problems can be sorted into three basic categories:
+ *
+ * a) Primary evidence, which signals that something is defective within the
+ * general grouping of metadata.
+ *
+ * b) Secondary evidence, which are side effects of primary problem but are
+ * not themselves problems. These can be forgotten when the primary
+ * health problems are addressed.
+ *
+ * c) Indirect evidence, which points to something being wrong in another
+ * group, but we had to release resources and this is all that's left of
+ * that state.
*/
struct xfs_mount;
struct xfs_perag;
struct xfs_inode;
struct xfs_fsop_geom;
+struct xfs_btree_cur;
+struct xfs_da_args;
/* Observable health issues for metadata spanning the entire filesystem. */
#define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */
#define XFS_SICK_FS_UQUOTA (1 << 1) /* user quota */
#define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */
#define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */
+#define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */
+#define XFS_SICK_FS_NLINKS (1 << 5) /* inode link counts */
/* Observable health issues for realtime volume metadata. */
#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */
@@ -57,6 +76,7 @@ struct xfs_fsop_geom;
#define XFS_SICK_AG_FINOBT (1 << 7) /* free inode index */
#define XFS_SICK_AG_RMAPBT (1 << 8) /* reverse mappings */
#define XFS_SICK_AG_REFCNTBT (1 << 9) /* reference counts */
+#define XFS_SICK_AG_INODES (1 << 10) /* inactivated bad inodes */
/* Observable health issues for inode metadata. */
#define XFS_SICK_INO_CORE (1 << 0) /* inode core */
@@ -68,11 +88,21 @@ struct xfs_fsop_geom;
#define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */
#define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */
+#define XFS_SICK_INO_BMBTD_ZAPPED (1 << 8) /* data fork erased */
+#define XFS_SICK_INO_BMBTA_ZAPPED (1 << 9) /* attr fork erased */
+#define XFS_SICK_INO_DIR_ZAPPED (1 << 10) /* directory erased */
+#define XFS_SICK_INO_SYMLINK_ZAPPED (1 << 11) /* symlink erased */
+
+/* Don't propagate sick status to ag health summary during inactivation */
+#define XFS_SICK_INO_FORGET (1 << 12)
+
/* Primary evidence of health problems in a given group. */
#define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \
XFS_SICK_FS_UQUOTA | \
XFS_SICK_FS_GQUOTA | \
- XFS_SICK_FS_PQUOTA)
+ XFS_SICK_FS_PQUOTA | \
+ XFS_SICK_FS_QUOTACHECK | \
+ XFS_SICK_FS_NLINKS)
#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \
XFS_SICK_RT_SUMMARY)
@@ -97,29 +127,91 @@ struct xfs_fsop_geom;
XFS_SICK_INO_SYMLINK | \
XFS_SICK_INO_PARENT)
-/* These functions must be provided by the xfs implementation. */
+#define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \
+ XFS_SICK_INO_BMBTA_ZAPPED | \
+ XFS_SICK_INO_DIR_ZAPPED | \
+ XFS_SICK_INO_SYMLINK_ZAPPED)
+
+/* Secondary state related to (but not primary evidence of) health problems. */
+#define XFS_SICK_FS_SECONDARY (0)
+#define XFS_SICK_RT_SECONDARY (0)
+#define XFS_SICK_AG_SECONDARY (0)
+#define XFS_SICK_INO_SECONDARY (XFS_SICK_INO_FORGET)
+
+/* Evidence of health problems elsewhere. */
+#define XFS_SICK_FS_INDIRECT (0)
+#define XFS_SICK_RT_INDIRECT (0)
+#define XFS_SICK_AG_INDIRECT (XFS_SICK_AG_INODES)
+#define XFS_SICK_INO_INDIRECT (0)
+
+/* All health masks. */
+#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \
+ XFS_SICK_FS_SECONDARY | \
+ XFS_SICK_FS_INDIRECT)
+
+#define XFS_SICK_RT_ALL (XFS_SICK_RT_PRIMARY | \
+ XFS_SICK_RT_SECONDARY | \
+ XFS_SICK_RT_INDIRECT)
+
+#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \
+ XFS_SICK_AG_SECONDARY | \
+ XFS_SICK_AG_INDIRECT)
+
+#define XFS_SICK_INO_ALL (XFS_SICK_INO_PRIMARY | \
+ XFS_SICK_INO_SECONDARY | \
+ XFS_SICK_INO_INDIRECT | \
+ XFS_SICK_INO_ZAPPED)
+
+/*
+ * These functions must be provided by the xfs implementation. Function
+ * behavior with respect to the first argument should be as follows:
+ *
+ * xfs_*_mark_sick: Set the sick flags and do not set checked flags.
+ * Runtime code should call this upon encountering
+ * a corruption.
+ *
+ * xfs_*_mark_corrupt: Set the sick and checked flags simultaneously.
+ * Fsck tools should call this when corruption is
+ * found.
+ *
+ * xfs_*_mark_healthy: Clear the sick flags and set the checked flags.
+ * Fsck tools should call this after correcting errors.
+ *
+ * xfs_*_measure_sickness: Return the sick and check status in the provided
+ * out parameters.
+ */
void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_mark_corrupt(struct xfs_mount *mp, unsigned int mask);
void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask);
void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
unsigned int *checked);
void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_mark_corrupt(struct xfs_mount *mp, unsigned int mask);
void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
unsigned int *checked);
+void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno,
+ unsigned int mask);
void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_mark_corrupt(struct xfs_perag *pag, unsigned int mask);
void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask);
void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick,
unsigned int *checked);
void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_mark_corrupt(struct xfs_inode *ip, unsigned int mask);
void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask);
void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick,
unsigned int *checked);
void xfs_health_unmount(struct xfs_mount *mp);
+void xfs_bmap_mark_sick(struct xfs_inode *ip, int whichfork);
+void xfs_btree_mark_sick(struct xfs_btree_cur *cur);
+void xfs_dirattr_mark_sick(struct xfs_inode *ip, int whichfork);
+void xfs_da_mark_sick(struct xfs_da_args *args);
/* Now some helpers. */
@@ -187,4 +279,7 @@ void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
+#define xfs_metadata_is_sick(error) \
+ (unlikely((error) == -EFSCORRUPTED || (error) == -EFSBADCRC))
+
#endif /* __XFS_HEALTH_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index b83e54c70906..e5ac3e5430c4 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -27,6 +27,7 @@
#include "xfs_log.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
/*
* Lookup a record by ino in the btree given by cur.
@@ -95,18 +96,28 @@ xfs_inobt_btrec_to_irec(
irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
}
+/* Compute the freecount of an incore inode record. */
+uint8_t
+xfs_inobt_rec_freecount(
+ const struct xfs_inobt_rec_incore *irec)
+{
+ uint64_t realfree = irec->ir_free;
+
+ if (xfs_inobt_issparse(irec->ir_holemask))
+ realfree &= xfs_inobt_irec_to_allocmask(irec);
+ return hweight64(realfree);
+}
+
/* Simple checks for inode records. */
xfs_failaddr_t
xfs_inobt_check_irec(
- struct xfs_btree_cur *cur,
+ struct xfs_perag *pag,
const struct xfs_inobt_rec_incore *irec)
{
- uint64_t realfree;
-
/* Record has to be properly aligned within the AG. */
- if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
+ if (!xfs_verify_agino(pag, irec->ir_startino))
return __this_address;
- if (!xfs_verify_agino(cur->bc_ag.pag,
+ if (!xfs_verify_agino(pag,
irec->ir_startino + XFS_INODES_PER_CHUNK - 1))
return __this_address;
if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
@@ -115,12 +126,7 @@ xfs_inobt_check_irec(
if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
return __this_address;
- /* if there are no holes, return the first available offset */
- if (!xfs_inobt_issparse(irec->ir_holemask))
- realfree = irec->ir_free;
- else
- realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec);
- if (hweight64(realfree) != irec->ir_freecount)
+ if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount)
return __this_address;
return NULL;
@@ -135,13 +141,13 @@ xfs_inobt_complain_bad_rec(
struct xfs_mount *mp = cur->bc_mp;
xfs_warn(mp,
- "%s Inode BTree record corruption in AG %d detected at %pS!",
- cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
- cur->bc_ag.pag->pag_agno, fa);
+ "%sbt record corruption in AG %d detected at %pS!",
+ cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
xfs_warn(mp,
"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
irec->ir_startino, irec->ir_count, irec->ir_freecount,
irec->ir_free, irec->ir_holemask);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -164,7 +170,7 @@ xfs_inobt_get_rec(
return error;
xfs_inobt_btrec_to_irec(mp, rec, irec);
- fa = xfs_inobt_check_irec(cur, irec);
+ fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec);
if (fa)
return xfs_inobt_complain_bad_rec(cur, fa, irec);
@@ -200,14 +206,17 @@ xfs_inobt_insert(
struct xfs_buf *agbp,
xfs_agino_t newino,
xfs_agino_t newlen,
- xfs_btnum_t btnum)
+ bool is_finobt)
{
struct xfs_btree_cur *cur;
xfs_agino_t thisino;
int i;
int error;
- cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum);
+ if (is_finobt)
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
+ else
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
for (thisino = newino;
thisino < newino + newlen;
@@ -523,16 +532,14 @@ __xfs_inobt_rec_merge(
}
/*
- * Insert a new sparse inode chunk into the associated inode btree. The inode
- * record for the sparse chunk is pre-aligned to a startino that should match
- * any pre-existing sparse inode record in the tree. This allows sparse chunks
- * to fill over time.
+ * Insert a new sparse inode chunk into the associated inode allocation btree.
+ * The inode record for the sparse chunk is pre-aligned to a startino that
+ * should match any pre-existing sparse inode record in the tree. This allows
+ * sparse chunks to fill over time.
*
- * This function supports two modes of handling preexisting records depending on
- * the merge flag. If merge is true, the provided record is merged with the
+ * If no preexisting record exists, the provided record is inserted.
+ * If there is a preexisting record, the provided record is merged with the
* existing record and updated in place. The merged record is returned in nrec.
- * If merge is false, an existing record is replaced with the provided record.
- * If no preexisting record exists, the provided record is always inserted.
*
* It is considered corruption if a merge is requested and not possible. Given
* the sparse inode alignment constraints, this should never happen.
@@ -542,9 +549,7 @@ xfs_inobt_insert_sprec(
struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- int btnum,
- struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
- bool merge) /* merge or replace */
+ struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */
{
struct xfs_mount *mp = pag->pag_mount;
struct xfs_btree_cur *cur;
@@ -552,7 +557,7 @@ xfs_inobt_insert_sprec(
int i;
struct xfs_inobt_rec_incore rec;
- cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
/* the new record is pre-aligned so we know where to look */
error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
@@ -566,6 +571,7 @@ xfs_inobt_insert_sprec(
if (error)
goto error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error;
}
@@ -574,45 +580,45 @@ xfs_inobt_insert_sprec(
}
/*
- * A record exists at this startino. Merge or replace the record
- * depending on what we've been asked to do.
+ * A record exists at this startino. Merge the records.
*/
- if (merge) {
- error = xfs_inobt_get_rec(cur, &rec, &i);
- if (error)
- goto error;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto error;
- }
- if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
- error = -EFSCORRUPTED;
- goto error;
- }
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
+ xfs_btree_mark_sick(cur);
+ error = -EFSCORRUPTED;
+ goto error;
+ }
- /*
- * This should never fail. If we have coexisting records that
- * cannot merge, something is seriously wrong.
- */
- if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
- error = -EFSCORRUPTED;
- goto error;
- }
+ /*
+ * This should never fail. If we have coexisting records that
+ * cannot merge, something is seriously wrong.
+ */
+ if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
+ xfs_btree_mark_sick(cur);
+ error = -EFSCORRUPTED;
+ goto error;
+ }
- trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
- rec.ir_holemask, nrec->ir_startino,
- nrec->ir_holemask);
+ trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
+ rec.ir_holemask, nrec->ir_startino,
+ nrec->ir_holemask);
- /* merge to nrec to output the updated record */
- __xfs_inobt_rec_merge(nrec, &rec);
+ /* merge to nrec to output the updated record */
+ __xfs_inobt_rec_merge(nrec, &rec);
- trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
- nrec->ir_holemask);
+ trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
+ nrec->ir_holemask);
- error = xfs_inobt_rec_check_count(mp, nrec);
- if (error)
- goto error;
- }
+ error = xfs_inobt_rec_check_count(mp, nrec);
+ if (error)
+ goto error;
error = xfs_inobt_update(cur, nrec);
if (error)
@@ -627,6 +633,59 @@ error:
}
/*
+ * Insert a new sparse inode chunk into the free inode btree. The inode
+ * record for the sparse chunk is pre-aligned to a startino that should match
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
+ * to fill over time.
+ *
+ * The new record is always inserted, overwriting a pre-existing record if
+ * there is one.
+ */
+STATIC int
+xfs_finobt_insert_sprec(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_btree_cur *cur;
+ int error;
+ int i;
+
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
+
+ /* the new record is pre-aligned so we know where to look */
+ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ /* if nothing there, insert a new record and return */
+ if (i == 0) {
+ error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
+ nrec->ir_count, nrec->ir_freecount,
+ nrec->ir_free, &i);
+ if (error)
+ goto error;
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ } else {
+ error = xfs_inobt_update(cur, nrec);
+ if (error)
+ goto error;
+ }
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+
+/*
* Allocate new inodes in the allocation group specified by agbp. Returns 0 if
* inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
* the caller knows it can try another AG, a hard -ENOSPC when over the maximum
@@ -852,8 +911,7 @@ sparse_alloc:
* if necessary. If a merge does occur, rec is updated to the
* merged record.
*/
- error = xfs_inobt_insert_sprec(pag, tp, agbp,
- XFS_BTNUM_INO, &rec, true);
+ error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec);
if (error == -EFSCORRUPTED) {
xfs_alert(args.mp,
"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
@@ -877,21 +935,19 @@ sparse_alloc:
* existing record with this one.
*/
if (xfs_has_finobt(args.mp)) {
- error = xfs_inobt_insert_sprec(pag, tp, agbp,
- XFS_BTNUM_FINO, &rec, false);
+ error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec);
if (error)
return error;
}
} else {
/* full chunk - insert new records to both btrees */
- error = xfs_inobt_insert(pag, tp, agbp, newino, newlen,
- XFS_BTNUM_INO);
+ error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false);
if (error)
return error;
if (xfs_has_finobt(args.mp)) {
error = xfs_inobt_insert(pag, tp, agbp, newino,
- newlen, XFS_BTNUM_FINO);
+ newlen, true);
if (error)
return error;
}
@@ -944,8 +1000,10 @@ xfs_ialloc_next_rec(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
}
return 0;
@@ -969,8 +1027,10 @@ xfs_ialloc_get_rec(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
}
return 0;
@@ -1025,7 +1085,7 @@ xfs_dialloc_ag_inobt(
ASSERT(pag->pagi_freecount > 0);
restart_pagno:
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
@@ -1048,6 +1108,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1056,6 +1117,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, j != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1214,6 +1276,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1223,6 +1286,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1232,6 +1296,7 @@ xfs_dialloc_ag_inobt(
if (error)
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1292,8 +1357,10 @@ xfs_dialloc_ag_finobt_near(
error = xfs_inobt_get_rec(lcur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(lcur);
return -EFSCORRUPTED;
+ }
/*
* See if we've landed in the parent inode record. The finobt
@@ -1317,12 +1384,14 @@ xfs_dialloc_ag_finobt_near(
if (error)
goto error_rcur;
if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
+ xfs_btree_mark_sick(lcur);
error = -EFSCORRUPTED;
goto error_rcur;
}
}
if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
+ xfs_btree_mark_sick(lcur);
error = -EFSCORRUPTED;
goto error_rcur;
}
@@ -1378,8 +1447,10 @@ xfs_dialloc_ag_finobt_newino(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
return 0;
}
}
@@ -1390,14 +1461,18 @@ xfs_dialloc_ag_finobt_newino(
error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
return 0;
}
@@ -1419,14 +1494,18 @@ xfs_dialloc_ag_update_inobt(
error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
return error;
- if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
@@ -1435,8 +1514,10 @@ xfs_dialloc_ag_update_inobt(
if (XFS_IS_CORRUPT(cur->bc_mp,
rec.ir_free != frec->ir_free ||
- rec.ir_freecount != frec->ir_freecount))
+ rec.ir_freecount != frec->ir_freecount)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
return xfs_inobt_update(cur, &rec);
}
@@ -1478,7 +1559,7 @@ xfs_dialloc_ag(
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
error = xfs_check_agi_freecount(cur);
if (error)
@@ -1521,7 +1602,7 @@ xfs_dialloc_ag(
* the original freecount. If all is well, make the equivalent update to
* the inobt using the finobt record and offset information.
*/
- icur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+ icur = xfs_inobt_init_cursor(pag, tp, agbp);
error = xfs_check_agi_freecount(icur);
if (error)
@@ -1854,7 +1935,7 @@ xfs_difree_inode_chunk(
return xfs_free_extent_later(tp,
XFS_AGB_TO_FSB(mp, agno, sagbno),
M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
}
/* holemask is only 16-bits (fits in an unsigned long) */
@@ -1900,7 +1981,8 @@ xfs_difree_inode_chunk(
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
error = xfs_free_extent_later(tp,
XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
- &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE);
+ &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE,
+ false);
if (error)
return error;
@@ -1937,7 +2019,7 @@ xfs_difree_inobt(
/*
* Initialize the cursor.
*/
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
error = xfs_check_agi_freecount(cur);
if (error)
@@ -1952,6 +2034,7 @@ xfs_difree_inobt(
goto error0;
}
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -1962,6 +2045,7 @@ xfs_difree_inobt(
goto error0;
}
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error0;
}
@@ -2062,7 +2146,7 @@ xfs_difree_finobt(
int error;
int i;
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
@@ -2074,6 +2158,7 @@ xfs_difree_finobt(
* something is out of sync.
*/
if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error;
}
@@ -2100,6 +2185,7 @@ xfs_difree_finobt(
if (error)
goto error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error;
}
@@ -2110,6 +2196,7 @@ xfs_difree_finobt(
if (XFS_IS_CORRUPT(mp,
rec.ir_free != ibtrec->ir_free ||
rec.ir_freecount != ibtrec->ir_freecount)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto error;
}
@@ -2259,7 +2346,7 @@ xfs_imap_lookup(
* we have a record, we need to ensure it contains the inode number
* we are looking up.
*/
- cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (!error) {
if (i)
@@ -2598,6 +2685,8 @@ xfs_read_agi(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
if (error)
return error;
if (tp)
@@ -2739,7 +2828,7 @@ xfs_ialloc_count_inodes_rec(
xfs_failaddr_t fa;
xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
- fa = xfs_inobt_check_irec(cur, &irec);
+ fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec);
if (fa)
return xfs_inobt_complain_bad_rec(cur, fa, &irec);
@@ -2759,7 +2848,7 @@ xfs_ialloc_count_inodes(
struct xfs_ialloc_count_inodes ci = {0};
int error;
- ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+ ASSERT(xfs_btree_is_ino(cur->bc_ops));
error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
if (error)
return error;
@@ -2976,7 +3065,7 @@ xfs_ialloc_check_shrink(
if (!xfs_has_sparseinodes(pag->pag_mount))
return 0;
- cur = xfs_inobt_init_cursor(pag, tp, agibp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agibp);
/* Look up the inobt record that would correspond to the new EOFS. */
agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length);
@@ -2989,6 +3078,7 @@ xfs_ialloc_check_shrink(
goto out;
if (!has) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT);
error = -EFSCORRUPTED;
goto out;
}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index fe824bb04a09..f1412183bb44 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -79,6 +79,7 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
*/
int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
xfs_inobt_rec_incore_t *rec, int *stat);
+uint8_t xfs_inobt_rec_freecount(const struct xfs_inobt_rec_incore *irec);
/*
* Inode chunk initialisation routine
@@ -93,7 +94,7 @@ union xfs_btree_rec;
void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
const union xfs_btree_rec *rec,
struct xfs_inobt_rec_incore *irec);
-xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_inobt_check_irec(struct xfs_perag *pag,
const struct xfs_inobt_rec_incore *irec);
int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9258f01c0015..cc661fca6ff5 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -17,6 +17,7 @@
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
+#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_rmap.h"
@@ -37,7 +38,15 @@ xfs_inobt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_btnum);
+ cur->bc_ag.agbp);
+}
+
+STATIC struct xfs_btree_cur *
+xfs_finobt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_finobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
+ cur->bc_ag.agbp);
}
STATIC void
@@ -81,9 +90,9 @@ xfs_inobt_mod_blockcount(
if (!xfs_has_inobtcounts(cur->bc_mp))
return;
- if (cur->bc_btnum == XFS_BTNUM_FINO)
+ if (xfs_btree_is_fino(cur->bc_ops))
be32_add_cpu(&agi->agi_fblocks, howmuch);
- else if (cur->bc_btnum == XFS_BTNUM_INO)
+ else
be32_add_cpu(&agi->agi_iblocks, howmuch);
xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS);
}
@@ -161,7 +170,7 @@ __xfs_inobt_free_block(
xfs_inobt_mod_blockcount(cur, -1);
fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
- &XFS_RMAP_OINFO_INOBT, resv);
+ &XFS_RMAP_OINFO_INOBT, resv, false);
}
STATIC int
@@ -300,7 +309,7 @@ xfs_inobt_verify(
* xfs_perag_initialised_agi(pag)) if we ever do.
*/
if (xfs_has_crc(mp)) {
- fa = xfs_btree_sblock_v5hdr_verify(bp);
+ fa = xfs_btree_agblock_v5hdr_verify(bp);
if (fa)
return fa;
}
@@ -310,7 +319,7 @@ xfs_inobt_verify(
if (level >= M_IGEO(mp)->inobt_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp,
+ return xfs_btree_agblock_verify(bp,
M_IGEO(mp)->inobt_mxr[level != 0]);
}
@@ -320,7 +329,7 @@ xfs_inobt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_sblock_verify_crc(bp))
+ if (!xfs_btree_agblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_inobt_verify(bp);
@@ -344,7 +353,7 @@ xfs_inobt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_sblock_calc_crc(bp);
+ xfs_btree_agblock_calc_crc(bp);
}
@@ -398,9 +407,17 @@ xfs_inobt_keys_contiguous(
be32_to_cpu(key2->inobt.ir_startino));
}
-static const struct xfs_btree_ops xfs_inobt_ops = {
+const struct xfs_btree_ops xfs_inobt_ops = {
+ .name = "ino",
+ .type = XFS_BTREE_TYPE_AG,
+
.rec_len = sizeof(xfs_inobt_rec_t),
.key_len = sizeof(xfs_inobt_key_t),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_INO_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_ibt_2),
+ .sick_mask = XFS_SICK_AG_INOBT,
.dup_cursor = xfs_inobt_dup_cursor,
.set_root = xfs_inobt_set_root,
@@ -420,11 +437,19 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
.keys_contiguous = xfs_inobt_keys_contiguous,
};
-static const struct xfs_btree_ops xfs_finobt_ops = {
+const struct xfs_btree_ops xfs_finobt_ops = {
+ .name = "fino",
+ .type = XFS_BTREE_TYPE_AG,
+
.rec_len = sizeof(xfs_inobt_rec_t),
.key_len = sizeof(xfs_inobt_key_t),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
- .dup_cursor = xfs_inobt_dup_cursor,
+ .lru_refs = XFS_INO_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_fibt_2),
+ .sick_mask = XFS_SICK_AG_FINOBT,
+
+ .dup_cursor = xfs_finobt_dup_cursor,
.set_root = xfs_finobt_set_root,
.alloc_block = xfs_finobt_alloc_block,
.free_block = xfs_finobt_free_block,
@@ -443,65 +468,54 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
};
/*
- * Initialize a new inode btree cursor.
+ * Create an inode btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
*/
-static struct xfs_btree_cur *
-xfs_inobt_init_common(
+struct xfs_btree_cur *
+xfs_inobt_init_cursor(
struct xfs_perag *pag,
- struct xfs_trans *tp, /* transaction pointer */
- xfs_btnum_t btnum) /* ialloc or free ino btree */
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
{
struct xfs_mount *mp = pag->pag_mount;
struct xfs_btree_cur *cur;
- cur = xfs_btree_alloc_cursor(mp, tp, btnum,
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops,
M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
- if (btnum == XFS_BTNUM_INO) {
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
- cur->bc_ops = &xfs_inobt_ops;
- } else {
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
- cur->bc_ops = &xfs_finobt_ops;
- }
-
- if (xfs_has_crc(mp))
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_ag.agbp = agbp;
+ if (agbp) {
+ struct xfs_agi *agi = agbp->b_addr;
+
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ }
return cur;
}
-/* Create an inode btree cursor. */
+/*
+ * Create a free inode btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
+ */
struct xfs_btree_cur *
-xfs_inobt_init_cursor(
+xfs_finobt_init_cursor(
struct xfs_perag *pag,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
- xfs_btnum_t btnum)
+ struct xfs_buf *agbp)
{
+ struct xfs_mount *mp = pag->pag_mount;
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = agbp->b_addr;
- cur = xfs_inobt_init_common(pag, tp, btnum);
- if (btnum == XFS_BTNUM_INO)
- cur->bc_nlevels = be32_to_cpu(agi->agi_level);
- else
- cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops,
+ M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
+ cur->bc_ag.pag = xfs_perag_hold(pag);
cur->bc_ag.agbp = agbp;
- return cur;
-}
+ if (agbp) {
+ struct xfs_agi *agi = agbp->b_addr;
-/* Create an inode btree cursor with a fake root for staging. */
-struct xfs_btree_cur *
-xfs_inobt_stage_cursor(
- struct xfs_perag *pag,
- struct xbtree_afakeroot *afake,
- xfs_btnum_t btnum)
-{
- struct xfs_btree_cur *cur;
-
- cur = xfs_inobt_init_common(pag, NULL, btnum);
- xfs_btree_stage_afakeroot(cur, afake);
+ cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ }
return cur;
}
@@ -521,7 +535,7 @@ xfs_inobt_commit_staged_btree(
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
- if (cur->bc_btnum == XFS_BTNUM_INO) {
+ if (xfs_btree_is_ino(cur->bc_ops)) {
fields = XFS_AGI_ROOT | XFS_AGI_LEVEL;
agi->agi_root = cpu_to_be32(afake->af_root);
agi->agi_level = cpu_to_be32(afake->af_levels);
@@ -530,7 +544,7 @@ xfs_inobt_commit_staged_btree(
fields |= XFS_AGI_IBLOCKS;
}
xfs_ialloc_log_agi(tp, agbp, fields);
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops);
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
} else {
fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL;
agi->agi_free_root = cpu_to_be32(afake->af_root);
@@ -540,7 +554,7 @@ xfs_inobt_commit_staged_btree(
fields |= XFS_AGI_IBLOCKS;
}
xfs_ialloc_log_agi(tp, agbp, fields);
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops);
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
}
}
@@ -721,45 +735,21 @@ xfs_inobt_max_size(
XFS_INODES_PER_CHUNK);
}
-/* Read AGI and create inobt cursor. */
-int
-xfs_inobt_cur(
- struct xfs_perag *pag,
- struct xfs_trans *tp,
- xfs_btnum_t which,
- struct xfs_btree_cur **curpp,
- struct xfs_buf **agi_bpp)
-{
- struct xfs_btree_cur *cur;
- int error;
-
- ASSERT(*agi_bpp == NULL);
- ASSERT(*curpp == NULL);
-
- error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
- if (error)
- return error;
-
- cur = xfs_inobt_init_cursor(pag, tp, *agi_bpp, which);
- *curpp = cur;
- return 0;
-}
-
static int
-xfs_inobt_count_blocks(
+xfs_finobt_count_blocks(
struct xfs_perag *pag,
struct xfs_trans *tp,
- xfs_btnum_t btnum,
xfs_extlen_t *tree_blocks)
{
struct xfs_buf *agbp = NULL;
- struct xfs_btree_cur *cur = NULL;
+ struct xfs_btree_cur *cur;
int error;
- error = xfs_inobt_cur(pag, tp, btnum, &cur, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
if (error)
return error;
+ cur = xfs_inobt_init_cursor(pag, tp, agbp);
error = xfs_btree_count_blocks(cur, tree_blocks);
xfs_btree_del_cursor(cur, error);
xfs_trans_brelse(tp, agbp);
@@ -807,8 +797,7 @@ xfs_finobt_calc_reserves(
if (xfs_has_inobtcounts(pag->pag_mount))
error = xfs_finobt_read_blocks(pag, tp, &tree_len);
else
- error = xfs_inobt_count_blocks(pag, tp, XFS_BTNUM_FINO,
- &tree_len);
+ error = xfs_finobt_count_blocks(pag, tp, &tree_len);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 3262c3fe5ebe..6472ec1ecbb4 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -46,10 +46,10 @@ struct xfs_perag;
(maxrecs) * sizeof(xfs_inobt_key_t) + \
((index) - 1) * sizeof(xfs_inobt_ptr_t)))
-extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag,
- struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum);
-struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag,
- struct xbtree_afakeroot *afake, xfs_btnum_t btnum);
+struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+struct xfs_btree_cur *xfs_finobt_init_cursor(struct xfs_perag *pag,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
/* ir_holemask to inode allocation bitmap conversion */
@@ -66,9 +66,6 @@ int xfs_finobt_calc_reserves(struct xfs_perag *perag, struct xfs_trans *tp,
xfs_extlen_t *ask, xfs_extlen_t *used);
extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
-int xfs_inobt_cur(struct xfs_perag *pag, struct xfs_trans *tp,
- xfs_btnum_t btnum, struct xfs_btree_cur **curpp,
- struct xfs_buf **agi_bpp);
void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 773cf4349428..8796f2b3e534 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -158,7 +158,7 @@ static void *
xfs_iext_find_first_leaf(
struct xfs_ifork *ifp)
{
- struct xfs_iext_node *node = ifp->if_u1.if_root;
+ struct xfs_iext_node *node = ifp->if_data;
int height;
if (!ifp->if_height)
@@ -176,7 +176,7 @@ static void *
xfs_iext_find_last_leaf(
struct xfs_ifork *ifp)
{
- struct xfs_iext_node *node = ifp->if_u1.if_root;
+ struct xfs_iext_node *node = ifp->if_data;
int height, i;
if (!ifp->if_height)
@@ -306,7 +306,7 @@ xfs_iext_find_level(
xfs_fileoff_t offset,
int level)
{
- struct xfs_iext_node *node = ifp->if_u1.if_root;
+ struct xfs_iext_node *node = ifp->if_data;
int height, i;
if (!ifp->if_height)
@@ -394,20 +394,27 @@ xfs_iext_leaf_key(
return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK;
}
+static inline void *
+xfs_iext_alloc_node(
+ int size)
+{
+ return kzalloc(size, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+}
+
static void
xfs_iext_grow(
struct xfs_ifork *ifp)
{
- struct xfs_iext_node *node = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ struct xfs_iext_node *node = xfs_iext_alloc_node(NODE_SIZE);
int i;
if (ifp->if_height == 1) {
- struct xfs_iext_leaf *prev = ifp->if_u1.if_root;
+ struct xfs_iext_leaf *prev = ifp->if_data;
node->keys[0] = xfs_iext_leaf_key(prev, 0);
node->ptrs[0] = prev;
} else {
- struct xfs_iext_node *prev = ifp->if_u1.if_root;
+ struct xfs_iext_node *prev = ifp->if_data;
ASSERT(ifp->if_height > 1);
@@ -418,7 +425,7 @@ xfs_iext_grow(
for (i = 1; i < KEYS_PER_NODE; i++)
node->keys[i] = XFS_IEXT_KEY_INVALID;
- ifp->if_u1.if_root = node;
+ ifp->if_data = node;
ifp->if_height++;
}
@@ -430,7 +437,7 @@ xfs_iext_update_node(
int level,
void *ptr)
{
- struct xfs_iext_node *node = ifp->if_u1.if_root;
+ struct xfs_iext_node *node = ifp->if_data;
int height, i;
for (height = ifp->if_height; height > level; height--) {
@@ -454,7 +461,7 @@ xfs_iext_split_node(
int *nr_entries)
{
struct xfs_iext_node *node = *nodep;
- struct xfs_iext_node *new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ struct xfs_iext_node *new = xfs_iext_alloc_node(NODE_SIZE);
const int nr_move = KEYS_PER_NODE / 2;
int nr_keep = nr_move + (KEYS_PER_NODE & 1);
int i = 0;
@@ -542,7 +549,7 @@ xfs_iext_split_leaf(
int *nr_entries)
{
struct xfs_iext_leaf *leaf = cur->leaf;
- struct xfs_iext_leaf *new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ struct xfs_iext_leaf *new = xfs_iext_alloc_node(NODE_SIZE);
const int nr_move = RECS_PER_LEAF / 2;
int nr_keep = nr_move + (RECS_PER_LEAF & 1);
int i;
@@ -583,11 +590,11 @@ xfs_iext_alloc_root(
{
ASSERT(ifp->if_bytes == 0);
- ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
+ ifp->if_data = xfs_iext_alloc_node(sizeof(struct xfs_iext_rec));
ifp->if_height = 1;
/* now that we have a node step into it */
- cur->leaf = ifp->if_u1.if_root;
+ cur->leaf = ifp->if_data;
cur->pos = 0;
}
@@ -603,9 +610,10 @@ xfs_iext_realloc_root(
if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
new_size = NODE_SIZE;
- new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL);
+ new = krealloc(ifp->if_data, new_size,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
- ifp->if_u1.if_root = new;
+ ifp->if_data = new;
cur->leaf = new;
}
@@ -622,13 +630,11 @@ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
}
void
-xfs_iext_insert(
- struct xfs_inode *ip,
+xfs_iext_insert_raw(
+ struct xfs_ifork *ifp,
struct xfs_iext_cursor *cur,
- struct xfs_bmbt_irec *irec,
- int state)
+ struct xfs_bmbt_irec *irec)
{
- struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
xfs_fileoff_t offset = irec->br_startoff;
struct xfs_iext_leaf *new = NULL;
int nr_entries, i;
@@ -662,12 +668,23 @@ xfs_iext_insert(
xfs_iext_set(cur_rec(cur), irec);
ifp->if_bytes += sizeof(struct xfs_iext_rec);
- trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
-
if (new)
xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
}
+void
+xfs_iext_insert(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *irec,
+ int state)
+{
+ struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
+
+ xfs_iext_insert_raw(ifp, cur, irec);
+ trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
+}
+
static struct xfs_iext_node *
xfs_iext_rebalance_node(
struct xfs_iext_node *parent,
@@ -734,7 +751,7 @@ xfs_iext_remove_node(
again:
ASSERT(node->ptrs[pos]);
ASSERT(node->ptrs[pos] == victim);
- kmem_free(victim);
+ kfree(victim);
nr_entries = xfs_iext_node_nr_entries(node, pos) - 1;
offset = node->keys[0];
@@ -777,10 +794,10 @@ again:
* If we are at the root and only one entry is left we can just
* free this node and update the root pointer.
*/
- ASSERT(node == ifp->if_u1.if_root);
- ifp->if_u1.if_root = node->ptrs[0];
+ ASSERT(node == ifp->if_data);
+ ifp->if_data = node->ptrs[0];
ifp->if_height--;
- kmem_free(node);
+ kfree(node);
}
}
@@ -854,8 +871,8 @@ xfs_iext_free_last_leaf(
struct xfs_ifork *ifp)
{
ifp->if_height--;
- kmem_free(ifp->if_u1.if_root);
- ifp->if_u1.if_root = NULL;
+ kfree(ifp->if_data);
+ ifp->if_data = NULL;
}
void
@@ -872,7 +889,7 @@ xfs_iext_remove(
trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
ASSERT(ifp->if_height > 0);
- ASSERT(ifp->if_u1.if_root != NULL);
+ ASSERT(ifp->if_data != NULL);
ASSERT(xfs_iext_valid(ifp, cur));
xfs_iext_inc_seq(ifp);
@@ -1035,16 +1052,16 @@ xfs_iext_destroy_node(
}
}
- kmem_free(node);
+ kfree(node);
}
void
xfs_iext_destroy(
struct xfs_ifork *ifp)
{
- xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height);
+ xfs_iext_destroy_node(ifp->if_data, ifp->if_height);
ifp->if_bytes = 0;
ifp->if_height = 0;
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 543f3748c2a3..d0dcce462bf4 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -18,6 +18,7 @@
#include "xfs_trans.h"
#include "xfs_ialloc.h"
#include "xfs_dir2.h"
+#include "xfs_health.h"
#include <linux/iversion.h>
@@ -132,9 +133,14 @@ xfs_imap_to_bp(
struct xfs_imap *imap,
struct xfs_buf **bpp)
{
- return xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
- imap->im_len, XBF_UNMAPPED, bpp,
- &xfs_inode_buf_ops);
+ int error;
+
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+ imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno),
+ XFS_SICK_AG_INODES);
+ return error;
}
static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts)
@@ -510,6 +516,9 @@ xfs_dinode_verify(
if (mode && nextents + naextents > nblocks)
return __this_address;
+ if (nextents + naextents == 0 && nblocks != 0)
+ return __this_address;
+
if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 5a2e7ddfa76d..7d660a973909 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -25,6 +25,8 @@
#include "xfs_attr_leaf.h"
#include "xfs_types.h"
#include "xfs_errortag.h"
+#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
struct kmem_cache *xfs_ifork_cache;
@@ -50,12 +52,16 @@ xfs_init_local_fork(
mem_size++;
if (size) {
- ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS);
- memcpy(ifp->if_u1.if_data, data, size);
+ char *new_data = kmalloc(mem_size,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+
+ memcpy(new_data, data, size);
if (zero_terminate)
- ifp->if_u1.if_data[size] = '\0';
+ new_data[size] = '\0';
+
+ ifp->if_data = new_data;
} else {
- ifp->if_u1.if_data = NULL;
+ ifp->if_data = NULL;
}
ifp->if_bytes = size;
@@ -74,7 +80,7 @@ xfs_iformat_local(
/*
* If the size is unreasonable, then something
* is wrong and we just bail out rather than crash in
- * kmem_alloc() or memcpy() below.
+ * kmalloc() or memcpy() below.
*/
if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
xfs_warn(ip->i_mount,
@@ -84,6 +90,7 @@ xfs_iformat_local(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_local", dip, sizeof(*dip),
__this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
@@ -113,7 +120,7 @@ xfs_iformat_extents(
/*
* If the number of extents is unreasonable, then something is wrong and
- * we just bail out rather than crash in kmem_alloc() or memcpy() below.
+ * we just bail out rather than crash in kmalloc() or memcpy() below.
*/
if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
@@ -121,11 +128,12 @@ xfs_iformat_extents(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(1)", dip, sizeof(*dip),
__this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
ifp->if_bytes = 0;
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
ifp->if_height = 0;
if (size) {
dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
@@ -140,6 +148,7 @@ xfs_iformat_extents(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(2)",
dp, sizeof(*dp), fa);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return xfs_bmap_complain_bad_rec(ip, whichfork,
fa, &new);
}
@@ -198,11 +207,13 @@ xfs_iformat_btree(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_btree", dfp, size,
__this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
ifp->if_broot_bytes = size;
- ifp->if_broot = kmem_alloc(size, KM_NOFS);
+ ifp->if_broot = kmalloc(size,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
ASSERT(ifp->if_broot != NULL);
/*
* Copy and convert from the on-disk structure
@@ -212,7 +223,7 @@ xfs_iformat_btree(
ifp->if_broot, size);
ifp->if_bytes = 0;
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
ifp->if_height = 0;
return 0;
}
@@ -262,12 +273,14 @@ xfs_iformat_data_fork(
default:
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
dip, sizeof(*dip), __this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
break;
default:
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
sizeof(*dip), __this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
}
}
@@ -276,10 +289,9 @@ static uint16_t
xfs_dfork_attr_shortform_size(
struct xfs_dinode *dip)
{
- struct xfs_attr_shortform *atp =
- (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip);
+ struct xfs_attr_sf_hdr *sf = XFS_DFORK_APTR(dip);
- return be16_to_cpu(atp->hdr.totsize);
+ return be16_to_cpu(sf->totsize);
}
void
@@ -340,6 +352,7 @@ xfs_iformat_attr_fork(
default:
xfs_inode_verifier_error(ip, error, __func__, dip,
sizeof(*dip), __this_address);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
error = -EFSCORRUPTED;
break;
}
@@ -397,7 +410,8 @@ xfs_iroot_realloc(
*/
if (ifp->if_broot_bytes == 0) {
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
- ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
+ ifp->if_broot = kmalloc(new_size,
+ GFP_KERNEL | __GFP_NOFAIL);
ifp->if_broot_bytes = (int)new_size;
return;
}
@@ -412,7 +426,7 @@ xfs_iroot_realloc(
new_max = cur_max + rec_diff;
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
ifp->if_broot = krealloc(ifp->if_broot, new_size,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOFAIL);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -438,7 +452,7 @@ xfs_iroot_realloc(
else
new_size = 0;
if (new_size > 0) {
- new_broot = kmem_alloc(new_size, KM_NOFS);
+ new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
/*
* First copy over the btree block header.
*/
@@ -468,7 +482,7 @@ xfs_iroot_realloc(
(int)new_size);
memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
}
- kmem_free(ifp->if_broot);
+ kfree(ifp->if_broot);
ifp->if_broot = new_broot;
ifp->if_broot_bytes = (int)new_size;
if (ifp->if_broot)
@@ -486,14 +500,14 @@ xfs_iroot_realloc(
*
* If the amount of space needed has decreased below the size of the
* inline buffer, then switch to using the inline buffer. Otherwise,
- * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * use krealloc() or kmalloc() to adjust the size of the buffer
* to what is needed.
*
* ip -- the inode whose if_data area is changing
* byte_diff -- the change in the number of bytes, positive or negative,
* requested for the if_data array.
*/
-void
+void *
xfs_idata_realloc(
struct xfs_inode *ip,
int64_t byte_diff,
@@ -505,34 +519,31 @@ xfs_idata_realloc(
ASSERT(new_size >= 0);
ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork));
- if (byte_diff == 0)
- return;
-
- if (new_size == 0) {
- kmem_free(ifp->if_u1.if_data);
- ifp->if_u1.if_data = NULL;
- ifp->if_bytes = 0;
- return;
+ if (byte_diff) {
+ ifp->if_data = krealloc(ifp->if_data, new_size,
+ GFP_KERNEL | __GFP_NOFAIL);
+ if (new_size == 0)
+ ifp->if_data = NULL;
+ ifp->if_bytes = new_size;
}
- ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size,
- GFP_NOFS | __GFP_NOFAIL);
- ifp->if_bytes = new_size;
+ return ifp->if_data;
}
+/* Free all memory and reset a fork back to its initial state. */
void
xfs_idestroy_fork(
struct xfs_ifork *ifp)
{
if (ifp->if_broot != NULL) {
- kmem_free(ifp->if_broot);
+ kfree(ifp->if_broot);
ifp->if_broot = NULL;
}
switch (ifp->if_format) {
case XFS_DINODE_FMT_LOCAL:
- kmem_free(ifp->if_u1.if_data);
- ifp->if_u1.if_data = NULL;
+ kfree(ifp->if_data);
+ ifp->if_data = NULL;
break;
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
@@ -563,7 +574,7 @@ xfs_iextents_copy(
struct xfs_bmbt_irec rec;
int64_t copied = 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
ASSERT(ifp->if_bytes > 0);
for_each_xfs_iext(ifp, &icur, &rec) {
@@ -625,9 +636,9 @@ xfs_iflush_fork(
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & dataflag[whichfork]) &&
(ifp->if_bytes > 0)) {
- ASSERT(ifp->if_u1.if_data != NULL);
+ ASSERT(ifp->if_data != NULL);
ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork));
- memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+ memcpy(cp, ifp->if_data, ifp->if_bytes);
}
break;
@@ -690,7 +701,7 @@ xfs_ifork_init_cow(
return;
ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS;
}
@@ -702,19 +713,27 @@ xfs_ifork_verify_local_data(
xfs_failaddr_t fa = NULL;
switch (VFS_I(ip)->i_mode & S_IFMT) {
- case S_IFDIR:
- fa = xfs_dir2_sf_verify(ip);
+ case S_IFDIR: {
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_dir2_sf_hdr *sfp = ifp->if_data;
+
+ fa = xfs_dir2_sf_verify(mp, sfp, ifp->if_bytes);
break;
- case S_IFLNK:
- fa = xfs_symlink_shortform_verify(ip);
+ }
+ case S_IFLNK: {
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+ fa = xfs_symlink_shortform_verify(ifp->if_data, ifp->if_bytes);
break;
+ }
default:
break;
}
if (fa) {
xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
- ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa);
+ ip->i_df.if_data, ip->i_df.if_bytes, fa);
return -EFSCORRUPTED;
}
@@ -729,14 +748,17 @@ xfs_ifork_verify_local_attr(
struct xfs_ifork *ifp = &ip->i_af;
xfs_failaddr_t fa;
- if (!xfs_inode_has_attr_fork(ip))
+ if (!xfs_inode_has_attr_fork(ip)) {
fa = __this_address;
- else
- fa = xfs_attr_shortform_verify(ip);
+ } else {
+ struct xfs_ifork *ifp = &ip->i_af;
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ fa = xfs_attr_shortform_verify(ifp->if_data, ifp->if_bytes);
+ }
if (fa) {
xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
- ifp->if_u1.if_data, ifp->if_bytes, fa);
+ ifp->if_data, ifp->if_bytes, fa);
return -EFSCORRUPTED;
}
@@ -792,3 +814,12 @@ xfs_iext_count_upgrade(
return 0;
}
+
+/* Decide if a file mapping is on the realtime device or not. */
+bool
+xfs_ifork_is_realtime(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ return XFS_IS_REALTIME_INODE(ip) && whichfork != XFS_ATTR_FORK;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 96d307784c85..bd53eb951b65 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -13,14 +13,12 @@ struct xfs_dinode;
* File incore extent information, present for each of data & attr forks.
*/
struct xfs_ifork {
- int64_t if_bytes; /* bytes in if_u1 */
+ int64_t if_bytes; /* bytes in if_data */
struct xfs_btree_block *if_broot; /* file's incore btree root */
unsigned int if_seq; /* fork mod counter */
int if_height; /* height of the extent tree */
- union {
- void *if_root; /* extent tree root */
- char *if_data; /* inline file data */
- } if_u1;
+ void *if_data; /* extent tree root or
+ inline data */
xfs_extnum_t if_nextents; /* # of extents in this fork */
short if_broot_bytes; /* bytes allocated for root */
int8_t if_format; /* format of this fork */
@@ -170,7 +168,7 @@ int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *);
void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
struct xfs_inode_log_item *, int);
void xfs_idestroy_fork(struct xfs_ifork *ifp);
-void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
+void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
int whichfork);
void xfs_iroot_realloc(struct xfs_inode *, int, int);
int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
@@ -180,6 +178,9 @@ void xfs_init_local_fork(struct xfs_inode *ip, int whichfork,
const void *data, int64_t size);
xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp);
+void xfs_iext_insert_raw(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *irec);
void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur,
struct xfs_bmbt_irec *, int);
void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *,
@@ -259,6 +260,7 @@ int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
int nr_to_add);
int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
uint nr_to_add);
+bool xfs_ifork_is_realtime(struct xfs_inode *ip, int whichfork);
/* returns true if the fork has extents but they are not read in yet. */
static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 269573c82808..16872972e1e9 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -838,10 +838,12 @@ struct xfs_cud_log_format {
#define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31)
#define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30)
+#define XFS_BMAP_EXTENT_REALTIME (1U << 29)
#define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \
XFS_BMAP_EXTENT_ATTR_FORK | \
- XFS_BMAP_EXTENT_UNWRITTEN)
+ XFS_BMAP_EXTENT_UNWRITTEN | \
+ XFS_BMAP_EXTENT_REALTIME)
/*
* This is the structure used to lay out an bui log item in the
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index a5100a11faf9..9fe7a9564bca 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -11,6 +11,7 @@
* define how recovery should work for that type of log item.
*/
struct xlog_recover_item;
+struct xfs_defer_op_type;
/* Sorting hat for log items as they're read in. */
enum xlog_recover_reorder {
@@ -153,4 +154,11 @@ xlog_recover_resv(const struct xfs_trans_res *r)
return ret;
}
+struct xfs_defer_pending;
+
+void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip,
+ xfs_lsn_t lsn, const struct xfs_defer_op_type *ops);
+int xlog_recover_finish_intent(struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp);
+
#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index c4cc99b70dd3..81885a6a028e 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -7,16 +7,16 @@
#define __XFS_ONDISK_H
#define XFS_CHECK_STRUCT_SIZE(structname, size) \
- BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
- #structname ") is wrong, expected " #size)
+ static_assert(sizeof(structname) == (size), \
+ "XFS: sizeof(" #structname ") is wrong, expected " #size)
#define XFS_CHECK_OFFSET(structname, member, off) \
- BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
+ static_assert(offsetof(structname, member) == (off), \
"XFS: offsetof(" #structname ", " #member ") is wrong, " \
"expected " #off)
#define XFS_CHECK_VALUE(value, expected) \
- BUILD_BUG_ON_MSG((value) != (expected), \
+ static_assert((value) == (expected), \
"XFS: value of " #value " is wrong, expected " #expected)
static inline void __init
@@ -72,6 +72,10 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4);
+ /* realtime structures */
+ XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4);
+ XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4);
+
/*
* m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
* 4 bytes anyway so it's not obviously a problem. Hence for the moment
@@ -89,13 +93,13 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32);
- XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, namelen, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3);
XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 646b3fa362ad..511c912d515c 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -23,6 +23,7 @@
#include "xfs_refcount.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
struct kmem_cache *xfs_refcount_intent_cache;
@@ -123,11 +124,9 @@ xfs_refcount_btrec_to_irec(
/* Simple checks for refcount records. */
xfs_failaddr_t
xfs_refcount_check_irec(
- struct xfs_btree_cur *cur,
+ struct xfs_perag *pag,
const struct xfs_refcount_irec *irec)
{
- struct xfs_perag *pag = cur->bc_ag.pag;
-
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
return __this_address;
@@ -158,6 +157,7 @@ xfs_refcount_complain_bad_rec(
xfs_warn(mp,
"Start block 0x%x, block count 0x%x, references 0x%x",
irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -179,7 +179,7 @@ xfs_refcount_get_rec(
return error;
xfs_refcount_btrec_to_irec(rec, irec);
- fa = xfs_refcount_check_irec(cur, irec);
+ fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec);
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, irec);
@@ -240,6 +240,7 @@ xfs_refcount_insert(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -270,12 +271,14 @@ xfs_refcount_delete(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec);
error = xfs_btree_delete(cur, i);
if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -400,6 +403,7 @@ xfs_refcount_split_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -427,6 +431,7 @@ xfs_refcount_split_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -472,6 +477,7 @@ xfs_refcount_merge_center_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -480,6 +486,7 @@ xfs_refcount_merge_center_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -489,6 +496,7 @@ xfs_refcount_merge_center_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -500,6 +508,7 @@ xfs_refcount_merge_center_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -544,6 +553,7 @@ xfs_refcount_merge_left_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -552,6 +562,7 @@ xfs_refcount_merge_left_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -563,6 +574,7 @@ xfs_refcount_merge_left_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -610,6 +622,7 @@ xfs_refcount_merge_right_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -618,6 +631,7 @@ xfs_refcount_merge_right_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -629,6 +643,7 @@ xfs_refcount_merge_right_extent(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -676,6 +691,7 @@ xfs_refcount_find_left_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -695,6 +711,7 @@ xfs_refcount_find_left_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -769,6 +786,7 @@ xfs_refcount_find_right_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -788,6 +806,7 @@ xfs_refcount_find_right_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1058,7 +1077,7 @@ xfs_refcount_still_have_space(
* to handle each of the shape changes to the refcount btree.
*/
overhead = xfs_allocfree_block_count(cur->bc_mp,
- cur->bc_ag.refc.shape_changes);
+ cur->bc_refc.shape_changes);
overhead += cur->bc_mp->m_refc_maxlevels;
overhead *= cur->bc_mp->m_sb.sb_blocksize;
@@ -1066,17 +1085,17 @@ xfs_refcount_still_have_space(
* Only allow 2 refcount extent updates per transaction if the
* refcount continue update "error" has been injected.
*/
- if (cur->bc_ag.refc.nr_ops > 2 &&
+ if (cur->bc_refc.nr_ops > 2 &&
XFS_TEST_ERROR(false, cur->bc_mp,
XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
return false;
- if (cur->bc_ag.refc.nr_ops == 0)
+ if (cur->bc_refc.nr_ops == 0)
return true;
else if (overhead > cur->bc_tp->t_log_res)
return false;
- return cur->bc_tp->t_log_res - overhead >
- cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+ return cur->bc_tp->t_log_res - overhead >
+ cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
}
/*
@@ -1136,7 +1155,7 @@ xfs_refcount_adjust_extents(
* Either cover the hole (increment) or
* delete the range (decrement).
*/
- cur->bc_ag.refc.nr_ops++;
+ cur->bc_refc.nr_ops++;
if (tmp.rc_refcount) {
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
@@ -1144,6 +1163,7 @@ xfs_refcount_adjust_extents(
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp,
found_tmp != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1153,7 +1173,7 @@ xfs_refcount_adjust_extents(
tmp.rc_startblock);
error = xfs_free_extent_later(cur->bc_tp, fsbno,
tmp.rc_blockcount, NULL,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
goto out_error;
}
@@ -1182,6 +1202,7 @@ xfs_refcount_adjust_extents(
*/
if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) ||
XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1195,7 +1216,7 @@ xfs_refcount_adjust_extents(
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, &ext);
- cur->bc_ag.refc.nr_ops++;
+ cur->bc_refc.nr_ops++;
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
if (error)
@@ -1205,6 +1226,7 @@ xfs_refcount_adjust_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1215,7 +1237,7 @@ xfs_refcount_adjust_extents(
ext.rc_startblock);
error = xfs_free_extent_later(cur->bc_tp, fsbno,
ext.rc_blockcount, NULL,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
goto out_error;
}
@@ -1283,7 +1305,7 @@ xfs_refcount_adjust(
if (shape_changed)
shape_changes++;
if (shape_changes)
- cur->bc_ag.refc.shape_changes++;
+ cur->bc_refc.shape_changes++;
/* Now that we've taken care of the ends, adjust the middle extents */
error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj);
@@ -1329,8 +1351,10 @@ xfs_refcount_continue_op(
struct xfs_perag *pag = cur->bc_ag.pag;
if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno,
- ri->ri_blockcount)))
+ ri->ri_blockcount))) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
@@ -1376,8 +1400,8 @@ xfs_refcount_finish_one(
*/
rcur = *pcur;
if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
- nr_ops = rcur->bc_ag.refc.nr_ops;
- shape_changes = rcur->bc_ag.refc.shape_changes;
+ nr_ops = rcur->bc_refc.nr_ops;
+ shape_changes = rcur->bc_refc.shape_changes;
xfs_refcount_finish_one_cleanup(tp, rcur, 0);
rcur = NULL;
*pcur = NULL;
@@ -1389,8 +1413,8 @@ xfs_refcount_finish_one(
return error;
rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag);
- rcur->bc_ag.refc.nr_ops = nr_ops;
- rcur->bc_ag.refc.shape_changes = shape_changes;
+ rcur->bc_refc.nr_ops = nr_ops;
+ rcur->bc_refc.shape_changes = shape_changes;
}
*pcur = rcur;
@@ -1451,14 +1475,14 @@ __xfs_refcount_add(
blockcount);
ri = kmem_cache_alloc(xfs_refcount_intent_cache,
- GFP_NOFS | __GFP_NOFAIL);
+ GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_startblock = startblock;
ri->ri_blockcount = blockcount;
xfs_refcount_update_get_group(tp->t_mountp, ri);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
+ xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
}
/*
@@ -1537,6 +1561,7 @@ xfs_refcount_find_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1554,6 +1579,7 @@ xfs_refcount_find_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1587,6 +1613,7 @@ xfs_refcount_find_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1684,6 +1711,7 @@ xfs_refcount_adjust_cow_extents(
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec &&
ext.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1699,6 +1727,7 @@ xfs_refcount_adjust_cow_extents(
/* Adding a CoW reservation, there should be nothing here. */
if (XFS_IS_CORRUPT(cur->bc_mp,
agbno + aglen > ext.rc_startblock)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1716,6 +1745,7 @@ xfs_refcount_adjust_cow_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1723,14 +1753,17 @@ xfs_refcount_adjust_cow_extents(
case XFS_REFCOUNT_ADJUST_COW_FREE:
/* Removing a CoW reservation, there should be one extent. */
if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1742,6 +1775,7 @@ xfs_refcount_adjust_cow_extents(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1891,17 +1925,20 @@ xfs_refcount_recover_extent(
struct xfs_refcount_recovery *rr;
if (XFS_IS_CORRUPT(cur->bc_mp,
- be32_to_cpu(rec->refc.rc_refcount) != 1))
+ be32_to_cpu(rec->refc.rc_refcount) != 1)) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
rr = kmalloc(sizeof(struct xfs_refcount_recovery),
GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&rr->rr_list);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
- if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL ||
+ if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL ||
XFS_IS_CORRUPT(cur->bc_mp,
rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ xfs_btree_mark_sick(cur);
kfree(rr);
return -EFSCORRUPTED;
}
@@ -1985,7 +2022,7 @@ xfs_refcount_recover_cow_leftovers(
/* Free the block. */
error = xfs_free_extent_later(tp, fsb,
rr->rr_rrec.rc_blockcount, NULL,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
goto out_trans;
@@ -2033,6 +2070,47 @@ xfs_refcount_has_records(
return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
}
+struct xfs_refcount_query_range_info {
+ xfs_refcount_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_refcount_query_range_helper(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_refcount_query_range_info *query = priv;
+ struct xfs_refcount_irec irec;
+ xfs_failaddr_t fa;
+
+ xfs_refcount_btrec_to_irec(rec, &irec);
+ fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec);
+ if (fa)
+ return xfs_refcount_complain_bad_rec(cur, fa, &irec);
+
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all refcount records between two keys. */
+int
+xfs_refcount_query_range(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *low_rec,
+ const struct xfs_refcount_irec *high_rec,
+ xfs_refcount_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_brec = { .rc = *low_rec };
+ union xfs_btree_irec high_brec = { .rc = *high_rec };
+ struct xfs_refcount_query_range_info query = { .priv = priv, .fn = fn };
+
+ return xfs_btree_query_range(cur, &low_brec, &high_brec,
+ xfs_refcount_query_range_helper, &query);
+}
+
int __init
xfs_refcount_intent_init_cache(void)
{
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 783cd89ca195..9b56768a590c 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -117,7 +117,7 @@ extern int xfs_refcount_has_records(struct xfs_btree_cur *cur,
union xfs_btree_rec;
extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec);
-xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag,
const struct xfs_refcount_irec *irec);
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
@@ -127,4 +127,14 @@ extern struct kmem_cache *xfs_refcount_intent_cache;
int __init xfs_refcount_intent_init_cache(void);
void xfs_refcount_intent_destroy_cache(void);
+typedef int (*xfs_refcount_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *rec,
+ void *priv);
+
+int xfs_refcount_query_range(struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *low_rec,
+ const struct xfs_refcount_irec *high_rec,
+ xfs_refcount_query_range_fn fn, void *priv);
+
#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 5c3987d8dc24..ca59f6c89f3e 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -16,6 +16,7 @@
#include "xfs_refcount.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
+#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_bit.h"
@@ -77,8 +78,6 @@ xfs_refcountbt_alloc_block(
xfs_refc_block(args.mp)));
if (error)
goto out_error;
- trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- args.agbno, 1);
if (args.fsbno == NULLFSBLOCK) {
*stat = 0;
return 0;
@@ -107,12 +106,10 @@ xfs_refcountbt_free_block(
struct xfs_agf *agf = agbp->b_addr;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
- trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
- &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
+ &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false);
}
STATIC int
@@ -220,18 +217,29 @@ xfs_refcountbt_verify(
if (!xfs_has_reflink(mp))
return __this_address;
- fa = xfs_btree_sblock_v5hdr_verify(bp);
+ fa = xfs_btree_agblock_v5hdr_verify(bp);
if (fa)
return fa;
level = be16_to_cpu(block->bb_level);
if (pag && xfs_perag_initialised_agf(pag)) {
- if (level >= pag->pagf_refcount_level)
+ unsigned int maxlevel = pag->pagf_refcount_level;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /*
+ * Online repair could be rewriting the refcount btree, so
+ * we'll validate against the larger of either tree while this
+ * is going on.
+ */
+ maxlevel = max_t(unsigned int, maxlevel,
+ pag->pagf_repair_refcount_level);
+#endif
+ if (level >= maxlevel)
return __this_address;
} else if (level >= mp->m_refc_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]);
+ return xfs_btree_agblock_verify(bp, mp->m_refc_mxr[level != 0]);
}
STATIC void
@@ -240,7 +248,7 @@ xfs_refcountbt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_sblock_verify_crc(bp))
+ if (!xfs_btree_agblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_refcountbt_verify(bp);
@@ -264,7 +272,7 @@ xfs_refcountbt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_sblock_calc_crc(bp);
+ xfs_btree_agblock_calc_crc(bp);
}
@@ -310,9 +318,17 @@ xfs_refcountbt_keys_contiguous(
be32_to_cpu(key2->refc.rc_startblock));
}
-static const struct xfs_btree_ops xfs_refcountbt_ops = {
+const struct xfs_btree_ops xfs_refcountbt_ops = {
+ .name = "refcount",
+ .type = XFS_BTREE_TYPE_AG,
+
.rec_len = sizeof(struct xfs_refcount_rec),
.key_len = sizeof(struct xfs_refcount_key),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_REFC_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2),
+ .sick_mask = XFS_SICK_AG_REFCNTBT,
.dup_cursor = xfs_refcountbt_dup_cursor,
.set_root = xfs_refcountbt_set_root,
@@ -333,59 +349,32 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = {
};
/*
- * Initialize a new refcount btree cursor.
+ * Create a new refcount btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
*/
-static struct xfs_btree_cur *
-xfs_refcountbt_init_common(
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
+ struct xfs_buf *agbp,
struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
- cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC,
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops,
mp->m_refc_maxlevels, xfs_refcountbt_cur_cache);
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
-
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
cur->bc_ag.pag = xfs_perag_hold(pag);
- cur->bc_ag.refc.nr_ops = 0;
- cur->bc_ag.refc.shape_changes = 0;
- cur->bc_ops = &xfs_refcountbt_ops;
- return cur;
-}
-
-/* Create a btree cursor. */
-struct xfs_btree_cur *
-xfs_refcountbt_init_cursor(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- struct xfs_buf *agbp,
- struct xfs_perag *pag)
-{
- struct xfs_agf *agf = agbp->b_addr;
- struct xfs_btree_cur *cur;
-
- cur = xfs_refcountbt_init_common(mp, tp, pag);
- cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+ cur->bc_refc.nr_ops = 0;
+ cur->bc_refc.shape_changes = 0;
cur->bc_ag.agbp = agbp;
- return cur;
-}
+ if (agbp) {
+ struct xfs_agf *agf = agbp->b_addr;
-/* Create a btree cursor with a fake root for staging. */
-struct xfs_btree_cur *
-xfs_refcountbt_stage_cursor(
- struct xfs_mount *mp,
- struct xbtree_afakeroot *afake,
- struct xfs_perag *pag)
-{
- struct xfs_btree_cur *cur;
-
- cur = xfs_refcountbt_init_common(mp, NULL, pag);
- xfs_btree_stage_afakeroot(cur, afake);
+ cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+ }
return cur;
}
@@ -410,7 +399,7 @@ xfs_refcountbt_commit_staged_btree(
xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS |
XFS_AGF_REFCOUNT_ROOT |
XFS_AGF_REFCOUNT_LEVEL);
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops);
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
}
/* Calculate number of records in a refcount btree block. */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index d66b37259bed..1e0ab25f6c68 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -48,8 +48,6 @@ struct xbtree_afakeroot;
extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *agbp,
struct xfs_perag *pag);
-struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, struct xfs_perag *pag);
extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index fbb0b2637463..ef16f6f9cef6 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -23,6 +23,7 @@
#include "xfs_error.h"
#include "xfs_inode.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
struct kmem_cache *xfs_rmap_intent_cache;
@@ -56,8 +57,10 @@ xfs_rmap_lookup_le(
error = xfs_rmap_get_rec(cur, irec, &get_stat);
if (error)
return error;
- if (!get_stat)
+ if (!get_stat) {
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
+ }
return 0;
}
@@ -132,6 +135,7 @@ xfs_rmap_insert(
if (error)
goto done;
if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) {
+ xfs_btree_mark_sick(rcur);
error = -EFSCORRUPTED;
goto done;
}
@@ -145,6 +149,7 @@ xfs_rmap_insert(
if (error)
goto done;
if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(rcur);
error = -EFSCORRUPTED;
goto done;
}
@@ -174,6 +179,7 @@ xfs_rmap_delete(
if (error)
goto done;
if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(rcur);
error = -EFSCORRUPTED;
goto done;
}
@@ -182,6 +188,7 @@ xfs_rmap_delete(
if (error)
goto done;
if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ xfs_btree_mark_sick(rcur);
error = -EFSCORRUPTED;
goto done;
}
@@ -208,10 +215,10 @@ xfs_rmap_btrec_to_irec(
/* Simple checks for rmap records. */
xfs_failaddr_t
xfs_rmap_check_irec(
- struct xfs_btree_cur *cur,
+ struct xfs_perag *pag,
const struct xfs_rmap_irec *irec)
{
- struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_mount *mp = pag->pag_mount;
bool is_inode;
bool is_unwritten;
bool is_bmbt;
@@ -226,8 +233,8 @@ xfs_rmap_check_irec(
return __this_address;
} else {
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbext(cur->bc_ag.pag, irec->rm_startblock,
- irec->rm_blockcount))
+ if (!xfs_verify_agbext(pag, irec->rm_startblock,
+ irec->rm_blockcount))
return __this_address;
}
@@ -262,6 +269,16 @@ xfs_rmap_check_irec(
return NULL;
}
+static inline xfs_failaddr_t
+xfs_rmap_check_btrec(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *irec)
+{
+ if (xfs_btree_is_mem_rmap(cur->bc_ops))
+ return xfs_rmap_check_irec(cur->bc_mem.pag, irec);
+ return xfs_rmap_check_irec(cur->bc_ag.pag, irec);
+}
+
static inline int
xfs_rmap_complain_bad_rec(
struct xfs_btree_cur *cur,
@@ -270,13 +287,18 @@ xfs_rmap_complain_bad_rec(
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_warn(mp,
- "Reverse Mapping BTree record corruption in AG %d detected at %pS!",
- cur->bc_ag.pag->pag_agno, fa);
+ if (xfs_btree_is_mem_rmap(cur->bc_ops))
+ xfs_warn(mp,
+ "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa);
+ else
+ xfs_warn(mp,
+ "Reverse Mapping BTree record corruption in AG %d detected at %pS!",
+ cur->bc_ag.pag->pag_agno, fa);
xfs_warn(mp,
"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
irec->rm_owner, irec->rm_flags, irec->rm_startblock,
irec->rm_blockcount);
+ xfs_btree_mark_sick(cur);
return -EFSCORRUPTED;
}
@@ -299,7 +321,7 @@ xfs_rmap_get_rec(
fa = xfs_rmap_btrec_to_irec(rec, irec);
if (!fa)
- fa = xfs_rmap_check_irec(cur, irec);
+ fa = xfs_rmap_check_btrec(cur, irec);
if (fa)
return xfs_rmap_complain_bad_rec(cur, fa, irec);
@@ -512,7 +534,7 @@ xfs_rmap_lookup_le_range(
*/
static int
xfs_rmap_free_check_owner(
- struct xfs_mount *mp,
+ struct xfs_btree_cur *cur,
uint64_t ltoff,
struct xfs_rmap_irec *rec,
xfs_filblks_t len,
@@ -520,6 +542,7 @@ xfs_rmap_free_check_owner(
uint64_t offset,
unsigned int flags)
{
+ struct xfs_mount *mp = cur->bc_mp;
int error = 0;
if (owner == XFS_RMAP_OWN_UNKNOWN)
@@ -529,12 +552,14 @@ xfs_rmap_free_check_owner(
if (XFS_IS_CORRUPT(mp,
(flags & XFS_RMAP_UNWRITTEN) !=
(rec->rm_flags & XFS_RMAP_UNWRITTEN))) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
/* Make sure the owner matches what we expect to find in the tree. */
if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -546,16 +571,19 @@ xfs_rmap_free_check_owner(
if (flags & XFS_RMAP_BMBT_BLOCK) {
if (XFS_IS_CORRUPT(mp,
!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
} else {
if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
if (XFS_IS_CORRUPT(mp,
offset + len > ltoff + rec->rm_blockcount)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -618,6 +646,7 @@ xfs_rmap_unmap(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -639,6 +668,7 @@ xfs_rmap_unmap(
if (XFS_IS_CORRUPT(mp,
bno <
ltrec.rm_startblock + ltrec.rm_blockcount)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -665,6 +695,7 @@ xfs_rmap_unmap(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -677,12 +708,13 @@ xfs_rmap_unmap(
ltrec.rm_startblock > bno ||
ltrec.rm_startblock + ltrec.rm_blockcount <
bno + len)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
/* Check owner information. */
- error = xfs_rmap_free_check_owner(mp, ltoff, &ltrec, len, owner,
+ error = xfs_rmap_free_check_owner(cur, ltoff, &ltrec, len, owner,
offset, flags);
if (error)
goto out_error;
@@ -697,6 +729,7 @@ xfs_rmap_unmap(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -788,6 +821,86 @@ out_error:
return error;
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of rmapbt live updates. If
+ * the compiler supports jump labels, the static branch will be replaced by a
+ * nop sled when there are no hook users. Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_rmap_hooks_switch);
+
+void
+xfs_rmap_hook_disable(void)
+{
+ xfs_hooks_switch_off(&xfs_rmap_hooks_switch);
+}
+
+void
+xfs_rmap_hook_enable(void)
+{
+ xfs_hooks_switch_on(&xfs_rmap_hooks_switch);
+}
+
+/* Call downstream hooks for a reverse mapping update. */
+static inline void
+xfs_rmap_update_hook(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ enum xfs_rmap_intent_type op,
+ xfs_agblock_t startblock,
+ xfs_extlen_t blockcount,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
+{
+ if (xfs_hooks_switched_on(&xfs_rmap_hooks_switch)) {
+ struct xfs_rmap_update_params p = {
+ .startblock = startblock,
+ .blockcount = blockcount,
+ .unwritten = unwritten,
+ .oinfo = *oinfo, /* struct copy */
+ };
+
+ if (pag)
+ xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p);
+ }
+}
+
+/* Call the specified function during a reverse mapping update. */
+int
+xfs_rmap_hook_add(
+ struct xfs_perag *pag,
+ struct xfs_rmap_hook *hook)
+{
+ return xfs_hooks_add(&pag->pag_rmap_update_hooks, &hook->rmap_hook);
+}
+
+/* Stop calling the specified function during a reverse mapping update. */
+void
+xfs_rmap_hook_del(
+ struct xfs_perag *pag,
+ struct xfs_rmap_hook *hook)
+{
+ xfs_hooks_del(&pag->pag_rmap_update_hooks, &hook->rmap_hook);
+}
+
+/* Configure rmap update hook functions. */
+void
+xfs_rmap_hook_setup(
+ struct xfs_rmap_hook *hook,
+ notifier_fn_t mod_fn)
+{
+ xfs_hook_setup(&hook->rmap_hook, mod_fn);
+}
+#else
+# define xfs_rmap_update_hook(t, p, o, s, b, u, oi) do { } while (0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
/*
* Remove a reference to an extent in the rmap btree.
*/
@@ -808,7 +921,7 @@ xfs_rmap_free(
return 0;
cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
-
+ xfs_rmap_update_hook(tp, pag, XFS_RMAP_UNMAP, bno, len, false, oinfo);
error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
xfs_btree_del_cursor(cur, error);
@@ -900,6 +1013,7 @@ xfs_rmap_map(
if (XFS_IS_CORRUPT(mp,
have_lt != 0 &&
ltrec.rm_startblock + ltrec.rm_blockcount > bno)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -917,10 +1031,12 @@ xfs_rmap_map(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -974,6 +1090,7 @@ xfs_rmap_map(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1021,6 +1138,7 @@ xfs_rmap_map(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -1055,6 +1173,7 @@ xfs_rmap_alloc(
return 0;
cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
+ xfs_rmap_update_hook(tp, pag, XFS_RMAP_MAP, bno, len, false, oinfo);
error = xfs_rmap_map(cur, bno, len, false, oinfo);
xfs_btree_del_cursor(cur, error);
@@ -1116,6 +1235,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1153,12 +1273,14 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if (XFS_IS_CORRUPT(mp,
LEFT.rm_startblock + LEFT.rm_blockcount >
bno)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1181,6 +1303,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1193,10 +1316,12 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1227,6 +1352,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1246,6 +1372,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1257,6 +1384,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1264,6 +1392,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1275,6 +1404,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1282,6 +1412,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1305,6 +1436,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1312,6 +1444,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1331,6 +1464,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1342,6 +1476,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1349,6 +1484,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1419,6 +1555,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1461,6 +1598,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1476,6 +1614,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1509,6 +1648,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1522,6 +1662,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 0)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1534,6 +1675,7 @@ xfs_rmap_convert(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1606,6 +1748,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1634,6 +1777,7 @@ xfs_rmap_convert_shared(
if (XFS_IS_CORRUPT(mp,
LEFT.rm_startblock + LEFT.rm_blockcount >
bno)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1652,10 +1796,12 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1706,6 +1852,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1732,6 +1879,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1758,6 +1906,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1781,6 +1930,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1816,6 +1966,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1861,6 +2012,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1896,6 +2048,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -1934,6 +2087,7 @@ xfs_rmap_convert_shared(
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto done;
}
@@ -2023,6 +2177,7 @@ xfs_rmap_unmap_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2033,12 +2188,14 @@ xfs_rmap_unmap_shared(
ltrec.rm_startblock > bno ||
ltrec.rm_startblock + ltrec.rm_blockcount <
bno + len)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
/* Make sure the owner matches what we expect to find in the tree. */
if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2047,16 +2204,19 @@ xfs_rmap_unmap_shared(
if (XFS_IS_CORRUPT(mp,
(flags & XFS_RMAP_UNWRITTEN) !=
(ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
/* Check the offset. */
if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2113,6 +2273,7 @@ xfs_rmap_unmap_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2142,6 +2303,7 @@ xfs_rmap_unmap_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2221,6 +2383,7 @@ xfs_rmap_map_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2273,6 +2436,7 @@ xfs_rmap_map_shared(
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out_error;
}
@@ -2335,15 +2499,12 @@ xfs_rmap_map_raw(
{
struct xfs_owner_info oinfo;
- oinfo.oi_owner = rmap->rm_owner;
- oinfo.oi_offset = rmap->rm_offset;
- oinfo.oi_flags = 0;
- if (rmap->rm_flags & XFS_RMAP_ATTR_FORK)
- oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
- if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK)
- oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+ xfs_owner_info_pack(&oinfo, rmap->rm_owner, rmap->rm_offset,
+ rmap->rm_flags);
- if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ if ((rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+ XFS_RMAP_UNWRITTEN)) ||
+ XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
return xfs_rmap_map(cur, rmap->rm_startblock,
rmap->rm_blockcount,
rmap->rm_flags & XFS_RMAP_UNWRITTEN,
@@ -2373,7 +2534,7 @@ xfs_rmap_query_range_helper(
fa = xfs_rmap_btrec_to_irec(rec, &irec);
if (!fa)
- fa = xfs_rmap_check_irec(cur, &irec);
+ fa = xfs_rmap_check_btrec(cur, &irec);
if (fa)
return xfs_rmap_complain_bad_rec(cur, fa, &irec);
@@ -2428,6 +2589,38 @@ xfs_rmap_finish_one_cleanup(
xfs_trans_brelse(tp, agbp);
}
+/* Commit an rmap operation into the ondisk tree. */
+int
+__xfs_rmap_finish_intent(
+ struct xfs_btree_cur *rcur,
+ enum xfs_rmap_intent_type op,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ bool unwritten)
+{
+ switch (op) {
+ case XFS_RMAP_ALLOC:
+ case XFS_RMAP_MAP:
+ return xfs_rmap_map(rcur, bno, len, unwritten, oinfo);
+ case XFS_RMAP_MAP_SHARED:
+ return xfs_rmap_map_shared(rcur, bno, len, unwritten, oinfo);
+ case XFS_RMAP_FREE:
+ case XFS_RMAP_UNMAP:
+ return xfs_rmap_unmap(rcur, bno, len, unwritten, oinfo);
+ case XFS_RMAP_UNMAP_SHARED:
+ return xfs_rmap_unmap_shared(rcur, bno, len, unwritten, oinfo);
+ case XFS_RMAP_CONVERT:
+ return xfs_rmap_convert(rcur, bno, len, !unwritten, oinfo);
+ case XFS_RMAP_CONVERT_SHARED:
+ return xfs_rmap_convert_shared(rcur, bno, len, !unwritten,
+ oinfo);
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+}
+
/*
* Process one of the deferred rmap operations. We pass back the
* btree cursor to maintain our lock on the rmapbt between calls.
@@ -2476,10 +2669,14 @@ xfs_rmap_finish_one(
* allocate blocks.
*/
error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp);
- if (error)
+ if (error) {
+ xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL);
return error;
- if (XFS_IS_CORRUPT(tp->t_mountp, !agbp))
+ }
+ if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
+ xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL);
return -EFSCORRUPTED;
+ }
rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
}
@@ -2490,39 +2687,14 @@ xfs_rmap_finish_one(
unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock);
- switch (ri->ri_type) {
- case XFS_RMAP_ALLOC:
- case XFS_RMAP_MAP:
- error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount,
- unwritten, &oinfo);
- break;
- case XFS_RMAP_MAP_SHARED:
- error = xfs_rmap_map_shared(rcur, bno,
- ri->ri_bmap.br_blockcount, unwritten, &oinfo);
- break;
- case XFS_RMAP_FREE:
- case XFS_RMAP_UNMAP:
- error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount,
- unwritten, &oinfo);
- break;
- case XFS_RMAP_UNMAP_SHARED:
- error = xfs_rmap_unmap_shared(rcur, bno,
- ri->ri_bmap.br_blockcount, unwritten, &oinfo);
- break;
- case XFS_RMAP_CONVERT:
- error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount,
- !unwritten, &oinfo);
- break;
- case XFS_RMAP_CONVERT_SHARED:
- error = xfs_rmap_convert_shared(rcur, bno,
- ri->ri_bmap.br_blockcount, !unwritten, &oinfo);
- break;
- default:
- ASSERT(0);
- error = -EFSCORRUPTED;
- }
+ error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
+ ri->ri_bmap.br_blockcount, &oinfo, unwritten);
+ if (error)
+ return error;
- return error;
+ xfs_rmap_update_hook(tp, ri->ri_pag, ri->ri_type, bno,
+ ri->ri_bmap.br_blockcount, unwritten, &oinfo);
+ return 0;
}
/*
@@ -2559,7 +2731,7 @@ __xfs_rmap_add(
bmap->br_blockcount,
bmap->br_state);
- ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_owner = owner;
@@ -2567,7 +2739,7 @@ __xfs_rmap_add(
ri->ri_bmap = *bmap;
xfs_rmap_update_get_group(tp->t_mountp, ri);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
+ xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
}
/* Map an extent into a file. */
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 3c98d9d50afb..9d01fe689497 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -186,6 +186,10 @@ void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
struct xfs_btree_cur *rcur, int error);
int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
struct xfs_btree_cur **pcur);
+int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur,
+ enum xfs_rmap_intent_type op, xfs_agblock_t bno,
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo,
+ bool unwritten);
int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
uint64_t owner, uint64_t offset, unsigned int flags,
@@ -195,7 +199,7 @@ int xfs_rmap_compare(const struct xfs_rmap_irec *a,
union xfs_btree_rec;
xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_rmap_irec *irec);
-xfs_failaddr_t xfs_rmap_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag,
const struct xfs_rmap_irec *irec);
int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
@@ -235,4 +239,29 @@ extern struct kmem_cache *xfs_rmap_intent_cache;
int __init xfs_rmap_intent_init_cache(void);
void xfs_rmap_intent_destroy_cache(void);
+/*
+ * Parameters for tracking reverse mapping changes. The hook function arg
+ * parameter is enum xfs_rmap_intent_type, and the rest is below.
+ */
+struct xfs_rmap_update_params {
+ xfs_agblock_t startblock;
+ xfs_extlen_t blockcount;
+ struct xfs_owner_info oinfo;
+ bool unwritten;
+};
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+
+struct xfs_rmap_hook {
+ struct xfs_hook rmap_hook;
+};
+
+void xfs_rmap_hook_disable(void);
+void xfs_rmap_hook_enable(void);
+
+int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
+void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
+void xfs_rmap_hook_setup(struct xfs_rmap_hook *hook, notifier_fn_t mod_fn);
+#endif
+
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 6c81b20e97d2..9e759efa81cc 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -16,11 +16,14 @@
#include "xfs_btree_staging.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
static struct kmem_cache *xfs_rmapbt_cur_cache;
@@ -65,13 +68,12 @@ xfs_rmapbt_set_root(
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
- int btnum = cur->bc_btnum;
ASSERT(ptr->s != 0);
- agf->agf_roots[btnum] = ptr->s;
- be32_add_cpu(&agf->agf_levels[btnum], inc);
- cur->bc_ag.pag->pagf_levels[btnum] += inc;
+ agf->agf_rmap_root = ptr->s;
+ be32_add_cpu(&agf->agf_rmap_level, inc);
+ cur->bc_ag.pag->pagf_rmap_level += inc;
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
@@ -94,8 +96,6 @@ xfs_rmapbt_alloc_block(
&bno, 1);
if (error)
return error;
-
- trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1);
if (bno == NULLAGBLOCK) {
*stat = 0;
return 0;
@@ -125,8 +125,6 @@ xfs_rmapbt_free_block(
int error;
bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
- trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno,
- bno, 1);
be32_add_cpu(&agf->agf_rmap_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
@@ -226,7 +224,7 @@ xfs_rmapbt_init_ptr_from_cur(
ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
- ptr->s = agf->agf_roots[cur->bc_btnum];
+ ptr->s = agf->agf_rmap_root;
}
/*
@@ -340,18 +338,29 @@ xfs_rmapbt_verify(
if (!xfs_has_rmapbt(mp))
return __this_address;
- fa = xfs_btree_sblock_v5hdr_verify(bp);
+ fa = xfs_btree_agblock_v5hdr_verify(bp);
if (fa)
return fa;
level = be16_to_cpu(block->bb_level);
if (pag && xfs_perag_initialised_agf(pag)) {
- if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
+ unsigned int maxlevel = pag->pagf_rmap_level;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /*
+ * Online repair could be rewriting the free space btrees, so
+ * we'll validate against the larger of either tree while this
+ * is going on.
+ */
+ maxlevel = max_t(unsigned int, maxlevel,
+ pag->pagf_repair_rmap_level);
+#endif
+ if (level >= maxlevel)
return __this_address;
} else if (level >= mp->m_rmap_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
+ return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]);
}
static void
@@ -360,7 +369,7 @@ xfs_rmapbt_read_verify(
{
xfs_failaddr_t fa;
- if (!xfs_btree_sblock_verify_crc(bp))
+ if (!xfs_btree_agblock_verify_crc(bp))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_rmapbt_verify(bp);
@@ -384,7 +393,7 @@ xfs_rmapbt_write_verify(
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
}
- xfs_btree_sblock_calc_crc(bp);
+ xfs_btree_agblock_calc_crc(bp);
}
@@ -476,9 +485,19 @@ xfs_rmapbt_keys_contiguous(
be32_to_cpu(key2->rmap.rm_startblock));
}
-static const struct xfs_btree_ops xfs_rmapbt_ops = {
+const struct xfs_btree_ops xfs_rmapbt_ops = {
+ .name = "rmap",
+ .type = XFS_BTREE_TYPE_AG,
+ .geom_flags = XFS_BTGEO_OVERLAPPING,
+
.rec_len = sizeof(struct xfs_rmap_rec),
+ /* Overlapping btree; 2 keys per pointer. */
.key_len = 2 * sizeof(struct xfs_rmap_key),
+ .ptr_len = XFS_BTREE_SHORT_PTR_LEN,
+
+ .lru_refs = XFS_RMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rmap_2),
+ .sick_mask = XFS_SICK_AG_RMAPBT,
.dup_cursor = xfs_rmapbt_dup_cursor,
.set_root = xfs_rmapbt_set_root,
@@ -498,55 +517,176 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = {
.keys_contiguous = xfs_rmapbt_keys_contiguous,
};
-static struct xfs_btree_cur *
-xfs_rmapbt_init_common(
+/*
+ * Create a new reverse mapping btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
+ */
+struct xfs_btree_cur *
+xfs_rmapbt_init_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
+ struct xfs_buf *agbp,
struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
- /* Overlapping btree; 2 keys per pointer. */
- cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP,
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops,
mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
- cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
- cur->bc_ops = &xfs_rmapbt_ops;
-
cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_ag.agbp = agbp;
+ if (agbp) {
+ struct xfs_agf *agf = agbp->b_addr;
+
+ cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level);
+ }
return cur;
}
-/* Create a new reverse mapping btree cursor. */
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline unsigned int
+xfs_rmapbt_mem_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen /
+ (2 * sizeof(struct xfs_rmap_key) + sizeof(__be64));
+}
+
+/*
+ * Validate an in-memory rmap btree block. Callers are allowed to generate an
+ * in-memory btree even if the ondisk feature is not enabled.
+ */
+static xfs_failaddr_t
+xfs_rmapbt_mem_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ unsigned int level;
+ unsigned int maxrecs;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+
+ level = be16_to_cpu(block->bb_level);
+ if (level >= xfs_rmapbt_maxlevels_ondisk())
+ return __this_address;
+
+ maxrecs = xfs_rmapbt_mem_block_maxrecs(
+ XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0);
+ return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+xfs_rmapbt_mem_rw_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa = xfs_rmapbt_mem_verify(bp);
+
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = {
+ .name = "xfs_rmapbt_mem",
+ .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
+ .verify_read = xfs_rmapbt_mem_rw_verify,
+ .verify_write = xfs_rmapbt_mem_rw_verify,
+ .verify_struct = xfs_rmapbt_mem_verify,
+};
+
+const struct xfs_btree_ops xfs_rmapbt_mem_ops = {
+ .name = "mem_rmap",
+ .type = XFS_BTREE_TYPE_MEM,
+ .geom_flags = XFS_BTGEO_OVERLAPPING,
+
+ .rec_len = sizeof(struct xfs_rmap_rec),
+ /* Overlapping btree; 2 keys per pointer. */
+ .key_len = 2 * sizeof(struct xfs_rmap_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = XFS_RMAP_BTREE_REF,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rmap_mem_2),
+
+ .dup_cursor = xfbtree_dup_cursor,
+ .set_root = xfbtree_set_root,
+ .alloc_block = xfbtree_alloc_block,
+ .free_block = xfbtree_free_block,
+ .get_minrecs = xfbtree_get_minrecs,
+ .get_maxrecs = xfbtree_get_maxrecs,
+ .init_key_from_rec = xfs_rmapbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfbtree_init_ptr_from_cur,
+ .key_diff = xfs_rmapbt_key_diff,
+ .buf_ops = &xfs_rmapbt_mem_buf_ops,
+ .diff_two_keys = xfs_rmapbt_diff_two_keys,
+ .keys_inorder = xfs_rmapbt_keys_inorder,
+ .recs_inorder = xfs_rmapbt_recs_inorder,
+ .keys_contiguous = xfs_rmapbt_keys_contiguous,
+};
+
+/* Create a cursor for an in-memory btree. */
struct xfs_btree_cur *
-xfs_rmapbt_init_cursor(
- struct xfs_mount *mp,
+xfs_rmapbt_mem_cursor(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
- struct xfs_perag *pag)
+ struct xfbtree *xfbt)
{
- struct xfs_agf *agf = agbp->b_addr;
struct xfs_btree_cur *cur;
+ struct xfs_mount *mp = pag->pag_mount;
- cur = xfs_rmapbt_init_common(mp, tp, pag);
- cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
- cur->bc_ag.agbp = agbp;
+ cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops,
+ xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache);
+ cur->bc_mem.xfbtree = xfbt;
+ cur->bc_nlevels = xfbt->nlevels;
+
+ cur->bc_mem.pag = xfs_perag_hold(pag);
return cur;
}
-/* Create a new reverse mapping btree cursor with a fake root for staging. */
-struct xfs_btree_cur *
-xfs_rmapbt_stage_cursor(
+/* Create an in-memory rmap btree. */
+int
+xfs_rmapbt_mem_init(
struct xfs_mount *mp,
- struct xbtree_afakeroot *afake,
- struct xfs_perag *pag)
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp,
+ xfs_agnumber_t agno)
{
- struct xfs_btree_cur *cur;
+ xfbt->owner = agno;
+ return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops);
+}
- cur = xfs_rmapbt_init_common(mp, NULL, pag);
- xfs_btree_stage_afakeroot(cur, afake);
- return cur;
+/* Compute the max possible height for reverse mapping btrees in memory. */
+static unsigned int
+xfs_rmapbt_mem_maxlevels(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2;
+
+ /*
+ * How tall can an in-memory rmap btree become if we filled the entire
+ * AG with rmap records?
+ */
+ return xfs_btree_compute_maxlevels(minrecs,
+ XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec));
}
+#else
+# define xfs_rmapbt_mem_maxlevels() (0)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
/*
* Install a new reverse mapping btree root. Caller is responsible for
@@ -563,12 +703,12 @@ xfs_rmapbt_commit_staged_btree(
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
- agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
- agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+ agf->agf_rmap_root = cpu_to_be32(afake->af_root);
+ agf->agf_rmap_level = cpu_to_be32(afake->af_levels);
agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
XFS_AGF_RMAP_BLOCKS);
- xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
+ xfs_btree_commit_afakeroot(cur, tp, agbp);
}
/* Calculate number of records in a reverse mapping btree block. */
@@ -618,7 +758,8 @@ xfs_rmapbt_maxlevels_ondisk(void)
* like if it consumes almost all the blocks in the AG due to maximal
* sharing factor.
*/
- return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS);
+ return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS),
+ xfs_rmapbt_mem_maxlevels());
}
/* Compute the maximum height of an rmap btree. */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 3244715dd111..eb90d89e8086 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -10,6 +10,7 @@ struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
struct xbtree_afakeroot;
+struct xfbtree;
/* rmaps only exist on crc enabled filesystems */
#define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN
@@ -44,8 +45,6 @@ struct xbtree_afakeroot;
struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *bp,
struct xfs_perag *pag);
-struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, struct xfs_perag *pag);
void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
int xfs_rmapbt_maxrecs(int blocklen, int leaf);
@@ -64,4 +63,9 @@ unsigned int xfs_rmapbt_maxlevels_ondisk(void);
int __init xfs_rmapbt_init_cur_cache(void);
void xfs_rmapbt_destroy_cur_cache(void);
+struct xfs_btree_cur *xfs_rmapbt_mem_cursor(struct xfs_perag *pag,
+ struct xfs_trans *tp, struct xfbtree *xfbtree);
+int xfs_rmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+ struct xfs_buftarg *btp, xfs_agnumber_t agno);
+
#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 396648acb5be..f246d6dbf4ec 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -16,6 +16,8 @@
#include "xfs_trans.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_health.h"
/*
* Realtime allocator bitmap functions shared with userspace.
@@ -46,43 +48,93 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
.verify_write = xfs_rtbuf_verify_write,
};
+/* Release cached rt bitmap and summary buffers. */
+void
+xfs_rtbuf_cache_relse(
+ struct xfs_rtalloc_args *args)
+{
+ if (args->rbmbp) {
+ xfs_trans_brelse(args->tp, args->rbmbp);
+ args->rbmbp = NULL;
+ args->rbmoff = NULLFILEOFF;
+ }
+ if (args->sumbp) {
+ xfs_trans_brelse(args->tp, args->sumbp);
+ args->sumbp = NULL;
+ args->sumoff = NULLFILEOFF;
+ }
+}
+
/*
* Get a buffer for the bitmap or summary file block specified.
* The buffer is returned read and locked.
*/
int
xfs_rtbuf_get(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t block, /* block number in bitmap or summary */
- int issum, /* is summary not bitmap */
- struct xfs_buf **bpp) /* output: buffer for the block */
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t block, /* block number in bitmap or summary */
+ int issum) /* is summary not bitmap */
{
- struct xfs_buf *bp; /* block buffer, result */
- xfs_inode_t *ip; /* bitmap or summary inode */
- xfs_bmbt_irec_t map;
- int nmap = 1;
- int error; /* error value */
+ struct xfs_mount *mp = args->mp;
+ struct xfs_buf **cbpp; /* cached block buffer */
+ xfs_fileoff_t *coffp; /* cached block number */
+ struct xfs_buf *bp; /* block buffer, result */
+ struct xfs_inode *ip; /* bitmap or summary inode */
+ struct xfs_bmbt_irec map;
+ enum xfs_blft type;
+ int nmap = 1;
+ int error;
- ip = issum ? mp->m_rsumip : mp->m_rbmip;
+ if (issum) {
+ cbpp = &args->sumbp;
+ coffp = &args->sumoff;
+ ip = mp->m_rsumip;
+ type = XFS_BLFT_RTSUMMARY_BUF;
+ } else {
+ cbpp = &args->rbmbp;
+ coffp = &args->rbmoff;
+ ip = mp->m_rbmip;
+ type = XFS_BLFT_RTBITMAP_BUF;
+ }
+
+ /*
+ * If we have a cached buffer, and the block number matches, use that.
+ */
+ if (*cbpp && *coffp == block)
+ return 0;
+
+ /*
+ * Otherwise we have to have to get the buffer. If there was an old
+ * one, get rid of it first.
+ */
+ if (*cbpp) {
+ xfs_trans_brelse(args->tp, *cbpp);
+ *cbpp = NULL;
+ }
error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map)))
+ if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) {
+ xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
+ XFS_SICK_RT_BITMAP);
return -EFSCORRUPTED;
+ }
ASSERT(map.br_startblock != NULLFSBLOCK);
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
+ XFS_SICK_RT_BITMAP);
if (error)
return error;
- xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF
- : XFS_BLFT_RTBITMAP_BUF);
- *bpp = bp;
+ xfs_trans_buf_set_type(args->tp, bp, type);
+ *cbpp = bp;
+ *coffp = block;
return 0;
}
@@ -92,47 +144,44 @@ xfs_rtbuf_get(
*/
int
xfs_rtfind_back(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to look at */
- xfs_rtblock_t limit, /* last block to look at */
- xfs_rtblock_t *rtblock) /* out: start block found */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext to look at */
+ xfs_rtxnum_t limit, /* last rtext to look at */
+ xfs_rtxnum_t *rtx) /* out: start rtext found */
{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- struct xfs_buf *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t firstbit; /* first useful bit in the word */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t len; /* length of inspected area */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t want; /* mask for "good" values */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
+ struct xfs_mount *mp = args->mp;
+ int bit; /* bit number in the word */
+ xfs_fileoff_t block; /* bitmap block number */
+ int error; /* error value */
+ xfs_rtxnum_t firstbit; /* first useful bit in the word */
+ xfs_rtxnum_t i; /* current bit number rel. to start */
+ xfs_rtxnum_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ xfs_rtword_t incore;
+ unsigned int word; /* word number in the buffer */
/*
* Compute and read in starting bitmap block for starting block.
*/
- block = XFS_BITTOBLOCK(mp, start);
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
+ block = xfs_rtx_to_rbmblock(mp, start);
+ error = xfs_rtbitmap_read_buf(args, block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
+
/*
* Get the first word's index & point to it.
*/
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
+ word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
len = start - limit + 1;
/*
* Compute match value, based on the bit at start: if 1 (free)
* then all-ones, else all-zeroes.
*/
- want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ incore = xfs_rtbitmap_getword(args, word);
+ want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
/*
* If the starting position is not word-aligned, deal with the
* partial word.
@@ -142,20 +191,19 @@ xfs_rtfind_back(
* Calculate first (leftmost) bit number to look at,
* and mask for all the relevant bits in this word.
*/
- firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+ firstbit = max_t(xfs_srtblock_t, bit - len + 1, 0);
mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
firstbit;
/*
* Calculate the difference between the value there
* and what we're looking for.
*/
- if ((wdiff = (*b ^ want) & mask)) {
+ if ((wdiff = (incore ^ want) & mask)) {
/*
* Different. Mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
- i = bit - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
+ i = bit - xfs_highbit32(wdiff);
+ *rtx = start - i + 1;
return 0;
}
i = bit - firstbit + 1;
@@ -167,19 +215,11 @@ xfs_rtfind_back(
/*
* If done with this block, get the previous one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, --block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
- word = XFS_BLOCKWMASK(mp);
- b = &bufp[word];
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b--;
+
+ word = mp->m_blockwsize - 1;
}
} else {
/*
@@ -195,13 +235,13 @@ xfs_rtfind_back(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = *b ^ want)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = incore ^ want)) {
/*
* Different, mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
- i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
+ i += XFS_NBWORD - 1 - xfs_highbit32(wdiff);
+ *rtx = start - i + 1;
return 0;
}
i += XFS_NBWORD;
@@ -213,19 +253,11 @@ xfs_rtfind_back(
/*
* If done with this block, get the previous one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, --block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
- word = XFS_BLOCKWMASK(mp);
- b = &bufp[word];
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b--;
+
+ word = mp->m_blockwsize - 1;
}
}
/*
@@ -242,13 +274,13 @@ xfs_rtfind_back(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = (*b ^ want) & mask)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = (incore ^ want) & mask)) {
/*
* Different, mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
- i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
+ i += XFS_NBWORD - 1 - xfs_highbit32(wdiff);
+ *rtx = start - i + 1;
return 0;
} else
i = len;
@@ -256,8 +288,7 @@ xfs_rtfind_back(
/*
* No match, return that we scanned the whole area.
*/
- xfs_trans_brelse(tp, bp);
- *rtblock = start - i + 1;
+ *rtx = start - i + 1;
return 0;
}
@@ -267,47 +298,44 @@ xfs_rtfind_back(
*/
int
xfs_rtfind_forw(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to look at */
- xfs_rtblock_t limit, /* last block to look at */
- xfs_rtblock_t *rtblock) /* out: start block found */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext to look at */
+ xfs_rtxnum_t limit, /* last rtext to look at */
+ xfs_rtxnum_t *rtx) /* out: start rtext found */
{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- struct xfs_buf *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t lastbit; /* last useful bit in the word */
- xfs_rtblock_t len; /* length of inspected area */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t want; /* mask for "good" values */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
+ struct xfs_mount *mp = args->mp;
+ int bit; /* bit number in the word */
+ xfs_fileoff_t block; /* bitmap block number */
+ int error;
+ xfs_rtxnum_t i; /* current bit number rel. to start */
+ xfs_rtxnum_t lastbit;/* last useful bit in the word */
+ xfs_rtxnum_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ xfs_rtword_t incore;
+ unsigned int word; /* word number in the buffer */
/*
* Compute and read in starting bitmap block for starting block.
*/
- block = XFS_BITTOBLOCK(mp, start);
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
+ block = xfs_rtx_to_rbmblock(mp, start);
+ error = xfs_rtbitmap_read_buf(args, block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
+
/*
* Get the first word's index & point to it.
*/
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
+ word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
len = limit - start + 1;
/*
* Compute match value, based on the bit at start: if 1 (free)
* then all-ones, else all-zeroes.
*/
- want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ incore = xfs_rtbitmap_getword(args, word);
+ want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
/*
* If the starting position is not word-aligned, deal with the
* partial word.
@@ -317,19 +345,18 @@ xfs_rtfind_forw(
* Calculate last (rightmost) bit number to look at,
* and mask for all the relevant bits in this word.
*/
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ lastbit = min(bit + len, XFS_NBWORD);
mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
/*
* Calculate the difference between the value there
* and what we're looking for.
*/
- if ((wdiff = (*b ^ want) & mask)) {
+ if ((wdiff = (incore ^ want) & mask)) {
/*
* Different. Mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
- i = XFS_RTLOBIT(wdiff) - bit;
- *rtblock = start + i - 1;
+ i = xfs_lowbit32(wdiff) - bit;
+ *rtx = start + i - 1;
return 0;
}
i = lastbit - bit;
@@ -337,22 +364,15 @@ xfs_rtfind_forw(
* Go on to next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* If done with this block, get the previous one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- b = bufp = bp->b_addr;
+
word = 0;
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b++;
}
} else {
/*
@@ -368,13 +388,13 @@ xfs_rtfind_forw(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = *b ^ want)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = incore ^ want)) {
/*
* Different, mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
- i += XFS_RTLOBIT(wdiff);
- *rtblock = start + i - 1;
+ i += xfs_lowbit32(wdiff);
+ *rtx = start + i - 1;
return 0;
}
i += XFS_NBWORD;
@@ -382,22 +402,15 @@ xfs_rtfind_forw(
* Go on to next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* If done with this block, get the next one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- b = bufp = bp->b_addr;
+
word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
}
}
/*
@@ -412,13 +425,13 @@ xfs_rtfind_forw(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = (*b ^ want) & mask)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = (incore ^ want) & mask)) {
/*
* Different, mark where we are and return.
*/
- xfs_trans_brelse(tp, bp);
- i += XFS_RTLOBIT(wdiff);
- *rtblock = start + i - 1;
+ i += xfs_lowbit32(wdiff);
+ *rtx = start + i - 1;
return 0;
} else
i = len;
@@ -426,102 +439,95 @@ xfs_rtfind_forw(
/*
* No match, return that we scanned the whole area.
*/
- xfs_trans_brelse(tp, bp);
- *rtblock = start + i - 1;
+ *rtx = start + i - 1;
return 0;
}
+/* Log rtsummary counter at @infoword. */
+static inline void
+xfs_trans_log_rtsummary(
+ struct xfs_rtalloc_args *args,
+ unsigned int infoword)
+{
+ struct xfs_buf *bp = args->sumbp;
+ size_t first, last;
+
+ first = (void *)xfs_rsumblock_infoptr(args, infoword) - bp->b_addr;
+ last = first + sizeof(xfs_suminfo_t) - 1;
+
+ xfs_trans_log_buf(args->tp, bp, first, last);
+}
+
/*
- * Read and/or modify the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- *
- * Summary information is returned in *sum if specified.
- * If no delta is specified, returns summary only.
+ * Modify the summary information for a given extent size, bitmap block
+ * combination.
*/
int
-xfs_rtmodify_summary_int(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- int delta, /* change to make to summary info */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_suminfo_t *sum) /* out: summary info for this block */
+xfs_rtmodify_summary(
+ struct xfs_rtalloc_args *args,
+ int log, /* log2 of extent size */
+ xfs_fileoff_t bbno, /* bitmap block number */
+ int delta) /* in/out: summary block number */
{
- struct xfs_buf *bp; /* buffer for the summary block */
- int error; /* error value */
- xfs_fsblock_t sb; /* summary fsblock */
- int so; /* index into the summary file */
- xfs_suminfo_t *sp; /* pointer to returned data */
+ struct xfs_mount *mp = args->mp;
+ xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno);
+ unsigned int infoword;
+ xfs_suminfo_t val;
+ int error;
- /*
- * Compute entry number in the summary file.
- */
- so = XFS_SUMOFFS(mp, log, bbno);
- /*
- * Compute the block number in the summary file.
- */
- sb = XFS_SUMOFFSTOBLOCK(mp, so);
- /*
- * If we have an old buffer, and the block number matches, use that.
- */
- if (*rbpp && *rsb == sb)
- bp = *rbpp;
- /*
- * Otherwise we have to get the buffer.
- */
- else {
- /*
- * If there was an old one, get rid of it first.
- */
- if (*rbpp)
- xfs_trans_brelse(tp, *rbpp);
- error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
- if (error) {
- return error;
- }
- /*
- * Remember this buffer and block for the next call.
- */
- *rbpp = bp;
- *rsb = sb;
- }
- /*
- * Point to the summary information, modify/log it, and/or copy it out.
- */
- sp = XFS_SUMPTR(mp, bp, so);
- if (delta) {
- uint first = (uint)((char *)sp - (char *)bp->b_addr);
-
- *sp += delta;
- if (mp->m_rsum_cache) {
- if (*sp == 0 && log == mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno]++;
- if (*sp != 0 && log < mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log;
- }
- xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
+ error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so));
+ if (error)
+ return error;
+
+ infoword = xfs_rtsumoffs_to_infoword(mp, so);
+ val = xfs_suminfo_add(args, infoword, delta);
+
+ if (mp->m_rsum_cache) {
+ if (val == 0 && log + 1 == mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log;
+ if (val != 0 && log >= mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log + 1;
}
- if (sum)
- *sum = *sp;
+
+ xfs_trans_log_rtsummary(args, infoword);
return 0;
}
+/*
+ * Read and return the summary information for a given extent size, bitmap block
+ * combination.
+ */
int
-xfs_rtmodify_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- int delta, /* change to make to summary info */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
+xfs_rtget_summary(
+ struct xfs_rtalloc_args *args,
+ int log, /* log2 of extent size */
+ xfs_fileoff_t bbno, /* bitmap block number */
+ xfs_suminfo_t *sum) /* out: summary info for this block */
{
- return xfs_rtmodify_summary_int(mp, tp, log, bbno,
- delta, rbpp, rsb, NULL);
+ struct xfs_mount *mp = args->mp;
+ xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno);
+ int error;
+
+ error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so));
+ if (!error)
+ *sum = xfs_suminfo_get(args, xfs_rtsumoffs_to_infoword(mp, so));
+ return error;
+}
+
+/* Log rtbitmap block from the word @from to the byte before @next. */
+static inline void
+xfs_trans_log_rtbitmap(
+ struct xfs_rtalloc_args *args,
+ unsigned int from,
+ unsigned int next)
+{
+ struct xfs_buf *bp = args->rbmbp;
+ size_t first, last;
+
+ first = (void *)xfs_rbmblock_wordptr(args, from) - bp->b_addr;
+ last = ((void *)xfs_rbmblock_wordptr(args, next) - 1) - bp->b_addr;
+
+ xfs_trans_log_buf(args->tp, bp, first, last);
}
/*
@@ -530,41 +536,37 @@ xfs_rtmodify_summary(
*/
int
xfs_rtmodify_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to modify */
- xfs_extlen_t len, /* length of extent to modify */
- int val) /* 1 for free, 0 for allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext to modify */
+ xfs_rtxlen_t len, /* length of extent to modify */
+ int val) /* 1 for free, 0 for allocated */
{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- struct xfs_buf *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtword_t *first; /* first used word in the buffer */
- int i; /* current bit number rel. to start */
- int lastbit; /* last useful bit in word */
- xfs_rtword_t mask; /* mask o frelevant bits for value */
- int word; /* word number in the buffer */
+ struct xfs_mount *mp = args->mp;
+ int bit; /* bit number in the word */
+ xfs_fileoff_t block; /* bitmap block number */
+ int error;
+ int i; /* current bit number rel. to start */
+ int lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t incore;
+ unsigned int firstword; /* first word used in the buffer */
+ unsigned int word; /* word number in the buffer */
/*
* Compute starting bitmap block number.
*/
- block = XFS_BITTOBLOCK(mp, start);
+ block = xfs_rtx_to_rbmblock(mp, start);
/*
* Read the bitmap block, and point to its data.
*/
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
+
/*
* Compute the starting word's address, and starting bit.
*/
- word = XFS_BITTOWORD(mp, start);
- first = b = &bufp[word];
+ firstword = word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
/*
* 0 (allocated) => all zeroes; 1 (free) => all ones.
@@ -578,39 +580,33 @@ xfs_rtmodify_range(
/*
* Compute first bit not changed and mask of relevant bits.
*/
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ lastbit = min(bit + len, XFS_NBWORD);
mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
/*
* Set/clear the active bits.
*/
+ incore = xfs_rtbitmap_getword(args, word);
if (val)
- *b |= mask;
+ incore |= mask;
else
- *b &= ~mask;
+ incore &= ~mask;
+ xfs_rtbitmap_setword(args, word, incore);
i = lastbit - bit;
/*
* Go on to the next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* Log the changed part of this block.
* Get the next one.
*/
- xfs_trans_log_buf(tp, bp,
- (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp));
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ xfs_trans_log_rtbitmap(args, firstword, word);
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- first = b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer
- */
- b++;
+
+ firstword = word = 0;
}
} else {
/*
@@ -626,31 +622,23 @@ xfs_rtmodify_range(
/*
* Set the word value correctly.
*/
- *b = val;
+ xfs_rtbitmap_setword(args, word, val);
i += XFS_NBWORD;
/*
* Go on to the next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* Log the changed part of this block.
* Get the next one.
*/
- xfs_trans_log_buf(tp, bp,
- (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp));
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ xfs_trans_log_rtbitmap(args, firstword, word);
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- first = b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer
- */
- b++;
+
+ firstword = word = 0;
}
}
/*
@@ -665,18 +653,19 @@ xfs_rtmodify_range(
/*
* Set/clear the active bits.
*/
+ incore = xfs_rtbitmap_getword(args, word);
if (val)
- *b |= mask;
+ incore |= mask;
else
- *b &= ~mask;
- b++;
+ incore &= ~mask;
+ xfs_rtbitmap_setword(args, word, incore);
+ word++;
}
/*
* Log any remaining changed bytes.
*/
- if (b > first)
- xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp - 1));
+ if (word > firstword)
+ xfs_trans_log_rtbitmap(args, firstword, word);
return 0;
}
@@ -686,23 +675,21 @@ xfs_rtmodify_range(
*/
int
xfs_rtfree_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to free */
- xfs_extlen_t len, /* length to free */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext to free */
+ xfs_rtxlen_t len) /* in/out: summary block number */
{
- xfs_rtblock_t end; /* end of the freed extent */
- int error; /* error value */
- xfs_rtblock_t postblock; /* first block freed > end */
- xfs_rtblock_t preblock; /* first block freed < start */
+ struct xfs_mount *mp = args->mp;
+ xfs_rtxnum_t end; /* end of the freed extent */
+ int error; /* error value */
+ xfs_rtxnum_t postblock; /* first rtext freed > end */
+ xfs_rtxnum_t preblock; /* first rtext freed < start */
end = start + len - 1;
/*
* Modify the bitmap to mark this extent freed.
*/
- error = xfs_rtmodify_range(mp, tp, start, len, 1);
+ error = xfs_rtmodify_range(args, start, len, 1);
if (error) {
return error;
}
@@ -711,15 +698,15 @@ xfs_rtfree_range(
* We need to find the beginning and end of the extent so we can
* properly update the summary.
*/
- error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+ error = xfs_rtfind_back(args, start, 0, &preblock);
if (error) {
return error;
}
/*
* Find the next allocated block (end of allocated extent).
*/
- error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
- &postblock);
+ error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
if (error)
return error;
/*
@@ -727,9 +714,9 @@ xfs_rtfree_range(
* old extent, add summary data for them to be allocated.
*/
if (preblock < start) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(start - preblock),
- XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+ error = xfs_rtmodify_summary(args,
+ xfs_highbit64(start - preblock),
+ xfs_rtx_to_rbmblock(mp, preblock), -1);
if (error) {
return error;
}
@@ -739,9 +726,9 @@ xfs_rtfree_range(
* old extent, add summary data for them to be allocated.
*/
if (postblock > end) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock - end),
- XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+ error = xfs_rtmodify_summary(args,
+ xfs_highbit64(postblock - end),
+ xfs_rtx_to_rbmblock(mp, end + 1), -1);
if (error) {
return error;
}
@@ -750,10 +737,9 @@ xfs_rtfree_range(
* Increment the summary information corresponding to the entire
* (new) free extent.
*/
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
- XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
- return error;
+ return xfs_rtmodify_summary(args,
+ xfs_highbit64(postblock + 1 - preblock),
+ xfs_rtx_to_rbmblock(mp, preblock), 1);
}
/*
@@ -762,43 +748,39 @@ xfs_rtfree_range(
*/
int
xfs_rtcheck_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block number of extent */
- xfs_extlen_t len, /* length of extent */
- int val, /* 1 for free, 0 for allocated */
- xfs_rtblock_t *new, /* out: first block not matching */
- int *stat) /* out: 1 for matches, 0 for not */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext number of extent */
+ xfs_rtxlen_t len, /* length of extent */
+ int val, /* 1 for free, 0 for allocated */
+ xfs_rtxnum_t *new, /* out: first rtext not matching */
+ int *stat) /* out: 1 for matches, 0 for not */
{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- struct xfs_buf *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t lastbit; /* last useful bit in word */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
+ struct xfs_mount *mp = args->mp;
+ int bit; /* bit number in the word */
+ xfs_fileoff_t block; /* bitmap block number */
+ int error;
+ xfs_rtxnum_t i; /* current bit number rel. to start */
+ xfs_rtxnum_t lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ xfs_rtword_t incore;
+ unsigned int word; /* word number in the buffer */
/*
* Compute starting bitmap block number
*/
- block = XFS_BITTOBLOCK(mp, start);
+ block = xfs_rtx_to_rbmblock(mp, start);
/*
* Read the bitmap block.
*/
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, block);
+ if (error)
return error;
- }
- bufp = bp->b_addr;
+
/*
* Compute the starting word's address, and starting bit.
*/
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
+ word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
/*
* 0 (allocated) => all zero's; 1 (free) => all one's.
@@ -812,7 +794,7 @@ xfs_rtcheck_range(
/*
* Compute first bit not examined.
*/
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ lastbit = min(bit + len, XFS_NBWORD);
/*
* Mask of relevant bits.
*/
@@ -820,12 +802,12 @@ xfs_rtcheck_range(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = (*b ^ val) & mask)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = (incore ^ val) & mask)) {
/*
* Different, compute first wrong bit and return.
*/
- xfs_trans_brelse(tp, bp);
- i = XFS_RTLOBIT(wdiff) - bit;
+ i = xfs_lowbit32(wdiff) - bit;
*new = start + i;
*stat = 0;
return 0;
@@ -835,22 +817,15 @@ xfs_rtcheck_range(
* Go on to next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* If done with this block, get the next one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- b = bufp = bp->b_addr;
+
word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
}
} else {
/*
@@ -866,12 +841,12 @@ xfs_rtcheck_range(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = *b ^ val)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = incore ^ val)) {
/*
* Different, compute first wrong bit and return.
*/
- xfs_trans_brelse(tp, bp);
- i += XFS_RTLOBIT(wdiff);
+ i += xfs_lowbit32(wdiff);
*new = start + i;
*stat = 0;
return 0;
@@ -881,22 +856,15 @@ xfs_rtcheck_range(
* Go on to next block if that's where the next word is
* and we need the next word.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ if (++word == mp->m_blockwsize && i < len) {
/*
* If done with this block, get the next one.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
+ error = xfs_rtbitmap_read_buf(args, ++block);
+ if (error)
return error;
- }
- b = bufp = bp->b_addr;
+
word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
}
}
/*
@@ -911,12 +879,12 @@ xfs_rtcheck_range(
/*
* Compute difference between actual and desired value.
*/
- if ((wdiff = (*b ^ val) & mask)) {
+ incore = xfs_rtbitmap_getword(args, word);
+ if ((wdiff = (incore ^ val) & mask)) {
/*
* Different, compute first wrong bit and return.
*/
- xfs_trans_brelse(tp, bp);
- i += XFS_RTLOBIT(wdiff);
+ i += xfs_lowbit32(wdiff);
*new = start + i;
*stat = 0;
return 0;
@@ -926,7 +894,6 @@ xfs_rtcheck_range(
/*
* Successful, return.
*/
- xfs_trans_brelse(tp, bp);
*new = start + i;
*stat = 1;
return 0;
@@ -936,58 +903,57 @@ xfs_rtcheck_range(
/*
* Check that the given extent (block range) is allocated already.
*/
-STATIC int /* error */
+STATIC int
xfs_rtcheck_alloc_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number of extent */
- xfs_extlen_t len) /* length of extent */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext number of extent */
+ xfs_rtxlen_t len) /* length of extent */
{
- xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */
- int stat;
- int error;
+ xfs_rtxnum_t new; /* dummy for xfs_rtcheck_range */
+ int stat;
+ int error;
- error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+ error = xfs_rtcheck_range(args, start, len, 0, &new, &stat);
if (error)
return error;
ASSERT(stat);
return 0;
}
#else
-#define xfs_rtcheck_alloc_range(m,t,b,l) (0)
+#define xfs_rtcheck_alloc_range(a,b,l) (0)
#endif
/*
* Free an extent in the realtime subvolume. Length is expressed in
* realtime extents, as is the block number.
*/
-int /* error */
+int
xfs_rtfree_extent(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to free */
- xfs_extlen_t len) /* length of extent freed */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_rtxnum_t start, /* starting rtext number to free */
+ xfs_rtxlen_t len) /* length of extent freed */
{
- int error; /* error value */
- xfs_mount_t *mp; /* file system mount structure */
- xfs_fsblock_t sb; /* summary file block number */
- struct xfs_buf *sumbp = NULL; /* summary file block buffer */
- struct timespec64 atime;
-
- mp = tp->t_mountp;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ .tp = tp,
+ };
+ int error;
+ struct timespec64 atime;
ASSERT(mp->m_rbmip->i_itemp != NULL);
- ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL);
- error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+ error = xfs_rtcheck_alloc_range(&args, start, len);
if (error)
return error;
/*
* Free the range of realtime blocks.
*/
- error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
- if (error) {
- return error;
- }
+ error = xfs_rtfree_range(&args, start, len);
+ if (error)
+ goto out;
+
/*
* Mark more blocks free in the superblock.
*/
@@ -1002,11 +968,47 @@ xfs_rtfree_extent(
mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
atime = inode_get_atime(VFS_I(mp->m_rbmip));
- *((uint64_t *)&atime) = 0;
+ atime.tv_sec = 0;
inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
}
- return 0;
+ error = 0;
+out:
+ xfs_rtbuf_cache_relse(&args);
+ return error;
+}
+
+/*
+ * Free some blocks in the realtime subvolume. rtbno and rtlen are in units of
+ * rt blocks, not rt extents; must be aligned to the rt extent size; and rtlen
+ * cannot exceed XFS_MAX_BMBT_EXTLEN.
+ */
+int
+xfs_rtfree_blocks(
+ struct xfs_trans *tp,
+ xfs_fsblock_t rtbno,
+ xfs_filblks_t rtlen)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_rtxnum_t start;
+ xfs_filblks_t len;
+ xfs_extlen_t mod;
+
+ ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
+
+ len = xfs_rtb_to_rtxrem(mp, rtlen, &mod);
+ if (mod) {
+ ASSERT(mod == 0);
+ return -EIO;
+ }
+
+ start = xfs_rtb_to_rtxrem(mp, rtbno, &mod);
+ if (mod) {
+ ASSERT(mod == 0);
+ return -EIO;
+ }
+
+ return xfs_rtfree_extent(tp, start, len);
}
/* Find all the free records within a given range. */
@@ -1019,10 +1021,14 @@ xfs_rtalloc_query_range(
xfs_rtalloc_query_range_fn fn,
void *priv)
{
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ .tp = tp,
+ };
struct xfs_rtalloc_rec rec;
- xfs_rtblock_t rtstart;
- xfs_rtblock_t rtend;
- xfs_rtblock_t high_key;
+ xfs_rtxnum_t rtstart;
+ xfs_rtxnum_t rtend;
+ xfs_rtxnum_t high_key;
int is_free;
int error = 0;
@@ -1038,13 +1044,13 @@ xfs_rtalloc_query_range(
rtstart = low_rec->ar_startext;
while (rtstart <= high_key) {
/* Is the first block free? */
- error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
+ error = xfs_rtcheck_range(&args, rtstart, 1, 1, &rtend,
&is_free);
if (error)
break;
/* How long does the extent go for? */
- error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend);
+ error = xfs_rtfind_forw(&args, rtstart, high_key, &rtend);
if (error)
break;
@@ -1060,6 +1066,7 @@ xfs_rtalloc_query_range(
rtstart = rtend + 1;
}
+ xfs_rtbuf_cache_relse(&args);
return error;
}
@@ -1085,18 +1092,79 @@ int
xfs_rtalloc_extent_is_free(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_rtblock_t start,
- xfs_extlen_t len,
+ xfs_rtxnum_t start,
+ xfs_rtxlen_t len,
bool *is_free)
{
- xfs_rtblock_t end;
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ .tp = tp,
+ };
+ xfs_rtxnum_t end;
int matches;
int error;
- error = xfs_rtcheck_range(mp, tp, start, len, 1, &end, &matches);
+ error = xfs_rtcheck_range(&args, start, len, 1, &end, &matches);
+ xfs_rtbuf_cache_relse(&args);
if (error)
return error;
*is_free = matches;
return 0;
}
+
+/*
+ * Compute the number of rtbitmap blocks needed to track the given number of rt
+ * extents.
+ */
+xfs_filblks_t
+xfs_rtbitmap_blockcount(
+ struct xfs_mount *mp,
+ xfs_rtbxlen_t rtextents)
+{
+ return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Compute the number of rtbitmap words needed to populate every block of a
+ * bitmap that is large enough to track the given number of rt extents.
+ */
+unsigned long long
+xfs_rtbitmap_wordcount(
+ struct xfs_mount *mp,
+ xfs_rtbxlen_t rtextents)
+{
+ xfs_filblks_t blocks;
+
+ blocks = xfs_rtbitmap_blockcount(mp, rtextents);
+ return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
+}
+
+/* Compute the number of rtsummary blocks needed to track the given rt space. */
+xfs_filblks_t
+xfs_rtsummary_blockcount(
+ struct xfs_mount *mp,
+ unsigned int rsumlevels,
+ xfs_extlen_t rbmblocks)
+{
+ unsigned long long rsumwords;
+
+ rsumwords = (unsigned long long)rsumlevels * rbmblocks;
+ return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG);
+}
+
+/*
+ * Compute the number of rtsummary info words needed to populate every block of
+ * a summary file that is large enough to track the given rt space.
+ */
+unsigned long long
+xfs_rtsummary_wordcount(
+ struct xfs_mount *mp,
+ unsigned int rsumlevels,
+ xfs_extlen_t rbmblocks)
+{
+ xfs_filblks_t blocks;
+
+ blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks);
+ return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
+}
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
new file mode 100644
index 000000000000..152a66750af5
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_RTBITMAP_H__
+#define __XFS_RTBITMAP_H__
+
+struct xfs_rtalloc_args {
+ struct xfs_mount *mp;
+ struct xfs_trans *tp;
+
+ struct xfs_buf *rbmbp; /* bitmap block buffer */
+ struct xfs_buf *sumbp; /* summary block buffer */
+
+ xfs_fileoff_t rbmoff; /* bitmap block number */
+ xfs_fileoff_t sumoff; /* summary block number */
+};
+
+static inline xfs_rtblock_t
+xfs_rtx_to_rtb(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t rtx)
+{
+ if (mp->m_rtxblklog >= 0)
+ return rtx << mp->m_rtxblklog;
+
+ return rtx * mp->m_sb.sb_rextsize;
+}
+
+static inline xfs_extlen_t
+xfs_rtxlen_to_extlen(
+ struct xfs_mount *mp,
+ xfs_rtxlen_t rtxlen)
+{
+ if (mp->m_rtxblklog >= 0)
+ return rtxlen << mp->m_rtxblklog;
+
+ return rtxlen * mp->m_sb.sb_rextsize;
+}
+
+/* Compute the misalignment between an extent length and a realtime extent .*/
+static inline unsigned int
+xfs_extlen_to_rtxmod(
+ struct xfs_mount *mp,
+ xfs_extlen_t len)
+{
+ if (mp->m_rtxblklog >= 0)
+ return len & mp->m_rtxblkmask;
+
+ return len % mp->m_sb.sb_rextsize;
+}
+
+static inline xfs_rtxlen_t
+xfs_extlen_to_rtxlen(
+ struct xfs_mount *mp,
+ xfs_extlen_t len)
+{
+ if (mp->m_rtxblklog >= 0)
+ return len >> mp->m_rtxblklog;
+
+ return len / mp->m_sb.sb_rextsize;
+}
+
+/* Convert an rt block number into an rt extent number. */
+static inline xfs_rtxnum_t
+xfs_rtb_to_rtx(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ if (likely(mp->m_rtxblklog >= 0))
+ return rtbno >> mp->m_rtxblklog;
+
+ return div_u64(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/* Return the offset of an rt block number within an rt extent. */
+static inline xfs_extlen_t
+xfs_rtb_to_rtxoff(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ if (likely(mp->m_rtxblklog >= 0))
+ return rtbno & mp->m_rtxblkmask;
+
+ return do_div(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/*
+ * Crack an rt block number into an rt extent number and an offset within that
+ * rt extent. Returns the rt extent number directly and the offset in @off.
+ */
+static inline xfs_rtxnum_t
+xfs_rtb_to_rtxrem(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno,
+ xfs_extlen_t *off)
+{
+ if (likely(mp->m_rtxblklog >= 0)) {
+ *off = rtbno & mp->m_rtxblkmask;
+ return rtbno >> mp->m_rtxblklog;
+ }
+
+ return div_u64_rem(rtbno, mp->m_sb.sb_rextsize, off);
+}
+
+/*
+ * Convert an rt block number into an rt extent number, rounding up to the next
+ * rt extent if the rt block is not aligned to an rt extent boundary.
+ */
+static inline xfs_rtxnum_t
+xfs_rtb_to_rtxup(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ if (likely(mp->m_rtxblklog >= 0)) {
+ if (rtbno & mp->m_rtxblkmask)
+ return (rtbno >> mp->m_rtxblklog) + 1;
+ return rtbno >> mp->m_rtxblklog;
+ }
+
+ if (do_div(rtbno, mp->m_sb.sb_rextsize))
+ rtbno++;
+ return rtbno;
+}
+
+/* Round this rtblock up to the nearest rt extent size. */
+static inline xfs_rtblock_t
+xfs_rtb_roundup_rtx(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return roundup_64(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/* Round this rtblock down to the nearest rt extent size. */
+static inline xfs_rtblock_t
+xfs_rtb_rounddown_rtx(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return rounddown_64(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/* Convert an rt extent number to a file block offset in the rt bitmap file. */
+static inline xfs_fileoff_t
+xfs_rtx_to_rbmblock(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t rtx)
+{
+ return rtx >> mp->m_blkbit_log;
+}
+
+/* Convert an rt extent number to a word offset within an rt bitmap block. */
+static inline unsigned int
+xfs_rtx_to_rbmword(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t rtx)
+{
+ return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1);
+}
+
+/* Convert a file block offset in the rt bitmap file to an rt extent number. */
+static inline xfs_rtxnum_t
+xfs_rbmblock_to_rtx(
+ struct xfs_mount *mp,
+ xfs_fileoff_t rbmoff)
+{
+ return rbmoff << mp->m_blkbit_log;
+}
+
+/* Return a pointer to a bitmap word within a rt bitmap block. */
+static inline union xfs_rtword_raw *
+xfs_rbmblock_wordptr(
+ struct xfs_rtalloc_args *args,
+ unsigned int index)
+{
+ union xfs_rtword_raw *words = args->rbmbp->b_addr;
+
+ return words + index;
+}
+
+/* Convert an ondisk bitmap word to its incore representation. */
+static inline xfs_rtword_t
+xfs_rtbitmap_getword(
+ struct xfs_rtalloc_args *args,
+ unsigned int index)
+{
+ union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index);
+
+ return word->old;
+}
+
+/* Set an ondisk bitmap word from an incore representation. */
+static inline void
+xfs_rtbitmap_setword(
+ struct xfs_rtalloc_args *args,
+ unsigned int index,
+ xfs_rtword_t value)
+{
+ union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index);
+
+ word->old = value;
+}
+
+/*
+ * Convert a rt extent length and rt bitmap block number to a xfs_suminfo_t
+ * offset within the rt summary file.
+ */
+static inline xfs_rtsumoff_t
+xfs_rtsumoffs(
+ struct xfs_mount *mp,
+ int log2_len,
+ xfs_fileoff_t rbmoff)
+{
+ return log2_len * mp->m_sb.sb_rbmblocks + rbmoff;
+}
+
+/*
+ * Convert an xfs_suminfo_t offset to a file block offset within the rt summary
+ * file.
+ */
+static inline xfs_fileoff_t
+xfs_rtsumoffs_to_block(
+ struct xfs_mount *mp,
+ xfs_rtsumoff_t rsumoff)
+{
+ return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t));
+}
+
+/*
+ * Convert an xfs_suminfo_t offset to an info word offset within an rt summary
+ * block.
+ */
+static inline unsigned int
+xfs_rtsumoffs_to_infoword(
+ struct xfs_mount *mp,
+ xfs_rtsumoff_t rsumoff)
+{
+ unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG;
+
+ return rsumoff & mask;
+}
+
+/* Return a pointer to a summary info word within a rt summary block. */
+static inline union xfs_suminfo_raw *
+xfs_rsumblock_infoptr(
+ struct xfs_rtalloc_args *args,
+ unsigned int index)
+{
+ union xfs_suminfo_raw *info = args->sumbp->b_addr;
+
+ return info + index;
+}
+
+/* Get the current value of a summary counter. */
+static inline xfs_suminfo_t
+xfs_suminfo_get(
+ struct xfs_rtalloc_args *args,
+ unsigned int index)
+{
+ union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index);
+
+ return info->old;
+}
+
+/* Add to the current value of a summary counter and return the new value. */
+static inline xfs_suminfo_t
+xfs_suminfo_add(
+ struct xfs_rtalloc_args *args,
+ unsigned int index,
+ int delta)
+{
+ union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index);
+
+ info->old += delta;
+ return info->old;
+}
+
+/*
+ * Functions for walking free space rtextents in the realtime bitmap.
+ */
+struct xfs_rtalloc_rec {
+ xfs_rtxnum_t ar_startext;
+ xfs_rtbxlen_t ar_extcount;
+};
+
+typedef int (*xfs_rtalloc_query_range_fn)(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv);
+
+#ifdef CONFIG_XFS_RT
+void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args);
+
+int xfs_rtbuf_get(struct xfs_rtalloc_args *args, xfs_fileoff_t block,
+ int issum);
+
+static inline int
+xfs_rtbitmap_read_buf(
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t block)
+{
+ return xfs_rtbuf_get(args, block, 0);
+}
+
+static inline int
+xfs_rtsummary_read_buf(
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t block)
+{
+ return xfs_rtbuf_get(args, block, 1);
+}
+
+int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat);
+int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
+int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
+int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxlen_t len, int val);
+int xfs_rtget_summary(struct xfs_rtalloc_args *args, int log,
+ xfs_fileoff_t bbno, xfs_suminfo_t *sum);
+int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log,
+ xfs_fileoff_t bbno, int delta);
+int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+ xfs_rtxlen_t len);
+int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *low_rec,
+ const struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn, void *priv);
+int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv);
+int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtxnum_t start, xfs_rtxlen_t len,
+ bool *is_free);
+/*
+ * Free an extent in the realtime subvolume. Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int /* error */
+xfs_rtfree_extent(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_rtxnum_t start, /* starting rtext number to free */
+ xfs_rtxlen_t len); /* length of extent freed */
+
+/* Same as above, but in units of rt blocks. */
+int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
+ xfs_filblks_t rtlen);
+
+xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
+ rtextents);
+unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp,
+ xfs_rtbxlen_t rtextents);
+
+xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp,
+ unsigned int rsumlevels, xfs_extlen_t rbmblocks);
+unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
+ unsigned int rsumlevels, xfs_extlen_t rbmblocks);
+#else /* CONFIG_XFS_RT */
+# define xfs_rtfree_extent(t,b,l) (-ENOSYS)
+# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS)
+# define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS)
+# define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS)
+# define xfs_rtbitmap_read_buf(a,b) (-ENOSYS)
+# define xfs_rtsummary_read_buf(a,b) (-ENOSYS)
+# define xfs_rtbuf_cache_relse(a) (0)
+# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS)
+static inline xfs_filblks_t
+xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
+{
+ /* shut up gcc */
+ return 0;
+}
+# define xfs_rtbitmap_wordcount(mp, r) (0)
+# define xfs_rtsummary_blockcount(mp, l, b) (0)
+# define xfs_rtsummary_wordcount(mp, l, b) (0)
+#endif /* CONFIG_XFS_RT */
+
+#endif /* __XFS_RTBITMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 6264daaab37b..73a4b895de67 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -25,6 +25,7 @@
#include "xfs_da_format.h"
#include "xfs_health.h"
#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -508,8 +509,9 @@ xfs_validate_sb_common(
rbmblocks = howmany_64(sbp->sb_rextents,
NBBY * sbp->sb_blocksize);
- if (sbp->sb_rextents != rexts ||
- sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+ if (!xfs_validate_rtextents(rexts) ||
+ sbp->sb_rextents != rexts ||
+ sbp->sb_rextslog != xfs_compute_rextslog(rexts) ||
sbp->sb_rbmblocks != rbmblocks) {
xfs_notice(mp,
"realtime geometry sanity check failed");
@@ -528,7 +530,8 @@ xfs_validate_sb_common(
}
if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit),
- XFS_FSB_TO_B(mp, sbp->sb_width), 0, false))
+ XFS_FSB_TO_B(mp, sbp->sb_width), 0,
+ xfs_buf_daddr(bp) == XFS_SB_DADDR, false))
return -EFSCORRUPTED;
/*
@@ -975,6 +978,8 @@ xfs_sb_mount_common(
mp->m_blockmask = sbp->sb_blocksize - 1;
mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
mp->m_blockwmask = mp->m_blockwsize - 1;
+ mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
+ mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
@@ -1286,6 +1291,8 @@ xfs_sb_read_secondary(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_agno_mark_sick(mp, agno, XFS_SICK_AG_SB);
if (error)
return error;
xfs_buf_set_ref(bp, XFS_SSB_REF);
@@ -1317,8 +1324,10 @@ xfs_sb_get_secondary(
}
/*
- * sunit, swidth, sectorsize(optional with 0) should be all in bytes,
- * so users won't be confused by values in error messages.
+ * sunit, swidth, sectorsize(optional with 0) should be all in bytes, so users
+ * won't be confused by values in error messages. This function returns false
+ * if the stripe geometry is invalid and the caller is unable to repair the
+ * stripe configuration later in the mount process.
*/
bool
xfs_validate_stripe_geometry(
@@ -1326,20 +1335,21 @@ xfs_validate_stripe_geometry(
__s64 sunit,
__s64 swidth,
int sectorsize,
+ bool may_repair,
bool silent)
{
if (swidth > INT_MAX) {
if (!silent)
xfs_notice(mp,
"stripe width (%lld) is too large", swidth);
- return false;
+ goto check_override;
}
if (sunit > swidth) {
if (!silent)
xfs_notice(mp,
"stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth);
- return false;
+ goto check_override;
}
if (sectorsize && (int)sunit % sectorsize) {
@@ -1347,21 +1357,21 @@ xfs_validate_stripe_geometry(
xfs_notice(mp,
"stripe unit (%lld) must be a multiple of the sector size (%d)",
sunit, sectorsize);
- return false;
+ goto check_override;
}
if (sunit && !swidth) {
if (!silent)
xfs_notice(mp,
"invalid stripe unit (%lld) and stripe width of 0", sunit);
- return false;
+ goto check_override;
}
if (!sunit && swidth) {
if (!silent)
xfs_notice(mp,
"invalid stripe width (%lld) and stripe unit of 0", swidth);
- return false;
+ goto check_override;
}
if (sunit && (int)swidth % (int)sunit) {
@@ -1369,7 +1379,39 @@ xfs_validate_stripe_geometry(
xfs_notice(mp,
"stripe width (%lld) must be a multiple of the stripe unit (%lld)",
swidth, sunit);
- return false;
+ goto check_override;
}
return true;
+
+check_override:
+ if (!may_repair)
+ return false;
+ /*
+ * During mount, mp->m_dalign will not be set unless the sunit mount
+ * option was set. If it was set, ignore the bad stripe alignment values
+ * and allow the validation and overwrite later in the mount process to
+ * attempt to overwrite the bad stripe alignment values with the values
+ * supplied by mount options.
+ */
+ if (!mp->m_dalign)
+ return false;
+ if (!silent)
+ xfs_notice(mp,
+"Will try to correct with specified mount options sunit (%d) and swidth (%d)",
+ BBTOB(mp->m_dalign), BBTOB(mp->m_swidth));
+ return true;
+}
+
+/*
+ * Compute the maximum level number of the realtime summary file, as defined by
+ * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct
+ * use of rt volumes with more than 2^32 extents.
+ */
+uint8_t
+xfs_compute_rextslog(
+ xfs_rtbxlen_t rtextents)
+{
+ if (!rtextents)
+ return 0;
+ return xfs_highbit64(rtextents);
}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index a5e14740ec9a..37b1ed1bc209 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -25,7 +25,7 @@ extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp);
extern int xfs_update_secondary_sbs(struct xfs_mount *mp);
-#define XFS_FS_GEOM_MAX_STRUCT_VER (4)
+#define XFS_FS_GEOM_MAX_STRUCT_VER (5)
extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo,
int struct_version);
extern int xfs_sb_read_secondary(struct xfs_mount *mp,
@@ -35,7 +35,10 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp,
struct xfs_trans *tp, xfs_agnumber_t agno,
struct xfs_buf **bpp);
-extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
- __s64 sunit, __s64 swidth, int sectorsize, bool silent);
+bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
+ __s64 sunit, __s64 swidth, int sectorsize, bool may_repair,
+ bool silent);
+
+uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index c4381388c0c1..dfd61fa8332e 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -43,6 +43,60 @@ extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+/* btree ops */
+extern const struct xfs_btree_ops xfs_bnobt_ops;
+extern const struct xfs_btree_ops xfs_cntbt_ops;
+extern const struct xfs_btree_ops xfs_inobt_ops;
+extern const struct xfs_btree_ops xfs_finobt_ops;
+extern const struct xfs_btree_ops xfs_bmbt_ops;
+extern const struct xfs_btree_ops xfs_refcountbt_ops;
+extern const struct xfs_btree_ops xfs_rmapbt_ops;
+extern const struct xfs_btree_ops xfs_rmapbt_mem_ops;
+
+static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_bnobt_ops;
+}
+
+static inline bool xfs_btree_is_cnt(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_cntbt_ops;
+}
+
+static inline bool xfs_btree_is_bmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_bmbt_ops;
+}
+
+static inline bool xfs_btree_is_ino(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_inobt_ops;
+}
+
+static inline bool xfs_btree_is_fino(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_finobt_ops;
+}
+
+static inline bool xfs_btree_is_refcount(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_refcountbt_ops;
+}
+
+static inline bool xfs_btree_is_rmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rmapbt_ops;
+}
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops)
+{
+ return ops == &xfs_rmapbt_mem_ops;
+}
+#else
+# define xfs_btree_is_mem_rmap(...) (false)
+#endif
+
/* log size calculation functions */
int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
int xfs_log_calc_minimum_size(struct xfs_mount *);
@@ -128,19 +182,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
-
-/*
- * Symlink decoding/encoding functions
- */
-int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
-int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
- uint32_t size, struct xfs_buf *bp);
-bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
- uint32_t size, struct xfs_buf *bp);
-void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_inode *ip, struct xfs_ifork *ifp);
-xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip);
-
/* Computed inode geometry for the filesystem. */
struct xfs_ino_geometry {
/* Maximum inode count in this filesystem. */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index bdc777b9ec4a..ffb1317a9212 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -16,7 +16,10 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
-
+#include "xfs_symlink_remote.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_health.h"
/*
* Each contiguous block has a header, so it is not just a simple pathlen
@@ -175,7 +178,7 @@ xfs_symlink_local_to_remote(
if (!xfs_has_crc(mp)) {
bp->b_ops = NULL;
- memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+ memcpy(bp->b_addr, ifp->if_data, ifp->if_bytes);
xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
return;
}
@@ -191,7 +194,7 @@ xfs_symlink_local_to_remote(
buf = bp->b_addr;
buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
- memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+ memcpy(buf, ifp->if_data, ifp->if_bytes);
xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) +
ifp->if_bytes - 1);
}
@@ -202,15 +205,11 @@ xfs_symlink_local_to_remote(
*/
xfs_failaddr_t
xfs_symlink_shortform_verify(
- struct xfs_inode *ip)
+ void *sfp,
+ int64_t size)
{
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
- char *sfp = (char *)ifp->if_u1.if_data;
- int size = ifp->if_bytes;
char *endp = sfp + size;
- ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
-
/*
* Zero length symlinks should never occur in memory as they are
* never allowed to exist on disk.
@@ -231,3 +230,153 @@ xfs_symlink_shortform_verify(
return __this_address;
return NULL;
}
+
+/* Read a remote symlink target into the buffer. */
+int
+xfs_symlink_remote_read(
+ struct xfs_inode *ip,
+ char *link)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_buf *bp;
+ xfs_daddr_t d;
+ char *cur_chunk;
+ int pathlen = ip->i_disk_size;
+ int nmaps = XFS_SYMLINK_MAPS;
+ int byte_cnt;
+ int n;
+ int error = 0;
+ int fsblocks = 0;
+ int offset;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+ fsblocks = xfs_symlink_blocks(mp, pathlen);
+ error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
+ if (error)
+ goto out;
+
+ offset = 0;
+ for (n = 0; n < nmaps; n++) {
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+
+ error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
+ &bp, &xfs_symlink_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+ if (error)
+ return error;
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+ if (pathlen < byte_cnt)
+ byte_cnt = pathlen;
+
+ cur_chunk = bp->b_addr;
+ if (xfs_has_crc(mp)) {
+ if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
+ byte_cnt, bp)) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+ error = -EFSCORRUPTED;
+ xfs_alert(mp,
+"symlink header does not match required off/len/owner (0x%x/0x%x,0x%llx)",
+ offset, byte_cnt, ip->i_ino);
+ xfs_buf_relse(bp);
+ goto out;
+
+ }
+
+ cur_chunk += sizeof(struct xfs_dsymlink_hdr);
+ }
+
+ memcpy(link + offset, cur_chunk, byte_cnt);
+
+ pathlen -= byte_cnt;
+ offset += byte_cnt;
+
+ xfs_buf_relse(bp);
+ }
+ ASSERT(pathlen == 0);
+
+ link[ip->i_disk_size] = '\0';
+ error = 0;
+
+ out:
+ return error;
+}
+
+/* Write the symlink target into the inode. */
+int
+xfs_symlink_write_target(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ const char *target_path,
+ int pathlen,
+ xfs_fsblock_t fs_blocks,
+ uint resblks)
+{
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_mount *mp = tp->t_mountp;
+ const char *cur_chunk;
+ struct xfs_buf *bp;
+ xfs_daddr_t d;
+ int byte_cnt;
+ int nmaps;
+ int offset = 0;
+ int n;
+ int error;
+
+ /*
+ * If the symlink will fit into the inode, write it inline.
+ */
+ if (pathlen <= xfs_inode_data_fork_size(ip)) {
+ xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
+
+ ip->i_disk_size = pathlen;
+ ip->i_df.if_format = XFS_DINODE_FMT_LOCAL;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+ return 0;
+ }
+
+ nmaps = XFS_SYMLINK_MAPS;
+ error = xfs_bmapi_write(tp, ip, 0, fs_blocks, XFS_BMAPI_METADATA,
+ resblks, mval, &nmaps);
+ if (error)
+ return error;
+
+ ip->i_disk_size = pathlen;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ cur_chunk = target_path;
+ offset = 0;
+ for (n = 0; n < nmaps; n++) {
+ char *buf;
+
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ BTOBB(byte_cnt), 0, &bp);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+ byte_cnt = min(byte_cnt, pathlen);
+
+ buf = bp->b_addr;
+ buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, byte_cnt,
+ bp);
+
+ memcpy(buf, cur_chunk, byte_cnt);
+
+ cur_chunk += byte_cnt;
+ pathlen -= byte_cnt;
+ offset += byte_cnt;
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
+ xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
+ (char *)bp->b_addr);
+ }
+ ASSERT(pathlen == 0);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h
new file mode 100644
index 000000000000..a63bd38ae4fa
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_symlink_remote.h
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_SYMLINK_REMOTE_H
+#define __XFS_SYMLINK_REMOTE_H
+
+/*
+ * Symlink decoding/encoding functions
+ */
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+ uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
+ uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_inode *ip, struct xfs_ifork *ifp);
+xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size);
+int xfs_symlink_remote_read(struct xfs_inode *ip, char *link);
+int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip,
+ const char *target_path, int pathlen, xfs_fsblock_t fs_blocks,
+ uint resblks);
+
+#endif /* __XFS_SYMLINK_REMOTE_H */
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 70e97ea6eee7..69fc5b981352 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -31,7 +31,7 @@ xfs_trans_ijoin(
{
struct xfs_inode_log_item *iip;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (ip->i_itemp == NULL)
xfs_inode_item_init(ip, ip->i_mount);
iip = ip->i_itemp;
@@ -60,7 +60,7 @@ xfs_trans_ichgtime(
struct timespec64 tv;
ASSERT(tp);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
tv = current_time(inode);
@@ -90,7 +90,7 @@ xfs_trans_log_inode(
struct inode *inode = VFS_I(ip);
ASSERT(iip);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
tp->t_flags |= XFS_TRANS_DIRTY;
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 5b2f27cbdb80..6cd45e8c118d 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -19,6 +19,7 @@
#include "xfs_trans.h"
#include "xfs_qm.h"
#include "xfs_trans_space.h"
+#include "xfs_rtbitmap.h"
#define _ALLOC true
#define _FREE false
@@ -217,11 +218,12 @@ xfs_rtalloc_block_count(
struct xfs_mount *mp,
unsigned int num_ops)
{
- unsigned int blksz = XFS_FSB_TO_B(mp, 1);
- unsigned int rtbmp_bytes;
+ unsigned int rtbmp_blocks;
+ xfs_rtxlen_t rtxlen;
- rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
- return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
+ rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN);
+ rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen);
+ return (rtbmp_blocks + 1) * num_ops;
}
/*
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 5c2765934732..c299b16c9365 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -148,10 +148,10 @@ xfs_verify_rtbno(
/* Verify that a realtime device extent is fully contained inside the volume. */
bool
-xfs_verify_rtext(
+xfs_verify_rtbext(
struct xfs_mount *mp,
xfs_rtblock_t rtbno,
- xfs_rtblock_t len)
+ xfs_filblks_t len)
{
if (rtbno + len <= rtbno)
return false;
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 851220021484..76eb9e328835 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -11,6 +11,7 @@ typedef uint32_t prid_t; /* project ID */
typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef uint32_t xfs_extlen_t; /* extent length in blocks */
+typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */
typedef uint32_t xfs_agnumber_t; /* allocation group number */
typedef uint64_t xfs_extnum_t; /* # of extents in a file */
typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
@@ -18,6 +19,7 @@ typedef int64_t xfs_fsize_t; /* bytes in a file */
typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */
+typedef uint32_t xfs_rtsumoff_t; /* offset of an rtsummary info word */
typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
typedef int64_t xfs_lsn_t; /* log sequence number */
@@ -31,6 +33,8 @@ typedef uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
typedef uint64_t xfs_rtblock_t; /* extent (block) in realtime area */
typedef uint64_t xfs_fileoff_t; /* block number in a file */
typedef uint64_t xfs_filblks_t; /* number of blocks in a file */
+typedef uint64_t xfs_rtxnum_t; /* rtextent number */
+typedef uint64_t xfs_rtbxlen_t; /* rtbitmap extent length in rtextents */
typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
@@ -76,11 +80,13 @@ typedef void * xfs_failaddr_t;
/*
* Inode fork identifiers.
*/
-#define XFS_DATA_FORK 0
-#define XFS_ATTR_FORK 1
-#define XFS_COW_FORK 2
+#define XFS_STAGING_FORK (-1) /* fake fork for staging a btree */
+#define XFS_DATA_FORK (0)
+#define XFS_ATTR_FORK (1)
+#define XFS_COW_FORK (2)
#define XFS_WHICHFORK_STRINGS \
+ { XFS_STAGING_FORK, "staging" }, \
{ XFS_DATA_FORK, "data" }, \
{ XFS_ATTR_FORK, "attr" }, \
{ XFS_COW_FORK, "cow" }
@@ -110,24 +116,6 @@ typedef enum {
{ XFS_LOOKUP_LEi, "le" }, \
{ XFS_LOOKUP_GEi, "ge" }
-/*
- * This enum is used in string mapping in xfs_trace.h and scrub/trace.h;
- * please keep the TRACE_DEFINE_ENUMs for it up to date.
- */
-typedef enum {
- XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
- XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
-} xfs_btnum_t;
-
-#define XFS_BTNUM_STRINGS \
- { XFS_BTNUM_BNOi, "bnobt" }, \
- { XFS_BTNUM_CNTi, "cntbt" }, \
- { XFS_BTNUM_RMAPi, "rmapbt" }, \
- { XFS_BTNUM_BMAPi, "bmbt" }, \
- { XFS_BTNUM_INOi, "inobt" }, \
- { XFS_BTNUM_FINOi, "finobt" }, \
- { XFS_BTNUM_REFCi, "refcbt" }
-
struct xfs_name {
const unsigned char *name;
int len;
@@ -145,6 +133,7 @@ typedef uint32_t xfs_dqid_t;
*/
#define XFS_NBBYLOG 3 /* log2(NBBY) */
#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
+#define XFS_SUMINFOLOG 2 /* log2(sizeof(xfs_suminfo_t)) */
#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
#define XFS_NBWORD (1 << XFS_NBWORDLOG)
#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
@@ -202,6 +191,13 @@ enum xfs_ag_resv_type {
XFS_AG_RESV_AGFL,
XFS_AG_RESV_METADATA,
XFS_AG_RESV_RMAPBT,
+
+ /*
+ * Don't increase fdblocks when freeing extent. This is a pony for
+ * the bnobt repair functions to re-free the free space without
+ * altering fdblocks. If you think you need this you're wrong.
+ */
+ XFS_AG_RESV_IGNORE,
};
/* Results of scanning a btree keyspace to check occupancy. */
@@ -229,8 +225,8 @@ bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
-bool xfs_verify_rtext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
- xfs_rtblock_t len);
+bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
+ xfs_filblks_t len);
bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min,
@@ -239,4 +235,16 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off);
bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off,
xfs_fileoff_t len);
+/* Do we support an rt volume having this number of rtextents? */
+static inline bool
+xfs_validate_rtextents(
+ xfs_rtbxlen_t rtextents)
+{
+ /* No runt rt volumes */
+ if (rtextents == 0)
+ return false;
+
+ return true;
+}
+
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h
deleted file mode 100644
index 79155eec341b..000000000000
--- a/fs/xfs/mrlock.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#ifndef __XFS_SUPPORT_MRLOCK_H__
-#define __XFS_SUPPORT_MRLOCK_H__
-
-#include <linux/rwsem.h>
-
-typedef struct {
- struct rw_semaphore mr_lock;
-#if defined(DEBUG) || defined(XFS_WARN)
- int mr_writer;
-#endif
-} mrlock_t;
-
-#if defined(DEBUG) || defined(XFS_WARN)
-#define mrinit(mrp, name) \
- do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
-#else
-#define mrinit(mrp, name) \
- do { init_rwsem(&(mrp)->mr_lock); } while (0)
-#endif
-
-#define mrlock_init(mrp, t,n,s) mrinit(mrp, n)
-#define mrfree(mrp) do { } while (0)
-
-static inline void mraccess_nested(mrlock_t *mrp, int subclass)
-{
- down_read_nested(&mrp->mr_lock, subclass);
-}
-
-static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
-{
- down_write_nested(&mrp->mr_lock, subclass);
-#if defined(DEBUG) || defined(XFS_WARN)
- mrp->mr_writer = 1;
-#endif
-}
-
-static inline int mrtryaccess(mrlock_t *mrp)
-{
- return down_read_trylock(&mrp->mr_lock);
-}
-
-static inline int mrtryupdate(mrlock_t *mrp)
-{
- if (!down_write_trylock(&mrp->mr_lock))
- return 0;
-#if defined(DEBUG) || defined(XFS_WARN)
- mrp->mr_writer = 1;
-#endif
- return 1;
-}
-
-static inline void mrunlock_excl(mrlock_t *mrp)
-{
-#if defined(DEBUG) || defined(XFS_WARN)
- mrp->mr_writer = 0;
-#endif
- up_write(&mrp->mr_lock);
-}
-
-static inline void mrunlock_shared(mrlock_t *mrp)
-{
- up_read(&mrp->mr_lock);
-}
-
-static inline void mrdemote(mrlock_t *mrp)
-{
-#if defined(DEBUG) || defined(XFS_WARN)
- mrp->mr_writer = 0;
-#endif
- downgrade_write(&mrp->mr_lock);
-}
-
-#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/scrub/agb_bitmap.c b/fs/xfs/scrub/agb_bitmap.c
new file mode 100644
index 000000000000..573e4e062754
--- /dev/null
+++ b/fs/xfs/scrub/agb_bitmap.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "bitmap.h"
+#include "scrub/agb_bitmap.h"
+
+/*
+ * Record all btree blocks seen while iterating all records of a btree.
+ *
+ * We know that the btree query_all function starts at the left edge and walks
+ * towards the right edge of the tree. Therefore, we know that we can walk up
+ * the btree cursor towards the root; if the pointer for a given level points
+ * to the first record/key in that block, we haven't seen this block before;
+ * and therefore we need to remember that we saw this block in the btree.
+ *
+ * So if our btree is:
+ *
+ * 4
+ * / | \
+ * 1 2 3
+ *
+ * Pretend for this example that each leaf block has 100 btree records. For
+ * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
+ * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so
+ * we record block 4. The list is [1, 4].
+ *
+ * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
+ * the loop. The list remains [1, 4].
+ *
+ * For the 101st btree record, we've moved onto leaf block 2. Now
+ * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that
+ * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2].
+ *
+ * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
+ *
+ * For the 201st record, we've moved on to leaf block 3.
+ * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3].
+ *
+ * For the 300th record we just exit, with the list being [1, 4, 2, 3].
+ */
+
+/* Mark a btree block to the agblock bitmap. */
+STATIC int
+xagb_bitmap_visit_btblock(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xagb_bitmap *bitmap = priv;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t agbno;
+
+ xfs_btree_get_block(cur, level, &bp);
+ if (!bp)
+ return 0;
+
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+ agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+
+ return xagb_bitmap_set(bitmap, agbno, 1);
+}
+
+/* Mark all (per-AG) btree blocks in the agblock bitmap. */
+int
+xagb_bitmap_set_btblocks(
+ struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur)
+{
+ return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock,
+ XFS_BTREE_VISIT_ALL, bitmap);
+}
+
+/*
+ * Record all the buffers pointed to by the btree cursor. Callers already
+ * engaged in a btree walk should call this function to capture the list of
+ * blocks going from the leaf towards the root.
+ */
+int
+xagb_bitmap_set_btcur_path(
+ struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur)
+{
+ int i;
+ int error;
+
+ for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
+ error = xagb_bitmap_visit_btblock(cur, i, bitmap);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h
new file mode 100644
index 000000000000..e488e1f4f63d
--- /dev/null
+++ b/fs/xfs/scrub/agb_bitmap.h
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_AGB_BITMAP_H__
+#define __XFS_SCRUB_AGB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_agblock_t */
+
+struct xagb_bitmap {
+ struct xbitmap32 agbitmap;
+};
+
+static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap)
+{
+ xbitmap32_init(&bitmap->agbitmap);
+}
+
+static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap)
+{
+ xbitmap32_destroy(&bitmap->agbitmap);
+}
+
+static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap,
+ xfs_agblock_t start, xfs_extlen_t len)
+{
+ return xbitmap32_clear(&bitmap->agbitmap, start, len);
+}
+static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap,
+ xfs_agblock_t start, xfs_extlen_t len)
+{
+ return xbitmap32_set(&bitmap->agbitmap, start, len);
+}
+
+static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap,
+ xfs_agblock_t start, xfs_extlen_t *len)
+{
+ return xbitmap32_test(&bitmap->agbitmap, start, len);
+}
+
+static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap,
+ struct xagb_bitmap *sub)
+{
+ return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap);
+}
+
+static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap)
+{
+ return xbitmap32_hweight(&bitmap->agbitmap);
+}
+static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap)
+{
+ return xbitmap32_empty(&bitmap->agbitmap);
+}
+
+static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
+ xbitmap32_walk_fn fn, void *priv)
+{
+ return xbitmap32_walk(&bitmap->agbitmap, fn, priv);
+}
+
+int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur);
+int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur);
+
+static inline uint32_t xagb_bitmap_count_set_regions(struct xagb_bitmap *b)
+{
+ return xbitmap32_count_set_regions(&b->agbitmap);
+}
+
+#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 6c6e5eba42c8..e954f07679dd 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -556,28 +556,28 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
/* Check the AGF btree roots and levels */
- agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
+ agbno = be32_to_cpu(agf->agf_bno_root);
if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
+ agbno = be32_to_cpu(agf->agf_cnt_root);
if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+ level = be32_to_cpu(agf->agf_bno_level);
if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+ level = be32_to_cpu(agf->agf_cnt_level);
if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
if (xfs_has_rmapbt(mp)) {
- agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
+ agbno = be32_to_cpu(agf->agf_rmap_root);
if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+ level = be32_to_cpu(agf->agf_rmap_level);
if (level <= 0 || level > mp->m_rmap_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 876a2f41b063..427054b65b23 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -26,6 +26,7 @@
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
#include "scrub/reap.h"
/* Superblock */
@@ -72,7 +73,7 @@ xrep_superblock(
/* Write this to disk. */
xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
- return error;
+ return 0;
}
/* AGF */
@@ -173,8 +174,7 @@ xrep_agf_find_btrees(
* We relied on the rmapbt to reconstruct the AGF. If we get a
* different root then something's seriously wrong.
*/
- if (fab[XREP_AGF_RMAPBT].root !=
- be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi]))
+ if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root))
return -EFSCORRUPTED;
/* We must find the refcountbt root if that feature is enabled. */
@@ -223,20 +223,14 @@ xrep_agf_set_roots(
struct xfs_agf *agf,
struct xrep_find_ag_btree *fab)
{
- agf->agf_roots[XFS_BTNUM_BNOi] =
- cpu_to_be32(fab[XREP_AGF_BNOBT].root);
- agf->agf_levels[XFS_BTNUM_BNOi] =
- cpu_to_be32(fab[XREP_AGF_BNOBT].height);
+ agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root);
+ agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height);
- agf->agf_roots[XFS_BTNUM_CNTi] =
- cpu_to_be32(fab[XREP_AGF_CNTBT].root);
- agf->agf_levels[XFS_BTNUM_CNTi] =
- cpu_to_be32(fab[XREP_AGF_CNTBT].height);
+ agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root);
+ agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height);
- agf->agf_roots[XFS_BTNUM_RMAPi] =
- cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
- agf->agf_levels[XFS_BTNUM_RMAPi] =
- cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
+ agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
+ agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
if (xfs_has_reflink(sc->mp)) {
agf->agf_refcount_root =
@@ -261,8 +255,7 @@ xrep_agf_calc_from_btrees(
int error;
/* Update the AGF counters from the bnobt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
+ cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
if (error)
goto err;
@@ -275,8 +268,7 @@ xrep_agf_calc_from_btrees(
agf->agf_longest = cpu_to_be32(raa.longest);
/* Update the AGF counters from the cntbt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -332,16 +324,13 @@ xrep_agf_commit_new(
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
pag->pagf_longest = be32_to_cpu(agf->agf_longest);
- pag->pagf_levels[XFS_BTNUM_BNOi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
- pag->pagf_levels[XFS_BTNUM_CNTi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
- pag->pagf_levels[XFS_BTNUM_RMAPi] =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+ pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
+ pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
+ pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
- return 0;
+ return xrep_roll_ag_trans(sc);
}
/* Repair the AGF. v5 filesystems only. */
@@ -494,12 +483,11 @@ xrep_agfl_walk_rmap(
/* Strike out the blocks that are cross-linked according to the rmapbt. */
STATIC int
xrep_agfl_check_extent(
- uint64_t start,
- uint64_t len,
+ uint32_t agbno,
+ uint32_t len,
void *priv)
{
struct xrep_agfl *ra = priv;
- xfs_agblock_t agbno = start;
xfs_agblock_t last_agbno = agbno + len - 1;
int error;
@@ -559,16 +547,14 @@ xrep_agfl_collect_blocks(
goto out_bmp;
/* Find all blocks currently being used by the bnobt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
+ cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
xfs_btree_del_cursor(cur, error);
if (error)
goto out_bmp;
/* Find all blocks currently being used by the cntbt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
xfs_btree_del_cursor(cur, error);
if (error)
@@ -647,8 +633,8 @@ struct xrep_agfl_fill {
/* Fill the AGFL with whatever blocks are in this extent. */
static int
xrep_agfl_fill(
- uint64_t start,
- uint64_t len,
+ uint32_t start,
+ uint32_t len,
void *priv)
{
struct xrep_agfl_fill *af = priv;
@@ -789,6 +775,9 @@ xrep_agfl(
/* Dump any AGFL overflow. */
error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
XFS_AG_RESV_AGFL);
+ if (error)
+ goto err;
+
err:
xagb_bitmap_destroy(&agfl_extents);
return error;
@@ -905,7 +894,7 @@ xrep_agi_calc_from_btrees(
xfs_agino_t freecount;
int error;
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
error = xfs_ialloc_count_inodes(cur, &count, &freecount);
if (error)
goto err;
@@ -925,8 +914,7 @@ xrep_agi_calc_from_btrees(
if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
xfs_agblock_t blocks;
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp,
- XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -962,7 +950,7 @@ xrep_agi_commit_new(
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
- return 0;
+ return xrep_roll_ag_trans(sc);
}
/* Repair the AGI. */
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 279af72b1671..d1b8a4997dd2 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -9,13 +9,16 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_btree.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "xfs_ag.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub free space btrees.
@@ -24,10 +27,19 @@ int
xchk_setup_ag_allocbt(
struct xfs_scrub *sc)
{
+ int error;
+
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
- return xchk_setup_ag_btree(sc, false);
+ error = xchk_setup_ag_btree(sc, false);
+ if (error)
+ return error;
+
+ if (xchk_could_repair(sc))
+ return xrep_setup_ag_allocbt(sc);
+
+ return 0;
}
/* Free space btree scrubber. */
@@ -127,7 +139,7 @@ xchk_allocbt_rec(
struct xchk_alloc *ca = bs->private;
xfs_alloc_btrec_to_irec(rec, &irec);
- if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) {
+ if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -138,31 +150,27 @@ xchk_allocbt_rec(
return 0;
}
-/* Scrub the freespace btrees for some AG. */
-STATIC int
+/* Scrub one of the freespace btrees for some AG. */
+int
xchk_allocbt(
- struct xfs_scrub *sc,
- xfs_btnum_t which)
+ struct xfs_scrub *sc)
{
struct xchk_alloc ca = { };
struct xfs_btree_cur *cur;
- cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
- return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca);
-}
-
-int
-xchk_bnobt(
- struct xfs_scrub *sc)
-{
- return xchk_allocbt(sc, XFS_BTNUM_BNO);
-}
+ switch (sc->sm->sm_type) {
+ case XFS_SCRUB_TYPE_BNOBT:
+ cur = sc->sa.bno_cur;
+ break;
+ case XFS_SCRUB_TYPE_CNTBT:
+ cur = sc->sa.cnt_cur;
+ break;
+ default:
+ ASSERT(0);
+ return -EIO;
+ }
-int
-xchk_cntbt(
- struct xfs_scrub *sc)
-{
- return xchk_allocbt(sc, XFS_BTNUM_CNT);
+ return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca);
}
/* xref check that the extent is not free */
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
new file mode 100644
index 000000000000..d421b253923e
--- /dev/null
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -0,0 +1,933 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_refcount.h"
+#include "xfs_extent_busy.h"
+#include "xfs_health.h"
+#include "xfs_bmap.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Free Space Btree Repair
+ * =======================
+ *
+ * The reverse mappings are supposed to record all space usage for the entire
+ * AG. Therefore, we can recreate the free extent records in an AG by looking
+ * for gaps in the physical extents recorded in the rmapbt. These records are
+ * staged in @free_records. Identifying the gaps is more difficult on a
+ * reflink filesystem because rmap records are allowed to overlap.
+ *
+ * Because the final step of building a new index is to free the space used by
+ * the old index, repair needs to find that space. Unfortunately, all
+ * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share
+ * the same rmapbt owner code (OWN_AG), so this is not straightforward.
+ *
+ * The scan of the reverse mapping information records the space used by OWN_AG
+ * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While
+ * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to
+ * record all visited rmap btree blocks and all blocks owned by the AGFL.
+ *
+ * After that is where the definitions of old_allocbt_blocks shifts. This
+ * expression identifies possible former bnobt/cntbt blocks:
+ *
+ * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks);
+ *
+ * Substituting from above definitions, that becomes:
+ *
+ * old_allocbt_blocks & ~not_allocbt_blocks
+ *
+ * The OWN_AG bitmap itself isn't needed after this point, so what we really do
+ * instead is:
+ *
+ * old_allocbt_blocks &= ~not_allocbt_blocks;
+ *
+ * After this point, @old_allocbt_blocks is a bitmap of alleged former
+ * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first
+ * parameter in place to avoid copying records around.
+ *
+ * Next, some of the space described by @free_records are diverted to the newbt
+ * reservation and used to format new btree blocks. The remaining records are
+ * written to the new btree indices. We reconstruct both bnobt and cntbt at
+ * the same time since we've already done all the work.
+ *
+ * We use the prefix 'xrep_abt' here because we regenerate both free space
+ * allocation btrees at the same time.
+ */
+
+struct xrep_abt {
+ /* Blocks owned by the rmapbt or the agfl. */
+ struct xagb_bitmap not_allocbt_blocks;
+
+ /* All OWN_AG blocks. */
+ struct xagb_bitmap old_allocbt_blocks;
+
+ /*
+ * New bnobt information. All btree block reservations are added to
+ * the reservation list in new_bnobt.
+ */
+ struct xrep_newbt new_bnobt;
+
+ /* new cntbt information */
+ struct xrep_newbt new_cntbt;
+
+ /* Free space extents. */
+ struct xfarray *free_records;
+
+ struct xfs_scrub *sc;
+
+ /* Number of non-null records in @free_records. */
+ uint64_t nr_real_records;
+
+ /* get_records()'s position in the free space record array. */
+ xfarray_idx_t array_cur;
+
+ /*
+ * Next block we anticipate seeing in the rmap records. If the next
+ * rmap record is greater than next_agbno, we have found unused space.
+ */
+ xfs_agblock_t next_agbno;
+
+ /* Number of free blocks in this AG. */
+ xfs_agblock_t nr_blocks;
+
+ /* Longest free extent we found in the AG. */
+ xfs_agblock_t longest;
+};
+
+/* Set up to repair AG free space btrees. */
+int
+xrep_setup_ag_allocbt(
+ struct xfs_scrub *sc)
+{
+ unsigned int busy_gen;
+
+ /*
+ * Make sure the busy extent list is clear because we can't put extents
+ * on there twice.
+ */
+ busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
+ if (xfs_extent_busy_list_empty(sc->sa.pag))
+ return 0;
+
+ return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
+}
+
+/* Check for any obvious conflicts in the free extent. */
+STATIC int
+xrep_abt_check_free_ext(
+ struct xfs_scrub *sc,
+ const struct xfs_alloc_rec_incore *rec)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ /* Must not be an inode chunk. */
+ error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+ rec->ar_startblock, rec->ar_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ /* Must not be shared or CoW staging. */
+ if (sc->sa.refc_cur) {
+ error = xfs_refcount_has_records(sc->sa.refc_cur,
+ XFS_REFC_DOMAIN_SHARED, rec->ar_startblock,
+ rec->ar_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ error = xfs_refcount_has_records(sc->sa.refc_cur,
+ XFS_REFC_DOMAIN_COW, rec->ar_startblock,
+ rec->ar_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/*
+ * Stash a free space record for all the space since the last bno we found
+ * all the way up to @end.
+ */
+static int
+xrep_abt_stash(
+ struct xrep_abt *ra,
+ xfs_agblock_t end)
+{
+ struct xfs_alloc_rec_incore arec = {
+ .ar_startblock = ra->next_agbno,
+ .ar_blockcount = end - ra->next_agbno,
+ };
+ struct xfs_scrub *sc = ra->sc;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ error = xrep_abt_check_free_ext(ra->sc, &arec);
+ if (error)
+ return error;
+
+ trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
+
+ error = xfarray_append(ra->free_records, &arec);
+ if (error)
+ return error;
+
+ ra->nr_blocks += arec.ar_blockcount;
+ return 0;
+}
+
+/* Record extents that aren't in use from gaps in the rmap records. */
+STATIC int
+xrep_abt_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_abt *ra = priv;
+ int error;
+
+ /* Record all the OWN_AG blocks... */
+ if (rec->rm_owner == XFS_RMAP_OWN_AG) {
+ error = xagb_bitmap_set(&ra->old_allocbt_blocks,
+ rec->rm_startblock, rec->rm_blockcount);
+ if (error)
+ return error;
+ }
+
+ /* ...and all the rmapbt blocks... */
+ error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur);
+ if (error)
+ return error;
+
+ /* ...and all the free space. */
+ if (rec->rm_startblock > ra->next_agbno) {
+ error = xrep_abt_stash(ra, rec->rm_startblock);
+ if (error)
+ return error;
+ }
+
+ /*
+ * rmap records can overlap on reflink filesystems, so project
+ * next_agbno as far out into the AG space as we currently know about.
+ */
+ ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno,
+ rec->rm_startblock + rec->rm_blockcount);
+ return 0;
+}
+
+/* Collect an AGFL block for the not-to-release list. */
+static int
+xrep_abt_walk_agfl(
+ struct xfs_mount *mp,
+ xfs_agblock_t agbno,
+ void *priv)
+{
+ struct xrep_abt *ra = priv;
+
+ return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1);
+}
+
+/*
+ * Compare two free space extents by block number. We want to sort in order of
+ * increasing block number.
+ */
+static int
+xrep_bnobt_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_alloc_rec_incore *ap = a;
+ const struct xfs_alloc_rec_incore *bp = b;
+
+ if (ap->ar_startblock > bp->ar_startblock)
+ return 1;
+ else if (ap->ar_startblock < bp->ar_startblock)
+ return -1;
+ return 0;
+}
+
+/*
+ * Re-sort the free extents by block number so that we can put the records into
+ * the bnobt in the correct order. Make sure the records do not overlap in
+ * physical space.
+ */
+STATIC int
+xrep_bnobt_sort_records(
+ struct xrep_abt *ra)
+{
+ struct xfs_alloc_rec_incore arec;
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ xfs_agblock_t next_agbno = 0;
+ int error;
+
+ error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0);
+ if (error)
+ return error;
+
+ while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) {
+ if (arec.ar_startblock < next_agbno)
+ return -EFSCORRUPTED;
+
+ next_agbno = arec.ar_startblock + arec.ar_blockcount;
+ }
+
+ return error;
+}
+
+/*
+ * Compare two free space extents by length and then block number. We want
+ * to sort first in order of increasing length and then in order of increasing
+ * block number.
+ */
+static int
+xrep_cntbt_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_alloc_rec_incore *ap = a;
+ const struct xfs_alloc_rec_incore *bp = b;
+
+ if (ap->ar_blockcount > bp->ar_blockcount)
+ return 1;
+ else if (ap->ar_blockcount < bp->ar_blockcount)
+ return -1;
+ return xrep_bnobt_extent_cmp(a, b);
+}
+
+/*
+ * Sort the free extents by length so so that we can put the records into the
+ * cntbt in the correct order. Don't let userspace kill us if we're resorting
+ * after allocating btree blocks.
+ */
+STATIC int
+xrep_cntbt_sort_records(
+ struct xrep_abt *ra,
+ bool is_resort)
+{
+ return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp,
+ is_resort ? 0 : XFARRAY_SORT_KILLABLE);
+}
+
+/*
+ * Iterate all reverse mappings to find (1) the gaps between rmap records (all
+ * unowned space), (2) the OWN_AG extents (which encompass the free space
+ * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL
+ * blocks. The free space is (1) + (2) - (3) - (4).
+ */
+STATIC int
+xrep_abt_find_freespace(
+ struct xrep_abt *ra)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_buf *agfl_bp;
+ xfs_agblock_t agend;
+ int error;
+
+ xagb_bitmap_init(&ra->not_allocbt_blocks);
+
+ xrep_ag_btcur_init(sc, &sc->sa);
+
+ /*
+ * Iterate all the reverse mappings to find gaps in the physical
+ * mappings, all the OWN_AG blocks, and all the rmapbt extents.
+ */
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra);
+ if (error)
+ goto err;
+
+ /* Insert a record for space between the last rmap and EOAG. */
+ agend = be32_to_cpu(agf->agf_length);
+ if (ra->next_agbno < agend) {
+ error = xrep_abt_stash(ra, agend);
+ if (error)
+ goto err;
+ }
+
+ /* Collect all the AGFL blocks. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ goto err;
+
+ error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra);
+ if (error)
+ goto err_agfl;
+
+ /* Compute the old bnobt/cntbt blocks. */
+ error = xagb_bitmap_disunion(&ra->old_allocbt_blocks,
+ &ra->not_allocbt_blocks);
+ if (error)
+ goto err_agfl;
+
+ ra->nr_real_records = xfarray_length(ra->free_records);
+err_agfl:
+ xfs_trans_brelse(sc->tp, agfl_bp);
+err:
+ xchk_ag_btcur_free(&sc->sa);
+ xagb_bitmap_destroy(&ra->not_allocbt_blocks);
+ return error;
+}
+
+/*
+ * We're going to use the observed free space records to reserve blocks for the
+ * new free space btrees, so we play an iterative game where we try to converge
+ * on the number of blocks we need:
+ *
+ * 1. Estimate how many blocks we'll need to store the records.
+ * 2. If the first free record has more blocks than we need, we're done.
+ * We will have to re-sort the records prior to building the cntbt.
+ * 3. If that record has exactly the number of blocks we need, null out the
+ * record. We're done.
+ * 4. Otherwise, we still need more blocks. Null out the record, subtract its
+ * length from the number of blocks we need, and go back to step 1.
+ *
+ * Fortunately, we don't have to do any transaction work to play this game, so
+ * we don't have to tear down the staging cursors.
+ */
+STATIC int
+xrep_abt_reserve_space(
+ struct xrep_abt *ra,
+ struct xfs_btree_cur *bno_cur,
+ struct xfs_btree_cur *cnt_cur,
+ bool *needs_resort)
+{
+ struct xfs_scrub *sc = ra->sc;
+ xfarray_idx_t record_nr;
+ unsigned int allocated = 0;
+ int error = 0;
+
+ record_nr = xfarray_length(ra->free_records) - 1;
+ do {
+ struct xfs_alloc_rec_incore arec;
+ uint64_t required;
+ unsigned int desired;
+ unsigned int len;
+
+ /* Compute how many blocks we'll need. */
+ error = xfs_btree_bload_compute_geometry(cnt_cur,
+ &ra->new_cntbt.bload, ra->nr_real_records);
+ if (error)
+ break;
+
+ error = xfs_btree_bload_compute_geometry(bno_cur,
+ &ra->new_bnobt.bload, ra->nr_real_records);
+ if (error)
+ break;
+
+ /* How many btree blocks do we need to store all records? */
+ required = ra->new_bnobt.bload.nr_blocks +
+ ra->new_cntbt.bload.nr_blocks;
+ ASSERT(required < INT_MAX);
+
+ /* If we've reserved enough blocks, we're done. */
+ if (allocated >= required)
+ break;
+
+ desired = required - allocated;
+
+ /* We need space but there's none left; bye! */
+ if (ra->nr_real_records == 0) {
+ error = -ENOSPC;
+ break;
+ }
+
+ /* Grab the first record from the list. */
+ error = xfarray_load(ra->free_records, record_nr, &arec);
+ if (error)
+ break;
+
+ ASSERT(arec.ar_blockcount <= UINT_MAX);
+ len = min_t(unsigned int, arec.ar_blockcount, desired);
+
+ trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
+ arec.ar_startblock, len, XFS_RMAP_OWN_AG);
+
+ error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
+ arec.ar_startblock, len);
+ if (error)
+ break;
+ allocated += len;
+ ra->nr_blocks -= len;
+
+ if (arec.ar_blockcount > desired) {
+ /*
+ * Record has more space than we need. The number of
+ * free records doesn't change, so shrink the free
+ * record, inform the caller that the records are no
+ * longer sorted by length, and exit.
+ */
+ arec.ar_startblock += desired;
+ arec.ar_blockcount -= desired;
+ error = xfarray_store(ra->free_records, record_nr,
+ &arec);
+ if (error)
+ break;
+
+ *needs_resort = true;
+ return 0;
+ }
+
+ /*
+ * We're going to use up the entire record, so unset it and
+ * move on to the next one. This changes the number of free
+ * records (but doesn't break the sorting order), so we must
+ * go around the loop once more to re-run _bload_init.
+ */
+ error = xfarray_unset(ra->free_records, record_nr);
+ if (error)
+ break;
+ ra->nr_real_records--;
+ record_nr--;
+ } while (1);
+
+ return error;
+}
+
+STATIC int
+xrep_abt_dispose_one(
+ struct xrep_abt *ra,
+ struct xrep_newbt_resv *resv)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ xfs_agblock_t free_agbno = resv->agbno + resv->used;
+ xfs_extlen_t free_aglen = resv->len - resv->used;
+ int error;
+
+ ASSERT(pag == resv->pag);
+
+ /* Add a deferred rmap for each extent we used. */
+ if (resv->used > 0)
+ xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
+ resv->used, XFS_RMAP_OWN_AG);
+
+ /*
+ * For each reserved btree block we didn't use, add it to the free
+ * space btree. We didn't touch fdblocks when we reserved them, so
+ * we don't touch it now.
+ */
+ if (free_aglen == 0)
+ return 0;
+
+ trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
+ free_aglen, ra->new_bnobt.oinfo.oi_owner);
+
+ error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
+ &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
+ if (error)
+ return error;
+
+ return xrep_defer_finish(sc);
+}
+
+/*
+ * Deal with all the space we reserved. Blocks that were allocated for the
+ * free space btrees need to have a (deferred) rmap added for the OWN_AG
+ * allocation, and blocks that didn't get used can be freed via the usual
+ * (deferred) means.
+ */
+STATIC void
+xrep_abt_dispose_reservations(
+ struct xrep_abt *ra,
+ int error)
+{
+ struct xrep_newbt_resv *resv, *n;
+
+ if (error)
+ goto junkit;
+
+ list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
+ error = xrep_abt_dispose_one(ra, resv);
+ if (error)
+ goto junkit;
+ }
+
+junkit:
+ list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
+ xfs_perag_put(resv->pag);
+ list_del(&resv->list);
+ kfree(resv);
+ }
+
+ xrep_newbt_cancel(&ra->new_bnobt);
+ xrep_newbt_cancel(&ra->new_cntbt);
+}
+
+/* Retrieve free space data for bulk load. */
+STATIC int
+xrep_abt_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a;
+ struct xrep_abt *ra = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ error = xfarray_load_next(ra->free_records, &ra->array_cur,
+ arec);
+ if (error)
+ return error;
+
+ ra->longest = max(ra->longest, arec->ar_blockcount);
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_abt_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_abt *ra = priv;
+
+ return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr);
+}
+
+/*
+ * Reset the AGF counters to reflect the free space btrees that we just
+ * rebuilt, then reinitialize the per-AG data.
+ */
+STATIC int
+xrep_abt_reset_counters(
+ struct xrep_abt *ra)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ unsigned int freesp_btreeblks = 0;
+
+ /*
+ * Compute the contribution to agf_btreeblks for the new free space
+ * btrees. This is the computed btree size minus anything we didn't
+ * use.
+ */
+ freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1;
+ freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1;
+
+ freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt);
+ freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt);
+
+ /*
+ * The AGF header contains extra information related to the free space
+ * btrees, so we must update those fields here.
+ */
+ agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
+ (be32_to_cpu(agf->agf_rmap_blocks) - 1));
+ agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
+ agf->agf_longest = cpu_to_be32(ra->longest);
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
+ XFS_AGF_LONGEST |
+ XFS_AGF_FREEBLKS);
+
+ /*
+ * After we commit the new btree to disk, it is possible that the
+ * process to reap the old btree blocks will race with the AIL trying
+ * to checkpoint the old btree blocks into the filesystem. If the new
+ * tree is shorter than the old one, the allocbt write verifier will
+ * fail and the AIL will shut down the filesystem.
+ *
+ * To avoid this, save the old incore btree height values as the alt
+ * height values before re-initializing the perag info from the updated
+ * AGF to capture all the new values.
+ */
+ pag->pagf_repair_bno_level = pag->pagf_bno_level;
+ pag->pagf_repair_cnt_level = pag->pagf_cnt_level;
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagf(sc);
+}
+
+/*
+ * Use the collected free space information to stage new free space btrees.
+ * If this is successful we'll return with the new btree root
+ * information logged to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_abt_build_new_trees(
+ struct xrep_abt *ra)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_btree_cur *bno_cur;
+ struct xfs_btree_cur *cnt_cur;
+ struct xfs_perag *pag = sc->sa.pag;
+ bool needs_resort = false;
+ int error;
+
+ /*
+ * Sort the free extents by length so that we can set up the free space
+ * btrees in as few extents as possible. This reduces the amount of
+ * deferred rmap / free work we have to do at the end.
+ */
+ error = xrep_cntbt_sort_records(ra, false);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the AG header.
+ */
+ xrep_newbt_init_bare(&ra->new_bnobt, sc);
+ xrep_newbt_init_bare(&ra->new_cntbt, sc);
+
+ ra->new_bnobt.bload.get_records = xrep_abt_get_records;
+ ra->new_cntbt.bload.get_records = xrep_abt_get_records;
+
+ ra->new_bnobt.bload.claim_block = xrep_abt_claim_block;
+ ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
+
+ /* Allocate cursors for the staged btrees. */
+ bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake);
+
+ cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake);
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_cur;
+
+ /* Reserve the space we'll need for the new btrees. */
+ error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort);
+ if (error)
+ goto err_cur;
+
+ /*
+ * If we need to re-sort the free extents by length, do so so that we
+ * can put the records into the cntbt in the correct order.
+ */
+ if (needs_resort) {
+ error = xrep_cntbt_sort_records(ra, needs_resort);
+ if (error)
+ goto err_cur;
+ }
+
+ /*
+ * Due to btree slack factors, it's possible for a new btree to be one
+ * level taller than the old btree. Update the alternate incore btree
+ * height so that we don't trip the verifiers when writing the new
+ * btree blocks to disk.
+ */
+ pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height;
+ pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height;
+
+ /* Load the free space by length tree. */
+ ra->array_cur = XFARRAY_CURSOR_INIT;
+ ra->longest = 0;
+ error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra);
+ if (error)
+ goto err_levels;
+
+ error = xrep_bnobt_sort_records(ra);
+ if (error)
+ return error;
+
+ /* Load the free space by block number tree. */
+ ra->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra);
+ if (error)
+ goto err_levels;
+
+ /*
+ * Install the new btrees in the AG header. After this point the old
+ * btrees are no longer accessible and the new trees are live.
+ */
+ xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(bno_cur, 0);
+ xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(cnt_cur, 0);
+
+ /* Reset the AGF counters now that we've changed the btree shape. */
+ error = xrep_abt_reset_counters(ra);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ xrep_abt_dispose_reservations(ra, error);
+
+ return xrep_roll_ag_trans(sc);
+
+err_levels:
+ pag->pagf_repair_bno_level = 0;
+ pag->pagf_repair_cnt_level = 0;
+err_cur:
+ xfs_btree_del_cursor(cnt_cur, error);
+ xfs_btree_del_cursor(bno_cur, error);
+err_newbt:
+ xrep_abt_dispose_reservations(ra, error);
+ return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_abt_remove_old_trees(
+ struct xrep_abt *ra)
+{
+ struct xfs_perag *pag = ra->sc->sa.pag;
+ int error;
+
+ /* Free the old btree blocks if they're not in use. */
+ error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks,
+ &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE);
+ if (error)
+ return error;
+
+ /*
+ * Now that we've zapped all the old allocbt blocks we can turn off
+ * the alternate height mechanism.
+ */
+ pag->pagf_repair_bno_level = 0;
+ pag->pagf_repair_cnt_level = 0;
+ return 0;
+}
+
+/* Repair the freespace btrees for some AG. */
+int
+xrep_allocbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_abt *ra;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+
+ ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS);
+ if (!ra)
+ return -ENOMEM;
+ ra->sc = sc;
+
+ /* We rebuild both data structures. */
+ sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
+
+ /*
+ * Make sure the busy extent list is clear because we can't put extents
+ * on there twice. In theory we cleared this before we started, but
+ * let's not risk the filesystem.
+ */
+ if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
+ error = -EDEADLOCK;
+ goto out_ra;
+ }
+
+ /* Set up enough storage to handle maximally fragmented free space. */
+ descr = xchk_xfile_ag_descr(sc, "free space records");
+ error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2,
+ sizeof(struct xfs_alloc_rec_incore),
+ &ra->free_records);
+ kfree(descr);
+ if (error)
+ goto out_ra;
+
+ /* Collect the free space data and find the old btree blocks. */
+ xagb_bitmap_init(&ra->old_allocbt_blocks);
+ error = xrep_abt_find_freespace(ra);
+ if (error)
+ goto out_bitmap;
+
+ /* Rebuild the free space information. */
+ error = xrep_abt_build_new_trees(ra);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old trees. */
+ error = xrep_abt_remove_old_trees(ra);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xagb_bitmap_destroy(&ra->old_allocbt_blocks);
+ xfarray_destroy(ra->free_records);
+out_ra:
+ kfree(ra);
+ return error;
+}
+
+/* Make sure both btrees are ok after we've rebuilt them. */
+int
+xrep_revalidate_allocbt(
+ struct xfs_scrub *sc)
+{
+ __u32 old_type = sc->sm->sm_type;
+ int error;
+
+ /*
+ * We must update sm_type temporarily so that the tree-to-tree cross
+ * reference checks will work in the correct direction, and also so
+ * that tracing will report correctly if there are more errors.
+ */
+ sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
+ error = xchk_allocbt(sc);
+ if (error)
+ goto out;
+
+ sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT;
+ error = xchk_allocbt(sc);
+out:
+ sc->sm->sm_type = old_type;
+ return error;
+}
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 6c16d9530cca..83c7feb38714 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -527,28 +527,23 @@ xchk_xattr_check_sf(
struct xfs_scrub *sc)
{
struct xchk_xattr_buf *ab = sc->buf;
- struct xfs_attr_shortform *sf;
- struct xfs_attr_sf_entry *sfe;
+ struct xfs_ifork *ifp = &sc->ip->i_af;
+ struct xfs_attr_sf_hdr *sf = ifp->if_data;
+ struct xfs_attr_sf_entry *sfe = xfs_attr_sf_firstentry(sf);
struct xfs_attr_sf_entry *next;
- struct xfs_ifork *ifp;
- unsigned char *end;
+ unsigned char *end = ifp->if_data + ifp->if_bytes;
int i;
int error = 0;
- ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
-
bitmap_zero(ab->usedmap, ifp->if_bytes);
- sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data;
- end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes;
- xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr));
+ xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*sf));
- sfe = &sf->list[0];
if ((unsigned char *)sfe > end) {
xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
return 0;
}
- for (i = 0; i < sf->hdr.count; i++) {
+ for (i = 0; i < sf->count; i++) {
unsigned char *name = sfe->nameval;
unsigned char *value = &sfe->nameval[sfe->namelen];
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index e0c89a9a0ca0..0cb8d43912a8 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -16,7 +16,9 @@
#include <linux/interval_tree_generic.h>
-struct xbitmap_node {
+/* u64 bitmap */
+
+struct xbitmap64_node {
struct rb_node bn_rbnode;
/* First set bit of this interval and subtree. */
@@ -39,72 +41,72 @@ struct xbitmap_node {
* forward-declare them anyway for clarity.
*/
static inline void
-xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root);
+xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root);
static inline void
-xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root);
+xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root);
-static inline struct xbitmap_node *
-xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start,
+static inline struct xbitmap64_node *
+xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start,
uint64_t last);
-static inline struct xbitmap_node *
-xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start,
+static inline struct xbitmap64_node *
+xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start,
uint64_t last);
-INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t,
- __bn_subtree_last, START, LAST, static inline, xbitmap_tree)
+INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t,
+ __bn_subtree_last, START, LAST, static inline, xbitmap64_tree)
/* Iterate each interval of a bitmap. Do not change the bitmap. */
-#define for_each_xbitmap_extent(bn, bitmap) \
+#define for_each_xbitmap64_extent(bn, bitmap) \
for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \
- struct xbitmap_node, bn_rbnode); \
+ struct xbitmap64_node, bn_rbnode); \
(bn) != NULL; \
(bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \
- struct xbitmap_node, bn_rbnode))
+ struct xbitmap64_node, bn_rbnode))
/* Clear a range of this bitmap. */
int
-xbitmap_clear(
- struct xbitmap *bitmap,
+xbitmap64_clear(
+ struct xbitmap64 *bitmap,
uint64_t start,
uint64_t len)
{
- struct xbitmap_node *bn;
- struct xbitmap_node *new_bn;
+ struct xbitmap64_node *bn;
+ struct xbitmap64_node *new_bn;
uint64_t last = start + len - 1;
- while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) {
+ while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last))) {
if (bn->bn_start < start && bn->bn_last > last) {
uint64_t old_last = bn->bn_last;
/* overlaps with the entire clearing range */
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
bn->bn_last = start - 1;
- xbitmap_tree_insert(bn, &bitmap->xb_root);
+ xbitmap64_tree_insert(bn, &bitmap->xb_root);
/* add an extent */
- new_bn = kmalloc(sizeof(struct xbitmap_node),
+ new_bn = kmalloc(sizeof(struct xbitmap64_node),
XCHK_GFP_FLAGS);
if (!new_bn)
return -ENOMEM;
new_bn->bn_start = last + 1;
new_bn->bn_last = old_last;
- xbitmap_tree_insert(new_bn, &bitmap->xb_root);
+ xbitmap64_tree_insert(new_bn, &bitmap->xb_root);
} else if (bn->bn_start < start) {
/* overlaps with the left side of the clearing range */
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
bn->bn_last = start - 1;
- xbitmap_tree_insert(bn, &bitmap->xb_root);
+ xbitmap64_tree_insert(bn, &bitmap->xb_root);
} else if (bn->bn_last > last) {
/* overlaps with the right side of the clearing range */
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
bn->bn_start = last + 1;
- xbitmap_tree_insert(bn, &bitmap->xb_root);
+ xbitmap64_tree_insert(bn, &bitmap->xb_root);
break;
} else {
/* in the middle of the clearing range */
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
kfree(bn);
}
}
@@ -114,59 +116,59 @@ xbitmap_clear(
/* Set a range of this bitmap. */
int
-xbitmap_set(
- struct xbitmap *bitmap,
+xbitmap64_set(
+ struct xbitmap64 *bitmap,
uint64_t start,
uint64_t len)
{
- struct xbitmap_node *left;
- struct xbitmap_node *right;
+ struct xbitmap64_node *left;
+ struct xbitmap64_node *right;
uint64_t last = start + len - 1;
int error;
/* Is this whole range already set? */
- left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+ left = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last);
if (left && left->bn_start <= start && left->bn_last >= last)
return 0;
/* Clear out everything in the range we want to set. */
- error = xbitmap_clear(bitmap, start, len);
+ error = xbitmap64_clear(bitmap, start, len);
if (error)
return error;
/* Do we have a left-adjacent extent? */
- left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
+ left = xbitmap64_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
ASSERT(!left || left->bn_last + 1 == start);
/* Do we have a right-adjacent extent? */
- right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
+ right = xbitmap64_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
ASSERT(!right || right->bn_start == last + 1);
if (left && right) {
/* combine left and right adjacent extent */
- xbitmap_tree_remove(left, &bitmap->xb_root);
- xbitmap_tree_remove(right, &bitmap->xb_root);
+ xbitmap64_tree_remove(left, &bitmap->xb_root);
+ xbitmap64_tree_remove(right, &bitmap->xb_root);
left->bn_last = right->bn_last;
- xbitmap_tree_insert(left, &bitmap->xb_root);
+ xbitmap64_tree_insert(left, &bitmap->xb_root);
kfree(right);
} else if (left) {
/* combine with left extent */
- xbitmap_tree_remove(left, &bitmap->xb_root);
+ xbitmap64_tree_remove(left, &bitmap->xb_root);
left->bn_last = last;
- xbitmap_tree_insert(left, &bitmap->xb_root);
+ xbitmap64_tree_insert(left, &bitmap->xb_root);
} else if (right) {
/* combine with right extent */
- xbitmap_tree_remove(right, &bitmap->xb_root);
+ xbitmap64_tree_remove(right, &bitmap->xb_root);
right->bn_start = start;
- xbitmap_tree_insert(right, &bitmap->xb_root);
+ xbitmap64_tree_insert(right, &bitmap->xb_root);
} else {
/* add an extent */
- left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS);
+ left = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS);
if (!left)
return -ENOMEM;
left->bn_start = start;
left->bn_last = last;
- xbitmap_tree_insert(left, &bitmap->xb_root);
+ xbitmap64_tree_insert(left, &bitmap->xb_root);
}
return 0;
@@ -174,21 +176,21 @@ xbitmap_set(
/* Free everything related to this bitmap. */
void
-xbitmap_destroy(
- struct xbitmap *bitmap)
+xbitmap64_destroy(
+ struct xbitmap64 *bitmap)
{
- struct xbitmap_node *bn;
+ struct xbitmap64_node *bn;
- while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) {
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) {
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
kfree(bn);
}
}
/* Set up a per-AG block bitmap. */
void
-xbitmap_init(
- struct xbitmap *bitmap)
+xbitmap64_init(
+ struct xbitmap64 *bitmap)
{
bitmap->xb_root = RB_ROOT_CACHED;
}
@@ -208,18 +210,18 @@ xbitmap_init(
* This is the logical equivalent of bitmap &= ~sub.
*/
int
-xbitmap_disunion(
- struct xbitmap *bitmap,
- struct xbitmap *sub)
+xbitmap64_disunion(
+ struct xbitmap64 *bitmap,
+ struct xbitmap64 *sub)
{
- struct xbitmap_node *bn;
+ struct xbitmap64_node *bn;
int error;
- if (xbitmap_empty(bitmap) || xbitmap_empty(sub))
+ if (xbitmap64_empty(bitmap) || xbitmap64_empty(sub))
return 0;
- for_each_xbitmap_extent(bn, sub) {
- error = xbitmap_clear(bitmap, bn->bn_start,
+ for_each_xbitmap64_extent(bn, sub) {
+ error = xbitmap64_clear(bitmap, bn->bn_start,
bn->bn_last - bn->bn_start + 1);
if (error)
return error;
@@ -228,88 +230,273 @@ xbitmap_disunion(
return 0;
}
+/* How many bits are set in this bitmap? */
+uint64_t
+xbitmap64_hweight(
+ struct xbitmap64 *bitmap)
+{
+ struct xbitmap64_node *bn;
+ uint64_t ret = 0;
+
+ for_each_xbitmap64_extent(bn, bitmap)
+ ret += bn->bn_last - bn->bn_start + 1;
+
+ return ret;
+}
+
+/* Call a function for every run of set bits in this bitmap. */
+int
+xbitmap64_walk(
+ struct xbitmap64 *bitmap,
+ xbitmap64_walk_fn fn,
+ void *priv)
+{
+ struct xbitmap64_node *bn;
+ int error = 0;
+
+ for_each_xbitmap64_extent(bn, bitmap) {
+ error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/* Does this bitmap have no bits set at all? */
+bool
+xbitmap64_empty(
+ struct xbitmap64 *bitmap)
+{
+ return bitmap->xb_root.rb_root.rb_node == NULL;
+}
+
+/* Is the start of the range set or clear? And for how long? */
+bool
+xbitmap64_test(
+ struct xbitmap64 *bitmap,
+ uint64_t start,
+ uint64_t *len)
+{
+ struct xbitmap64_node *bn;
+ uint64_t last = start + *len - 1;
+
+ bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last);
+ if (!bn)
+ return false;
+ if (bn->bn_start <= start) {
+ if (bn->bn_last < last)
+ *len = bn->bn_last - start + 1;
+ return true;
+ }
+ *len = bn->bn_start - start;
+ return false;
+}
+
+/* u32 bitmap */
+
+struct xbitmap32_node {
+ struct rb_node bn_rbnode;
+
+ /* First set bit of this interval and subtree. */
+ uint32_t bn_start;
+
+ /* Last set bit of this interval. */
+ uint32_t bn_last;
+
+ /* Last set bit of this subtree. Do not touch this. */
+ uint32_t __bn_subtree_last;
+};
+
+/* Define our own interval tree type with uint32_t parameters. */
+
/*
- * Record all btree blocks seen while iterating all records of a btree.
- *
- * We know that the btree query_all function starts at the left edge and walks
- * towards the right edge of the tree. Therefore, we know that we can walk up
- * the btree cursor towards the root; if the pointer for a given level points
- * to the first record/key in that block, we haven't seen this block before;
- * and therefore we need to remember that we saw this block in the btree.
- *
- * So if our btree is:
- *
- * 4
- * / | \
- * 1 2 3
- *
- * Pretend for this example that each leaf block has 100 btree records. For
- * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
- * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so
- * we record block 4. The list is [1, 4].
- *
- * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
- * the loop. The list remains [1, 4].
- *
- * For the 101st btree record, we've moved onto leaf block 2. Now
- * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that
- * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2].
- *
- * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
- *
- * For the 201st record, we've moved on to leaf block 3.
- * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3].
- *
- * For the 300th record we just exit, with the list being [1, 4, 2, 3].
+ * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
+ * forward-declare them anyway for clarity.
*/
+static inline void
+xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root);
-/* Mark a btree block to the agblock bitmap. */
-STATIC int
-xagb_bitmap_visit_btblock(
- struct xfs_btree_cur *cur,
- int level,
- void *priv)
+static inline void
+xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root);
+
+static inline struct xbitmap32_node *
+xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start,
+ uint32_t last);
+
+static inline struct xbitmap32_node *
+xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start,
+ uint32_t last);
+
+INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t,
+ __bn_subtree_last, START, LAST, static inline, xbitmap32_tree)
+
+/* Iterate each interval of a bitmap. Do not change the bitmap. */
+#define for_each_xbitmap32_extent(bn, bitmap) \
+ for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \
+ struct xbitmap32_node, bn_rbnode); \
+ (bn) != NULL; \
+ (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \
+ struct xbitmap32_node, bn_rbnode))
+
+/* Clear a range of this bitmap. */
+int
+xbitmap32_clear(
+ struct xbitmap32 *bitmap,
+ uint32_t start,
+ uint32_t len)
{
- struct xagb_bitmap *bitmap = priv;
- struct xfs_buf *bp;
- xfs_fsblock_t fsbno;
- xfs_agblock_t agbno;
+ struct xbitmap32_node *bn;
+ struct xbitmap32_node *new_bn;
+ uint32_t last = start + len - 1;
- xfs_btree_get_block(cur, level, &bp);
- if (!bp)
- return 0;
+ while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last))) {
+ if (bn->bn_start < start && bn->bn_last > last) {
+ uint32_t old_last = bn->bn_last;
- fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
- agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ /* overlaps with the entire clearing range */
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ bn->bn_last = start - 1;
+ xbitmap32_tree_insert(bn, &bitmap->xb_root);
+
+ /* add an extent */
+ new_bn = kmalloc(sizeof(struct xbitmap32_node),
+ XCHK_GFP_FLAGS);
+ if (!new_bn)
+ return -ENOMEM;
+ new_bn->bn_start = last + 1;
+ new_bn->bn_last = old_last;
+ xbitmap32_tree_insert(new_bn, &bitmap->xb_root);
+ } else if (bn->bn_start < start) {
+ /* overlaps with the left side of the clearing range */
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ bn->bn_last = start - 1;
+ xbitmap32_tree_insert(bn, &bitmap->xb_root);
+ } else if (bn->bn_last > last) {
+ /* overlaps with the right side of the clearing range */
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ bn->bn_start = last + 1;
+ xbitmap32_tree_insert(bn, &bitmap->xb_root);
+ break;
+ } else {
+ /* in the middle of the clearing range */
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ kfree(bn);
+ }
+ }
- return xagb_bitmap_set(bitmap, agbno, 1);
+ return 0;
}
-/* Mark all (per-AG) btree blocks in the agblock bitmap. */
+/* Set a range of this bitmap. */
int
-xagb_bitmap_set_btblocks(
- struct xagb_bitmap *bitmap,
- struct xfs_btree_cur *cur)
+xbitmap32_set(
+ struct xbitmap32 *bitmap,
+ uint32_t start,
+ uint32_t len)
+{
+ struct xbitmap32_node *left;
+ struct xbitmap32_node *right;
+ uint32_t last = start + len - 1;
+ int error;
+
+ /* Is this whole range already set? */
+ left = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last);
+ if (left && left->bn_start <= start && left->bn_last >= last)
+ return 0;
+
+ /* Clear out everything in the range we want to set. */
+ error = xbitmap32_clear(bitmap, start, len);
+ if (error)
+ return error;
+
+ /* Do we have a left-adjacent extent? */
+ left = xbitmap32_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
+ ASSERT(!left || left->bn_last + 1 == start);
+
+ /* Do we have a right-adjacent extent? */
+ right = xbitmap32_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
+ ASSERT(!right || right->bn_start == last + 1);
+
+ if (left && right) {
+ /* combine left and right adjacent extent */
+ xbitmap32_tree_remove(left, &bitmap->xb_root);
+ xbitmap32_tree_remove(right, &bitmap->xb_root);
+ left->bn_last = right->bn_last;
+ xbitmap32_tree_insert(left, &bitmap->xb_root);
+ kfree(right);
+ } else if (left) {
+ /* combine with left extent */
+ xbitmap32_tree_remove(left, &bitmap->xb_root);
+ left->bn_last = last;
+ xbitmap32_tree_insert(left, &bitmap->xb_root);
+ } else if (right) {
+ /* combine with right extent */
+ xbitmap32_tree_remove(right, &bitmap->xb_root);
+ right->bn_start = start;
+ xbitmap32_tree_insert(right, &bitmap->xb_root);
+ } else {
+ /* add an extent */
+ left = kmalloc(sizeof(struct xbitmap32_node), XCHK_GFP_FLAGS);
+ if (!left)
+ return -ENOMEM;
+ left->bn_start = start;
+ left->bn_last = last;
+ xbitmap32_tree_insert(left, &bitmap->xb_root);
+ }
+
+ return 0;
+}
+
+/* Free everything related to this bitmap. */
+void
+xbitmap32_destroy(
+ struct xbitmap32 *bitmap)
{
- return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock,
- XFS_BTREE_VISIT_ALL, bitmap);
+ struct xbitmap32_node *bn;
+
+ while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, 0, -1U))) {
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ kfree(bn);
+ }
+}
+
+/* Set up a per-AG block bitmap. */
+void
+xbitmap32_init(
+ struct xbitmap32 *bitmap)
+{
+ bitmap->xb_root = RB_ROOT_CACHED;
}
/*
- * Record all the buffers pointed to by the btree cursor. Callers already
- * engaged in a btree walk should call this function to capture the list of
- * blocks going from the leaf towards the root.
+ * Remove all the blocks mentioned in @sub from the extents in @bitmap.
+ *
+ * The intent is that callers will iterate the rmapbt for all of its records
+ * for a given owner to generate @bitmap; and iterate all the blocks of the
+ * metadata structures that are not being rebuilt and have the same rmapbt
+ * owner to generate @sub. This routine subtracts all the extents
+ * mentioned in sub from all the extents linked in @bitmap, which leaves
+ * @bitmap as the list of blocks that are not accounted for, which we assume
+ * are the dead blocks of the old metadata structure. The blocks mentioned in
+ * @bitmap can be reaped.
+ *
+ * This is the logical equivalent of bitmap &= ~sub.
*/
int
-xagb_bitmap_set_btcur_path(
- struct xagb_bitmap *bitmap,
- struct xfs_btree_cur *cur)
+xbitmap32_disunion(
+ struct xbitmap32 *bitmap,
+ struct xbitmap32 *sub)
{
- int i;
+ struct xbitmap32_node *bn;
int error;
- for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
- error = xagb_bitmap_visit_btblock(cur, i, bitmap);
+ if (xbitmap32_empty(bitmap) || xbitmap32_empty(sub))
+ return 0;
+
+ for_each_xbitmap32_extent(bn, sub) {
+ error = xbitmap32_clear(bitmap, bn->bn_start,
+ bn->bn_last - bn->bn_start + 1);
if (error)
return error;
}
@@ -318,14 +505,14 @@ xagb_bitmap_set_btcur_path(
}
/* How many bits are set in this bitmap? */
-uint64_t
-xbitmap_hweight(
- struct xbitmap *bitmap)
+uint32_t
+xbitmap32_hweight(
+ struct xbitmap32 *bitmap)
{
- struct xbitmap_node *bn;
- uint64_t ret = 0;
+ struct xbitmap32_node *bn;
+ uint32_t ret = 0;
- for_each_xbitmap_extent(bn, bitmap)
+ for_each_xbitmap32_extent(bn, bitmap)
ret += bn->bn_last - bn->bn_start + 1;
return ret;
@@ -333,15 +520,15 @@ xbitmap_hweight(
/* Call a function for every run of set bits in this bitmap. */
int
-xbitmap_walk(
- struct xbitmap *bitmap,
- xbitmap_walk_fn fn,
+xbitmap32_walk(
+ struct xbitmap32 *bitmap,
+ xbitmap32_walk_fn fn,
void *priv)
{
- struct xbitmap_node *bn;
+ struct xbitmap32_node *bn;
int error = 0;
- for_each_xbitmap_extent(bn, bitmap) {
+ for_each_xbitmap32_extent(bn, bitmap) {
error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv);
if (error)
break;
@@ -352,23 +539,23 @@ xbitmap_walk(
/* Does this bitmap have no bits set at all? */
bool
-xbitmap_empty(
- struct xbitmap *bitmap)
+xbitmap32_empty(
+ struct xbitmap32 *bitmap)
{
return bitmap->xb_root.rb_root.rb_node == NULL;
}
/* Is the start of the range set or clear? And for how long? */
bool
-xbitmap_test(
- struct xbitmap *bitmap,
- uint64_t start,
- uint64_t *len)
+xbitmap32_test(
+ struct xbitmap32 *bitmap,
+ uint32_t start,
+ uint32_t *len)
{
- struct xbitmap_node *bn;
- uint64_t last = start + *len - 1;
+ struct xbitmap32_node *bn;
+ uint32_t last = start + *len - 1;
- bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+ bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last);
if (!bn)
return false;
if (bn->bn_start <= start) {
@@ -379,3 +566,17 @@ xbitmap_test(
*len = bn->bn_start - start;
return false;
}
+
+/* Count the number of set regions in this bitmap. */
+uint32_t
+xbitmap32_count_set_regions(
+ struct xbitmap32 *bitmap)
+{
+ struct xbitmap32_node *bn;
+ uint32_t nr = 0;
+
+ for_each_xbitmap32_extent(bn, bitmap)
+ nr++;
+
+ return nr;
+}
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 4fe58bad6734..710c1ac5e323 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -6,17 +6,19 @@
#ifndef __XFS_SCRUB_BITMAP_H__
#define __XFS_SCRUB_BITMAP_H__
-struct xbitmap {
+/* u64 bitmap */
+
+struct xbitmap64 {
struct rb_root_cached xb_root;
};
-void xbitmap_init(struct xbitmap *bitmap);
-void xbitmap_destroy(struct xbitmap *bitmap);
+void xbitmap64_init(struct xbitmap64 *bitmap);
+void xbitmap64_destroy(struct xbitmap64 *bitmap);
-int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len);
-int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
-int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
-uint64_t xbitmap_hweight(struct xbitmap *bitmap);
+int xbitmap64_clear(struct xbitmap64 *bitmap, uint64_t start, uint64_t len);
+int xbitmap64_set(struct xbitmap64 *bitmap, uint64_t start, uint64_t len);
+int xbitmap64_disunion(struct xbitmap64 *bitmap, struct xbitmap64 *sub);
+uint64_t xbitmap64_hweight(struct xbitmap64 *bitmap);
/*
* Return codes for the bitmap iterator functions are 0 to continue iterating,
@@ -25,84 +27,41 @@ uint64_t xbitmap_hweight(struct xbitmap *bitmap);
* iteration, because neither bitmap iterator ever generates that error code on
* its own. Callers must not modify the bitmap while walking it.
*/
-typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv);
-int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn,
+typedef int (*xbitmap64_walk_fn)(uint64_t start, uint64_t len, void *priv);
+int xbitmap64_walk(struct xbitmap64 *bitmap, xbitmap64_walk_fn fn,
void *priv);
-bool xbitmap_empty(struct xbitmap *bitmap);
-bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len);
+bool xbitmap64_empty(struct xbitmap64 *bitmap);
+bool xbitmap64_test(struct xbitmap64 *bitmap, uint64_t start, uint64_t *len);
-/* Bitmaps, but for type-checked for xfs_agblock_t */
+/* u32 bitmap */
-struct xagb_bitmap {
- struct xbitmap agbitmap;
+struct xbitmap32 {
+ struct rb_root_cached xb_root;
};
-static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap)
-{
- xbitmap_init(&bitmap->agbitmap);
-}
-
-static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap)
-{
- xbitmap_destroy(&bitmap->agbitmap);
-}
-
-static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap,
- xfs_agblock_t start, xfs_extlen_t len)
-{
- return xbitmap_clear(&bitmap->agbitmap, start, len);
-}
-static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap,
- xfs_agblock_t start, xfs_extlen_t len)
-{
- return xbitmap_set(&bitmap->agbitmap, start, len);
-}
-
-static inline bool
-xagb_bitmap_test(
- struct xagb_bitmap *bitmap,
- xfs_agblock_t start,
- xfs_extlen_t *len)
-{
- uint64_t biglen = *len;
- bool ret;
-
- ret = xbitmap_test(&bitmap->agbitmap, start, &biglen);
-
- if (start + biglen >= UINT_MAX) {
- ASSERT(0);
- biglen = UINT_MAX - start;
- }
-
- *len = biglen;
- return ret;
-}
+void xbitmap32_init(struct xbitmap32 *bitmap);
+void xbitmap32_destroy(struct xbitmap32 *bitmap);
-static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap,
- struct xagb_bitmap *sub)
-{
- return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap);
-}
+int xbitmap32_clear(struct xbitmap32 *bitmap, uint32_t start, uint32_t len);
+int xbitmap32_set(struct xbitmap32 *bitmap, uint32_t start, uint32_t len);
+int xbitmap32_disunion(struct xbitmap32 *bitmap, struct xbitmap32 *sub);
+uint32_t xbitmap32_hweight(struct xbitmap32 *bitmap);
-static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap)
-{
- return xbitmap_hweight(&bitmap->agbitmap);
-}
-static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap)
-{
- return xbitmap_empty(&bitmap->agbitmap);
-}
+/*
+ * Return codes for the bitmap iterator functions are 0 to continue iterating,
+ * and non-zero to stop iterating. Any non-zero value will be passed up to the
+ * iteration caller. The special value -ECANCELED can be used to stop
+ * iteration, because neither bitmap iterator ever generates that error code on
+ * its own. Callers must not modify the bitmap while walking it.
+ */
+typedef int (*xbitmap32_walk_fn)(uint32_t start, uint32_t len, void *priv);
+int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn,
+ void *priv);
-static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
- xbitmap_walk_fn fn, void *priv)
-{
- return xbitmap_walk(&bitmap->agbitmap, fn, priv);
-}
+bool xbitmap32_empty(struct xbitmap32 *bitmap);
+bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len);
-int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
- struct xfs_btree_cur *cur);
-int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
- struct xfs_btree_cur *cur);
+uint32_t xbitmap32_count_set_regions(struct xbitmap32 *bitmap);
#endif /* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 75588915572e..24a15bf784f1 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -19,9 +19,11 @@
#include "xfs_bmap_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "scrub/health.h"
#include "xfs_ag.h"
/* Set us up with an inode's bmap. */
@@ -48,9 +50,18 @@ xchk_setup_inode_bmap(
if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) {
struct address_space *mapping = VFS_I(sc->ip)->i_mapping;
+ bool is_repair = xchk_could_repair(sc);
xchk_ilock(sc, XFS_MMAPLOCK_EXCL);
+ /* Break all our leases, we're going to mess with things. */
+ if (is_repair) {
+ error = xfs_break_layouts(VFS_I(sc->ip),
+ &sc->ilock_flags, BREAK_WRITE);
+ if (error)
+ goto out;
+ }
+
inode_dio_wait(VFS_I(sc->ip));
/*
@@ -71,6 +82,15 @@ xchk_setup_inode_bmap(
error = filemap_fdatawait_keep_errors(mapping);
if (error && (error != -ENOSPC && error != -EIO))
goto out;
+
+ /* Drop the page cache if we're repairing block mappings. */
+ if (is_repair) {
+ error = invalidate_inode_pages2(
+ VFS_I(sc->ip)->i_mapping);
+ if (error)
+ goto out;
+ }
+
}
/* Got the inode, lock it and we're ready to go. */
@@ -78,6 +98,10 @@ xchk_setup_inode_bmap(
if (error)
goto out;
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ goto out;
+
xchk_ilock(sc, XFS_ILOCK_EXCL);
out:
/* scrub teardown will unlock and release the inode */
@@ -410,7 +434,7 @@ xchk_bmap_iextent(
/* Make sure the extent points to a valid place. */
if (info->is_rt &&
- !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount))
+ !xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount))
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (!info->is_rt &&
@@ -633,6 +657,82 @@ xchk_bmap_check_ag_rmaps(
}
/*
+ * Decide if we want to scan the reverse mappings to determine if the attr
+ * fork /really/ has zero space mappings.
+ */
+STATIC bool
+xchk_bmap_check_empty_attrfork(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *ifp = &ip->i_af;
+
+ /*
+ * If the dinode repair found a bad attr fork, it will reset the fork
+ * to extents format with zero records and wait for the this scrubber
+ * to reconstruct the block mappings. If the fork is not in this
+ * state, then the fork cannot have been zapped.
+ */
+ if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0)
+ return false;
+
+ /*
+ * Files can have an attr fork in EXTENTS format with zero records for
+ * several reasons:
+ *
+ * a) an attr set created a fork but ran out of space
+ * b) attr replace deleted an old attr but failed during the set step
+ * c) the data fork was in btree format when all attrs were deleted, so
+ * the fork was left in place
+ * d) the inode repair code zapped the fork
+ *
+ * Only in case (d) do we want to scan the rmapbt to see if we need to
+ * rebuild the attr fork. The fork zap code clears all DAC permission
+ * bits and zeroes the uid and gid, so avoid the scan if any of those
+ * three conditions are not met.
+ */
+ if ((VFS_I(ip)->i_mode & 0777) != 0)
+ return false;
+ if (!uid_eq(VFS_I(ip)->i_uid, GLOBAL_ROOT_UID))
+ return false;
+ if (!gid_eq(VFS_I(ip)->i_gid, GLOBAL_ROOT_GID))
+ return false;
+
+ return true;
+}
+
+/*
+ * Decide if we want to scan the reverse mappings to determine if the data
+ * fork /really/ has zero space mappings.
+ */
+STATIC bool
+xchk_bmap_check_empty_datafork(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *ifp = &ip->i_df;
+
+ /* Don't support realtime rmap checks yet. */
+ if (XFS_IS_REALTIME_INODE(ip))
+ return false;
+
+ /*
+ * If the dinode repair found a bad data fork, it will reset the fork
+ * to extents format with zero records and wait for the this scrubber
+ * to reconstruct the block mappings. If the fork is not in this
+ * state, then the fork cannot have been zapped.
+ */
+ if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0)
+ return false;
+
+ /*
+ * If we encounter an empty data fork along with evidence that the fork
+ * might not really be empty, we need to scan the reverse mappings to
+ * decide if we're going to rebuild the fork. Data forks with nonzero
+ * file size are scanned.
+ */
+ return i_size_read(VFS_I(ip)) != 0;
+}
+
+/*
* Decide if we want to walk every rmap btree in the fs to make sure that each
* rmap for this file fork has corresponding bmbt entries.
*/
@@ -641,7 +741,6 @@ xchk_bmap_want_check_rmaps(
struct xchk_bmap_info *info)
{
struct xfs_scrub *sc = info->sc;
- struct xfs_ifork *ifp;
if (!xfs_has_rmapbt(sc->mp))
return false;
@@ -650,28 +749,10 @@ xchk_bmap_want_check_rmaps(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return false;
- /* Don't support realtime rmap checks yet. */
- if (info->is_rt)
- return false;
-
- /*
- * The inode repair code zaps broken inode forks by resetting them back
- * to EXTENTS format and zero extent records. If we encounter a fork
- * in this state along with evidence that the fork isn't supposed to be
- * empty, we need to scan the reverse mappings to decide if we're going
- * to rebuild the fork. Data forks with nonzero file size are scanned.
- * xattr forks are never empty of content, so they are always scanned.
- */
- ifp = xfs_ifork_ptr(sc->ip, info->whichfork);
- if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) {
- if (info->whichfork == XFS_DATA_FORK &&
- i_size_read(VFS_I(sc->ip)) == 0)
- return false;
-
- return true;
- }
+ if (info->whichfork == XFS_ATTR_FORK)
+ return xchk_bmap_check_empty_attrfork(sc->ip);
- return false;
+ return xchk_bmap_check_empty_datafork(sc->ip);
}
/* Make sure each rmap has a corresponding bmbt entry. */
@@ -843,7 +924,7 @@ xchk_bmap(
if (!ifp)
return -ENOENT;
- info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
+ info.is_rt = xfs_ifork_is_realtime(ip, whichfork);
info.whichfork = whichfork;
info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip);
info.sc = sc;
@@ -939,7 +1020,20 @@ int
xchk_bmap_data(
struct xfs_scrub *sc)
{
- return xchk_bmap(sc, XFS_DATA_FORK);
+ int error;
+
+ if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTD_ZAPPED)) {
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ error = xchk_bmap(sc, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ /* If the data fork is clean, it is clearly not zapped. */
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTD_ZAPPED);
+ return 0;
}
/* Scrub an inode's attr fork. */
@@ -947,7 +1041,27 @@ int
xchk_bmap_attr(
struct xfs_scrub *sc)
{
- return xchk_bmap(sc, XFS_ATTR_FORK);
+ int error;
+
+ /*
+ * If the attr fork has been zapped, it's possible that forkoff was
+ * reset to zero and hence sc->ip->i_afp is NULL. We don't want the
+ * NULL ifp check in xchk_bmap to conclude that the attr fork is ok,
+ * so short circuit that logic by setting the corruption flag and
+ * returning immediately.
+ */
+ if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTA_ZAPPED)) {
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ error = xchk_bmap(sc, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ /* If the attr fork is clean, it is clearly not zapped. */
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTA_ZAPPED);
+ return 0;
}
/* Scrub an inode's CoW fork. */
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
new file mode 100644
index 000000000000..1e656fab5e41
--- /dev/null
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -0,0 +1,873 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_quota.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "xfs_reflink.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Inode Fork Block Mapping (BMBT) Repair
+ * ======================================
+ *
+ * Gather all the rmap records for the inode and fork we're fixing, reset the
+ * incore fork, then recreate the btree.
+ */
+
+enum reflink_scan_state {
+ RLS_IRRELEVANT = -1, /* not applicable to this file */
+ RLS_UNKNOWN, /* shared extent scans required */
+ RLS_SET_IFLAG, /* iflag must be set */
+};
+
+struct xrep_bmap {
+ /* Old bmbt blocks */
+ struct xfsb_bitmap old_bmbt_blocks;
+
+ /* New fork. */
+ struct xrep_newbt new_bmapbt;
+
+ /* List of new bmap records. */
+ struct xfarray *bmap_records;
+
+ struct xfs_scrub *sc;
+
+ /* How many blocks did we find allocated to this file? */
+ xfs_rfsblock_t nblocks;
+
+ /* How many bmbt blocks did we find for this fork? */
+ xfs_rfsblock_t old_bmbt_block_count;
+
+ /* get_records()'s position in the free space record array. */
+ xfarray_idx_t array_cur;
+
+ /* How many real (non-hole, non-delalloc) mappings do we have? */
+ uint64_t real_mappings;
+
+ /* Which fork are we fixing? */
+ int whichfork;
+
+ /* What d the REFLINK flag be set when the repair is over? */
+ enum reflink_scan_state reflink_scan;
+
+ /* Do we allow unwritten extents? */
+ bool allow_unwritten;
+};
+
+/* Is this space extent shared? Flag the inode if it is. */
+STATIC int
+xrep_bmap_discover_shared(
+ struct xrep_bmap *rb,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount)
+{
+ struct xfs_scrub *sc = rb->sc;
+ xfs_agblock_t agbno;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int error;
+
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
+ error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount,
+ &fbno, &flen, false);
+ if (error)
+ return error;
+
+ if (fbno != NULLAGBLOCK)
+ rb->reflink_scan = RLS_SET_IFLAG;
+
+ return 0;
+}
+
+/* Remember this reverse-mapping as a series of bmap records. */
+STATIC int
+xrep_bmap_from_rmap(
+ struct xrep_bmap *rb,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ bool unwritten)
+{
+ struct xfs_bmbt_irec irec = {
+ .br_startoff = startoff,
+ .br_startblock = startblock,
+ .br_state = unwritten ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM,
+ };
+ struct xfs_bmbt_rec rbe;
+ struct xfs_scrub *sc = rb->sc;
+ int error = 0;
+
+ /*
+ * If we're repairing the data fork of a non-reflinked regular file on
+ * a reflink filesystem, we need to figure out if this space extent is
+ * shared.
+ */
+ if (rb->reflink_scan == RLS_UNKNOWN && !unwritten) {
+ error = xrep_bmap_discover_shared(rb, startblock, blockcount);
+ if (error)
+ return error;
+ }
+
+ do {
+ xfs_failaddr_t fa;
+
+ irec.br_blockcount = min_t(xfs_filblks_t, blockcount,
+ XFS_MAX_BMBT_EXTLEN);
+
+ fa = xfs_bmap_validate_extent(sc->ip, rb->whichfork, &irec);
+ if (fa)
+ return -EFSCORRUPTED;
+
+ xfs_bmbt_disk_set_all(&rbe, &irec);
+
+ trace_xrep_bmap_found(sc->ip, rb->whichfork, &irec);
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ error = xfarray_append(rb->bmap_records, &rbe);
+ if (error)
+ return error;
+
+ rb->real_mappings++;
+
+ irec.br_startblock += irec.br_blockcount;
+ irec.br_startoff += irec.br_blockcount;
+ blockcount -= irec.br_blockcount;
+ } while (blockcount > 0);
+
+ return 0;
+}
+
+/* Check for any obvious errors or conflicts in the file mapping. */
+STATIC int
+xrep_bmap_check_fork_rmap(
+ struct xrep_bmap *rb,
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec)
+{
+ struct xfs_scrub *sc = rb->sc;
+ enum xbtree_recpacking outcome;
+ int error;
+
+ /*
+ * Data extents for rt files are never stored on the data device, but
+ * everything else (xattrs, bmbt blocks) can be.
+ */
+ if (XFS_IS_REALTIME_INODE(sc->ip) &&
+ !(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))
+ return -EFSCORRUPTED;
+
+ /* Check that this is within the AG. */
+ if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock,
+ rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ /* Check the file offset range. */
+ if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+ !xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ /* No contradictory flags. */
+ if ((rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) &&
+ (rec->rm_flags & XFS_RMAP_UNWRITTEN))
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+ rec->rm_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ /* Must not be an inode chunk. */
+ error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+ rec->rm_startblock, rec->rm_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Record extents that belong to this inode's fork. */
+STATIC int
+xrep_bmap_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_bmap *rb = priv;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_fsblock_t fsbno;
+ int error = 0;
+
+ if (xchk_should_terminate(rb->sc, &error))
+ return error;
+
+ if (rec->rm_owner != rb->sc->ip->i_ino)
+ return 0;
+
+ error = xrep_bmap_check_fork_rmap(rb, cur, rec);
+ if (error)
+ return error;
+
+ /*
+ * Record all blocks allocated to this file even if the extent isn't
+ * for the fork we're rebuilding so that we can reset di_nblocks later.
+ */
+ rb->nblocks += rec->rm_blockcount;
+
+ /* If this rmap isn't for the fork we want, we're done. */
+ if (rb->whichfork == XFS_DATA_FORK &&
+ (rec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return 0;
+ if (rb->whichfork == XFS_ATTR_FORK &&
+ !(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return 0;
+
+ /* Reject unwritten extents if we don't allow those. */
+ if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten)
+ return -EFSCORRUPTED;
+
+ fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno,
+ rec->rm_startblock);
+
+ if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
+ rb->old_bmbt_block_count += rec->rm_blockcount;
+ return xfsb_bitmap_set(&rb->old_bmbt_blocks, fsbno,
+ rec->rm_blockcount);
+ }
+
+ return xrep_bmap_from_rmap(rb, rec->rm_offset, fsbno,
+ rec->rm_blockcount,
+ rec->rm_flags & XFS_RMAP_UNWRITTEN);
+}
+
+/*
+ * Compare two block mapping records. We want to sort in order of increasing
+ * file offset.
+ */
+static int
+xrep_bmap_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_bmbt_rec *ba = a;
+ const struct xfs_bmbt_rec *bb = b;
+ xfs_fileoff_t ao = xfs_bmbt_disk_get_startoff(ba);
+ xfs_fileoff_t bo = xfs_bmbt_disk_get_startoff(bb);
+
+ if (ao > bo)
+ return 1;
+ else if (ao < bo)
+ return -1;
+ return 0;
+}
+
+/*
+ * Sort the bmap extents by fork offset or else the records will be in the
+ * wrong order. Ensure there are no overlaps in the file offset ranges.
+ */
+STATIC int
+xrep_bmap_sort_records(
+ struct xrep_bmap *rb)
+{
+ struct xfs_bmbt_irec irec;
+ xfs_fileoff_t next_off = 0;
+ xfarray_idx_t array_cur;
+ int error;
+
+ error = xfarray_sort(rb->bmap_records, xrep_bmap_extent_cmp,
+ XFARRAY_SORT_KILLABLE);
+ if (error)
+ return error;
+
+ foreach_xfarray_idx(rb->bmap_records, array_cur) {
+ struct xfs_bmbt_rec rec;
+
+ if (xchk_should_terminate(rb->sc, &error))
+ return error;
+
+ error = xfarray_load(rb->bmap_records, array_cur, &rec);
+ if (error)
+ return error;
+
+ xfs_bmbt_disk_get_all(&rec, &irec);
+
+ if (irec.br_startoff < next_off)
+ return -EFSCORRUPTED;
+
+ next_off = irec.br_startoff + irec.br_blockcount;
+ }
+
+ return 0;
+}
+
+/* Scan one AG for reverse mappings that we can turn into extent maps. */
+STATIC int
+xrep_bmap_scan_ag(
+ struct xrep_bmap *rb,
+ struct xfs_perag *pag)
+{
+ struct xfs_scrub *sc = rb->sc;
+ int error;
+
+ error = xrep_ag_init(sc, pag, &sc->sa);
+ if (error)
+ return error;
+
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_bmap_walk_rmap, rb);
+ xchk_ag_free(sc, &sc->sa);
+ return error;
+}
+
+/* Find the delalloc extents from the old incore extent tree. */
+STATIC int
+xrep_bmap_find_delalloc(
+ struct xrep_bmap *rb)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_rec rbe;
+ struct xfs_inode *ip = rb->sc->ip;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, rb->whichfork);
+ int error = 0;
+
+ /*
+ * Skip this scan if we don't expect to find delayed allocation
+ * reservations in this fork.
+ */
+ if (rb->whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0)
+ return 0;
+
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ if (!isnullstartblock(irec.br_startblock))
+ continue;
+
+ xfs_bmbt_disk_set_all(&rbe, &irec);
+
+ trace_xrep_bmap_found(ip, rb->whichfork, &irec);
+
+ if (xchk_should_terminate(rb->sc, &error))
+ return error;
+
+ error = xfarray_append(rb->bmap_records, &rbe);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Collect block mappings for this fork of this inode and decide if we have
+ * enough space to rebuild. Caller is responsible for cleaning up the list if
+ * anything goes wrong.
+ */
+STATIC int
+xrep_bmap_find_mappings(
+ struct xrep_bmap *rb)
+{
+ struct xfs_scrub *sc = rb->sc;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ int error = 0;
+
+ /* Iterate the rmaps for extents. */
+ for_each_perag(sc->mp, agno, pag) {
+ error = xrep_bmap_scan_ag(rb, pag);
+ if (error) {
+ xfs_perag_rele(pag);
+ return error;
+ }
+ }
+
+ return xrep_bmap_find_delalloc(rb);
+}
+
+/* Retrieve real extent mappings for bulk loading the bmap btree. */
+STATIC int
+xrep_bmap_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_bmbt_rec rec;
+ struct xfs_bmbt_irec *irec = &cur->bc_rec.b;
+ struct xrep_bmap *rb = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ do {
+ error = xfarray_load(rb->bmap_records, rb->array_cur++,
+ &rec);
+ if (error)
+ return error;
+
+ xfs_bmbt_disk_get_all(&rec, irec);
+ } while (isnullstartblock(irec->br_startblock));
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_bmap_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_bmap *rb = priv;
+
+ return xrep_newbt_claim_block(cur, &rb->new_bmapbt, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_bmap_iroot_size(
+ struct xfs_btree_cur *cur,
+ unsigned int level,
+ unsigned int nr_this_level,
+ void *priv)
+{
+ ASSERT(level > 0);
+
+ return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level);
+}
+
+/* Update the inode counters. */
+STATIC int
+xrep_bmap_reset_counters(
+ struct xrep_bmap *rb)
+{
+ struct xfs_scrub *sc = rb->sc;
+ struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake;
+ int64_t delta;
+
+ if (rb->reflink_scan == RLS_SET_IFLAG)
+ sc->ip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+
+ /*
+ * Update the inode block counts to reflect the extents we found in the
+ * rmapbt.
+ */
+ delta = ifake->if_blocks - rb->old_bmbt_block_count;
+ sc->ip->i_nblocks = rb->nblocks + delta;
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+
+ /*
+ * Adjust the quota counts by the difference in size between the old
+ * and new bmbt.
+ */
+ xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, delta);
+ return 0;
+}
+
+/*
+ * Create a new iext tree and load it with block mappings. If the inode is
+ * in extents format, that's all we need to do to commit the new mappings.
+ * If it is in btree format, this takes care of preloading the incore tree.
+ */
+STATIC int
+xrep_bmap_extents_load(
+ struct xrep_bmap *rb)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec irec;
+ struct xfs_ifork *ifp = rb->new_bmapbt.ifake.if_fork;
+ xfarray_idx_t array_cur;
+ int error;
+
+ ASSERT(ifp->if_bytes == 0);
+
+ /* Add all the mappings (incl. delalloc) to the incore extent tree. */
+ xfs_iext_first(ifp, &icur);
+ foreach_xfarray_idx(rb->bmap_records, array_cur) {
+ struct xfs_bmbt_rec rec;
+
+ error = xfarray_load(rb->bmap_records, array_cur, &rec);
+ if (error)
+ return error;
+
+ xfs_bmbt_disk_get_all(&rec, &irec);
+
+ xfs_iext_insert_raw(ifp, &icur, &irec);
+ if (!isnullstartblock(irec.br_startblock))
+ ifp->if_nextents++;
+
+ xfs_iext_next(ifp, &icur);
+ }
+
+ return xrep_ino_ensure_extent_count(rb->sc, rb->whichfork,
+ ifp->if_nextents);
+}
+
+/*
+ * Reserve new btree blocks, bulk load the bmap records into the ondisk btree,
+ * and load the incore extent tree.
+ */
+STATIC int
+xrep_bmap_btree_load(
+ struct xrep_bmap *rb,
+ struct xfs_btree_cur *bmap_cur)
+{
+ struct xfs_scrub *sc = rb->sc;
+ int error;
+
+ /* Compute how many blocks we'll need. */
+ error = xfs_btree_bload_compute_geometry(bmap_cur,
+ &rb->new_bmapbt.bload, rb->real_mappings);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * Guess how many blocks we're going to need to rebuild an entire bmap
+ * from the number of extents we found, and pump up our transaction to
+ * have sufficient block reservation. We're allowed to exceed file
+ * quota to repair inconsistent metadata.
+ */
+ error = xfs_trans_reserve_more_inode(sc->tp, sc->ip,
+ rb->new_bmapbt.bload.nr_blocks, 0, true);
+ if (error)
+ return error;
+
+ /* Reserve the space we'll need for the new btree. */
+ error = xrep_newbt_alloc_blocks(&rb->new_bmapbt,
+ rb->new_bmapbt.bload.nr_blocks);
+ if (error)
+ return error;
+
+ /* Add all observed bmap records. */
+ rb->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(bmap_cur, &rb->new_bmapbt.bload, rb);
+ if (error)
+ return error;
+
+ /*
+ * Load the new bmap records into the new incore extent tree to
+ * preserve delalloc reservations for regular files. The directory
+ * code loads the extent tree during xfs_dir_open and assumes
+ * thereafter that it remains loaded, so we must not violate that
+ * assumption.
+ */
+ return xrep_bmap_extents_load(rb);
+}
+
+/*
+ * Use the collected bmap information to stage a new bmap fork. If this is
+ * successful we'll return with the new fork information logged to the repair
+ * transaction but not yet committed. The caller must ensure that the inode
+ * is joined to the transaction; the inode will be joined to a clean
+ * transaction when the function returns.
+ */
+STATIC int
+xrep_bmap_build_new_fork(
+ struct xrep_bmap *rb)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_scrub *sc = rb->sc;
+ struct xfs_btree_cur *bmap_cur;
+ struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake;
+ int error;
+
+ error = xrep_bmap_sort_records(rb);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new fork by initializing the new btree
+ * structure and creating a fake ifork in the ifakeroot structure.
+ */
+ xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork);
+ error = xrep_newbt_init_inode(&rb->new_bmapbt, sc, rb->whichfork,
+ &oinfo);
+ if (error)
+ return error;
+
+ rb->new_bmapbt.bload.get_records = xrep_bmap_get_records;
+ rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block;
+ rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size;
+
+ /*
+ * Allocate a new bmap btree cursor for reloading an inode block mapping
+ * data structure.
+ */
+ bmap_cur = xfs_bmbt_init_cursor(sc->mp, NULL, sc->ip, XFS_STAGING_FORK);
+ xfs_btree_stage_ifakeroot(bmap_cur, ifake);
+
+ /*
+ * Figure out the size and format of the new fork, then fill it with
+ * all the bmap records we've found. Join the inode to the transaction
+ * so that we can roll the transaction while holding the inode locked.
+ */
+ if (rb->real_mappings <= XFS_IFORK_MAXEXT(sc->ip, rb->whichfork)) {
+ ifake->if_fork->if_format = XFS_DINODE_FMT_EXTENTS;
+ error = xrep_bmap_extents_load(rb);
+ } else {
+ ifake->if_fork->if_format = XFS_DINODE_FMT_BTREE;
+ error = xrep_bmap_btree_load(rb, bmap_cur);
+ }
+ if (error)
+ goto err_cur;
+
+ /*
+ * Install the new fork in the inode. After this point the old mapping
+ * data are no longer accessible and the new tree is live. We delete
+ * the cursor immediately after committing the staged root because the
+ * staged fork might be in extents format.
+ */
+ xfs_bmbt_commit_staged_btree(bmap_cur, sc->tp, rb->whichfork);
+ xfs_btree_del_cursor(bmap_cur, 0);
+
+ /* Reset the inode counters now that we've changed the fork. */
+ error = xrep_bmap_reset_counters(rb);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rb->new_bmapbt);
+ if (error)
+ return error;
+
+ return xrep_roll_trans(sc);
+
+err_cur:
+ if (bmap_cur)
+ xfs_btree_del_cursor(bmap_cur, error);
+err_newbt:
+ xrep_newbt_cancel(&rb->new_bmapbt);
+ return error;
+}
+
+/*
+ * Now that we've logged the new inode btree, invalidate all of the old blocks
+ * and free them, if there were any.
+ */
+STATIC int
+xrep_bmap_remove_old_tree(
+ struct xrep_bmap *rb)
+{
+ struct xfs_scrub *sc = rb->sc;
+ struct xfs_owner_info oinfo;
+
+ /* Free the old bmbt blocks if they're not in use. */
+ xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork);
+ return xrep_reap_fsblocks(sc, &rb->old_bmbt_blocks, &oinfo);
+}
+
+/* Check for garbage inputs. Returns -ECANCELED if there's nothing to do. */
+STATIC int
+xrep_bmap_check_inputs(
+ struct xfs_scrub *sc,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
+
+ ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK);
+
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+
+ /* No fork means nothing to rebuild. */
+ if (!ifp)
+ return -ECANCELED;
+
+ /*
+ * We only know how to repair extent mappings, which is to say that we
+ * only support extents and btree fork format. Repairs to a local
+ * format fork require a higher level repair function, so we do not
+ * have any work to do here.
+ */
+ switch (ifp->if_format) {
+ case XFS_DINODE_FMT_DEV:
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_UUID:
+ return -ECANCELED;
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ break;
+ default:
+ return -EFSCORRUPTED;
+ }
+
+ if (whichfork == XFS_ATTR_FORK)
+ return 0;
+
+ /* Only files, symlinks, and directories get to have data forks. */
+ switch (VFS_I(sc->ip)->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ /* ok */
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Don't know how to rebuild realtime data forks. */
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+/* Set up the initial state of the reflink scan. */
+static inline enum reflink_scan_state
+xrep_bmap_init_reflink_scan(
+ struct xfs_scrub *sc,
+ int whichfork)
+{
+ /* cannot share on non-reflink filesystem */
+ if (!xfs_has_reflink(sc->mp))
+ return RLS_IRRELEVANT;
+
+ /* preserve flag if it's already set */
+ if (xfs_is_reflink_inode(sc->ip))
+ return RLS_SET_IFLAG;
+
+ /* can only share regular files */
+ if (!S_ISREG(VFS_I(sc->ip)->i_mode))
+ return RLS_IRRELEVANT;
+
+ /* cannot share attr fork extents */
+ if (whichfork != XFS_DATA_FORK)
+ return RLS_IRRELEVANT;
+
+ /* cannot share realtime extents */
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ return RLS_IRRELEVANT;
+
+ return RLS_UNKNOWN;
+}
+
+/* Repair an inode fork. */
+int
+xrep_bmap(
+ struct xfs_scrub *sc,
+ int whichfork,
+ bool allow_unwritten)
+{
+ struct xrep_bmap *rb;
+ char *descr;
+ unsigned int max_bmbt_recs;
+ bool large_extcount;
+ int error = 0;
+
+ error = xrep_bmap_check_inputs(sc, whichfork);
+ if (error == -ECANCELED)
+ return 0;
+ if (error)
+ return error;
+
+ rb = kzalloc(sizeof(struct xrep_bmap), XCHK_GFP_FLAGS);
+ if (!rb)
+ return -ENOMEM;
+ rb->sc = sc;
+ rb->whichfork = whichfork;
+ rb->reflink_scan = xrep_bmap_init_reflink_scan(sc, whichfork);
+ rb->allow_unwritten = allow_unwritten;
+
+ /* Set up enough storage to handle the max records for this fork. */
+ large_extcount = xfs_has_large_extent_counts(sc->mp);
+ max_bmbt_recs = xfs_iext_max_nextents(large_extcount, whichfork);
+ descr = xchk_xfile_ino_descr(sc, "%s fork mapping records",
+ whichfork == XFS_DATA_FORK ? "data" : "attr");
+ error = xfarray_create(descr, max_bmbt_recs,
+ sizeof(struct xfs_bmbt_rec), &rb->bmap_records);
+ kfree(descr);
+ if (error)
+ goto out_rb;
+
+ /* Collect all reverse mappings for this fork's extents. */
+ xfsb_bitmap_init(&rb->old_bmbt_blocks);
+ error = xrep_bmap_find_mappings(rb);
+ if (error)
+ goto out_bitmap;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Rebuild the bmap information. */
+ error = xrep_bmap_build_new_fork(rb);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old tree. */
+ error = xrep_bmap_remove_old_tree(rb);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xfsb_bitmap_destroy(&rb->old_bmbt_blocks);
+ xfarray_destroy(rb->bmap_records);
+out_rb:
+ kfree(rb);
+ return error;
+}
+
+/* Repair an inode's data fork. */
+int
+xrep_bmap_data(
+ struct xfs_scrub *sc)
+{
+ return xrep_bmap(sc, XFS_DATA_FORK, true);
+}
+
+/* Repair an inode's attr fork. */
+int
+xrep_bmap_attr(
+ struct xfs_scrub *sc)
+{
+ return xrep_bmap(sc, XFS_ATTR_FORK, false);
+}
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 1935b9ce1885..fe678a0438bc 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -47,7 +47,7 @@ __xchk_btree_process_error(
*error = 0;
fallthrough;
default:
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
trace_xchk_ifork_btree_op_error(sc, cur, level,
*error, ret_ip);
else
@@ -91,7 +91,7 @@ __xchk_btree_set_corrupt(
{
sc->sm->sm_flags |= errflag;
- if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
trace_xchk_ifork_btree_error(sc, cur, level,
ret_ip);
else
@@ -168,7 +168,7 @@ xchk_btree_rec(
if (xfs_btree_keycmp_lt(cur, &key, keyp))
xchk_btree_set_corrupt(bs->sc, cur, 1);
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Is high_key(rec) no larger than the parent high key? */
@@ -215,7 +215,7 @@ xchk_btree_key(
if (xfs_btree_keycmp_lt(cur, key, keyp))
xchk_btree_set_corrupt(bs->sc, cur, level);
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Is this block's high key no larger than the parent high key? */
@@ -236,22 +236,18 @@ xchk_btree_ptr_ok(
int level,
union xfs_btree_ptr *ptr)
{
- bool res;
-
/* A btree rooted in an inode has no block pointer to the root. */
- if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ if (bs->cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
level == bs->cur->bc_nlevels)
return true;
/* Otherwise, check the pointers. */
- if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
- res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
- else
- res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
- if (!res)
+ if (__xfs_btree_check_ptr(bs->cur, ptr, 0, level)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+ return false;
+ }
- return res;
+ return true;
}
/* Check that a btree block's sibling matches what we expect it. */
@@ -374,18 +370,21 @@ xchk_btree_check_block_owner(
{
xfs_agnumber_t agno;
xfs_agblock_t agbno;
- xfs_btnum_t btnum;
bool init_sa;
int error = 0;
if (!bs->cur)
return 0;
- btnum = bs->cur->bc_btnum;
agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
- init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
+ /*
+ * If the btree being examined is not itself a per-AG btree, initialize
+ * sc->sa so that we can check for the presence of an ownership record
+ * in the rmap btree for the AG containing the block.
+ */
+ init_sa = bs->cur->bc_ops->type != XFS_BTREE_TYPE_AG;
if (init_sa) {
error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
@@ -399,11 +398,11 @@ xchk_btree_check_block_owner(
* have to nullify it (to shut down further block owner checks) if
* self-xref encounters problems.
*/
- if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
+ if (!bs->sc->sa.bno_cur && xfs_btree_is_bno(bs->cur->bc_ops))
bs->cur = NULL;
xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo);
- if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
+ if (!bs->sc->sa.rmap_cur && xfs_btree_is_rmap(bs->cur->bc_ops))
bs->cur = NULL;
out_free:
@@ -429,7 +428,7 @@ xchk_btree_check_owner(
* up.
*/
if (bp == NULL) {
- if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE)
xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
}
@@ -442,7 +441,7 @@ xchk_btree_check_owner(
* duplicate cursors. Therefore, save the buffer daddr for
* later scanning.
*/
- if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
+ if (xfs_btree_is_bno(cur->bc_ops) || xfs_btree_is_rmap(cur->bc_ops)) {
struct check_owner *co;
co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
@@ -475,7 +474,7 @@ xchk_btree_check_iroot_minrecs(
* existing filesystems, so instead we disable the check for data fork
* bmap btrees when there's an attr fork.
*/
- if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
+ if (xfs_btree_is_bmap(bs->cur->bc_ops) &&
bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
xfs_inode_has_attr_fork(bs->sc->ip))
return false;
@@ -508,7 +507,7 @@ xchk_btree_check_minrecs(
* child block might be less than the standard minrecs, but that's ok
* provided that there's only one direct child of the root.
*/
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
level == cur->bc_nlevels - 2) {
struct xfs_btree_block *root_block;
struct xfs_buf *root_bp;
@@ -562,7 +561,7 @@ xchk_btree_block_check_keys(
return;
}
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Make sure the high key of this block matches the parent. */
@@ -585,7 +584,6 @@ xchk_btree_get_block(
struct xfs_btree_block **pblock,
struct xfs_buf **pbp)
{
- xfs_failaddr_t failed_at;
int error;
*pblock = NULL;
@@ -597,13 +595,7 @@ xchk_btree_get_block(
return error;
xfs_btree_get_block(bs->cur, level, pbp);
- if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
- failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
- level, *pbp);
- else
- failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
- level, *pbp);
- if (failed_at) {
+ if (__xfs_btree_check_block(bs->cur, *pblock, level, *pbp)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
}
@@ -664,7 +656,7 @@ xchk_btree_block_keys(
if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys))
xchk_btree_set_corrupt(bs->sc, cur, 1);
- if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
return;
/* Get high keys */
@@ -728,7 +720,7 @@ xchk_btree(
* error codes for us.
*/
level = cur->bc_nlevels - 1;
- cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ xfs_btree_init_ptr_from_cur(cur, &ptr);
if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
goto out;
error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index de24532fe083..47a20cf5205f 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -25,9 +25,12 @@
#include "xfs_trans_priv.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
+#include "xfs_dir2_priv.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -81,6 +84,15 @@ __xchk_process_error(
sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
sc->sm, *error);
break;
+ case -ECANCELED:
+ /*
+ * ECANCELED here means that the caller set one of the scrub
+ * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
+ * quickly. Set error to zero and do not continue.
+ */
+ trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
+ *error = 0;
+ break;
case -EFSBADCRC:
case -EFSCORRUPTED:
/* Note the badness but don't abort. */
@@ -88,8 +100,7 @@ __xchk_process_error(
*error = 0;
fallthrough;
default:
- trace_xchk_op_error(sc, agno, bno, *error,
- ret_ip);
+ trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
break;
}
return false;
@@ -135,6 +146,16 @@ __xchk_fblock_process_error(
/* Used to restart an op with deadlock avoidance. */
trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
break;
+ case -ECANCELED:
+ /*
+ * ECANCELED here means that the caller set one of the scrub
+ * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
+ * quickly. Set error to zero and do not continue.
+ */
+ trace_xchk_file_op_error(sc, whichfork, offset, *error,
+ ret_ip);
+ *error = 0;
+ break;
case -EFSBADCRC:
case -EFSCORRUPTED:
/* Note the badness but don't abort. */
@@ -226,6 +247,19 @@ xchk_block_set_corrupt(
trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
}
+#ifdef CONFIG_XFS_QUOTA
+/* Record a corrupt quota counter. */
+void
+xchk_qcheck_set_corrupt(
+ struct xfs_scrub *sc,
+ unsigned int dqtype,
+ xfs_dqid_t id)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
+}
+#endif
+
/* Record a corruption while cross-referencing. */
void
xchk_block_xref_set_corrupt(
@@ -426,7 +460,7 @@ xchk_perag_read_headers(
* Grab the AG headers for the attached perag structure and wait for pending
* intents to drain.
*/
-static int
+int
xchk_perag_drain_and_lock(
struct xfs_scrub *sc)
{
@@ -554,46 +588,50 @@ xchk_ag_btcur_init(
{
struct xfs_mount *mp = sc->mp;
- if (sa->agf_bp &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
+ if (sa->agf_bp) {
/* Set up a bnobt cursor for cross-referencing. */
- sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sa->pag, XFS_BTNUM_BNO);
- }
+ sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur,
+ XFS_SCRUB_TYPE_BNOBT);
- if (sa->agf_bp &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
/* Set up a cntbt cursor for cross-referencing. */
- sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- sa->pag, XFS_BTNUM_CNT);
- }
-
- /* Set up a inobt cursor for cross-referencing. */
- if (sa->agi_bp &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
- sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
- XFS_BTNUM_INO);
- }
-
- /* Set up a finobt cursor for cross-referencing. */
- if (sa->agi_bp && xfs_has_finobt(mp) &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
- sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
- XFS_BTNUM_FINO);
- }
-
- /* Set up a rmapbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_has_rmapbt(mp) &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
- sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur,
+ XFS_SCRUB_TYPE_CNTBT);
+
+ /* Set up a rmapbt cursor for cross-referencing. */
+ if (xfs_has_rmapbt(mp)) {
+ sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur,
+ XFS_SCRUB_TYPE_RMAPBT);
+ }
+
+ /* Set up a refcountbt cursor for cross-referencing. */
+ if (xfs_has_reflink(mp)) {
+ sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, sa->pag);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur,
+ XFS_SCRUB_TYPE_REFCNTBT);
+ }
}
- /* Set up a refcountbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_has_reflink(mp) &&
- xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
- sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
- sa->agf_bp, sa->pag);
+ if (sa->agi_bp) {
+ /* Set up a inobt cursor for cross-referencing. */
+ sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp,
+ sa->agi_bp);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur,
+ XFS_SCRUB_TYPE_INOBT);
+
+ /* Set up a finobt cursor for cross-referencing. */
+ if (xfs_has_finobt(mp)) {
+ sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp,
+ sa->agi_bp);
+ xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur,
+ XFS_SCRUB_TYPE_FINOBT);
+ }
}
}
@@ -604,6 +642,7 @@ xchk_ag_free(
struct xchk_ag *sa)
{
xchk_ag_btcur_free(sa);
+ xrep_reset_perag_resv(sc);
if (sa->agf_bp) {
xfs_trans_brelse(sc->tp, sa->agf_bp);
sa->agf_bp = NULL;
@@ -651,6 +690,13 @@ xchk_trans_cancel(
sc->tp = NULL;
}
+int
+xchk_trans_alloc_empty(
+ struct xfs_scrub *sc)
+{
+ return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+}
+
/*
* Grab an empty transaction so that we can re-grab locked buffers if
* one of our btrees turns out to be cyclic.
@@ -670,7 +716,7 @@ xchk_trans_alloc(
return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
resblks, 0, 0, &sc->tp);
- return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+ return xchk_trans_alloc_empty(sc);
}
/* Set us up with a transaction and an empty context. */
@@ -733,6 +779,8 @@ xchk_iget(
xfs_ino_t inum,
struct xfs_inode **ipp)
{
+ ASSERT(sc->tp != NULL);
+
return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
}
@@ -816,6 +864,26 @@ again:
return 0;
}
+#ifdef CONFIG_XFS_QUOTA
+/*
+ * Try to attach dquots to this inode if we think we might want to repair it.
+ * Callers must not hold any ILOCKs. If the dquots are broken and cannot be
+ * attached, a quotacheck will be scheduled.
+ */
+int
+xchk_ino_dqattach(
+ struct xfs_scrub *sc)
+{
+ ASSERT(sc->tp != NULL);
+ ASSERT(sc->ip != NULL);
+
+ if (!xchk_could_repair(sc))
+ return 0;
+
+ return xrep_ino_dqattach(sc);
+}
+#endif
+
/* Install an inode that we opened by handle for scrubbing. */
int
xchk_install_handle_inode(
@@ -882,8 +950,8 @@ xchk_iget_for_scrubbing(
if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
return -ENOENT;
- /* Try a regular untrusted iget. */
- error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+ /* Try a safe untrusted iget. */
+ error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
if (!error)
return xchk_install_handle_inode(sc, ip);
if (error == -ENOENT)
@@ -976,9 +1044,7 @@ xchk_irele(
struct xfs_scrub *sc,
struct xfs_inode *ip)
{
- if (current->journal_info != NULL) {
- ASSERT(current->journal_info == sc->tp);
-
+ if (sc->tp) {
/*
* If we are in a transaction, we /cannot/ drop the inode
* ourselves, because the VFS will trigger writeback, which
@@ -1027,6 +1093,11 @@ xchk_setup_inode_contents(
error = xchk_trans_alloc(sc, resblks);
if (error)
goto out;
+
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ goto out;
+
xchk_ilock(sc, XFS_ILOCK_EXCL);
out:
/* scrub teardown will unlock and release the inode for us */
@@ -1132,6 +1203,7 @@ xchk_metadata_inode_subtype(
unsigned int scrub_type)
{
__u32 smtype = sc->sm->sm_type;
+ unsigned int sick_mask = sc->sick_mask;
int error;
sc->sm->sm_type = scrub_type;
@@ -1149,6 +1221,7 @@ xchk_metadata_inode_subtype(
break;
}
+ sc->sick_mask = sick_mask;
sc->sm->sm_type = smtype;
return error;
}
@@ -1228,6 +1301,15 @@ xchk_fsgates_enable(
if (scrub_fsgates & XCHK_FSGATES_DRAIN)
xfs_drain_wait_enable();
+ if (scrub_fsgates & XCHK_FSGATES_QUOTA)
+ xfs_dqtrx_hook_enable();
+
+ if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
+ xfs_dir_hook_enable();
+
+ if (scrub_fsgates & XCHK_FSGATES_RMAP)
+ xfs_rmap_hook_enable();
+
sc->flags |= scrub_fsgates;
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index cabdc0e16838..89f7bbec887e 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -32,6 +32,7 @@ xchk_should_terminate(
}
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
+int xchk_trans_alloc_empty(struct xfs_scrub *sc);
void xchk_trans_cancel(struct xfs_scrub *sc);
bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
@@ -54,6 +55,10 @@ void xchk_block_set_corrupt(struct xfs_scrub *sc,
void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino);
void xchk_fblock_set_corrupt(struct xfs_scrub *sc, int whichfork,
xfs_fileoff_t offset);
+#ifdef CONFIG_XFS_QUOTA
+void xchk_qcheck_set_corrupt(struct xfs_scrub *sc, unsigned int dqtype,
+ xfs_dqid_t id);
+#endif
void xchk_block_xref_set_corrupt(struct xfs_scrub *sc,
struct xfs_buf *bp);
@@ -103,19 +108,33 @@ xchk_setup_rtsummary(struct xfs_scrub *sc)
}
#endif
#ifdef CONFIG_XFS_QUOTA
+int xchk_ino_dqattach(struct xfs_scrub *sc);
int xchk_setup_quota(struct xfs_scrub *sc);
+int xchk_setup_quotacheck(struct xfs_scrub *sc);
#else
static inline int
+xchk_ino_dqattach(struct xfs_scrub *sc)
+{
+ return 0;
+}
+static inline int
xchk_setup_quota(struct xfs_scrub *sc)
{
return -ENOENT;
}
+static inline int
+xchk_setup_quotacheck(struct xfs_scrub *sc)
+{
+ return -ENOENT;
+}
#endif
int xchk_setup_fscounters(struct xfs_scrub *sc);
+int xchk_setup_nlinks(struct xfs_scrub *sc);
void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa);
int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno,
struct xchk_ag *sa);
+int xchk_perag_drain_and_lock(struct xfs_scrub *sc);
/*
* Grab all AG resources, treating the inability to grab the perag structure as
@@ -151,6 +170,11 @@ void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags);
void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
+/*
+ * Grab the inode at @inum. The caller must have created a scrub transaction
+ * so that we can confirm the inumber by walking the inobt and not deadlock on
+ * a loop in the inobt.
+ */
int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp);
int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum,
struct xfs_buf **agi_bpp, struct xfs_inode **ipp);
@@ -158,6 +182,26 @@ void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip);
int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip);
/*
+ * Safe version of (untrusted) xchk_iget that uses an empty transaction to
+ * avoid deadlocking on loops in the inobt. This should only be used in a
+ * scrub or repair setup routine, and only prior to grabbing a transaction.
+ */
+static inline int
+xchk_iget_safe(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp)
+{
+ int error;
+
+ ASSERT(sc->tp == NULL);
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+ error = xchk_iget(sc, inum, ipp);
+ xchk_trans_cancel(sc);
+ return error;
+}
+
+/*
* Don't bother cross-referencing if we already found corruption or cross
* referencing discrepancies.
*/
@@ -167,6 +211,8 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
XFS_SCRUB_OFLAG_XCORRUPT);
}
+bool xchk_dir_looks_zapped(struct xfs_inode *dp);
+
#ifdef CONFIG_XFS_ONLINE_REPAIR
/* Decide if a repair is required. */
static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
@@ -175,8 +221,21 @@ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
XFS_SCRUB_OFLAG_XCORRUPT |
XFS_SCRUB_OFLAG_PREEN);
}
+
+/*
+ * "Should we prepare for a repair?"
+ *
+ * Return true if the caller permits us to repair metadata and we're not
+ * setting up for a post-repair evaluation.
+ */
+static inline bool xchk_could_repair(const struct xfs_scrub *sc)
+{
+ return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ !(sc->flags & XREP_ALREADY_FIXED);
+}
#else
# define xchk_needs_repair(sc) (false)
+# define xchk_could_repair(sc) (false)
#endif /* CONFIG_XFS_ONLINE_REPAIR */
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
@@ -188,6 +247,16 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
#define xchk_xfile_descr(sc, fmt, ...) \
kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \
(sc)->mp->m_super->s_id, ##__VA_ARGS__)
+#define xchk_xfile_ag_descr(sc, fmt, ...) \
+ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \
+ (sc)->mp->m_super->s_id, \
+ (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \
+ ##__VA_ARGS__)
+#define xchk_xfile_ino_descr(sc, fmt, ...) \
+ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \
+ (sc)->mp->m_super->s_id, \
+ (sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \
+ ##__VA_ARGS__)
/*
* Setting up a hook to wait for intents to drain is costly -- we have to take
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
new file mode 100644
index 000000000000..4de3f0f40f48
--- /dev/null
+++ b/fs/xfs/scrub/cow_repair.c
@@ -0,0 +1,614 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_quota.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
+#include "xfs_icache.h"
+#include "xfs_refcount_btree.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/off_bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/reap.h"
+
+/*
+ * CoW Fork Mapping Repair
+ * =======================
+ *
+ * Although CoW staging extents are owned by incore CoW inode forks, on disk
+ * they are owned by the refcount btree. The ondisk metadata does not record
+ * any ownership information, which limits what we can do to repair the
+ * mappings in the CoW fork. At most, we can replace ifork mappings that lack
+ * an entry in the refcount btree or are described by a reverse mapping record
+ * whose owner is not OWN_COW.
+ *
+ * Replacing extents is also tricky -- we can't touch written CoW fork extents
+ * since they are undergoing writeback, and delalloc extents do not require
+ * repair since they only exist incore. Hence the most we can do is find the
+ * bad parts of unwritten mappings, allocate a replacement set of blocks, and
+ * replace the incore mapping. We use the regular reaping process to unmap
+ * or free the discarded blocks, as appropriate.
+ */
+struct xrep_cow {
+ struct xfs_scrub *sc;
+
+ /* Bitmap of file offset ranges that need replacing. */
+ struct xoff_bitmap bad_fileoffs;
+
+ /* Bitmap of fsblocks that were removed from the CoW fork. */
+ struct xfsb_bitmap old_cowfork_fsblocks;
+
+ /* CoW fork mappings used to scan for bad CoW staging extents. */
+ struct xfs_bmbt_irec irec;
+
+ /* refcount btree block number of irec.br_startblock */
+ unsigned int irec_startbno;
+
+ /* refcount btree block number of the next refcount record we expect */
+ unsigned int next_bno;
+};
+
+/* CoW staging extent. */
+struct xrep_cow_extent {
+ xfs_fsblock_t fsbno;
+ xfs_extlen_t len;
+};
+
+/*
+ * Mark the part of the file range that corresponds to the given physical
+ * space. Caller must ensure that the physical range is within xc->irec.
+ */
+STATIC int
+xrep_cow_mark_file_range(
+ struct xrep_cow *xc,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount)
+{
+ xfs_fileoff_t startoff;
+
+ startoff = xc->irec.br_startoff +
+ (startblock - xc->irec.br_startblock);
+
+ trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff,
+ blockcount);
+
+ return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount);
+}
+
+/*
+ * Trim @src to fit within the CoW fork mapping being examined, and put the
+ * result in @dst.
+ */
+static inline void
+xrep_cow_trim_refcount(
+ struct xrep_cow *xc,
+ struct xfs_refcount_irec *dst,
+ const struct xfs_refcount_irec *src)
+{
+ unsigned int adj;
+
+ memcpy(dst, src, sizeof(*dst));
+
+ if (dst->rc_startblock < xc->irec_startbno) {
+ adj = xc->irec_startbno - dst->rc_startblock;
+ dst->rc_blockcount -= adj;
+ dst->rc_startblock += adj;
+ }
+
+ if (dst->rc_startblock + dst->rc_blockcount >
+ xc->irec_startbno + xc->irec.br_blockcount) {
+ adj = (dst->rc_startblock + dst->rc_blockcount) -
+ (xc->irec_startbno + xc->irec.br_blockcount);
+ dst->rc_blockcount -= adj;
+ }
+}
+
+/* Mark any shared CoW staging extents. */
+STATIC int
+xrep_cow_mark_shared_staging(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *rec,
+ void *priv)
+{
+ struct xrep_cow *xc = priv;
+ struct xfs_refcount_irec rrec;
+ xfs_fsblock_t fsbno;
+
+ if (!xfs_refcount_check_domain(rec) ||
+ rec->rc_domain != XFS_REFC_DOMAIN_SHARED)
+ return -EFSCORRUPTED;
+
+ xrep_cow_trim_refcount(xc, &rrec, rec);
+
+ fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
+ rrec.rc_startblock);
+ return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount);
+}
+
+/*
+ * Mark any portion of the CoW fork file offset range where there is not a CoW
+ * staging extent record in the refcountbt, and keep a record of where we did
+ * find correct refcountbt records. Staging records are always cleaned out at
+ * mount time, so any two inodes trying to map the same staging area would have
+ * already taken the fs down due to refcount btree verifier errors. Hence this
+ * inode should be the sole creator of the staging extent records ondisk.
+ */
+STATIC int
+xrep_cow_mark_missing_staging(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *rec,
+ void *priv)
+{
+ struct xrep_cow *xc = priv;
+ struct xfs_refcount_irec rrec;
+ int error;
+
+ if (!xfs_refcount_check_domain(rec) ||
+ rec->rc_domain != XFS_REFC_DOMAIN_COW)
+ return -EFSCORRUPTED;
+
+ xrep_cow_trim_refcount(xc, &rrec, rec);
+
+ if (xc->next_bno >= rrec.rc_startblock)
+ goto next;
+
+ error = xrep_cow_mark_file_range(xc,
+ XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
+ xc->next_bno),
+ rrec.rc_startblock - xc->next_bno);
+ if (error)
+ return error;
+
+next:
+ xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount;
+ return 0;
+}
+
+/*
+ * Mark any area that does not correspond to a CoW staging rmap. These are
+ * cross-linked areas that must be avoided.
+ */
+STATIC int
+xrep_cow_mark_missing_staging_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_cow *xc = priv;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t rec_bno;
+ xfs_extlen_t rec_len;
+ unsigned int adj;
+
+ if (rec->rm_owner == XFS_RMAP_OWN_COW)
+ return 0;
+
+ rec_bno = rec->rm_startblock;
+ rec_len = rec->rm_blockcount;
+ if (rec_bno < xc->irec_startbno) {
+ adj = xc->irec_startbno - rec_bno;
+ rec_len -= adj;
+ rec_bno += adj;
+ }
+
+ if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) {
+ adj = (rec_bno + rec_len) -
+ (xc->irec_startbno + xc->irec.br_blockcount);
+ rec_len -= adj;
+ }
+
+ fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno);
+ return xrep_cow_mark_file_range(xc, fsbno, rec_len);
+}
+
+/*
+ * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
+ * extent and mark the corresponding part of the file range in the bitmap.
+ */
+STATIC int
+xrep_cow_find_bad(
+ struct xrep_cow *xc)
+{
+ struct xfs_refcount_irec rc_low = { 0 };
+ struct xfs_refcount_irec rc_high = { 0 };
+ struct xfs_rmap_irec rm_low = { 0 };
+ struct xfs_rmap_irec rm_high = { 0 };
+ struct xfs_perag *pag;
+ struct xfs_scrub *sc = xc->sc;
+ xfs_agnumber_t agno;
+ int error;
+
+ agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock);
+ xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock);
+
+ pag = xfs_perag_get(sc->mp, agno);
+ if (!pag)
+ return -EFSCORRUPTED;
+
+ error = xrep_ag_init(sc, pag, &sc->sa);
+ if (error)
+ goto out_pag;
+
+ /* Mark any CoW fork extents that are shared. */
+ rc_low.rc_startblock = xc->irec_startbno;
+ rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
+ error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
+ xrep_cow_mark_shared_staging, xc);
+ if (error)
+ goto out_sa;
+
+ /* Make sure there are CoW staging extents for the whole mapping. */
+ rc_low.rc_startblock = xc->irec_startbno;
+ rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
+ xc->next_bno = xc->irec_startbno;
+ error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
+ xrep_cow_mark_missing_staging, xc);
+ if (error)
+ goto out_sa;
+
+ if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
+ error = xrep_cow_mark_file_range(xc,
+ XFS_AGB_TO_FSB(sc->mp, pag->pag_agno,
+ xc->next_bno),
+ xc->irec_startbno + xc->irec.br_blockcount -
+ xc->next_bno);
+ if (error)
+ goto out_sa;
+ }
+
+ /* Mark any area has an rmap that isn't a COW staging extent. */
+ rm_low.rm_startblock = xc->irec_startbno;
+ memset(&rm_high, 0xFF, sizeof(rm_high));
+ rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high,
+ xrep_cow_mark_missing_staging_rmap, xc);
+ if (error)
+ goto out_sa;
+
+ /*
+ * If userspace is forcing us to rebuild the CoW fork or someone turned
+ * on the debugging knob, replace everything in the CoW fork.
+ */
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+ XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+ error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
+ xc->irec.br_blockcount);
+ if (error)
+ return error;
+ }
+
+out_sa:
+ xchk_ag_free(sc, &sc->sa);
+out_pag:
+ xfs_perag_put(pag);
+ return 0;
+}
+
+/*
+ * Allocate a replacement CoW staging extent of up to the given number of
+ * blocks, and fill out the mapping.
+ */
+STATIC int
+xrep_cow_alloc(
+ struct xfs_scrub *sc,
+ xfs_extlen_t maxlen,
+ struct xrep_cow_extent *repl)
+{
+ struct xfs_alloc_arg args = {
+ .tp = sc->tp,
+ .mp = sc->mp,
+ .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
+ .minlen = 1,
+ .maxlen = maxlen,
+ .prod = 1,
+ .resv = XFS_AG_RESV_NONE,
+ .datatype = XFS_ALLOC_USERDATA,
+ };
+ int error;
+
+ error = xfs_trans_reserve_more(sc->tp, maxlen, 0);
+ if (error)
+ return error;
+
+ error = xfs_alloc_vextent_start_ag(&args,
+ XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino));
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
+
+ repl->fsbno = args.fsbno;
+ repl->len = args.len;
+ return 0;
+}
+
+/*
+ * Look up the current CoW fork mapping so that we only allocate enough to
+ * replace a single mapping. If we don't find a mapping that covers the start
+ * of the file range, or we find a delalloc or written extent, something is
+ * seriously wrong, since we didn't drop the ILOCK.
+ */
+static inline int
+xrep_cow_find_mapping(
+ struct xrep_cow *xc,
+ struct xfs_iext_cursor *icur,
+ xfs_fileoff_t startoff,
+ struct xfs_bmbt_irec *got)
+{
+ struct xfs_inode *ip = xc->sc->ip;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+
+ if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got))
+ goto bad;
+
+ if (got->br_startoff > startoff)
+ goto bad;
+
+ if (got->br_blockcount == 0)
+ goto bad;
+
+ if (isnullstartblock(got->br_startblock))
+ goto bad;
+
+ if (xfs_bmap_is_written_extent(got))
+ goto bad;
+
+ return 0;
+bad:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+#define REPLACE_LEFT_SIDE (1U << 0)
+#define REPLACE_RIGHT_SIDE (1U << 1)
+
+/*
+ * Given a CoW fork mapping @got and a replacement mapping @repl, remap the
+ * beginning of @got with the space described by @rep.
+ */
+static inline void
+xrep_cow_replace_mapping(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *icur,
+ const struct xfs_bmbt_irec *got,
+ const struct xrep_cow_extent *repl)
+{
+ struct xfs_bmbt_irec new = *got; /* struct copy */
+
+ ASSERT(repl->len > 0);
+ ASSERT(!isnullstartblock(got->br_startblock));
+
+ trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len);
+
+ if (got->br_blockcount == repl->len) {
+ /*
+ * The new extent is a complete replacement for the existing
+ * extent. Update the COW fork record.
+ */
+ new.br_startblock = repl->fsbno;
+ xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
+ return;
+ }
+
+ /*
+ * The new extent can replace the beginning of the COW fork record.
+ * Move the left side of @got upwards, then insert the new record.
+ */
+ new.br_startoff += repl->len;
+ new.br_startblock += repl->len;
+ new.br_blockcount -= repl->len;
+ xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
+
+ new.br_startoff = got->br_startoff;
+ new.br_startblock = repl->fsbno;
+ new.br_blockcount = repl->len;
+ xfs_iext_insert(ip, icur, &new, BMAP_COWFORK);
+}
+
+/*
+ * Replace the unwritten CoW staging extent backing the given file range with a
+ * new space extent that isn't as problematic.
+ */
+STATIC int
+xrep_cow_replace_range(
+ struct xrep_cow *xc,
+ xfs_fileoff_t startoff,
+ xfs_extlen_t *blockcount)
+{
+ struct xfs_iext_cursor icur;
+ struct xrep_cow_extent repl;
+ struct xfs_bmbt_irec got;
+ struct xfs_scrub *sc = xc->sc;
+ xfs_fileoff_t nextoff;
+ xfs_extlen_t alloc_len;
+ int error;
+
+ /*
+ * Put the existing CoW fork mapping in @got. If @got ends before
+ * @rep, truncate @rep so we only replace one extent mapping at a time.
+ */
+ error = xrep_cow_find_mapping(xc, &icur, startoff, &got);
+ if (error)
+ return error;
+ nextoff = min(startoff + *blockcount,
+ got.br_startoff + got.br_blockcount);
+
+ /*
+ * Allocate a replacement extent. If we don't fill all the blocks,
+ * shorten the quantity that will be deleted in this step.
+ */
+ alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
+ nextoff - startoff);
+ error = xrep_cow_alloc(sc, alloc_len, &repl);
+ if (error)
+ return error;
+
+ /*
+ * Replace the old mapping with the new one, and commit the metadata
+ * changes made so far.
+ */
+ xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl);
+
+ xfs_inode_set_cowblocks_tag(sc->ip);
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+
+ /* Note the old CoW staging extents; we'll reap them all later. */
+ error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
+ repl.len);
+ if (error)
+ return error;
+
+ *blockcount = repl.len;
+ return 0;
+}
+
+/*
+ * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc
+ * reservation.
+ */
+STATIC int
+xrep_cow_replace(
+ uint64_t startoff,
+ uint64_t blockcount,
+ void *priv)
+{
+ struct xrep_cow *xc = priv;
+ int error = 0;
+
+ while (blockcount > 0) {
+ xfs_extlen_t len = min_t(xfs_filblks_t, blockcount,
+ XFS_MAX_BMBT_EXTLEN);
+
+ error = xrep_cow_replace_range(xc, startoff, &len);
+ if (error)
+ break;
+
+ blockcount -= len;
+ startoff += len;
+ }
+
+ return error;
+}
+
+/*
+ * Repair an inode's CoW fork. The CoW fork is an in-core structure, so
+ * there's no btree to rebuid. Instead, we replace any mappings that are
+ * cross-linked or lack ondisk CoW fork records in the refcount btree.
+ */
+int
+xrep_bmap_cow(
+ struct xfs_scrub *sc)
+{
+ struct xrep_cow *xc;
+ struct xfs_iext_cursor icur;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK);
+ int error;
+
+ if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp))
+ return -EOPNOTSUPP;
+
+ if (!ifp)
+ return 0;
+
+ /* realtime files aren't supported yet */
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ return -EOPNOTSUPP;
+
+ /*
+ * If we're somehow not in extents format, then reinitialize it to
+ * an empty extent mapping fork and exit.
+ */
+ if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ return 0;
+ }
+
+ xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS);
+ if (!xc)
+ return -ENOMEM;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ xc->sc = sc;
+ xoff_bitmap_init(&xc->bad_fileoffs);
+ xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
+
+ for_each_xfs_iext(ifp, &icur, &xc->irec) {
+ if (xchk_should_terminate(sc, &error))
+ goto out_bitmap;
+
+ /*
+ * delalloc reservations only exist incore, so there is no
+ * ondisk metadata that we can examine. Hence we leave them
+ * alone.
+ */
+ if (isnullstartblock(xc->irec.br_startblock))
+ continue;
+
+ /*
+ * COW fork extents are only in the written state if writeback
+ * is actively writing to disk. We cannot restart the write
+ * at a different disk address since we've already issued the
+ * IO, so we leave these alone and hope for the best.
+ */
+ if (xfs_bmap_is_written_extent(&xc->irec))
+ continue;
+
+ error = xrep_cow_find_bad(xc);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Replace any bad unwritten mappings with fresh reservations. */
+ error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc);
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Reap as many of the old CoW blocks as we can. They are owned ondisk
+ * by the refcount btree, not the inode, so it is correct to treat them
+ * like inode metadata.
+ */
+ error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
+ &XFS_RMAP_OINFO_COW);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
+ xoff_bitmap_destroy(&xc->bad_fileoffs);
+ kfree(xc);
+ return error;
+}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 0b491784b759..076a310b8eb0 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -15,10 +15,12 @@
#include "xfs_icache.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
+#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
#include "scrub/readdir.h"
+#include "scrub/health.h"
/* Set us up to scrub directories. */
int
@@ -91,11 +93,11 @@ xchk_dir_actor(
return -ECANCELED;
}
- if (!strncmp(".", name->name, name->len)) {
+ if (xfs_dir2_samename(name, &xfs_name_dot)) {
/* If this is "." then check that the inum matches the dir. */
if (ino != dp->i_ino)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- } else if (!strncmp("..", name->name, name->len)) {
+ } else if (xfs_dir2_samename(name, &xfs_name_dotdot)) {
/*
* If this is ".." in the root inode, check that the inum
* matches this dir.
@@ -760,6 +762,11 @@ xchk_directory(
if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
return -ENOENT;
+ if (xchk_file_looks_zapped(sc, XFS_SICK_INO_DIR_ZAPPED)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
/* Plausible size? */
if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
@@ -784,7 +791,36 @@ xchk_directory(
/* Look up every name in this directory by hash. */
error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL);
- if (error == -ECANCELED)
- error = 0;
- return error;
+ if (error && error != -ECANCELED)
+ return error;
+
+ /* If the dir is clean, it is clearly not zapped. */
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED);
+ return 0;
+}
+
+/*
+ * Decide if this directory has been zapped to satisfy the inode and ifork
+ * verifiers. Checking and repairing should be postponed until the directory
+ * is fixed.
+ */
+bool
+xchk_dir_looks_zapped(
+ struct xfs_inode *dp)
+{
+ /* Repair zapped this dir's data fork a short time ago */
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return true;
+
+ /*
+ * If the dinode repair found a bad data fork, it will reset the fork
+ * to extents format with zero records and wait for the bmapbtd
+ * scrubber to reconstruct the block mappings. Directories always
+ * contain some content, so this is a clear sign of a zapped directory.
+ * The state checked by xfs_ifork_zapped is not persisted, so this is
+ * the secondary strategy if repairs are interrupted by a crash or an
+ * unmount.
+ */
+ return dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS &&
+ dp->i_df.if_nextents == 0;
}
diff --git a/fs/xfs/scrub/dqiterate.c b/fs/xfs/scrub/dqiterate.c
new file mode 100644
index 000000000000..20c4daedd48d
--- /dev/null
+++ b/fs/xfs/scrub/dqiterate.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_bmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/quota.h"
+#include "scrub/trace.h"
+
+/* Initialize a dquot iteration cursor. */
+void
+xchk_dqiter_init(
+ struct xchk_dqiter *cursor,
+ struct xfs_scrub *sc,
+ xfs_dqtype_t dqtype)
+{
+ cursor->sc = sc;
+ cursor->bmap.br_startoff = NULLFILEOFF;
+ cursor->dqtype = dqtype & XFS_DQTYPE_REC_MASK;
+ cursor->quota_ip = xfs_quota_inode(sc->mp, cursor->dqtype);
+ cursor->id = 0;
+}
+
+/*
+ * Ensure that the cached data fork mapping for the dqiter cursor is fresh and
+ * covers the dquot pointed to by the scan cursor.
+ */
+STATIC int
+xchk_dquot_iter_revalidate_bmap(
+ struct xchk_dqiter *cursor)
+{
+ struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip,
+ XFS_DATA_FORK);
+ xfs_fileoff_t fileoff;
+ xfs_dqid_t this_id = cursor->id;
+ int nmaps = 1;
+ int error;
+
+ fileoff = this_id / qi->qi_dqperchunk;
+
+ /*
+ * If we have a mapping for cursor->id and it's still fresh, there's
+ * no need to reread the bmbt.
+ */
+ if (cursor->bmap.br_startoff != NULLFILEOFF &&
+ cursor->if_seq == ifp->if_seq &&
+ cursor->bmap.br_startoff + cursor->bmap.br_blockcount > fileoff)
+ return 0;
+
+ /* Look up the data fork mapping for the dquot id of interest. */
+ error = xfs_bmapi_read(cursor->quota_ip, fileoff,
+ XFS_MAX_FILEOFF - fileoff, &cursor->bmap, &nmaps, 0);
+ if (error)
+ return error;
+ if (!nmaps) {
+ ASSERT(nmaps > 0);
+ return -EFSCORRUPTED;
+ }
+ if (cursor->bmap.br_startoff > fileoff) {
+ ASSERT(cursor->bmap.br_startoff == fileoff);
+ return -EFSCORRUPTED;
+ }
+
+ cursor->if_seq = ifp->if_seq;
+ trace_xchk_dquot_iter_revalidate_bmap(cursor, cursor->id);
+ return 0;
+}
+
+/* Advance the dqiter cursor to the next non-sparse region of the quota file. */
+STATIC int
+xchk_dquot_iter_advance_bmap(
+ struct xchk_dqiter *cursor,
+ uint64_t *next_ondisk_id)
+{
+ struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip,
+ XFS_DATA_FORK);
+ xfs_fileoff_t fileoff;
+ uint64_t next_id;
+ int nmaps = 1;
+ int error;
+
+ /* Find the dquot id for the next non-hole mapping. */
+ do {
+ fileoff = cursor->bmap.br_startoff + cursor->bmap.br_blockcount;
+ if (fileoff > XFS_DQ_ID_MAX / qi->qi_dqperchunk) {
+ /* The hole goes beyond the max dquot id, we're done */
+ *next_ondisk_id = -1ULL;
+ return 0;
+ }
+
+ error = xfs_bmapi_read(cursor->quota_ip, fileoff,
+ XFS_MAX_FILEOFF - fileoff, &cursor->bmap,
+ &nmaps, 0);
+ if (error)
+ return error;
+ if (!nmaps) {
+ /* Must have reached the end of the mappings. */
+ *next_ondisk_id = -1ULL;
+ return 0;
+ }
+ if (cursor->bmap.br_startoff > fileoff) {
+ ASSERT(cursor->bmap.br_startoff == fileoff);
+ return -EFSCORRUPTED;
+ }
+ } while (!xfs_bmap_is_real_extent(&cursor->bmap));
+
+ next_id = cursor->bmap.br_startoff * qi->qi_dqperchunk;
+ if (next_id > XFS_DQ_ID_MAX) {
+ /* The hole goes beyond the max dquot id, we're done */
+ *next_ondisk_id = -1ULL;
+ return 0;
+ }
+
+ /* Propose jumping forward to the dquot in the next allocated block. */
+ *next_ondisk_id = next_id;
+ cursor->if_seq = ifp->if_seq;
+ trace_xchk_dquot_iter_advance_bmap(cursor, *next_ondisk_id);
+ return 0;
+}
+
+/*
+ * Find the id of the next highest incore dquot. Normally this will correspond
+ * exactly with the quota file block mappings, but repair might have erased a
+ * mapping because it was crosslinked; in that case, we need to re-allocate the
+ * space so that we can reset q_blkno.
+ */
+STATIC void
+xchk_dquot_iter_advance_incore(
+ struct xchk_dqiter *cursor,
+ uint64_t *next_incore_id)
+{
+ struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo;
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, cursor->dqtype);
+ struct xfs_dquot *dq;
+ unsigned int nr_found;
+
+ *next_incore_id = -1ULL;
+
+ mutex_lock(&qi->qi_tree_lock);
+ nr_found = radix_tree_gang_lookup(tree, (void **)&dq, cursor->id, 1);
+ if (nr_found)
+ *next_incore_id = dq->q_id;
+ mutex_unlock(&qi->qi_tree_lock);
+
+ trace_xchk_dquot_iter_advance_incore(cursor, *next_incore_id);
+}
+
+/*
+ * Walk all incore dquots of this filesystem. Caller must set *@cursorp to
+ * zero before the first call, and must not hold the quota file ILOCK.
+ * Returns 1 and a valid *@dqpp; 0 and *@dqpp == NULL when there are no more
+ * dquots to iterate; or a negative errno.
+ */
+int
+xchk_dquot_iter(
+ struct xchk_dqiter *cursor,
+ struct xfs_dquot **dqpp)
+{
+ struct xfs_mount *mp = cursor->sc->mp;
+ struct xfs_dquot *dq = NULL;
+ uint64_t next_ondisk, next_incore = -1ULL;
+ unsigned int lock_mode;
+ int error = 0;
+
+ if (cursor->id > XFS_DQ_ID_MAX)
+ return 0;
+ next_ondisk = cursor->id;
+
+ /* Revalidate and/or advance the cursor. */
+ lock_mode = xfs_ilock_data_map_shared(cursor->quota_ip);
+ error = xchk_dquot_iter_revalidate_bmap(cursor);
+ if (!error && !xfs_bmap_is_real_extent(&cursor->bmap))
+ error = xchk_dquot_iter_advance_bmap(cursor, &next_ondisk);
+ xfs_iunlock(cursor->quota_ip, lock_mode);
+ if (error)
+ return error;
+
+ if (next_ondisk > cursor->id)
+ xchk_dquot_iter_advance_incore(cursor, &next_incore);
+
+ /* Pick the next dquot in the sequence and return it. */
+ cursor->id = min(next_ondisk, next_incore);
+ if (cursor->id > XFS_DQ_ID_MAX)
+ return 0;
+
+ trace_xchk_dquot_iter(cursor, cursor->id);
+
+ error = xfs_qm_dqget(mp, cursor->id, cursor->dqtype, false, &dq);
+ if (error)
+ return error;
+
+ cursor->id = dq->q_id + 1;
+ *dqpp = dq;
+ return 1;
+}
diff --git a/fs/xfs/scrub/fsb_bitmap.h b/fs/xfs/scrub/fsb_bitmap.h
new file mode 100644
index 000000000000..40b462c1dd0d
--- /dev/null
+++ b/fs/xfs/scrub/fsb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_FSB_BITMAP_H__
+#define __XFS_SCRUB_FSB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_fsblock_t */
+
+struct xfsb_bitmap {
+ struct xbitmap64 fsbitmap;
+};
+
+static inline void xfsb_bitmap_init(struct xfsb_bitmap *bitmap)
+{
+ xbitmap64_init(&bitmap->fsbitmap);
+}
+
+static inline void xfsb_bitmap_destroy(struct xfsb_bitmap *bitmap)
+{
+ xbitmap64_destroy(&bitmap->fsbitmap);
+}
+
+static inline int xfsb_bitmap_set(struct xfsb_bitmap *bitmap,
+ xfs_fsblock_t start, xfs_filblks_t len)
+{
+ return xbitmap64_set(&bitmap->fsbitmap, start, len);
+}
+
+static inline int xfsb_bitmap_walk(struct xfsb_bitmap *bitmap,
+ xbitmap64_walk_fn fn, void *priv)
+{
+ return xbitmap64_walk(&bitmap->fsbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_FSB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 05be757668bb..d310737c8823 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -16,12 +16,13 @@
#include "xfs_health.h"
#include "xfs_btree.h"
#include "xfs_ag.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
+#include "scrub/fscounters.h"
/*
* FS Summary Counters
@@ -48,17 +49,6 @@
* our tolerance for mismatch between expected and actual counter values.
*/
-struct xchk_fscounters {
- struct xfs_scrub *sc;
- uint64_t icount;
- uint64_t ifree;
- uint64_t fdblocks;
- uint64_t frextents;
- unsigned long long icount_min;
- unsigned long long icount_max;
- bool frozen;
-};
-
/*
* Since the expected value computation is lockless but only browses incore
* values, the percpu counters should be fairly close to each other. However,
@@ -235,14 +225,19 @@ xchk_setup_fscounters(
* Pause all writer activity in the filesystem while we're scrubbing to
* reduce the likelihood of background perturbations to the counters
* throwing off our calculations.
+ *
+ * If we're repairing, we need to prevent any other thread from
+ * changing the global fs summary counters while we're repairing them.
+ * This requires the fs to be frozen, which will disable background
+ * reclaim and purge all inactive inodes.
*/
- if (sc->flags & XCHK_TRY_HARDER) {
+ if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) {
error = xchk_fscounters_freeze(sc);
if (error)
return error;
}
- return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+ return xchk_trans_alloc_empty(sc);
}
/*
@@ -254,7 +249,9 @@ xchk_setup_fscounters(
* set the INCOMPLETE flag even when a negative errno is returned. This care
* must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
* ECANCELED) that are absorbed into a scrub state flag update by
- * xchk_*_process_error.
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
*/
/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
@@ -482,6 +479,10 @@ xchk_fscount_within_range(
if (curr_value == expected)
return true;
+ /* We require exact matches when repair is running. */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ return false;
+
min_value = min(old_value, curr_value);
max_value = max(old_value, curr_value);
diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h
new file mode 100644
index 000000000000..461a13d25f4b
--- /dev/null
+++ b/fs/xfs/scrub/fscounters.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_FSCOUNTERS_H__
+#define __XFS_SCRUB_FSCOUNTERS_H__
+
+struct xchk_fscounters {
+ struct xfs_scrub *sc;
+ uint64_t icount;
+ uint64_t ifree;
+ uint64_t fdblocks;
+ uint64_t frextents;
+ unsigned long long icount_min;
+ unsigned long long icount_max;
+ bool frozen;
+};
+
+#endif /* __XFS_SCRUB_FSCOUNTERS_H__ */
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
new file mode 100644
index 000000000000..94cdb852bee4
--- /dev/null
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_health.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/fscounters.h"
+
+/*
+ * FS Summary Counters
+ * ===================
+ *
+ * We correct errors in the filesystem summary counters by setting them to the
+ * values computed during the obligatory scrub phase. However, we must be
+ * careful not to allow any other thread to change the counters while we're
+ * computing and setting new values. To achieve this, we freeze the
+ * filesystem for the whole operation if the REPAIR flag is set. The checking
+ * function is stricter when we've frozen the fs.
+ */
+
+/*
+ * Reset the superblock counters. Caller is responsible for freezing the
+ * filesystem during the calculation and reset phases.
+ */
+int
+xrep_fscounters(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xchk_fscounters *fsc = sc->buf;
+
+ /*
+ * Reinitialize the in-core counters from what we computed. We froze
+ * the filesystem, so there shouldn't be anyone else trying to modify
+ * these counters.
+ */
+ if (!fsc->frozen) {
+ ASSERT(fsc->frozen);
+ return -EFSCORRUPTED;
+ }
+
+ trace_xrep_reset_counters(mp, fsc);
+
+ percpu_counter_set(&mp->m_icount, fsc->icount);
+ percpu_counter_set(&mp->m_ifree, fsc->ifree);
+ percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
+ percpu_counter_set(&mp->m_frextents, fsc->frextents);
+ mp->m_sb.sb_frextents = fsc->frextents;
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 5e2b09ed6e29..9020a6bef7f1 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -10,12 +10,11 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
#include "xfs_ag.h"
#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/health.h"
+#include "scrub/common.h"
/*
* Scrub and In-Core Filesystem Health Assessments
@@ -107,6 +106,8 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA },
[XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA },
[XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS },
+ [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK },
+ [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS },
};
/* Return the health status mask for this scrub type. */
@@ -118,6 +119,56 @@ xchk_health_mask_for_scrub_type(
}
/*
+ * If the scrub state is clean, add @mask to the scrub sick mask to clear
+ * additional sick flags from the metadata object's sick state.
+ */
+void
+xchk_mark_healthy_if_clean(
+ struct xfs_scrub *sc,
+ unsigned int mask)
+{
+ if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT)))
+ sc->sick_mask |= mask;
+}
+
+/*
+ * If we're scrubbing a piece of file metadata for the first time, does it look
+ * like it has been zapped? Skip the check if we just repaired the metadata
+ * and are revalidating it.
+ */
+bool
+xchk_file_looks_zapped(
+ struct xfs_scrub *sc,
+ unsigned int mask)
+{
+ ASSERT((mask & ~XFS_SICK_INO_ZAPPED) == 0);
+
+ if (sc->flags & XREP_ALREADY_FIXED)
+ return false;
+
+ return xfs_inode_has_sickness(sc->ip, mask);
+}
+
+/*
+ * Scrub gave the filesystem a clean bill of health, so clear all the indirect
+ * markers of past problems (at least for the fs and ags) so that we can be
+ * healthy again.
+ */
+STATIC void
+xchk_mark_all_healthy(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT);
+ xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT);
+ for_each_perag(mp, agno, pag)
+ xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT);
+}
+
+/*
* Update filesystem health assessments based on what we found and did.
*
* If the scrubber finds errors, we mark sick whatever's mentioned in
@@ -134,6 +185,18 @@ xchk_update_health(
struct xfs_perag *pag;
bool bad;
+ /*
+ * The HEALTHY scrub type is a request from userspace to clear all the
+ * indirect flags after a clean scan of the entire filesystem. As such
+ * there's no sick flag defined for it, so we branch here ahead of the
+ * mask check.
+ */
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_HEALTHY &&
+ !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+ xchk_mark_all_healthy(sc->mp);
+ return;
+ }
+
if (!sc->sick_mask)
return;
@@ -143,7 +206,7 @@ xchk_update_health(
case XHG_AG:
pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
if (bad)
- xfs_ag_mark_sick(pag, sc->sick_mask);
+ xfs_ag_mark_corrupt(pag, sc->sick_mask);
else
xfs_ag_mark_healthy(pag, sc->sick_mask);
xfs_perag_put(pag);
@@ -151,20 +214,30 @@ xchk_update_health(
case XHG_INO:
if (!sc->ip)
return;
- if (bad)
- xfs_inode_mark_sick(sc->ip, sc->sick_mask);
- else
+ if (bad) {
+ unsigned int mask = sc->sick_mask;
+
+ /*
+ * If we're coming in for repairs then we don't want
+ * sickness flags to propagate to the incore health
+ * status if the inode gets inactivated before we can
+ * fix it.
+ */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ mask |= XFS_SICK_INO_FORGET;
+ xfs_inode_mark_corrupt(sc->ip, mask);
+ } else
xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
break;
case XHG_FS:
if (bad)
- xfs_fs_mark_sick(sc->mp, sc->sick_mask);
+ xfs_fs_mark_corrupt(sc->mp, sc->sick_mask);
else
xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
break;
case XHG_RT:
if (bad)
- xfs_rt_mark_sick(sc->mp, sc->sick_mask);
+ xfs_rt_mark_corrupt(sc->mp, sc->sick_mask);
else
xfs_rt_mark_healthy(sc->mp, sc->sick_mask);
break;
@@ -175,13 +248,13 @@ xchk_update_health(
}
/* Is the given per-AG btree healthy enough for scanning? */
-bool
-xchk_ag_btree_healthy_enough(
+void
+xchk_ag_btree_del_cursor_if_sick(
struct xfs_scrub *sc,
- struct xfs_perag *pag,
- xfs_btnum_t btnum)
+ struct xfs_btree_cur **curp,
+ unsigned int sm_type)
{
- unsigned int mask = 0;
+ unsigned int mask = (*curp)->bc_ops->sick_mask;
/*
* We always want the cursor if it's the same type as whatever we're
@@ -190,41 +263,8 @@ xchk_ag_btree_healthy_enough(
* Otherwise, we're only interested in the btree for cross-referencing.
* If we know the btree is bad then don't bother, just set XFAIL.
*/
- switch (btnum) {
- case XFS_BTNUM_BNO:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT)
- return true;
- mask = XFS_SICK_AG_BNOBT;
- break;
- case XFS_BTNUM_CNT:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT)
- return true;
- mask = XFS_SICK_AG_CNTBT;
- break;
- case XFS_BTNUM_INO:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
- return true;
- mask = XFS_SICK_AG_INOBT;
- break;
- case XFS_BTNUM_FINO:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
- return true;
- mask = XFS_SICK_AG_FINOBT;
- break;
- case XFS_BTNUM_RMAP:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT)
- return true;
- mask = XFS_SICK_AG_RMAPBT;
- break;
- case XFS_BTNUM_REFC:
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT)
- return true;
- mask = XFS_SICK_AG_REFCNTBT;
- break;
- default:
- ASSERT(0);
- return true;
- }
+ if (sc->sm->sm_type == sm_type)
+ return;
/*
* If we just repaired some AG metadata, sc->sick_mask will reflect all
@@ -236,10 +276,42 @@ xchk_ag_btree_healthy_enough(
type_to_health_flag[sc->sm->sm_type].group == XHG_AG)
mask &= ~sc->sick_mask;
- if (xfs_ag_has_sickness(pag, mask)) {
+ if (xfs_ag_has_sickness((*curp)->bc_ag.pag, mask)) {
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
- return false;
+ xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR);
+ *curp = NULL;
+ }
+}
+
+/*
+ * Quick scan to double-check that there isn't any evidence of lingering
+ * primary health problems. If we're still clear, then the health update will
+ * take care of clearing the indirect evidence.
+ */
+int
+xchk_health_record(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ unsigned int sick;
+ unsigned int checked;
+
+ xfs_fs_measure_sickness(mp, &sick, &checked);
+ if (sick & XFS_SICK_FS_PRIMARY)
+ xchk_set_corrupt(sc);
+
+ xfs_rt_measure_sickness(mp, &sick, &checked);
+ if (sick & XFS_SICK_RT_PRIMARY)
+ xchk_set_corrupt(sc);
+
+ for_each_perag(mp, agno, pag) {
+ xfs_ag_measure_sickness(pag, &sick, &checked);
+ if (sick & XFS_SICK_AG_PRIMARY)
+ xchk_set_corrupt(sc);
}
- return true;
+ return 0;
}
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
index 66a273f8585b..63fc426eb5ae 100644
--- a/fs/xfs/scrub/health.h
+++ b/fs/xfs/scrub/health.h
@@ -8,7 +8,10 @@
unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type);
void xchk_update_health(struct xfs_scrub *sc);
-bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag,
- xfs_btnum_t btnum);
+void xchk_ag_btree_del_cursor_if_sick(struct xfs_scrub *sc,
+ struct xfs_btree_cur **curp, unsigned int sm_type);
+void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask);
+bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask);
+int xchk_health_record(struct xfs_scrub *sc);
#endif /* __XFS_SCRUB_HEALTH_H__ */
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index fb7bbf47ae5d..750d7b0cd25a 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -76,7 +76,7 @@ xchk_inobt_xref_finobt(
int has_record;
int error;
- ASSERT(cur->bc_btnum == XFS_BTNUM_FINO);
+ ASSERT(xfs_btree_is_fino(cur->bc_ops));
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
if (error)
@@ -179,7 +179,7 @@ xchk_finobt_xref_inobt(
int has_record;
int error;
- ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+ ASSERT(xfs_btree_is_ino(cur->bc_ops));
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
if (error)
@@ -514,7 +514,7 @@ xchk_iallocbt_rec_alignment(
* Otherwise, we expect that the finobt record is aligned to the
* cluster alignment as told by the superblock.
*/
- if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+ if (xfs_btree_is_fino(bs->cur->bc_ops)) {
unsigned int imask;
imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
@@ -585,7 +585,7 @@ xchk_iallocbt_rec(
uint16_t holemask;
xfs_inobt_btrec_to_irec(mp, rec, &irec);
- if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) {
+ if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -649,8 +649,7 @@ out:
*/
STATIC void
xchk_iallocbt_xref_rmap_btreeblks(
- struct xfs_scrub *sc,
- int which)
+ struct xfs_scrub *sc)
{
xfs_filblks_t blocks;
xfs_extlen_t inobt_blocks = 0;
@@ -688,7 +687,6 @@ xchk_iallocbt_xref_rmap_btreeblks(
STATIC void
xchk_iallocbt_xref_rmap_inodes(
struct xfs_scrub *sc,
- int which,
unsigned long long inodes)
{
xfs_filblks_t blocks;
@@ -708,11 +706,10 @@ xchk_iallocbt_xref_rmap_inodes(
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
}
-/* Scrub the inode btrees for some AG. */
-STATIC int
+/* Scrub one of the inode btrees for some AG. */
+int
xchk_iallocbt(
- struct xfs_scrub *sc,
- xfs_btnum_t which)
+ struct xfs_scrub *sc)
{
struct xfs_btree_cur *cur;
struct xchk_iallocbt iabt = {
@@ -722,13 +719,24 @@ xchk_iallocbt(
};
int error;
- cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur;
+ switch (sc->sm->sm_type) {
+ case XFS_SCRUB_TYPE_INOBT:
+ cur = sc->sa.ino_cur;
+ break;
+ case XFS_SCRUB_TYPE_FINOBT:
+ cur = sc->sa.fino_cur;
+ break;
+ default:
+ ASSERT(0);
+ return -EIO;
+ }
+
error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT,
&iabt);
if (error)
return error;
- xchk_iallocbt_xref_rmap_btreeblks(sc, which);
+ xchk_iallocbt_xref_rmap_btreeblks(sc);
/*
* If we're scrubbing the inode btree, inode_blocks is the number of
@@ -737,26 +745,11 @@ xchk_iallocbt(
* knows about. We can't do this for the finobt since it only points
* to inode chunks with free inodes.
*/
- if (which == XFS_BTNUM_INO)
- xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes);
-
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
+ xchk_iallocbt_xref_rmap_inodes(sc, iabt.inodes);
return error;
}
-int
-xchk_inobt(
- struct xfs_scrub *sc)
-{
- return xchk_iallocbt(sc, XFS_BTNUM_INO);
-}
-
-int
-xchk_finobt(
- struct xfs_scrub *sc)
-{
- return xchk_iallocbt(sc, XFS_BTNUM_FINO);
-}
-
/* See if an inode btree has (or doesn't have) an inode chunk record. */
static inline void
xchk_xref_inode_check(
diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c
new file mode 100644
index 000000000000..a00ec7ae1792
--- /dev/null
+++ b/fs/xfs/scrub/ialloc_repair.c
@@ -0,0 +1,884 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_health.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Inode Btree Repair
+ * ==================
+ *
+ * A quick refresher of inode btrees on a v5 filesystem:
+ *
+ * - Inode records are read into memory in units of 'inode clusters'. However
+ * many inodes fit in a cluster buffer is the smallest number of inodes that
+ * can be allocated or freed. Clusters are never smaller than one fs block
+ * though they can span multiple blocks. The size (in fs blocks) is
+ * computed with xfs_icluster_size_fsb(). The fs block alignment of a
+ * cluster is computed with xfs_ialloc_cluster_alignment().
+ *
+ * - Each inode btree record can describe a single 'inode chunk'. The chunk
+ * size is defined to be 64 inodes. If sparse inodes are enabled, every
+ * inobt record must be aligned to the chunk size; if not, every record must
+ * be aligned to the start of a cluster. It is possible to construct an XFS
+ * geometry where one inobt record maps to multiple inode clusters; it is
+ * also possible to construct a geometry where multiple inobt records map to
+ * different parts of one inode cluster.
+ *
+ * - If sparse inodes are not enabled, the smallest unit of allocation for
+ * inode records is enough to contain one inode chunk's worth of inodes.
+ *
+ * - If sparse inodes are enabled, the holemask field will be active. Each
+ * bit of the holemask represents 4 potential inodes; if set, the
+ * corresponding space does *not* contain inodes and must be left alone.
+ * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation
+ * of inode records is one inode cluster.
+ *
+ * So what's the rebuild algorithm?
+ *
+ * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT
+ * records. The OWN_INOBT records are the old inode btree blocks and will be
+ * cleared out after we've rebuilt the tree. Each possible inode cluster
+ * within an OWN_INODES record will be read in; for each possible inobt record
+ * associated with that cluster, compute the freemask calculated from the
+ * i_mode data in the inode chunk. For sparse inodes the holemask will be
+ * calculated by creating the properly aligned inobt record and punching out
+ * any chunk that's missing. Inode allocations and frees grab the AGI first,
+ * so repair protects itself from concurrent access by locking the AGI.
+ *
+ * Once we've reconstructed all the inode records, we can create new inode
+ * btree roots and reload the btrees. We rebuild both inode trees at the same
+ * time because they have the same rmap owner and it would be more complex to
+ * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT
+ * blocks it owns. We have all the data we need to build both, so dump
+ * everything and start over.
+ *
+ * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once.
+ */
+
+struct xrep_ibt {
+ /* Record under construction. */
+ struct xfs_inobt_rec_incore rie;
+
+ /* new inobt information */
+ struct xrep_newbt new_inobt;
+
+ /* new finobt information */
+ struct xrep_newbt new_finobt;
+
+ /* Old inode btree blocks we found in the rmap. */
+ struct xagb_bitmap old_iallocbt_blocks;
+
+ /* Reconstructed inode records. */
+ struct xfarray *inode_records;
+
+ struct xfs_scrub *sc;
+
+ /* Number of inodes assigned disk space. */
+ unsigned int icount;
+
+ /* Number of inodes in use. */
+ unsigned int iused;
+
+ /* Number of finobt records needed. */
+ unsigned int finobt_recs;
+
+ /* get_records()'s position in the inode record array. */
+ xfarray_idx_t array_cur;
+};
+
+/*
+ * Is this inode in use? If the inode is in memory we can tell from i_mode,
+ * otherwise we have to check di_mode in the on-disk buffer. We only care
+ * that the high (i.e. non-permission) bits of _mode are zero. This should be
+ * safe because repair keeps all AG headers locked until the end, and process
+ * trying to perform an inode allocation/free must lock the AGI.
+ *
+ * @cluster_ag_base is the inode offset of the cluster within the AG.
+ * @cluster_bp is the cluster buffer.
+ * @cluster_index is the inode offset within the inode cluster.
+ */
+STATIC int
+xrep_ibt_check_ifree(
+ struct xrep_ibt *ri,
+ xfs_agino_t cluster_ag_base,
+ struct xfs_buf *cluster_bp,
+ unsigned int cluster_index,
+ bool *inuse)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_dinode *dip;
+ xfs_ino_t fsino;
+ xfs_agino_t agino;
+ xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno;
+ unsigned int cluster_buf_base;
+ unsigned int offset;
+ int error;
+
+ agino = cluster_ag_base + cluster_index;
+ fsino = XFS_AGINO_TO_INO(mp, agno, agino);
+
+ /* Inode uncached or half assembled, read disk buffer */
+ cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
+ offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize;
+ if (offset >= BBTOB(cluster_bp->b_length))
+ return -EFSCORRUPTED;
+ dip = xfs_buf_offset(cluster_bp, offset);
+ if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
+ return -EFSCORRUPTED;
+
+ if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)
+ return -EFSCORRUPTED;
+
+ /* Will the in-core inode tell us if it's in use? */
+ error = xchk_inode_is_allocated(sc, agino, inuse);
+ if (!error)
+ return 0;
+
+ *inuse = dip->di_mode != 0;
+ return 0;
+}
+
+/* Stash the accumulated inobt record for rebuilding. */
+STATIC int
+xrep_ibt_stash(
+ struct xrep_ibt *ri)
+{
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie);
+ if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL)
+ return -EFSCORRUPTED;
+
+ if (ri->rie.ir_freecount > 0)
+ ri->finobt_recs++;
+
+ trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie);
+
+ error = xfarray_append(ri->inode_records, &ri->rie);
+ if (error)
+ return error;
+
+ ri->rie.ir_startino = NULLAGINO;
+ return 0;
+}
+
+/*
+ * Given an extent of inodes and an inode cluster buffer, calculate the
+ * location of the corresponding inobt record (creating it if necessary),
+ * then update the parts of the holemask and freemask of that record that
+ * correspond to the inode extent we were given.
+ *
+ * @cluster_ir_startino is the AG inode number of an inobt record that we're
+ * proposing to create for this inode cluster. If sparse inodes are enabled,
+ * we must round down to a chunk boundary to find the actual sparse record.
+ * @cluster_bp is the buffer of the inode cluster.
+ * @nr_inodes is the number of inodes to check from the cluster.
+ */
+STATIC int
+xrep_ibt_cluster_record(
+ struct xrep_ibt *ri,
+ xfs_agino_t cluster_ir_startino,
+ struct xfs_buf *cluster_bp,
+ unsigned int nr_inodes)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agino_t ir_startino;
+ unsigned int cluster_base;
+ unsigned int cluster_index;
+ int error = 0;
+
+ ir_startino = cluster_ir_startino;
+ if (xfs_has_sparseinodes(mp))
+ ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK);
+ cluster_base = cluster_ir_startino - ir_startino;
+
+ /*
+ * If the accumulated inobt record doesn't map this cluster, add it to
+ * the list and reset it.
+ */
+ if (ri->rie.ir_startino != NULLAGINO &&
+ ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) {
+ error = xrep_ibt_stash(ri);
+ if (error)
+ return error;
+ }
+
+ if (ri->rie.ir_startino == NULLAGINO) {
+ ri->rie.ir_startino = ir_startino;
+ ri->rie.ir_free = XFS_INOBT_ALL_FREE;
+ ri->rie.ir_holemask = 0xFFFF;
+ ri->rie.ir_count = 0;
+ }
+
+ /* Record the whole cluster. */
+ ri->icount += nr_inodes;
+ ri->rie.ir_count += nr_inodes;
+ ri->rie.ir_holemask &= ~xfs_inobt_maskn(
+ cluster_base / XFS_INODES_PER_HOLEMASK_BIT,
+ nr_inodes / XFS_INODES_PER_HOLEMASK_BIT);
+
+ /* Which inodes within this cluster are free? */
+ for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+ bool inuse = false;
+
+ error = xrep_ibt_check_ifree(ri, cluster_ir_startino,
+ cluster_bp, cluster_index, &inuse);
+ if (error)
+ return error;
+ if (!inuse)
+ continue;
+ ri->iused++;
+ ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base +
+ cluster_index);
+ }
+ return 0;
+}
+
+/*
+ * For each inode cluster covering the physical extent recorded by the rmapbt,
+ * we must calculate the properly aligned startino of that cluster, then
+ * iterate each cluster to fill in used and filled masks appropriately. We
+ * then use the (startino, used, filled) information to construct the
+ * appropriate inode records.
+ */
+STATIC int
+xrep_ibt_process_cluster(
+ struct xrep_ibt *ri,
+ xfs_agblock_t cluster_bno)
+{
+ struct xfs_imap imap;
+ struct xfs_buf *cluster_bp;
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agino_t cluster_ag_base;
+ xfs_agino_t irec_index;
+ unsigned int nr_inodes;
+ int error;
+
+ nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster,
+ XFS_INODES_PER_CHUNK);
+
+ /*
+ * Grab the inode cluster buffer. This is safe to do with a broken
+ * inobt because imap_to_bp directly maps the buffer without touching
+ * either inode btree.
+ */
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno);
+ imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
+ imap.im_boffset = 0;
+ error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp);
+ if (error)
+ return error;
+
+ /*
+ * Record the contents of each possible inobt record mapping this
+ * cluster.
+ */
+ cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno);
+ for (irec_index = 0;
+ irec_index < igeo->inodes_per_cluster;
+ irec_index += XFS_INODES_PER_CHUNK) {
+ error = xrep_ibt_cluster_record(ri,
+ cluster_ag_base + irec_index, cluster_bp,
+ nr_inodes);
+ if (error)
+ break;
+
+ }
+
+ xfs_trans_brelse(sc->tp, cluster_bp);
+ return error;
+}
+
+/* Check for any obvious conflicts in the inode chunk extent. */
+STATIC int
+xrep_ibt_check_inode_ext(
+ struct xfs_scrub *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agino_t agino;
+ enum xbtree_recpacking outcome;
+ int error;
+
+ /* Inode records must be within the AG. */
+ if (!xfs_verify_agbext(sc->sa.pag, agbno, len))
+ return -EFSCORRUPTED;
+
+ /* The entire record must align to the inode cluster size. */
+ if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) ||
+ !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster))
+ return -EFSCORRUPTED;
+
+ /*
+ * The entire record must also adhere to the inode cluster alignment
+ * size if sparse inodes are not enabled.
+ */
+ if (!xfs_has_sparseinodes(mp) &&
+ (!IS_ALIGNED(agbno, igeo->cluster_align) ||
+ !IS_ALIGNED(agbno + len, igeo->cluster_align)))
+ return -EFSCORRUPTED;
+
+ /*
+ * On a sparse inode fs, this cluster could be part of a sparse chunk.
+ * Sparse clusters must be aligned to sparse chunk alignment.
+ */
+ if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align &&
+ (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
+ !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
+ return -EFSCORRUPTED;
+
+ /* Make sure the entire range of blocks are valid AG inodes. */
+ agino = XFS_AGB_TO_AGINO(mp, agbno);
+ if (!xfs_verify_agino(sc->sa.pag, agino))
+ return -EFSCORRUPTED;
+
+ agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1;
+ if (!xfs_verify_agino(sc->sa.pag, agino))
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Found a fragment of the old inode btrees; dispose of them later. */
+STATIC int
+xrep_ibt_record_old_btree_blocks(
+ struct xrep_ibt *ri,
+ const struct xfs_rmap_irec *rec)
+{
+ if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock,
+ rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock,
+ rec->rm_blockcount);
+}
+
+/* Record extents that belong to inode cluster blocks. */
+STATIC int
+xrep_ibt_record_inode_blocks(
+ struct xrep_ibt *ri,
+ const struct xfs_rmap_irec *rec)
+{
+ struct xfs_mount *mp = ri->sc->mp;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agblock_t cluster_base;
+ int error;
+
+ error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock,
+ rec->rm_blockcount);
+ if (error)
+ return error;
+
+ trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno,
+ rec->rm_startblock, rec->rm_blockcount, rec->rm_owner,
+ rec->rm_offset, rec->rm_flags);
+
+ /*
+ * Record the free/hole masks for each inode cluster that could be
+ * mapped by this rmap record.
+ */
+ for (cluster_base = 0;
+ cluster_base < rec->rm_blockcount;
+ cluster_base += igeo->blocks_per_cluster) {
+ error = xrep_ibt_process_cluster(ri,
+ rec->rm_startblock + cluster_base);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+STATIC int
+xrep_ibt_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_ibt *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ switch (rec->rm_owner) {
+ case XFS_RMAP_OWN_INOBT:
+ return xrep_ibt_record_old_btree_blocks(ri, rec);
+ case XFS_RMAP_OWN_INODES:
+ return xrep_ibt_record_inode_blocks(ri, rec);
+ }
+ return 0;
+}
+
+/*
+ * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode
+ * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct
+ * the inode btrees. The caller must clean up the lists if anything goes
+ * wrong.
+ */
+STATIC int
+xrep_ibt_find_inodes(
+ struct xrep_ibt *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ int error;
+
+ ri->rie.ir_startino = NULLAGINO;
+
+ /* Collect all reverse mappings for inode blocks. */
+ xrep_ag_btcur_init(sc, &sc->sa);
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri);
+ xchk_ag_btcur_free(&sc->sa);
+ if (error)
+ return error;
+
+ /* If we have a record ready to go, add it to the array. */
+ if (ri->rie.ir_startino != NULLAGINO)
+ return xrep_ibt_stash(ri);
+
+ return 0;
+}
+
+/* Update the AGI counters. */
+STATIC int
+xrep_ibt_reset_counters(
+ struct xrep_ibt *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+ unsigned int freecount = ri->icount - ri->iused;
+
+ /* Trigger inode count recalculation */
+ xfs_force_summary_recalc(sc->mp);
+
+ /*
+ * The AGI header contains extra information related to the inode
+ * btrees, so we must update those fields here.
+ */
+ agi->agi_count = cpu_to_be32(ri->icount);
+ agi->agi_freecount = cpu_to_be32(freecount);
+ xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp,
+ XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagi(sc);
+}
+
+/* Retrieve finobt data for bulk load. */
+STATIC int
+xrep_fibt_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i;
+ struct xrep_ibt *ri = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ do {
+ error = xfarray_load(ri->inode_records,
+ ri->array_cur++, irec);
+ } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0);
+ if (error)
+ return error;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Retrieve inobt data for bulk load. */
+STATIC int
+xrep_ibt_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i;
+ struct xrep_ibt *ri = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ error = xfarray_load(ri->inode_records, ri->array_cur++, irec);
+ if (error)
+ return error;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new inobt blocks to the bulk loader. */
+STATIC int
+xrep_ibt_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_ibt *ri = priv;
+
+ return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr);
+}
+
+/* Feed one of the new finobt blocks to the bulk loader. */
+STATIC int
+xrep_fibt_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_ibt *ri = priv;
+
+ return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr);
+}
+
+/* Make sure the records do not overlap in inumber address space. */
+STATIC int
+xrep_ibt_check_overlap(
+ struct xrep_ibt *ri)
+{
+ struct xfs_inobt_rec_incore irec;
+ xfarray_idx_t cur;
+ xfs_agino_t next_agino = 0;
+ int error = 0;
+
+ foreach_xfarray_idx(ri->inode_records, cur) {
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ error = xfarray_load(ri->inode_records, cur, &irec);
+ if (error)
+ return error;
+
+ if (irec.ir_startino < next_agino)
+ return -EFSCORRUPTED;
+
+ next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK;
+ }
+
+ return error;
+}
+
+/* Build new inode btrees and dispose of the old one. */
+STATIC int
+xrep_ibt_build_new_trees(
+ struct xrep_ibt *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_btree_cur *ino_cur;
+ struct xfs_btree_cur *fino_cur = NULL;
+ xfs_fsblock_t fsbno;
+ bool need_finobt;
+ int error;
+
+ need_finobt = xfs_has_finobt(sc->mp);
+
+ /*
+ * Create new btrees for staging all the inobt records we collected
+ * earlier. The records were collected in order of increasing agino,
+ * so we do not have to sort them. Ensure there are no overlapping
+ * records.
+ */
+ error = xrep_ibt_check_overlap(ri);
+ if (error)
+ return error;
+
+ /*
+ * The new inode btrees will not be rooted in the AGI until we've
+ * successfully rebuilt the tree.
+ *
+ * Start by setting up the inobt staging cursor.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+ XFS_IBT_BLOCK(sc->mp)),
+ xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno,
+ XFS_AG_RESV_NONE);
+ ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
+ ri->new_inobt.bload.get_records = xrep_ibt_get_records;
+
+ ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL);
+ xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake);
+ error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
+ xfarray_length(ri->inode_records));
+ if (error)
+ goto err_inocur;
+
+ /* Set up finobt staging cursor. */
+ if (need_finobt) {
+ enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA;
+
+ if (sc->mp->m_finobt_nores)
+ resv = XFS_AG_RESV_NONE;
+
+ fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+ XFS_FIBT_BLOCK(sc->mp)),
+ xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT,
+ fsbno, resv);
+ ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
+ ri->new_finobt.bload.get_records = xrep_fibt_get_records;
+
+ fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL);
+ xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake);
+ error = xfs_btree_bload_compute_geometry(fino_cur,
+ &ri->new_finobt.bload, ri->finobt_recs);
+ if (error)
+ goto err_finocur;
+ }
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_finocur;
+
+ /* Reserve all the space we need to build the new btrees. */
+ error = xrep_newbt_alloc_blocks(&ri->new_inobt,
+ ri->new_inobt.bload.nr_blocks);
+ if (error)
+ goto err_finocur;
+
+ if (need_finobt) {
+ error = xrep_newbt_alloc_blocks(&ri->new_finobt,
+ ri->new_finobt.bload.nr_blocks);
+ if (error)
+ goto err_finocur;
+ }
+
+ /* Add all inobt records. */
+ ri->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri);
+ if (error)
+ goto err_finocur;
+
+ /* Add all finobt records. */
+ if (need_finobt) {
+ ri->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri);
+ if (error)
+ goto err_finocur;
+ }
+
+ /*
+ * Install the new btrees in the AG header. After this point the old
+ * btrees are no longer accessible and the new trees are live.
+ */
+ xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp);
+ xfs_btree_del_cursor(ino_cur, 0);
+
+ if (fino_cur) {
+ xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp);
+ xfs_btree_del_cursor(fino_cur, 0);
+ }
+
+ /* Reset the AGI counters now that we've changed the inode roots. */
+ error = xrep_ibt_reset_counters(ri);
+ if (error)
+ goto err_finobt;
+
+ /* Free unused blocks and bitmap. */
+ if (need_finobt) {
+ error = xrep_newbt_commit(&ri->new_finobt);
+ if (error)
+ goto err_inobt;
+ }
+ error = xrep_newbt_commit(&ri->new_inobt);
+ if (error)
+ return error;
+
+ return xrep_roll_ag_trans(sc);
+
+err_finocur:
+ if (need_finobt)
+ xfs_btree_del_cursor(fino_cur, error);
+err_inocur:
+ xfs_btree_del_cursor(ino_cur, error);
+err_finobt:
+ if (need_finobt)
+ xrep_newbt_cancel(&ri->new_finobt);
+err_inobt:
+ xrep_newbt_cancel(&ri->new_inobt);
+ return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_ibt_remove_old_trees(
+ struct xrep_ibt *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ int error;
+
+ /*
+ * Free the old inode btree blocks if they're not in use. It's ok to
+ * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG
+ * reservation because we reset the reservation before releasing the
+ * AGI and AGF header buffer locks.
+ */
+ error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks,
+ &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE);
+ if (error)
+ return error;
+
+ /*
+ * If the finobt is enabled and has a per-AG reservation, make sure we
+ * reinitialize the per-AG reservations.
+ */
+ if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores)
+ sc->flags |= XREP_RESET_PERAG_RESV;
+
+ return 0;
+}
+
+/* Repair both inode btrees. */
+int
+xrep_iallocbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_ibt *ri;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ xfs_agino_t first_agino, last_agino;
+ int error = 0;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+
+ ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS);
+ if (!ri)
+ return -ENOMEM;
+ ri->sc = sc;
+
+ /* We rebuild both inode btrees. */
+ sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
+
+ /* Set up enough storage to handle an AG with nothing but inodes. */
+ xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino);
+ last_agino /= XFS_INODES_PER_CHUNK;
+ descr = xchk_xfile_ag_descr(sc, "inode index records");
+ error = xfarray_create(descr, last_agino,
+ sizeof(struct xfs_inobt_rec_incore),
+ &ri->inode_records);
+ kfree(descr);
+ if (error)
+ goto out_ri;
+
+ /* Collect the inode data and find the old btree blocks. */
+ xagb_bitmap_init(&ri->old_iallocbt_blocks);
+ error = xrep_ibt_find_inodes(ri);
+ if (error)
+ goto out_bitmap;
+
+ /* Rebuild the inode indexes. */
+ error = xrep_ibt_build_new_trees(ri);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old tree. */
+ error = xrep_ibt_remove_old_trees(ri);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xagb_bitmap_destroy(&ri->old_iallocbt_blocks);
+ xfarray_destroy(ri->inode_records);
+out_ri:
+ kfree(ri);
+ return error;
+}
+
+/* Make sure both btrees are ok after we've rebuilt them. */
+int
+xrep_revalidate_iallocbt(
+ struct xfs_scrub *sc)
+{
+ __u32 old_type = sc->sm->sm_type;
+ int error;
+
+ /*
+ * We must update sm_type temporarily so that the tree-to-tree cross
+ * reference checks will work in the correct direction, and also so
+ * that tracing will report correctly if there are more errors.
+ */
+ sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT;
+ error = xchk_iallocbt(sc);
+ if (error)
+ goto out;
+
+ if (xfs_has_finobt(sc->mp)) {
+ sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT;
+ error = xchk_iallocbt(sc);
+ }
+
+out:
+ sc->sm->sm_type = old_type;
+ return error;
+}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 59d7912fb75f..6e2fe2d6250b 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -20,10 +20,12 @@
#include "xfs_reflink.h"
#include "xfs_rmap.h"
#include "xfs_bmap_util.h"
+#include "xfs_rtbitmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
+#include "scrub/repair.h"
/* Prepare the attached inode for scrubbing. */
static inline int
@@ -38,6 +40,10 @@ xchk_prepare_iscrub(
if (error)
return error;
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ return error;
+
xchk_ilock(sc, XFS_ILOCK_EXCL);
return 0;
}
@@ -94,8 +100,8 @@ xchk_setup_inode(
if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
return -ENOENT;
- /* Try a regular untrusted iget. */
- error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+ /* Try a safe untrusted iget. */
+ error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
if (!error)
return xchk_install_handle_iscrub(sc, ip);
if (error == -ENOENT)
@@ -180,8 +186,11 @@ xchk_setup_inode(
* saying the inode is allocated and the icache being unable to load
* the inode until we can flag the corruption in xchk_inode. The
* scrub function has to note the corruption, since we're not really
- * supposed to do that from the setup function.
+ * supposed to do that from the setup function. Save the mapping to
+ * make repairs to the ondisk inode buffer.
*/
+ if (xchk_could_repair(sc))
+ xrep_setup_inode(sc, &imap);
return 0;
out_cancel:
@@ -225,7 +234,7 @@ xchk_inode_extsize(
*/
if ((flags & XFS_DIFLAG_RTINHERIT) &&
(flags & XFS_DIFLAG_EXTSZINHERIT) &&
- value % sc->mp->m_sb.sb_rextsize > 0)
+ xfs_extlen_to_rtxmod(sc->mp, value) > 0)
xchk_ino_set_warning(sc, ino);
}
@@ -337,6 +346,10 @@ xchk_inode_flags2(
if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp))
goto bad;
+ /* no large extent counts without the filesystem feature */
+ if ((flags2 & XFS_DIFLAG2_NREXT64) && !xfs_has_large_extent_counts(mp))
+ goto bad;
+
return;
bad:
xchk_ino_set_corrupt(sc, ino);
@@ -547,7 +560,7 @@ xchk_dinode(
}
/* di_forkoff */
- if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
+ if (XFS_DFORK_BOFF(dip) >= mp->m_sb.sb_inodesize)
xchk_ino_set_corrupt(sc, ino);
if (naextents != 0 && dip->di_forkoff == 0)
xchk_ino_set_corrupt(sc, ino);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
new file mode 100644
index 000000000000..eab380e95ef4
--- /dev/null
+++ b/fs/xfs/scrub/inode_repair.c
@@ -0,0 +1,1750 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_ialloc.h"
+#include "xfs_da_format.h"
+#include "xfs_reflink.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap_util.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_quota_defs.h"
+#include "xfs_quota.h"
+#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_log_priv.h"
+#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/readdir.h"
+
+/*
+ * Inode Record Repair
+ * ===================
+ *
+ * Roughly speaking, inode problems can be classified based on whether or not
+ * they trip the dinode verifiers. If those trip, then we won't be able to
+ * xfs_iget ourselves the inode.
+ *
+ * Therefore, the xrep_dinode_* functions fix anything that will cause the
+ * inode buffer verifier or the dinode verifier. The xrep_inode_* functions
+ * fix things on live incore inodes. The inode repair functions make decisions
+ * with security and usability implications when reviving a file:
+ *
+ * - Files with zero di_mode or a garbage di_mode are converted to regular file
+ * that only root can read. This file may not actually contain user data,
+ * if the file was not previously a regular file. Setuid and setgid bits
+ * are cleared.
+ *
+ * - Zero-size directories can be truncated to look empty. It is necessary to
+ * run the bmapbtd and directory repair functions to fully rebuild the
+ * directory.
+ *
+ * - Zero-size symbolic link targets can be truncated to '?'. It is necessary
+ * to run the bmapbtd and symlink repair functions to salvage the symlink.
+ *
+ * - Invalid extent size hints will be removed.
+ *
+ * - Quotacheck will be scheduled if we repaired an inode that was so badly
+ * damaged that the ondisk inode had to be rebuilt.
+ *
+ * - Invalid user, group, or project IDs (aka -1U) will be reset to zero.
+ * Setuid and setgid bits are cleared.
+ *
+ * - Data and attr forks are reset to extents format with zero extents if the
+ * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta
+ * repair functions to recover the space mapping.
+ *
+ * - ACLs will not be recovered if the attr fork is zapped or the extended
+ * attribute structure itself requires salvaging.
+ *
+ * - If the attr fork is zapped, the user and group ids are reset to root and
+ * the setuid and setgid bits are removed.
+ */
+
+/*
+ * All the information we need to repair the ondisk inode if we can't iget the
+ * incore inode. We don't allocate this buffer unless we're going to perform
+ * a repair to the ondisk inode cluster buffer.
+ */
+struct xrep_inode {
+ /* Inode mapping that we saved from the initial lookup attempt. */
+ struct xfs_imap imap;
+
+ struct xfs_scrub *sc;
+
+ /* Blocks in use on the data device by data extents or bmbt blocks. */
+ xfs_rfsblock_t data_blocks;
+
+ /* Blocks in use on the rt device. */
+ xfs_rfsblock_t rt_blocks;
+
+ /* Blocks in use by the attr fork. */
+ xfs_rfsblock_t attr_blocks;
+
+ /* Number of data device extents for the data fork. */
+ xfs_extnum_t data_extents;
+
+ /*
+ * Number of realtime device extents for the data fork. If
+ * data_extents and rt_extents indicate that the data fork has extents
+ * on both devices, we'll just back away slowly.
+ */
+ xfs_extnum_t rt_extents;
+
+ /* Number of (data device) extents for the attr fork. */
+ xfs_aextnum_t attr_extents;
+
+ /* Sick state to set after zapping parts of the inode. */
+ unsigned int ino_sick_mask;
+
+ /* Must we remove all access from this file? */
+ bool zap_acls;
+
+ /* Inode scanner to see if we can find the ftype from dirents */
+ struct xchk_iscan ftype_iscan;
+ uint8_t alleged_ftype;
+};
+
+/*
+ * Setup function for inode repair. @imap contains the ondisk inode mapping
+ * information so that we can correct the ondisk inode cluster buffer if
+ * necessary to make iget work.
+ */
+int
+xrep_setup_inode(
+ struct xfs_scrub *sc,
+ const struct xfs_imap *imap)
+{
+ struct xrep_inode *ri;
+
+ sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ ri = sc->buf;
+ memcpy(&ri->imap, imap, sizeof(struct xfs_imap));
+ ri->sc = sc;
+ return 0;
+}
+
+/*
+ * Make sure this ondisk inode can pass the inode buffer verifier. This is
+ * not the same as the dinode verifier.
+ */
+STATIC void
+xrep_dinode_buf_core(
+ struct xfs_scrub *sc,
+ struct xfs_buf *bp,
+ unsigned int ioffset)
+{
+ struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset);
+ struct xfs_trans *tp = sc->tp;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agino_t agino;
+ bool crc_ok = false;
+ bool magic_ok = false;
+ bool unlinked_ok = false;
+
+ agino = be32_to_cpu(dip->di_next_unlinked);
+
+ if (xfs_verify_agino_or_null(bp->b_pag, agino))
+ unlinked_ok = true;
+
+ if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+ xfs_dinode_good_version(mp, dip->di_version))
+ magic_ok = true;
+
+ if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+ XFS_DINODE_CRC_OFF))
+ crc_ok = true;
+
+ if (magic_ok && unlinked_ok && crc_ok)
+ return;
+
+ if (!magic_ok) {
+ dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ dip->di_version = 3;
+ }
+ if (!unlinked_ok)
+ dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+ xfs_trans_log_buf(tp, bp, ioffset,
+ ioffset + sizeof(struct xfs_dinode) - 1);
+}
+
+/* Make sure this inode cluster buffer can pass the inode buffer verifier. */
+STATIC void
+xrep_dinode_buf(
+ struct xfs_scrub *sc,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = sc->mp;
+ int i;
+ int ni;
+
+ ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+ for (i = 0; i < ni; i++)
+ xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog);
+}
+
+/* Reinitialize things that never change in an inode. */
+STATIC void
+xrep_dinode_header(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip)
+{
+ trace_xrep_dinode_header(sc, dip);
+
+ dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ if (!xfs_dinode_good_version(sc->mp, dip->di_version))
+ dip->di_version = 3;
+ dip->di_ino = cpu_to_be64(sc->sm->sm_ino);
+ uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
+ dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
+}
+
+/*
+ * If this directory entry points to the scrub target inode, then the directory
+ * we're scanning is the parent of the scrub target inode.
+ */
+STATIC int
+xrep_dinode_findmode_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_inode *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ if (ino != sc->sm->sm_ino)
+ return 0;
+
+ /* Ignore garbage directory entry names. */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+ return -EFSCORRUPTED;
+
+ /* Don't pick up dot or dotdot entries; we only want child dirents. */
+ if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
+ xfs_dir2_samename(name, &xfs_name_dot))
+ return 0;
+
+ /*
+ * Uhoh, more than one parent for this inode and they don't agree on
+ * the file type?
+ */
+ if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN &&
+ ri->alleged_ftype != name->type) {
+ trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type,
+ ri->alleged_ftype);
+ return -EFSCORRUPTED;
+ }
+
+ /* We found a potential parent; remember the ftype. */
+ trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type);
+ ri->alleged_ftype = name->type;
+ return 0;
+}
+
+/*
+ * If this is a directory, walk the dirents looking for any that point to the
+ * scrub target inode.
+ */
+STATIC int
+xrep_dinode_findmode_walk_directory(
+ struct xrep_inode *ri,
+ struct xfs_inode *dp)
+{
+ struct xfs_scrub *sc = ri->sc;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /*
+ * Scan the directory to see if there it contains an entry pointing to
+ * the directory that we are repairing.
+ */
+ lock_mode = xfs_ilock_data_map_shared(dp);
+
+ /*
+ * If this directory is known to be sick, we cannot scan it reliably
+ * and must abort.
+ */
+ if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
+ XFS_SICK_INO_BMBTD |
+ XFS_SICK_INO_DIR)) {
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ /*
+ * We cannot complete our parent pointer scan if a directory looks as
+ * though it has been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ return error;
+}
+
+/*
+ * Try to find the mode of the inode being repaired by looking for directories
+ * that point down to this file.
+ */
+STATIC int
+xrep_dinode_find_mode(
+ struct xrep_inode *ri,
+ uint16_t *mode)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_inode *dp;
+ int error;
+
+ /* No ftype means we have no other metadata to consult. */
+ if (!xfs_has_ftype(sc->mp)) {
+ *mode = S_IFREG;
+ return 0;
+ }
+
+ /*
+ * Scan all directories for parents that might point down to this
+ * inode. Skip the inode being repaired during the scan since it
+ * cannot be its own parent. Note that we still hold the AGI locked
+ * so there's a real possibility that _iscan_iter can return EBUSY.
+ */
+ xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
+ ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
+ ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
+ while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
+ if (S_ISDIR(VFS_I(dp)->i_mode))
+ error = xrep_dinode_findmode_walk_directory(ri, dp);
+ xchk_iscan_mark_visited(&ri->ftype_iscan, dp);
+ xchk_irele(sc, dp);
+ if (error < 0)
+ break;
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&ri->ftype_iscan);
+ xchk_iscan_teardown(&ri->ftype_iscan);
+
+ if (error == -EBUSY) {
+ if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) {
+ /*
+ * If we got an EBUSY after finding at least one
+ * dirent, that means the scan found an inode on the
+ * inactivation list and could not open it. Accept the
+ * alleged ftype and install a new mode below.
+ */
+ error = 0;
+ } else if (!(sc->flags & XCHK_TRY_HARDER)) {
+ /*
+ * Otherwise, retry the operation one time to see if
+ * the reason for the delay is an inode from the same
+ * cluster buffer waiting on the inactivation list.
+ */
+ error = -EDEADLOCK;
+ }
+ }
+ if (error)
+ return error;
+
+ /*
+ * Convert the discovered ftype into the file mode. If all else fails,
+ * return S_IFREG.
+ */
+ switch (ri->alleged_ftype) {
+ case XFS_DIR3_FT_DIR:
+ *mode = S_IFDIR;
+ break;
+ case XFS_DIR3_FT_WHT:
+ case XFS_DIR3_FT_CHRDEV:
+ *mode = S_IFCHR;
+ break;
+ case XFS_DIR3_FT_BLKDEV:
+ *mode = S_IFBLK;
+ break;
+ case XFS_DIR3_FT_FIFO:
+ *mode = S_IFIFO;
+ break;
+ case XFS_DIR3_FT_SOCK:
+ *mode = S_IFSOCK;
+ break;
+ case XFS_DIR3_FT_SYMLINK:
+ *mode = S_IFLNK;
+ break;
+ default:
+ *mode = S_IFREG;
+ break;
+ }
+ return 0;
+}
+
+/* Turn di_mode into /something/ recognizable. Returns true if we succeed. */
+STATIC int
+xrep_dinode_mode(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ uint16_t mode = be16_to_cpu(dip->di_mode);
+ int error;
+
+ trace_xrep_dinode_mode(sc, dip);
+
+ if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
+ return 0;
+
+ /* Try to fix the mode. If we cannot, then leave everything alone. */
+ error = xrep_dinode_find_mode(ri, &mode);
+ switch (error) {
+ case -EINTR:
+ case -EBUSY:
+ case -EDEADLOCK:
+ /* temporary failure or fatal signal */
+ return error;
+ case 0:
+ /* found mode */
+ break;
+ default:
+ /* some other error, assume S_IFREG */
+ mode = S_IFREG;
+ break;
+ }
+
+ /* bad mode, so we set it to a file that only root can read */
+ dip->di_mode = cpu_to_be16(mode);
+ dip->di_uid = 0;
+ dip->di_gid = 0;
+ ri->zap_acls = true;
+ return 0;
+}
+
+/* Fix any conflicting flags that the verifiers complain about. */
+STATIC void
+xrep_dinode_flags(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ bool isrt)
+{
+ struct xfs_mount *mp = sc->mp;
+ uint64_t flags2 = be64_to_cpu(dip->di_flags2);
+ uint16_t flags = be16_to_cpu(dip->di_flags);
+ uint16_t mode = be16_to_cpu(dip->di_mode);
+
+ trace_xrep_dinode_flags(sc, dip);
+
+ if (isrt)
+ flags |= XFS_DIFLAG_REALTIME;
+ else
+ flags &= ~XFS_DIFLAG_REALTIME;
+
+ /*
+ * For regular files on a reflink filesystem, set the REFLINK flag to
+ * protect shared extents. A later stage will actually check those
+ * extents and clear the flag if possible.
+ */
+ if (xfs_has_reflink(mp) && S_ISREG(mode))
+ flags2 |= XFS_DIFLAG2_REFLINK;
+ else
+ flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
+ if (flags & XFS_DIFLAG_REALTIME)
+ flags2 &= ~XFS_DIFLAG2_REFLINK;
+ if (!xfs_has_bigtime(mp))
+ flags2 &= ~XFS_DIFLAG2_BIGTIME;
+ if (!xfs_has_large_extent_counts(mp))
+ flags2 &= ~XFS_DIFLAG2_NREXT64;
+ if (flags2 & XFS_DIFLAG2_NREXT64)
+ dip->di_nrext64_pad = 0;
+ else if (dip->di_version >= 3)
+ dip->di_v3_pad = 0;
+ dip->di_flags = cpu_to_be16(flags);
+ dip->di_flags2 = cpu_to_be64(flags2);
+}
+
+/*
+ * Blow out symlink; now it points nowhere. We don't have to worry about
+ * incore state because this inode is failing the verifiers.
+ */
+STATIC void
+xrep_dinode_zap_symlink(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ char *p;
+
+ trace_xrep_dinode_zap_symlink(sc, dip);
+
+ dip->di_format = XFS_DINODE_FMT_LOCAL;
+ dip->di_size = cpu_to_be64(1);
+ p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ *p = '?';
+ ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
+}
+
+/*
+ * Blow out dir, make the parent point to the root. In the future repair will
+ * reconstruct this directory for us. Note that there's no in-core directory
+ * inode because the sf verifier tripped, so we don't have to worry about the
+ * dentry cache.
+ */
+STATIC void
+xrep_dinode_zap_dir(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_dir2_sf_hdr *sfp;
+ int i8count;
+
+ trace_xrep_dinode_zap_dir(sc, dip);
+
+ dip->di_format = XFS_DINODE_FMT_LOCAL;
+ i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
+ sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ sfp->count = 0;
+ sfp->i8count = i8count;
+ xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
+ dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
+ ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED;
+}
+
+/* Make sure we don't have a garbage file size. */
+STATIC void
+xrep_dinode_size(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ uint64_t size = be64_to_cpu(dip->di_size);
+ uint16_t mode = be16_to_cpu(dip->di_mode);
+
+ trace_xrep_dinode_size(sc, dip);
+
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ /* di_size can't be nonzero for special files */
+ dip->di_size = 0;
+ break;
+ case S_IFREG:
+ /* Regular files can't be larger than 2^63-1 bytes. */
+ dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
+ break;
+ case S_IFLNK:
+ /*
+ * Truncate ridiculously oversized symlinks. If the size is
+ * zero, reset it to point to the current directory. Both of
+ * these conditions trigger dinode verifier errors, so there
+ * is no in-core state to reset.
+ */
+ if (size > XFS_SYMLINK_MAXLEN)
+ dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
+ else if (size == 0)
+ xrep_dinode_zap_symlink(ri, dip);
+ break;
+ case S_IFDIR:
+ /*
+ * Directories can't have a size larger than 32G. If the size
+ * is zero, reset it to an empty directory. Both of these
+ * conditions trigger dinode verifier errors, so there is no
+ * in-core state to reset.
+ */
+ if (size > XFS_DIR2_SPACE_SIZE)
+ dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
+ else if (size == 0)
+ xrep_dinode_zap_dir(ri, dip);
+ break;
+ }
+}
+
+/* Fix extent size hints. */
+STATIC void
+xrep_dinode_extsize_hints(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip)
+{
+ struct xfs_mount *mp = sc->mp;
+ uint64_t flags2 = be64_to_cpu(dip->di_flags2);
+ uint16_t flags = be16_to_cpu(dip->di_flags);
+ uint16_t mode = be16_to_cpu(dip->di_mode);
+
+ xfs_failaddr_t fa;
+
+ trace_xrep_dinode_extsize_hints(sc, dip);
+
+ fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
+ mode, flags);
+ if (fa) {
+ dip->di_extsize = 0;
+ dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ }
+
+ if (dip->di_version < 3)
+ return;
+
+ fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
+ mode, flags, flags2);
+ if (fa) {
+ dip->di_cowextsize = 0;
+ dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
+ }
+}
+
+/* Count extents and blocks for an inode given an rmap. */
+STATIC int
+xrep_dinode_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_inode *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ /* We only care about this inode. */
+ if (rec->rm_owner != ri->sc->sm->sm_ino)
+ return 0;
+
+ if (rec->rm_flags & XFS_RMAP_ATTR_FORK) {
+ ri->attr_blocks += rec->rm_blockcount;
+ if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ ri->attr_extents++;
+
+ return 0;
+ }
+
+ ri->data_blocks += rec->rm_blockcount;
+ if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ ri->data_extents++;
+
+ return 0;
+}
+
+/* Count extents and blocks for an inode from all AG rmap data. */
+STATIC int
+xrep_dinode_count_ag_rmaps(
+ struct xrep_inode *ri,
+ struct xfs_perag *pag)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agf;
+ int error;
+
+ error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf);
+ if (error)
+ return error;
+
+ cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag);
+ error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri);
+ xfs_btree_del_cursor(cur, error);
+ xfs_trans_brelse(ri->sc->tp, agf);
+ return error;
+}
+
+/* Count extents and blocks for a given inode from all rmap data. */
+STATIC int
+xrep_dinode_count_rmaps(
+ struct xrep_inode *ri)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ int error;
+
+ if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
+ return -EOPNOTSUPP;
+
+ for_each_perag(ri->sc->mp, agno, pag) {
+ error = xrep_dinode_count_ag_rmaps(ri, pag);
+ if (error) {
+ xfs_perag_rele(pag);
+ return error;
+ }
+ }
+
+ /* Can't have extents on both the rt and the data device. */
+ if (ri->data_extents && ri->rt_extents)
+ return -EFSCORRUPTED;
+
+ trace_xrep_dinode_count_rmaps(ri->sc,
+ ri->data_blocks, ri->rt_blocks, ri->attr_blocks,
+ ri->data_extents, ri->rt_extents, ri->attr_extents);
+ return 0;
+}
+
+/* Return true if this extents-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_extents_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size,
+ int whichfork)
+{
+ struct xfs_bmbt_irec new;
+ struct xfs_bmbt_rec *dp;
+ xfs_extnum_t nex;
+ bool isrt;
+ unsigned int i;
+
+ nex = xfs_dfork_nextents(dip, whichfork);
+ if (nex > dfork_size / sizeof(struct xfs_bmbt_rec))
+ return true;
+
+ dp = XFS_DFORK_PTR(dip, whichfork);
+
+ isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
+ for (i = 0; i < nex; i++, dp++) {
+ xfs_failaddr_t fa;
+
+ xfs_bmbt_disk_get_all(dp, &new);
+ fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork,
+ &new);
+ if (fa)
+ return true;
+ }
+
+ return false;
+}
+
+/* Return true if this btree-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_bmbt_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size,
+ int whichfork)
+{
+ struct xfs_bmdr_block *dfp;
+ xfs_extnum_t nex;
+ unsigned int i;
+ unsigned int dmxr;
+ unsigned int nrecs;
+ unsigned int level;
+
+ nex = xfs_dfork_nextents(dip, whichfork);
+ if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec))
+ return true;
+
+ if (dfork_size < sizeof(struct xfs_bmdr_block))
+ return true;
+
+ dfp = XFS_DFORK_PTR(dip, whichfork);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
+ return true;
+ if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
+ return true;
+
+ dmxr = xfs_bmdr_maxrecs(dfork_size, 0);
+ for (i = 1; i <= nrecs; i++) {
+ struct xfs_bmbt_key *fkp;
+ xfs_bmbt_ptr_t *fpp;
+ xfs_fileoff_t fileoff;
+ xfs_fsblock_t fsbno;
+
+ fkp = XFS_BMDR_KEY_ADDR(dfp, i);
+ fileoff = be64_to_cpu(fkp->br_startoff);
+ if (!xfs_verify_fileoff(sc->mp, fileoff))
+ return true;
+
+ fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
+ fsbno = be64_to_cpu(*fpp);
+ if (!xfs_verify_fsbno(sc->mp, fsbno))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Check the data fork for things that will fail the ifork verifiers or the
+ * ifork formatters.
+ */
+STATIC bool
+xrep_dinode_check_dfork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ void *dfork_ptr;
+ int64_t data_size;
+ unsigned int fmt;
+ unsigned int dfork_size;
+
+ /*
+ * Verifier functions take signed int64_t, so check for bogus negative
+ * values first.
+ */
+ data_size = be64_to_cpu(dip->di_size);
+ if (data_size < 0)
+ return true;
+
+ fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK);
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ if (fmt != XFS_DINODE_FMT_DEV)
+ return true;
+ break;
+ case S_IFREG:
+ if (fmt == XFS_DINODE_FMT_LOCAL)
+ return true;
+ fallthrough;
+ case S_IFLNK:
+ case S_IFDIR:
+ switch (fmt) {
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ break;
+ default:
+ return true;
+ }
+ break;
+ default:
+ return true;
+ }
+
+ dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK);
+ dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+ switch (fmt) {
+ case XFS_DINODE_FMT_DEV:
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ /* dir/symlink structure cannot be larger than the fork */
+ if (data_size > dfork_size)
+ return true;
+ /* directory structure must pass verification. */
+ if (S_ISDIR(mode) &&
+ xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL)
+ return true;
+ /* symlink structure must pass verification. */
+ if (S_ISLNK(mode) &&
+ xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL)
+ return true;
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size,
+ XFS_DATA_FORK))
+ return true;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size,
+ XFS_DATA_FORK))
+ return true;
+ break;
+ default:
+ return true;
+ }
+
+ return false;
+}
+
+static void
+xrep_dinode_set_data_nextents(
+ struct xfs_dinode *dip,
+ xfs_extnum_t nextents)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ dip->di_big_nextents = cpu_to_be64(nextents);
+ else
+ dip->di_nextents = cpu_to_be32(nextents);
+}
+
+static void
+xrep_dinode_set_attr_nextents(
+ struct xfs_dinode *dip,
+ xfs_extnum_t nextents)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ dip->di_big_anextents = cpu_to_be32(nextents);
+ else
+ dip->di_anextents = cpu_to_be16(nextents);
+}
+
+/* Reset the data fork to something sane. */
+STATIC void
+xrep_dinode_zap_dfork(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ struct xfs_scrub *sc = ri->sc;
+
+ trace_xrep_dinode_zap_dfork(sc, dip);
+
+ ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED;
+
+ xrep_dinode_set_data_nextents(dip, 0);
+ ri->data_blocks = 0;
+ ri->rt_blocks = 0;
+
+ /* Special files always get reset to DEV */
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ dip->di_format = XFS_DINODE_FMT_DEV;
+ dip->di_size = 0;
+ return;
+ }
+
+ /*
+ * If we have data extents, reset to an empty map and hope the user
+ * will run the bmapbtd checker next.
+ */
+ if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) {
+ dip->di_format = XFS_DINODE_FMT_EXTENTS;
+ return;
+ }
+
+ /* Otherwise, reset the local format to the minimum. */
+ switch (mode & S_IFMT) {
+ case S_IFLNK:
+ xrep_dinode_zap_symlink(ri, dip);
+ break;
+ case S_IFDIR:
+ xrep_dinode_zap_dir(ri, dip);
+ break;
+ }
+}
+
+/*
+ * Check the attr fork for things that will fail the ifork verifiers or the
+ * ifork formatters.
+ */
+STATIC bool
+xrep_dinode_check_afork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip)
+{
+ struct xfs_attr_sf_hdr *afork_ptr;
+ size_t attr_size;
+ unsigned int afork_size;
+
+ if (XFS_DFORK_BOFF(dip) == 0)
+ return dip->di_aformat != XFS_DINODE_FMT_EXTENTS ||
+ xfs_dfork_attr_extents(dip) != 0;
+
+ afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
+ afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
+
+ switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) {
+ case XFS_DINODE_FMT_LOCAL:
+ /* Fork has to be large enough to extract the xattr size. */
+ if (afork_size < sizeof(struct xfs_attr_sf_hdr))
+ return true;
+
+ /* xattr structure cannot be larger than the fork */
+ attr_size = be16_to_cpu(afork_ptr->totsize);
+ if (attr_size > afork_size)
+ return true;
+
+ /* xattr structure must pass verification. */
+ return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (xrep_dinode_bad_extents_fork(sc, dip, afork_size,
+ XFS_ATTR_FORK))
+ return true;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size,
+ XFS_ATTR_FORK))
+ return true;
+ break;
+ default:
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Reset the attr fork to empty. Since the attr fork could have contained
+ * ACLs, make the file readable only by root.
+ */
+STATIC void
+xrep_dinode_zap_afork(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ struct xfs_scrub *sc = ri->sc;
+
+ trace_xrep_dinode_zap_afork(sc, dip);
+
+ ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED;
+
+ dip->di_aformat = XFS_DINODE_FMT_EXTENTS;
+ xrep_dinode_set_attr_nextents(dip, 0);
+ ri->attr_blocks = 0;
+
+ /*
+ * If the data fork is in btree format, removing the attr fork entirely
+ * might cause verifier failures if the next level down in the bmbt
+ * could now fit in the data fork area.
+ */
+ if (dip->di_format != XFS_DINODE_FMT_BTREE)
+ dip->di_forkoff = 0;
+ dip->di_mode = cpu_to_be16(mode & ~0777);
+ dip->di_uid = 0;
+ dip->di_gid = 0;
+}
+
+/* Make sure the fork offset is a sensible value. */
+STATIC void
+xrep_dinode_ensure_forkoff(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ struct xfs_bmdr_block *bmdr;
+ struct xfs_scrub *sc = ri->sc;
+ xfs_extnum_t attr_extents, data_extents;
+ size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1);
+ unsigned int lit_sz = XFS_LITINO(sc->mp);
+ unsigned int afork_min, dfork_min;
+
+ trace_xrep_dinode_ensure_forkoff(sc, dip);
+
+ /*
+ * Before calling this function, xrep_dinode_core ensured that both
+ * forks actually fit inside their respective literal areas. If this
+ * was not the case, the fork was reset to FMT_EXTENTS with zero
+ * records. If the rmapbt scan found attr or data fork blocks, this
+ * will be noted in the dinode_stats, and we must leave enough room
+ * for the bmap repair code to reconstruct the mapping structure.
+ *
+ * First, compute the minimum space required for the attr fork.
+ */
+ switch (dip->di_aformat) {
+ case XFS_DINODE_FMT_LOCAL:
+ /*
+ * If we still have a shortform xattr structure at all, that
+ * means the attr fork area was exactly large enough to fit
+ * the sf structure.
+ */
+ afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ attr_extents = xfs_dfork_attr_extents(dip);
+ if (attr_extents) {
+ /*
+ * We must maintain sufficient space to hold the entire
+ * extent map array in the data fork. Note that we
+ * previously zapped the fork if it had no chance of
+ * fitting in the inode.
+ */
+ afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents;
+ } else if (ri->attr_extents > 0) {
+ /*
+ * The attr fork thinks it has zero extents, but we
+ * found some xattr extents. We need to leave enough
+ * empty space here so that the incore attr fork will
+ * get created (and hence trigger the attr fork bmap
+ * repairer).
+ */
+ afork_min = bmdr_minsz;
+ } else {
+ /* No extents on disk or found in rmapbt. */
+ afork_min = 0;
+ }
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ /* Must have space for btree header and key/pointers. */
+ bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
+ afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+ break;
+ default:
+ /* We should never see any other formats. */
+ afork_min = 0;
+ break;
+ }
+
+ /* Compute the minimum space required for the data fork. */
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_DEV:
+ dfork_min = sizeof(__be32);
+ break;
+ case XFS_DINODE_FMT_UUID:
+ dfork_min = sizeof(uuid_t);
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ /*
+ * If we still have a shortform data fork at all, that means
+ * the data fork area was large enough to fit whatever was in
+ * there.
+ */
+ dfork_min = be64_to_cpu(dip->di_size);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ data_extents = xfs_dfork_data_extents(dip);
+ if (data_extents) {
+ /*
+ * We must maintain sufficient space to hold the entire
+ * extent map array in the data fork. Note that we
+ * previously zapped the fork if it had no chance of
+ * fitting in the inode.
+ */
+ dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents;
+ } else if (ri->data_extents > 0 || ri->rt_extents > 0) {
+ /*
+ * The data fork thinks it has zero extents, but we
+ * found some data extents. We need to leave enough
+ * empty space here so that the data fork bmap repair
+ * will recover the mappings.
+ */
+ dfork_min = bmdr_minsz;
+ } else {
+ /* No extents on disk or found in rmapbt. */
+ dfork_min = 0;
+ }
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ /* Must have space for btree header and key/pointers. */
+ bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+ break;
+ default:
+ dfork_min = 0;
+ break;
+ }
+
+ /*
+ * Round all values up to the nearest 8 bytes, because that is the
+ * precision of di_forkoff.
+ */
+ afork_min = roundup(afork_min, 8);
+ dfork_min = roundup(dfork_min, 8);
+ bmdr_minsz = roundup(bmdr_minsz, 8);
+
+ ASSERT(dfork_min <= lit_sz);
+ ASSERT(afork_min <= lit_sz);
+
+ /*
+ * If the data fork was zapped and we don't have enough space for the
+ * recovery fork, move the attr fork up.
+ */
+ if (dip->di_format == XFS_DINODE_FMT_EXTENTS &&
+ xfs_dfork_data_extents(dip) == 0 &&
+ (ri->data_extents > 0 || ri->rt_extents > 0) &&
+ bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) {
+ if (bmdr_minsz + afork_min > lit_sz) {
+ /*
+ * The attr for and the stub fork we need to recover
+ * the data fork won't both fit. Zap the attr fork.
+ */
+ xrep_dinode_zap_afork(ri, dip, mode);
+ afork_min = bmdr_minsz;
+ } else {
+ void *before, *after;
+
+ /* Otherwise, just slide the attr fork up. */
+ before = XFS_DFORK_APTR(dip);
+ dip->di_forkoff = bmdr_minsz >> 3;
+ after = XFS_DFORK_APTR(dip);
+ memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp));
+ }
+ }
+
+ /*
+ * If the attr fork was zapped and we don't have enough space for the
+ * recovery fork, move the attr fork down.
+ */
+ if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ xfs_dfork_attr_extents(dip) == 0 &&
+ ri->attr_extents > 0 &&
+ bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) {
+ if (dip->di_format == XFS_DINODE_FMT_BTREE) {
+ /*
+ * If the data fork is in btree format then we can't
+ * adjust forkoff because that runs the risk of
+ * violating the extents/btree format transition rules.
+ */
+ } else if (bmdr_minsz + dfork_min > lit_sz) {
+ /*
+ * If we can't move the attr fork, too bad, we lose the
+ * attr fork and leak its blocks.
+ */
+ xrep_dinode_zap_afork(ri, dip, mode);
+ } else {
+ /*
+ * Otherwise, just slide the attr fork down. The attr
+ * fork is empty, so we don't have any old contents to
+ * move here.
+ */
+ dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3;
+ }
+ }
+}
+
+/*
+ * Zap the data/attr forks if we spot anything that isn't going to pass the
+ * ifork verifiers or the ifork formatters, because we need to get the inode
+ * into good enough shape that the higher level repair functions can run.
+ */
+STATIC void
+xrep_dinode_zap_forks(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ xfs_extnum_t data_extents;
+ xfs_extnum_t attr_extents;
+ xfs_filblks_t nblocks;
+ uint16_t mode;
+ bool zap_datafork = false;
+ bool zap_attrfork = ri->zap_acls;
+
+ trace_xrep_dinode_zap_forks(sc, dip);
+
+ mode = be16_to_cpu(dip->di_mode);
+
+ data_extents = xfs_dfork_data_extents(dip);
+ attr_extents = xfs_dfork_attr_extents(dip);
+ nblocks = be64_to_cpu(dip->di_nblocks);
+
+ /* Inode counters don't make sense? */
+ if (data_extents > nblocks)
+ zap_datafork = true;
+ if (attr_extents > nblocks)
+ zap_attrfork = true;
+ if (data_extents + attr_extents > nblocks)
+ zap_datafork = zap_attrfork = true;
+
+ if (!zap_datafork)
+ zap_datafork = xrep_dinode_check_dfork(sc, dip, mode);
+ if (!zap_attrfork)
+ zap_attrfork = xrep_dinode_check_afork(sc, dip);
+
+ /* Zap whatever's bad. */
+ if (zap_attrfork)
+ xrep_dinode_zap_afork(ri, dip, mode);
+ if (zap_datafork)
+ xrep_dinode_zap_dfork(ri, dip, mode);
+ xrep_dinode_ensure_forkoff(ri, dip, mode);
+
+ /*
+ * Zero di_nblocks if we don't have any extents at all to satisfy the
+ * buffer verifier.
+ */
+ data_extents = xfs_dfork_data_extents(dip);
+ attr_extents = xfs_dfork_attr_extents(dip);
+ if (data_extents + attr_extents == 0)
+ dip->di_nblocks = 0;
+}
+
+/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */
+STATIC int
+xrep_dinode_core(
+ struct xrep_inode *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_buf *bp;
+ struct xfs_dinode *dip;
+ xfs_ino_t ino = sc->sm->sm_ino;
+ int error;
+ int iget_error;
+
+ /* Figure out what this inode had mapped in both forks. */
+ error = xrep_dinode_count_rmaps(ri);
+ if (error)
+ return error;
+
+ /* Read the inode cluster buffer. */
+ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
+ ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
+ NULL);
+ if (error)
+ return error;
+
+ /* Make sure we can pass the inode buffer verifier. */
+ xrep_dinode_buf(sc, bp);
+ bp->b_ops = &xfs_inode_buf_ops;
+
+ /* Fix everything the verifier will complain about. */
+ dip = xfs_buf_offset(bp, ri->imap.im_boffset);
+ xrep_dinode_header(sc, dip);
+ iget_error = xrep_dinode_mode(ri, dip);
+ if (iget_error)
+ goto write;
+ xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
+ xrep_dinode_size(ri, dip);
+ xrep_dinode_extsize_hints(sc, dip);
+ xrep_dinode_zap_forks(ri, dip);
+
+write:
+ /* Write out the inode. */
+ trace_xrep_dinode_fixed(sc, dip);
+ xfs_dinode_calc_crc(sc->mp, dip);
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
+ xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset,
+ ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1);
+
+ /*
+ * In theory, we've fixed the ondisk inode record enough that we should
+ * be able to load the inode into the cache. Try to iget that inode
+ * now while we hold the AGI and the inode cluster buffer and take the
+ * IOLOCK so that we can continue with repairs without anyone else
+ * accessing the inode. If iget fails, we still need to commit the
+ * changes.
+ */
+ if (!iget_error)
+ iget_error = xchk_iget(sc, ino, &sc->ip);
+ if (!iget_error)
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+
+ /*
+ * Commit the inode cluster buffer updates and drop the AGI buffer that
+ * we've been holding since scrub setup. From here on out, repairs
+ * deal only with the cached inode.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ if (iget_error)
+ return iget_error;
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+
+ error = xrep_ino_dqattach(sc);
+ if (error)
+ return error;
+
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ if (ri->ino_sick_mask)
+ xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask);
+ return 0;
+}
+
+/* Fix everything xfs_dinode_verify cares about. */
+STATIC int
+xrep_dinode_problems(
+ struct xrep_inode *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ int error;
+
+ error = xrep_dinode_core(ri);
+ if (error)
+ return error;
+
+ /* We had to fix a totally busted inode, schedule quotacheck. */
+ if (XFS_IS_UQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
+ if (XFS_IS_GQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
+ if (XFS_IS_PQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
+
+ return 0;
+}
+
+/*
+ * Fix problems that the verifiers don't care about. In general these are
+ * errors that don't cause problems elsewhere in the kernel that we can easily
+ * detect, so we don't check them all that rigorously.
+ */
+
+/* Make sure block and extent counts are ok. */
+STATIC int
+xrep_inode_blockcounts(
+ struct xfs_scrub *sc)
+{
+ struct xfs_ifork *ifp;
+ xfs_filblks_t count;
+ xfs_filblks_t acount;
+ xfs_extnum_t nextents;
+ int error;
+
+ trace_xrep_inode_blockcounts(sc);
+
+ /* Set data fork counters from the data fork mappings. */
+ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
+ &nextents, &count);
+ if (error)
+ return error;
+ if (xfs_is_reflink_inode(sc->ip)) {
+ /*
+ * data fork blockcount can exceed physical storage if a user
+ * reflinks the same block over and over again.
+ */
+ ;
+ } else if (XFS_IS_REALTIME_INODE(sc->ip)) {
+ if (count >= sc->mp->m_sb.sb_rblocks)
+ return -EFSCORRUPTED;
+ } else {
+ if (count >= sc->mp->m_sb.sb_dblocks)
+ return -EFSCORRUPTED;
+ }
+ error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents);
+ if (error)
+ return error;
+ sc->ip->i_df.if_nextents = nextents;
+
+ /* Set attr fork counters from the attr fork mappings. */
+ ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
+ if (ifp) {
+ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
+ &nextents, &acount);
+ if (error)
+ return error;
+ if (count >= sc->mp->m_sb.sb_dblocks)
+ return -EFSCORRUPTED;
+ error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK,
+ nextents);
+ if (error)
+ return error;
+ ifp->if_nextents = nextents;
+ } else {
+ acount = 0;
+ }
+
+ sc->ip->i_nblocks = count + acount;
+ return 0;
+}
+
+/* Check for invalid uid/gid/prid. */
+STATIC void
+xrep_inode_ids(
+ struct xfs_scrub *sc)
+{
+ bool dirty = false;
+
+ trace_xrep_inode_ids(sc);
+
+ if (!uid_valid(VFS_I(sc->ip)->i_uid)) {
+ i_uid_write(VFS_I(sc->ip), 0);
+ dirty = true;
+ if (XFS_IS_UQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
+ }
+
+ if (!gid_valid(VFS_I(sc->ip)->i_gid)) {
+ i_gid_write(VFS_I(sc->ip), 0);
+ dirty = true;
+ if (XFS_IS_GQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
+ }
+
+ if (sc->ip->i_projid == -1U) {
+ sc->ip->i_projid = 0;
+ dirty = true;
+ if (XFS_IS_PQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
+ }
+
+ /* strip setuid/setgid if we touched any of the ids */
+ if (dirty)
+ VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
+}
+
+static inline void
+xrep_clamp_timestamp(
+ struct xfs_inode *ip,
+ struct timespec64 *ts)
+{
+ ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
+ *ts = timestamp_truncate(*ts, VFS_I(ip));
+}
+
+/* Nanosecond counters can't have more than 1 billion. */
+STATIC void
+xrep_inode_timestamps(
+ struct xfs_inode *ip)
+{
+ struct timespec64 tstamp;
+ struct inode *inode = VFS_I(ip);
+
+ tstamp = inode_get_atime(inode);
+ xrep_clamp_timestamp(ip, &tstamp);
+ inode_set_atime_to_ts(inode, tstamp);
+
+ tstamp = inode_get_mtime(inode);
+ xrep_clamp_timestamp(ip, &tstamp);
+ inode_set_mtime_to_ts(inode, tstamp);
+
+ tstamp = inode_get_ctime(inode);
+ xrep_clamp_timestamp(ip, &tstamp);
+ inode_set_ctime_to_ts(inode, tstamp);
+
+ xrep_clamp_timestamp(ip, &ip->i_crtime);
+}
+
+/* Fix inode flags that don't make sense together. */
+STATIC void
+xrep_inode_flags(
+ struct xfs_scrub *sc)
+{
+ uint16_t mode;
+
+ trace_xrep_inode_flags(sc);
+
+ mode = VFS_I(sc->ip)->i_mode;
+
+ /* Clear junk flags */
+ if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY)
+ sc->ip->i_diflags &= ~XFS_DIFLAG_ANY;
+
+ /* NEWRTBM only applies to realtime bitmaps */
+ if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
+ sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+ else
+ sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM;
+
+ /* These only make sense for directories. */
+ if (!S_ISDIR(mode))
+ sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT |
+ XFS_DIFLAG_EXTSZINHERIT |
+ XFS_DIFLAG_PROJINHERIT |
+ XFS_DIFLAG_NOSYMLINKS);
+
+ /* These only make sense for files. */
+ if (!S_ISREG(mode))
+ sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME |
+ XFS_DIFLAG_EXTSIZE);
+
+ /* These only make sense for non-rt files. */
+ if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
+ sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM;
+
+ /* Immutable and append only? Drop the append. */
+ if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) &&
+ (sc->ip->i_diflags & XFS_DIFLAG_APPEND))
+ sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND;
+
+ /* Clear junk flags. */
+ if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY)
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY;
+
+ /* No reflink flag unless we support it and it's a file. */
+ if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode))
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+
+ /* DAX only applies to files and dirs. */
+ if (!(S_ISREG(mode) || S_ISDIR(mode)))
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
+
+ /* No reflink files on the realtime device. */
+ if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+}
+
+/*
+ * Fix size problems with block/node format directories. If we fail to find
+ * the extent list, just bail out and let the bmapbtd repair functions clean
+ * up that mess.
+ */
+STATIC void
+xrep_inode_blockdir_size(
+ struct xfs_scrub *sc)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t off;
+ int error;
+
+ trace_xrep_inode_blockdir_size(sc);
+
+ error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK);
+ if (error)
+ return;
+
+ /* Find the last block before 32G; this is the dir size. */
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE);
+ if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) {
+ /* zero-extents directory? */
+ return;
+ }
+
+ off = got.br_startoff + got.br_blockcount;
+ sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE,
+ XFS_FSB_TO_B(sc->mp, off));
+}
+
+/* Fix size problems with short format directories. */
+STATIC void
+xrep_inode_sfdir_size(
+ struct xfs_scrub *sc)
+{
+ struct xfs_ifork *ifp;
+
+ trace_xrep_inode_sfdir_size(sc);
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ sc->ip->i_disk_size = ifp->if_bytes;
+}
+
+/*
+ * Fix any irregularities in a directory inode's size now that we can iterate
+ * extent maps and access other regular inode data.
+ */
+STATIC void
+xrep_inode_dir_size(
+ struct xfs_scrub *sc)
+{
+ trace_xrep_inode_dir_size(sc);
+
+ switch (sc->ip->i_df.if_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ xrep_inode_blockdir_size(sc);
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ xrep_inode_sfdir_size(sc);
+ break;
+ }
+}
+
+/* Fix extent size hint problems. */
+STATIC void
+xrep_inode_extsize(
+ struct xfs_scrub *sc)
+{
+ /* Fix misaligned extent size hints on a directory. */
+ if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+ xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) {
+ sc->ip->i_extsize = 0;
+ sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT;
+ }
+}
+
+/* Fix any irregularities in an inode that the verifiers don't catch. */
+STATIC int
+xrep_inode_problems(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ error = xrep_inode_blockcounts(sc);
+ if (error)
+ return error;
+ xrep_inode_timestamps(sc->ip);
+ xrep_inode_flags(sc);
+ xrep_inode_ids(sc);
+ /*
+ * We can now do a better job fixing the size of a directory now that
+ * we can scan the data fork extents than we could in xrep_dinode_size.
+ */
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode))
+ xrep_inode_dir_size(sc);
+ xrep_inode_extsize(sc);
+
+ trace_xrep_inode_fixed(sc);
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ return xrep_roll_trans(sc);
+}
+
+/* Repair an inode's fields. */
+int
+xrep_inode(
+ struct xfs_scrub *sc)
+{
+ int error = 0;
+
+ /*
+ * No inode? That means we failed the _iget verifiers. Repair all
+ * the things that the inode verifiers care about, then retry _iget.
+ */
+ if (!sc->ip) {
+ struct xrep_inode *ri = sc->buf;
+
+ ASSERT(ri != NULL);
+
+ error = xrep_dinode_problems(ri);
+ if (error == -EBUSY) {
+ /*
+ * Directory scan to recover inode mode encountered a
+ * busy inode, so we did not continue repairing things.
+ */
+ return 0;
+ }
+ if (error)
+ return error;
+
+ /* By this point we had better have a working incore inode. */
+ if (!sc->ip)
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* If we found corruption of any kind, try to fix it. */
+ if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
+ error = xrep_inode_problems(sc);
+ if (error)
+ return error;
+ }
+
+ /* See if we can clear the reflink flag. */
+ if (xfs_is_reflink_inode(sc->ip)) {
+ error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
+ if (error)
+ return error;
+ }
+
+ return xrep_defer_finish(sc);
+}
diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c
new file mode 100644
index 000000000000..ec3478bc505e
--- /dev/null
+++ b/fs/xfs/scrub/iscan.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+#include "xfs_icache.h"
+#include "scrub/scrub.h"
+#include "scrub/iscan.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * Live File Scan
+ * ==============
+ *
+ * Live file scans walk every inode in a live filesystem. This is more or
+ * less like a regular iwalk, except that when we're advancing the scan cursor,
+ * we must ensure that inodes cannot be added or deleted anywhere between the
+ * old cursor value and the new cursor value. If we're advancing the cursor
+ * by one inode, the caller must hold that inode; if we're finding the next
+ * inode to scan, we must grab the AGI and hold it until we've updated the
+ * scan cursor.
+ *
+ * Callers are expected to use this code to scan all files in the filesystem to
+ * construct a new metadata index of some kind. The scan races against other
+ * live updates, which means there must be a provision to update the new index
+ * when updates are made to inodes that already been scanned. The iscan lock
+ * can be used in live update hook code to stop the scan and protect this data
+ * structure.
+ *
+ * To keep the new index up to date with other metadata updates being made to
+ * the live filesystem, it is assumed that the caller will add hooks as needed
+ * to be notified when a metadata update occurs. The inode scanner must tell
+ * the hook code when an inode has been visited with xchk_iscan_mark_visit.
+ * Hook functions can use xchk_iscan_want_live_update to decide if the
+ * scanner's observations must be updated.
+ */
+
+/*
+ * If the inobt record @rec covers @iscan->skip_ino, mark the inode free so
+ * that the scan ignores that inode.
+ */
+STATIC void
+xchk_iscan_mask_skipino(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_inobt_rec_incore *rec,
+ xfs_agino_t lastrecino)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino);
+ xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino);
+
+ if (pag->pag_agno != skip_agno)
+ return;
+ if (skip_agino < rec->ir_startino)
+ return;
+ if (skip_agino > lastrecino)
+ return;
+
+ rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1);
+}
+
+/*
+ * Set *cursor to the next allocated inode after whatever it's set to now.
+ * If there are no more inodes in this AG, cursor is set to NULLAGINO.
+ */
+STATIC int
+xchk_iscan_find_next(
+ struct xchk_iscan *iscan,
+ struct xfs_buf *agi_bp,
+ struct xfs_perag *pag,
+ xfs_inofree_t *allocmaskp,
+ xfs_agino_t *cursor,
+ uint8_t *nr_inodesp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+ xfs_agnumber_t agno = pag->pag_agno;
+ xfs_agino_t lastino = NULLAGINO;
+ xfs_agino_t first, last;
+ xfs_agino_t agino = *cursor;
+ int has_rec;
+ int error;
+
+ /* If the cursor is beyond the end of this AG, move to the next one. */
+ xfs_agino_range(mp, agno, &first, &last);
+ if (agino > last) {
+ *cursor = NULLAGINO;
+ return 0;
+ }
+
+ /*
+ * Look up the inode chunk for the current cursor position. If there
+ * is no chunk here, we want the next one.
+ */
+ cur = xfs_inobt_init_cursor(pag, tp, agi_bp);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec);
+ if (!error && !has_rec)
+ error = xfs_btree_increment(cur, 0, &has_rec);
+ for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) {
+ xfs_inofree_t allocmask;
+
+ /*
+ * If we've run out of inobt records in this AG, move the
+ * cursor on to the next AG and exit. The caller can try
+ * again with the next AG.
+ */
+ if (!has_rec) {
+ *cursor = NULLAGINO;
+ break;
+ }
+
+ error = xfs_inobt_get_rec(cur, &rec, &has_rec);
+ if (error)
+ break;
+ if (!has_rec) {
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ /* Make sure that we always move forward. */
+ if (lastino != NULLAGINO &&
+ XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) {
+ error = -EFSCORRUPTED;
+ break;
+ }
+ lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1;
+
+ /*
+ * If this record only covers inodes that come before the
+ * cursor, advance to the next record.
+ */
+ if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
+ continue;
+
+ if (iscan->skip_ino)
+ xchk_iscan_mask_skipino(iscan, pag, &rec, lastino);
+
+ /*
+ * If the incoming lookup put us in the middle of an inobt
+ * record, mark it and the previous inodes "free" so that the
+ * search for allocated inodes will start at the cursor.
+ * We don't care about ir_freecount here.
+ */
+ if (agino >= rec.ir_startino)
+ rec.ir_free |= xfs_inobt_maskn(0,
+ agino + 1 - rec.ir_startino);
+
+ /*
+ * If there are allocated inodes in this chunk, find them
+ * and update the scan cursor.
+ */
+ allocmask = ~rec.ir_free;
+ if (hweight64(allocmask) > 0) {
+ int next = xfs_lowbit64(allocmask);
+
+ ASSERT(next >= 0);
+ *cursor = rec.ir_startino + next;
+ *allocmaskp = allocmask >> next;
+ *nr_inodesp = XFS_INODES_PER_CHUNK - next;
+ break;
+ }
+ }
+
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Advance both the scan and the visited cursors.
+ *
+ * The inumber address space for a given filesystem is sparse, which means that
+ * the scan cursor can jump a long ways in a single iter() call. There are no
+ * inodes in these sparse areas, so we must move the visited cursor forward at
+ * the same time so that the scan user can receive live updates for inodes that
+ * may get created once we release the AGI buffer.
+ */
+static inline void
+xchk_iscan_move_cursor(
+ struct xchk_iscan *iscan,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t cursor, visited;
+
+ BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO);
+
+ /*
+ * Special-case ino == 0 here so that we never set visited_ino to
+ * NULLFSINO when wrapping around EOFS, for that will let through all
+ * live updates.
+ */
+ cursor = XFS_AGINO_TO_INO(mp, agno, agino);
+ if (cursor == 0)
+ visited = XFS_MAXINUMBER;
+ else
+ visited = cursor - 1;
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = cursor;
+ iscan->__visited_ino = visited;
+ trace_xchk_iscan_move_cursor(iscan);
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Prepare to return agno/agino to the iscan caller by moving the lastino
+ * cursor to the previous inode. Do this while we still hold the AGI so that
+ * no other threads can create or delete inodes in this AG.
+ */
+static inline void
+xchk_iscan_finish(
+ struct xchk_iscan *iscan)
+{
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = NULLFSINO;
+
+ /* All live updates will be applied from now on */
+ iscan->__visited_ino = NULLFSINO;
+
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Advance ino to the next inode that the inobt thinks is allocated, being
+ * careful to jump to the next AG if we've reached the right end of this AG's
+ * inode btree. Advancing ino effectively means that we've pushed the inode
+ * scan forward, so set the iscan cursor to (ino - 1) so that our live update
+ * predicates will track inode allocations in that part of the inode number
+ * key space once we release the AGI buffer.
+ *
+ * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
+ * -ECANCELED if the live scan aborted, or the usual negative errno.
+ */
+STATIC int
+xchk_iscan_advance(
+ struct xchk_iscan *iscan,
+ struct xfs_perag **pagp,
+ struct xfs_buf **agi_bpp,
+ xfs_inofree_t *allocmaskp,
+ uint8_t *nr_inodesp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agi_bp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ xfs_agino_t agino;
+ int ret;
+
+ ASSERT(iscan->cursor_ino >= iscan->__visited_ino);
+
+ do {
+ if (xchk_iscan_aborted(iscan))
+ return -ECANCELED;
+
+ agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino);
+ pag = xfs_perag_get(mp, agno);
+ if (!pag)
+ return -ECANCELED;
+
+ ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
+ if (ret)
+ goto out_pag;
+
+ agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino);
+ ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp,
+ &agino, nr_inodesp);
+ if (ret)
+ goto out_buf;
+
+ if (agino != NULLAGINO) {
+ /*
+ * Found the next inode in this AG, so return it along
+ * with the AGI buffer and the perag structure to
+ * ensure it cannot go away.
+ */
+ xchk_iscan_move_cursor(iscan, agno, agino);
+ *agi_bpp = agi_bp;
+ *pagp = pag;
+ return 1;
+ }
+
+ /*
+ * Did not find any more inodes in this AG, move on to the next
+ * AG.
+ */
+ agno = (agno + 1) % mp->m_sb.sb_agcount;
+ xchk_iscan_move_cursor(iscan, agno, 0);
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ trace_xchk_iscan_advance_ag(iscan);
+ } while (iscan->cursor_ino != iscan->scan_start_ino);
+
+ xchk_iscan_finish(iscan);
+ return 0;
+
+out_buf:
+ xfs_trans_brelse(sc->tp, agi_bp);
+out_pag:
+ xfs_perag_put(pag);
+ return ret;
+}
+
+/*
+ * Grabbing the inode failed, so we need to back up the scan and ask the caller
+ * to try to _advance the scan again. Returns -EBUSY if we've run out of retry
+ * opportunities, -ECANCELED if the process has a fatal signal pending, or
+ * -EAGAIN if we should try again.
+ */
+STATIC int
+xchk_iscan_iget_retry(
+ struct xchk_iscan *iscan,
+ bool wait)
+{
+ ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1);
+
+ if (!iscan->iget_timeout ||
+ time_is_before_jiffies(iscan->__iget_deadline))
+ return -EBUSY;
+
+ if (wait) {
+ unsigned long relax;
+
+ /*
+ * Sleep for a period of time to let the rest of the system
+ * catch up. If we return early, someone sent a kill signal to
+ * the calling process.
+ */
+ relax = msecs_to_jiffies(iscan->iget_retry_delay);
+ trace_xchk_iscan_iget_retry_wait(iscan);
+
+ if (schedule_timeout_killable(relax) ||
+ xchk_iscan_aborted(iscan))
+ return -ECANCELED;
+ }
+
+ iscan->cursor_ino--;
+ return -EAGAIN;
+}
+
+/*
+ * Grab an inode as part of an inode scan. While scanning this inode, the
+ * caller must ensure that no other threads can modify the inode until a call
+ * to xchk_iscan_visit succeeds.
+ *
+ * Returns the number of incore inodes grabbed; -EAGAIN if the caller should
+ * call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode;
+ * -ECANCELED if there's a fatal signal pending; or some other negative errno.
+ */
+STATIC int
+xchk_iscan_iget(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_buf *agi_bp,
+ xfs_inofree_t allocmask,
+ uint8_t nr_inodes)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t ino = iscan->cursor_ino;
+ unsigned int idx = 0;
+ unsigned int i;
+ int error;
+
+ ASSERT(iscan->__inodes[0] == NULL);
+
+ /* Fill the first slot in the inode array. */
+ error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+ &iscan->__inodes[idx]);
+
+ trace_xchk_iscan_iget(iscan, error);
+
+ if (error == -ENOENT || error == -EAGAIN) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ /*
+ * It's possible that this inode has lost all of its links but
+ * hasn't yet been inactivated. If we don't have a transaction
+ * or it's not writable, flush the inodegc workers and wait.
+ */
+ xfs_inodegc_flush(mp);
+ return xchk_iscan_iget_retry(iscan, true);
+ }
+
+ if (error == -EINVAL) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ /*
+ * We thought the inode was allocated, but the inode btree
+ * lookup failed, which means that it was freed since the last
+ * time we advanced the cursor. Back up and try again. This
+ * should never happen since still hold the AGI buffer from the
+ * inobt check, but we need to be careful about infinite loops.
+ */
+ return xchk_iscan_iget_retry(iscan, false);
+ }
+
+ if (error) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ return error;
+ }
+ idx++;
+ ino++;
+ allocmask >>= 1;
+
+ /*
+ * Now that we've filled the first slot in __inodes, try to fill the
+ * rest of the batch with consecutively ordered inodes. to reduce the
+ * number of _iter calls. Make a bitmap of unallocated inodes from the
+ * zeroes in the inuse bitmap; these inodes will not be scanned, but
+ * the _want_live_update predicate will pass through all live updates.
+ *
+ * If we can't iget an allocated inode, stop and return what we have.
+ */
+ mutex_lock(&iscan->lock);
+ iscan->__batch_ino = ino - 1;
+ iscan->__skipped_inomask = 0;
+ mutex_unlock(&iscan->lock);
+
+ for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) {
+ if (!(allocmask & 1)) {
+ ASSERT(!(iscan->__skipped_inomask & (1ULL << i)));
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = ino;
+ iscan->__skipped_inomask |= (1ULL << i);
+ mutex_unlock(&iscan->lock);
+ continue;
+ }
+
+ ASSERT(iscan->__inodes[idx] == NULL);
+
+ error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+ &iscan->__inodes[idx]);
+ if (error)
+ break;
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = ino;
+ mutex_unlock(&iscan->lock);
+ idx++;
+ }
+
+ trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx);
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ return idx;
+}
+
+/*
+ * Advance the visit cursor to reflect skipped inodes beyond whatever we
+ * scanned.
+ */
+STATIC void
+xchk_iscan_finish_batch(
+ struct xchk_iscan *iscan)
+{
+ xfs_ino_t highest_skipped;
+
+ mutex_lock(&iscan->lock);
+
+ if (iscan->__batch_ino != NULLFSINO) {
+ highest_skipped = iscan->__batch_ino +
+ xfs_highbit64(iscan->__skipped_inomask);
+ iscan->__visited_ino = max(iscan->__visited_ino,
+ highest_skipped);
+
+ trace_xchk_iscan_skip(iscan);
+ }
+
+ iscan->__batch_ino = NULLFSINO;
+ iscan->__skipped_inomask = 0;
+
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Advance the inode scan cursor to the next allocated inode and return up to
+ * 64 consecutive allocated inodes starting with the cursor position.
+ */
+STATIC int
+xchk_iscan_iter_batch(
+ struct xchk_iscan *iscan)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ int ret;
+
+ xchk_iscan_finish_batch(iscan);
+
+ if (iscan->iget_timeout)
+ iscan->__iget_deadline = jiffies +
+ msecs_to_jiffies(iscan->iget_timeout);
+
+ do {
+ struct xfs_buf *agi_bp = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_inofree_t allocmask = 0;
+ uint8_t nr_inodes = 0;
+
+ ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask,
+ &nr_inodes);
+ if (ret != 1)
+ return ret;
+
+ if (xchk_iscan_aborted(iscan)) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ ret = -ECANCELED;
+ break;
+ }
+
+ ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+/*
+ * Advance the inode scan cursor to the next allocated inode and return the
+ * incore inode structure associated with it.
+ *
+ * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
+ * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be
+ * grabbed, or the usual negative errno.
+ *
+ * If the function returns -EBUSY and the caller can handle skipping an inode,
+ * it may call this function again to continue the scan with the next allocated
+ * inode.
+ */
+int
+xchk_iscan_iter(
+ struct xchk_iscan *iscan,
+ struct xfs_inode **ipp)
+{
+ unsigned int i;
+ int error;
+
+ /* Find a cached inode, or go get another batch. */
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+ if (iscan->__inodes[i])
+ goto foundit;
+ }
+
+ error = xchk_iscan_iter_batch(iscan);
+ if (error <= 0)
+ return error;
+
+ ASSERT(iscan->__inodes[0] != NULL);
+ i = 0;
+
+foundit:
+ /* Give the caller our reference. */
+ *ipp = iscan->__inodes[i];
+ iscan->__inodes[i] = NULL;
+ return 1;
+}
+
+/* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */
+void
+xchk_iscan_iter_finish(
+ struct xchk_iscan *iscan)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ unsigned int i;
+
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+ if (iscan->__inodes[i]) {
+ xchk_irele(sc, iscan->__inodes[i]);
+ iscan->__inodes[i] = NULL;
+ }
+ }
+}
+
+/* Mark this inode scan finished and release resources. */
+void
+xchk_iscan_teardown(
+ struct xchk_iscan *iscan)
+{
+ xchk_iscan_iter_finish(iscan);
+ xchk_iscan_finish(iscan);
+ mutex_destroy(&iscan->lock);
+}
+
+/* Pick an AG from which to start a scan. */
+static inline xfs_ino_t
+xchk_iscan_rotor(
+ struct xfs_mount *mp)
+{
+ static atomic_t agi_rotor;
+ unsigned int r = atomic_inc_return(&agi_rotor) - 1;
+
+ /*
+ * Rotoring *backwards* through the AGs, so we add one here before
+ * subtracting from the agcount to arrive at an AG number.
+ */
+ r = (r % mp->m_sb.sb_agcount) + 1;
+
+ return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0);
+}
+
+/*
+ * Set ourselves up to start an inode scan. If the @iget_timeout and
+ * @iget_retry_delay parameters are set, the scan will try to iget each inode
+ * for @iget_timeout milliseconds. If an iget call indicates that the inode is
+ * waiting to be inactivated, the CPU will relax for @iget_retry_delay
+ * milliseconds after pushing the inactivation workers.
+ */
+void
+xchk_iscan_start(
+ struct xfs_scrub *sc,
+ unsigned int iget_timeout,
+ unsigned int iget_retry_delay,
+ struct xchk_iscan *iscan)
+{
+ xfs_ino_t start_ino;
+
+ start_ino = xchk_iscan_rotor(sc->mp);
+
+ iscan->__batch_ino = NULLFSINO;
+ iscan->__skipped_inomask = 0;
+
+ iscan->sc = sc;
+ clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+ iscan->iget_timeout = iget_timeout;
+ iscan->iget_retry_delay = iget_retry_delay;
+ iscan->__visited_ino = start_ino;
+ iscan->cursor_ino = start_ino;
+ iscan->scan_start_ino = start_ino;
+ mutex_init(&iscan->lock);
+ memset(iscan->__inodes, 0, sizeof(iscan->__inodes));
+
+ trace_xchk_iscan_start(iscan, start_ino);
+}
+
+/*
+ * Mark this inode as having been visited. Callers must hold a sufficiently
+ * exclusive lock on the inode to prevent concurrent modifications.
+ */
+void
+xchk_iscan_mark_visited(
+ struct xchk_iscan *iscan,
+ struct xfs_inode *ip)
+{
+ mutex_lock(&iscan->lock);
+ iscan->__visited_ino = ip->i_ino;
+ trace_xchk_iscan_visit(iscan);
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Did we skip this inode because it wasn't allocated when we loaded the batch?
+ * If so, it is newly allocated and will not be scanned. All live updates to
+ * this inode must be passed to the caller to maintain scan correctness.
+ */
+static inline bool
+xchk_iscan_skipped(
+ const struct xchk_iscan *iscan,
+ xfs_ino_t ino)
+{
+ if (iscan->__batch_ino == NULLFSINO)
+ return false;
+ if (ino < iscan->__batch_ino)
+ return false;
+ if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK)
+ return false;
+
+ return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino));
+}
+
+/*
+ * Do we need a live update for this inode? This is true if the scanner thread
+ * has visited this inode and the scan hasn't been aborted due to errors.
+ * Callers must hold a sufficiently exclusive lock on the inode to prevent
+ * scanners from reading any inode metadata.
+ */
+bool
+xchk_iscan_want_live_update(
+ struct xchk_iscan *iscan,
+ xfs_ino_t ino)
+{
+ bool ret = false;
+
+ if (xchk_iscan_aborted(iscan))
+ return false;
+
+ mutex_lock(&iscan->lock);
+
+ trace_xchk_iscan_want_live_update(iscan, ino);
+
+ /* Scan is finished, caller should receive all updates. */
+ if (iscan->__visited_ino == NULLFSINO) {
+ ret = true;
+ goto unlock;
+ }
+
+ /*
+ * No inodes have been visited yet, so the visited cursor points at the
+ * start of the scan range. The caller should not receive any updates.
+ */
+ if (iscan->scan_start_ino == iscan->__visited_ino) {
+ ret = false;
+ goto unlock;
+ }
+
+ /*
+ * This inode was not allocated at the time of the iscan batch.
+ * The caller should receive all updates.
+ */
+ if (xchk_iscan_skipped(iscan, ino)) {
+ ret = true;
+ goto unlock;
+ }
+
+ /*
+ * The visited cursor hasn't yet wrapped around the end of the FS. If
+ * @ino is inside the starred range, the caller should receive updates:
+ *
+ * 0 ------------ S ************ V ------------ EOFS
+ */
+ if (iscan->scan_start_ino <= iscan->__visited_ino) {
+ if (ino >= iscan->scan_start_ino &&
+ ino <= iscan->__visited_ino)
+ ret = true;
+
+ goto unlock;
+ }
+
+ /*
+ * The visited cursor wrapped around the end of the FS. If @ino is
+ * inside the starred range, the caller should receive updates:
+ *
+ * 0 ************ V ------------ S ************ EOFS
+ */
+ if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino)
+ ret = true;
+
+unlock:
+ mutex_unlock(&iscan->lock);
+ return ret;
+}
diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h
new file mode 100644
index 000000000000..71f657552dfa
--- /dev/null
+++ b/fs/xfs/scrub/iscan.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_ISCAN_H__
+#define __XFS_SCRUB_ISCAN_H__
+
+struct xchk_iscan {
+ struct xfs_scrub *sc;
+
+ /* Lock to protect the scan cursor. */
+ struct mutex lock;
+
+ /*
+ * This is the first inode in the inumber address space that we
+ * examined. When the scan wraps around back to here, the scan is
+ * finished.
+ */
+ xfs_ino_t scan_start_ino;
+
+ /* This is the inode that will be examined next. */
+ xfs_ino_t cursor_ino;
+
+ /* If nonzero and non-NULL, skip this inode when scanning. */
+ xfs_ino_t skip_ino;
+
+ /*
+ * This is the last inode that we've successfully scanned, either
+ * because the caller scanned it, or we moved the cursor past an empty
+ * part of the inode address space. Scan callers should only use the
+ * xchk_iscan_visit function to modify this.
+ */
+ xfs_ino_t __visited_ino;
+
+ /* Operational state of the livescan. */
+ unsigned long __opstate;
+
+ /* Give up on iterating @cursor_ino if we can't iget it by this time. */
+ unsigned long __iget_deadline;
+
+ /* Amount of time (in ms) that we will try to iget an inode. */
+ unsigned int iget_timeout;
+
+ /* Wait this many ms to retry an iget. */
+ unsigned int iget_retry_delay;
+
+ /*
+ * The scan grabs batches of inodes and stashes them here before
+ * handing them out with _iter. Unallocated inodes are set in the
+ * mask so that all updates to that inode are selected for live
+ * update propagation.
+ */
+ xfs_ino_t __batch_ino;
+ xfs_inofree_t __skipped_inomask;
+ struct xfs_inode *__inodes[XFS_INODES_PER_CHUNK];
+};
+
+/* Set if the scan has been aborted due to some event in the fs. */
+#define XCHK_ISCAN_OPSTATE_ABORTED (1)
+
+static inline bool
+xchk_iscan_aborted(const struct xchk_iscan *iscan)
+{
+ return test_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+}
+
+static inline void
+xchk_iscan_abort(struct xchk_iscan *iscan)
+{
+ set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+}
+
+void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout,
+ unsigned int iget_retry_delay, struct xchk_iscan *iscan);
+void xchk_iscan_teardown(struct xchk_iscan *iscan);
+
+int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp);
+void xchk_iscan_iter_finish(struct xchk_iscan *iscan);
+
+void xchk_iscan_mark_visited(struct xchk_iscan *iscan, struct xfs_inode *ip);
+bool xchk_iscan_want_live_update(struct xchk_iscan *iscan, xfs_ino_t ino);
+
+#endif /* __XFS_SCRUB_ISCAN_H__ */
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
new file mode 100644
index 000000000000..4a0271123d94
--- /dev/null
+++ b/fs/xfs/scrub/newbt.c
@@ -0,0 +1,567 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_defer.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/newbt.h"
+
+/*
+ * Estimate proper slack values for a btree that's being reloaded.
+ *
+ * Under most circumstances, we'll take whatever default loading value the
+ * btree bulk loading code calculates for us. However, there are some
+ * exceptions to this rule:
+ *
+ * (0) If someone turned one of the debug knobs.
+ * (1) If this is a per-AG btree and the AG has less than 10% space free.
+ * (2) If this is an inode btree and the FS has less than 10% space free.
+
+ * In either case, format the new btree blocks almost completely full to
+ * minimize space usage.
+ */
+static void
+xrep_newbt_estimate_slack(
+ struct xrep_newbt *xnr)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xfs_btree_bload *bload = &xnr->bload;
+ uint64_t free;
+ uint64_t sz;
+
+ /*
+ * The xfs_globals values are set to -1 (i.e. take the bload defaults)
+ * unless someone has set them otherwise, so we just pull the values
+ * here.
+ */
+ bload->leaf_slack = xfs_globals.bload_leaf_slack;
+ bload->node_slack = xfs_globals.bload_node_slack;
+
+ if (sc->ops->type == ST_PERAG) {
+ free = sc->sa.pag->pagf_freeblks;
+ sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
+ } else {
+ free = percpu_counter_sum(&sc->mp->m_fdblocks);
+ sz = sc->mp->m_sb.sb_dblocks;
+ }
+
+ /* No further changes if there's more than 10% free space left. */
+ if (free >= div_u64(sz, 10))
+ return;
+
+ /*
+ * We're low on space; load the btrees as tightly as possible. Leave
+ * a couple of open slots in each btree block so that we don't end up
+ * splitting the btrees like crazy after a mount.
+ */
+ if (bload->leaf_slack < 0)
+ bload->leaf_slack = 2;
+ if (bload->node_slack < 0)
+ bload->node_slack = 2;
+}
+
+/* Initialize accounting resources for staging a new AG btree. */
+void
+xrep_newbt_init_ag(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo,
+ xfs_fsblock_t alloc_hint,
+ enum xfs_ag_resv_type resv)
+{
+ memset(xnr, 0, sizeof(struct xrep_newbt));
+ xnr->sc = sc;
+ xnr->oinfo = *oinfo; /* structure copy */
+ xnr->alloc_hint = alloc_hint;
+ xnr->resv = resv;
+ INIT_LIST_HEAD(&xnr->resv_list);
+ xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
+ xrep_newbt_estimate_slack(xnr);
+}
+
+/* Initialize accounting resources for staging a new inode fork btree. */
+int
+xrep_newbt_init_inode(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc,
+ int whichfork,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_ifork *ifp;
+
+ ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
+ if (!ifp)
+ return -ENOMEM;
+
+ xrep_newbt_init_ag(xnr, sc, oinfo,
+ XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
+ XFS_AG_RESV_NONE);
+ xnr->ifake.if_fork = ifp;
+ xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
+ return 0;
+}
+
+/*
+ * Initialize accounting resources for staging a new btree. Callers are
+ * expected to add their own reservations (and clean them up) manually.
+ */
+void
+xrep_newbt_init_bare(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc)
+{
+ xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
+ XFS_AG_RESV_NONE);
+}
+
+/*
+ * Designate specific blocks to be used to build our new btree. @pag must be
+ * a passive reference.
+ */
+STATIC int
+xrep_newbt_add_blocks(
+ struct xrep_newbt *xnr,
+ struct xfs_perag *pag,
+ const struct xfs_alloc_arg *args)
+{
+ struct xfs_mount *mp = xnr->sc->mp;
+ struct xrep_newbt_resv *resv;
+ int error;
+
+ resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
+ if (!resv)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&resv->list);
+ resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+ resv->len = args->len;
+ resv->used = 0;
+ resv->pag = xfs_perag_hold(pag);
+
+ if (args->tp) {
+ ASSERT(xnr->oinfo.oi_offset == 0);
+
+ error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
+ if (error)
+ goto out_pag;
+ }
+
+ list_add_tail(&resv->list, &xnr->resv_list);
+ return 0;
+out_pag:
+ xfs_perag_put(resv->pag);
+ kfree(resv);
+ return error;
+}
+
+/*
+ * Add an extent to the new btree reservation pool. Callers are required to
+ * reap this reservation manually if the repair is cancelled. @pag must be a
+ * passive reference.
+ */
+int
+xrep_newbt_add_extent(
+ struct xrep_newbt *xnr,
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ struct xfs_mount *mp = xnr->sc->mp;
+ struct xfs_alloc_arg args = {
+ .tp = NULL, /* no autoreap */
+ .oinfo = xnr->oinfo,
+ .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
+ .len = len,
+ .resv = xnr->resv,
+ };
+
+ return xrep_newbt_add_blocks(xnr, pag, &args);
+}
+
+/* Don't let our allocation hint take us beyond this AG */
+static inline void
+xrep_newbt_validate_ag_alloc_hint(
+ struct xrep_newbt *xnr)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
+
+ if (agno == sc->sa.pag->pag_agno &&
+ xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
+ return;
+
+ xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+ XFS_AGFL_BLOCK(sc->mp) + 1);
+}
+
+/* Allocate disk space for a new per-AG btree. */
+STATIC int
+xrep_newbt_alloc_ag_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ ASSERT(sc->sa.pag != NULL);
+
+ while (nr_blocks > 0) {
+ struct xfs_alloc_arg args = {
+ .tp = sc->tp,
+ .mp = mp,
+ .oinfo = xnr->oinfo,
+ .minlen = 1,
+ .maxlen = nr_blocks,
+ .prod = 1,
+ .resv = xnr->resv,
+ };
+ xfs_agnumber_t agno;
+
+ xrep_newbt_validate_ag_alloc_hint(xnr);
+
+ if (xnr->alloc_vextent)
+ error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
+ else
+ error = xfs_alloc_vextent_near_bno(&args,
+ xnr->alloc_hint);
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+
+ trace_xrep_newbt_alloc_ag_blocks(mp, agno,
+ XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
+ xnr->oinfo.oi_owner);
+
+ if (agno != sc->sa.pag->pag_agno) {
+ ASSERT(agno == sc->sa.pag->pag_agno);
+ return -EFSCORRUPTED;
+ }
+
+ error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
+ if (error)
+ return error;
+
+ nr_blocks -= args.len;
+ xnr->alloc_hint = args.fsbno + args.len;
+
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Don't let our allocation hint take us beyond EOFS */
+static inline void
+xrep_newbt_validate_file_alloc_hint(
+ struct xrep_newbt *xnr)
+{
+ struct xfs_scrub *sc = xnr->sc;
+
+ if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
+ return;
+
+ xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
+}
+
+/* Allocate disk space for our new file-based btree. */
+STATIC int
+xrep_newbt_alloc_file_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ while (nr_blocks > 0) {
+ struct xfs_alloc_arg args = {
+ .tp = sc->tp,
+ .mp = mp,
+ .oinfo = xnr->oinfo,
+ .minlen = 1,
+ .maxlen = nr_blocks,
+ .prod = 1,
+ .resv = xnr->resv,
+ };
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ xrep_newbt_validate_file_alloc_hint(xnr);
+
+ if (xnr->alloc_vextent)
+ error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
+ else
+ error = xfs_alloc_vextent_start_ag(&args,
+ xnr->alloc_hint);
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+
+ trace_xrep_newbt_alloc_file_blocks(mp, agno,
+ XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
+ xnr->oinfo.oi_owner);
+
+ pag = xfs_perag_get(mp, agno);
+ if (!pag) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xrep_newbt_add_blocks(xnr, pag, &args);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+ nr_blocks -= args.len;
+ xnr->alloc_hint = args.fsbno + args.len;
+
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Allocate disk space for our new btree. */
+int
+xrep_newbt_alloc_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ if (xnr->sc->ip)
+ return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
+ return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
+}
+
+/*
+ * Free the unused part of a space extent that was reserved for a new ondisk
+ * structure. Returns the number of EFIs logged or a negative errno.
+ */
+STATIC int
+xrep_newbt_free_extent(
+ struct xrep_newbt *xnr,
+ struct xrep_newbt_resv *resv,
+ bool btree_committed)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ xfs_agblock_t free_agbno = resv->agbno;
+ xfs_extlen_t free_aglen = resv->len;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ if (!btree_committed || resv->used == 0) {
+ /*
+ * If we're not committing a new btree or we didn't use the
+ * space reservation, let the existing EFI free the entire
+ * space extent.
+ */
+ trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
+ free_agbno, free_aglen, xnr->oinfo.oi_owner);
+ xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
+ return 1;
+ }
+
+ /*
+ * We used space and committed the btree. Cancel the autoreap, remove
+ * the written blocks from the reservation, and possibly log a new EFI
+ * to free any unused reservation space.
+ */
+ xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
+ free_agbno += resv->used;
+ free_aglen -= resv->used;
+
+ if (free_aglen == 0)
+ return 0;
+
+ trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
+ free_aglen, xnr->oinfo.oi_owner);
+
+ ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
+ ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
+
+ /*
+ * Use EFIs to free the reservations. This reduces the chance
+ * that we leak blocks if the system goes down.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
+ error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
+ xnr->resv, true);
+ if (error)
+ return error;
+
+ return 1;
+}
+
+/* Free all the accounting info and disk space we reserved for a new btree. */
+STATIC int
+xrep_newbt_free(
+ struct xrep_newbt *xnr,
+ bool btree_committed)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xrep_newbt_resv *resv, *n;
+ unsigned int freed = 0;
+ int error = 0;
+
+ /*
+ * If the filesystem already went down, we can't free the blocks. Skip
+ * ahead to freeing the incore metadata because we can't fix anything.
+ */
+ if (xfs_is_shutdown(sc->mp))
+ goto junkit;
+
+ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+ int ret;
+
+ ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
+ list_del(&resv->list);
+ xfs_perag_put(resv->pag);
+ kfree(resv);
+ if (ret < 0) {
+ error = ret;
+ goto junkit;
+ }
+
+ freed += ret;
+ if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
+ error = xrep_defer_finish(sc);
+ if (error)
+ goto junkit;
+ freed = 0;
+ }
+ }
+
+ if (freed)
+ error = xrep_defer_finish(sc);
+
+junkit:
+ /*
+ * If we still have reservations attached to @newbt, cleanup must have
+ * failed and the filesystem is about to go down. Clean up the incore
+ * reservations and try to commit to freeing the space we used.
+ */
+ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+ xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
+ list_del(&resv->list);
+ xfs_perag_put(resv->pag);
+ kfree(resv);
+ }
+
+ if (sc->ip) {
+ kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
+ xnr->ifake.if_fork = NULL;
+ }
+
+ return error;
+}
+
+/*
+ * Free all the accounting info and unused disk space allocations after
+ * committing a new btree.
+ */
+int
+xrep_newbt_commit(
+ struct xrep_newbt *xnr)
+{
+ return xrep_newbt_free(xnr, true);
+}
+
+/*
+ * Free all the accounting info and all of the disk space we reserved for a new
+ * btree that we're not going to commit. We want to try to roll things back
+ * cleanly for things like ENOSPC midway through allocation.
+ */
+void
+xrep_newbt_cancel(
+ struct xrep_newbt *xnr)
+{
+ xrep_newbt_free(xnr, false);
+}
+
+/* Feed one of the reserved btree blocks to the bulk loader. */
+int
+xrep_newbt_claim_block(
+ struct xfs_btree_cur *cur,
+ struct xrep_newbt *xnr,
+ union xfs_btree_ptr *ptr)
+{
+ struct xrep_newbt_resv *resv;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agblock_t agbno;
+
+ /*
+ * The first item in the list should always have a free block unless
+ * we're completely out.
+ */
+ resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
+ if (resv->used == resv->len)
+ return -ENOSPC;
+
+ /*
+ * Peel off a block from the start of the reservation. We allocate
+ * blocks in order to place blocks on disk in increasing record or key
+ * order. The block reservations tend to end up on the list in
+ * decreasing order, which hopefully results in leaf blocks ending up
+ * together.
+ */
+ agbno = resv->agbno + resv->used;
+ resv->used++;
+
+ /* If we used all the blocks in this reservation, move it to the end. */
+ if (resv->used == resv->len)
+ list_move_tail(&resv->list, &xnr->resv_list);
+
+ trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
+ xnr->oinfo.oi_owner);
+
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
+ ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
+ agbno));
+ else
+ ptr->s = cpu_to_be32(agbno);
+
+ /* Relog all the EFIs. */
+ return xrep_defer_finish(xnr->sc);
+}
+
+/* How many reserved blocks are unused? */
+unsigned int
+xrep_newbt_unused_blocks(
+ struct xrep_newbt *xnr)
+{
+ struct xrep_newbt_resv *resv;
+ unsigned int unused = 0;
+
+ list_for_each_entry(resv, &xnr->resv_list, list)
+ unused += resv->len - resv->used;
+ return unused;
+}
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
new file mode 100644
index 000000000000..3d804d31af24
--- /dev/null
+++ b/fs/xfs/scrub/newbt.h
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_NEWBT_H__
+#define __XFS_SCRUB_NEWBT_H__
+
+struct xfs_alloc_arg;
+
+struct xrep_newbt_resv {
+ /* Link to list of extents that we've reserved. */
+ struct list_head list;
+
+ struct xfs_perag *pag;
+
+ /* Auto-freeing this reservation if we don't commit. */
+ struct xfs_alloc_autoreap autoreap;
+
+ /* AG block of the extent we reserved. */
+ xfs_agblock_t agbno;
+
+ /* Length of the reservation. */
+ xfs_extlen_t len;
+
+ /* How much of this reservation has been used. */
+ xfs_extlen_t used;
+};
+
+struct xrep_newbt {
+ struct xfs_scrub *sc;
+
+ /* Custom allocation function, or NULL for xfs_alloc_vextent */
+ int (*alloc_vextent)(struct xfs_scrub *sc,
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t alloc_hint);
+
+ /* List of extents that we've reserved. */
+ struct list_head resv_list;
+
+ /* Fake root for new btree. */
+ union {
+ struct xbtree_afakeroot afake;
+ struct xbtree_ifakeroot ifake;
+ };
+
+ /* rmap owner of these blocks */
+ struct xfs_owner_info oinfo;
+
+ /* btree geometry for the bulk loader */
+ struct xfs_btree_bload bload;
+
+ /* Allocation hint */
+ xfs_fsblock_t alloc_hint;
+
+ /* per-ag reservation type */
+ enum xfs_ag_resv_type resv;
+};
+
+void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc);
+void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint,
+ enum xfs_ag_resv_type resv);
+int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc,
+ int whichfork, const struct xfs_owner_info *oinfo);
+int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks);
+int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag,
+ xfs_agblock_t agbno, xfs_extlen_t len);
+void xrep_newbt_cancel(struct xrep_newbt *xnr);
+int xrep_newbt_commit(struct xrep_newbt *xnr);
+int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr,
+ union xfs_btree_ptr *ptr);
+unsigned int xrep_newbt_unused_blocks(struct xrep_newbt *xnr);
+
+#endif /* __XFS_SCRUB_NEWBT_H__ */
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
new file mode 100644
index 000000000000..8a7d9557897c
--- /dev/null
+++ b/fs/xfs/scrub/nlinks.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/nlinks.h"
+#include "scrub/trace.h"
+#include "scrub/readdir.h"
+
+/*
+ * Live Inode Link Count Checking
+ * ==============================
+ *
+ * Inode link counts are "summary" metadata, in the sense that they are
+ * computed as the number of directory entries referencing each file on the
+ * filesystem. Therefore, we compute the correct link counts by creating a
+ * shadow link count structure and walking every inode.
+ */
+
+/* Set us up to scrub inode link counts. */
+int
+xchk_setup_nlinks(
+ struct xfs_scrub *sc)
+{
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 1: Collecting file link counts. For each file, we create a shadow link
+ * counting structure, then walk the entire directory tree, incrementing parent
+ * and child link counts for each directory entry seen.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned. This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
+ *
+ * Because we are scanning a live filesystem, it's possible that another thread
+ * will try to update the link counts for an inode that we've already scanned.
+ * This will cause our counts to be incorrect. Therefore, we hook all
+ * directory entry updates because that is when link count updates occur. By
+ * shadowing transaction updates in this manner, live nlink check can ensure by
+ * locking the inode and the shadow structure that its own copies are not out
+ * of date. Because the hook code runs in a different process context from the
+ * scrub code and the scrub state flags are not accessed atomically, failures
+ * in the hook code must abort the iscan and the scrubber must notice the
+ * aborted scan and set the incomplete flag.
+ *
+ * Note that we use jump labels and srcu notifier hooks to minimize the
+ * overhead when live nlinks is /not/ running. Locking order for nlink
+ * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock.
+ */
+
+/*
+ * Add a delta to an nlink counter, clamping the value to U32_MAX. Because
+ * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results
+ * even if we lose some precision.
+ */
+static inline void
+careful_add(
+ xfs_nlink_t *nlinkp,
+ int delta)
+{
+ uint64_t new_value = (uint64_t)(*nlinkp) + delta;
+
+ BUILD_BUG_ON(XFS_MAXLINK > U32_MAX);
+ *nlinkp = min_t(uint64_t, new_value, U32_MAX);
+}
+
+/* Update incore link count information. Caller must hold the nlinks lock. */
+STATIC int
+xchk_nlinks_update_incore(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino,
+ int parents_delta,
+ int backrefs_delta,
+ int children_delta)
+{
+ struct xchk_nlink nl;
+ int error;
+
+ if (!xnc->nlinks)
+ return 0;
+
+ error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
+ if (error)
+ return error;
+
+ trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta,
+ backrefs_delta, children_delta);
+
+ careful_add(&nl.parents, parents_delta);
+ careful_add(&nl.backrefs, backrefs_delta);
+ careful_add(&nl.children, children_delta);
+
+ nl.flags |= XCHK_NLINK_WRITTEN;
+ error = xfarray_store(xnc->nlinks, ino, &nl);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete.
+ */
+ error = -ECANCELED;
+ }
+ return error;
+}
+
+/*
+ * Apply a link count change from the regular filesystem into our shadow link
+ * count structure based on a directory update in progress.
+ */
+STATIC int
+xchk_nlinks_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xchk_nlink_ctrs *xnc;
+ int error;
+
+ xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
+
+ trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
+ p->delta, p->name->name, p->name->len);
+
+ /*
+ * If we've already scanned @dp, update the number of parents that link
+ * to @ip. If @ip is a subdirectory, update the number of child links
+ * going out of @dp.
+ */
+ if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) {
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta,
+ 0, 0);
+ if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode))
+ error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
+ 0, p->delta);
+ mutex_unlock(&xnc->lock);
+ if (error)
+ goto out_abort;
+ }
+
+ /*
+ * If @ip is a subdirectory and we've already scanned it, update the
+ * number of backrefs pointing to @dp.
+ */
+ if (S_ISDIR(VFS_IC(p->ip)->i_mode) &&
+ xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) {
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
+ p->delta, 0);
+ mutex_unlock(&xnc->lock);
+ if (error)
+ goto out_abort;
+ }
+
+ return NOTIFY_DONE;
+
+out_abort:
+ xchk_iscan_abort(&xnc->collect_iscan);
+ return NOTIFY_DONE;
+}
+
+/* Bump the observed link count for the inode referenced by this entry. */
+STATIC int
+xchk_nlinks_collect_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xchk_nlink_ctrs *xnc = priv;
+ bool dot = false, dotdot = false;
+ int error;
+
+ /* Does this name make sense? */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) {
+ error = -ECANCELED;
+ goto out_abort;
+ }
+
+ if (name->len == 1 && name->name[0] == '.')
+ dot = true;
+ else if (name->len == 2 && name->name[0] == '.' &&
+ name->name[1] == '.')
+ dotdot = true;
+
+ /* Don't accept a '.' entry that points somewhere else. */
+ if (dot && ino != dp->i_ino) {
+ error = -ECANCELED;
+ goto out_abort;
+ }
+
+ /* Don't accept an invalid inode number. */
+ if (!xfs_verify_dir_ino(sc->mp, ino)) {
+ error = -ECANCELED;
+ goto out_abort;
+ }
+
+ /* Update the shadow link counts if we haven't already failed. */
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ error = -ECANCELED;
+ goto out_incomplete;
+ }
+
+ trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name);
+
+ mutex_lock(&xnc->lock);
+
+ /*
+ * If this is a dotdot entry, it is a back link from dp to ino. How
+ * we handle this depends on whether or not dp is the root directory.
+ *
+ * The root directory is its own parent, so we pretend the dotdot entry
+ * establishes the "parent" of the root directory. Increment the
+ * number of parents of the root directory.
+ *
+ * Otherwise, increment the number of backrefs pointing back to ino.
+ */
+ if (dotdot) {
+ if (dp == sc->mp->m_rootip)
+ error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+ else
+ error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
+ if (error)
+ goto out_unlock;
+ }
+
+ /*
+ * If this dirent is a forward link from dp to ino, increment the
+ * number of parents linking into ino.
+ */
+ if (!dot && !dotdot) {
+ error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+ if (error)
+ goto out_unlock;
+ }
+
+ /*
+ * If this dirent is a forward link to a subdirectory, increment the
+ * number of child links of dp.
+ */
+ if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) {
+ error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1);
+ if (error)
+ goto out_unlock;
+ }
+
+ mutex_unlock(&xnc->lock);
+ return 0;
+
+out_unlock:
+ mutex_unlock(&xnc->lock);
+out_abort:
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+ xchk_set_incomplete(sc);
+ return error;
+}
+
+/* Walk a directory to bump the observed link counts of the children. */
+STATIC int
+xchk_nlinks_collect_dir(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode *dp)
+{
+ struct xfs_scrub *sc = xnc->sc;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /* Prevent anyone from changing this directory while we walk it. */
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
+ lock_mode = xfs_ilock_data_map_shared(dp);
+
+ /*
+ * The dotdot entry of an unlinked directory still points to the last
+ * parent, but the parent no longer links to this directory. Skip the
+ * directory to avoid overcounting.
+ */
+ if (VFS_I(dp)->i_nlink == 0)
+ goto out_unlock;
+
+ /*
+ * We cannot count file links if the directory looks as though it has
+ * been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_abort;
+ }
+
+ error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc);
+ if (error == -ECANCELED) {
+ error = 0;
+ goto out_unlock;
+ }
+ if (error)
+ goto out_abort;
+
+ xchk_iscan_mark_visited(&xnc->collect_iscan, dp);
+ goto out_unlock;
+
+out_abort:
+ xchk_set_incomplete(sc);
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ return error;
+}
+
+/* If this looks like a valid pointer, count it. */
+static inline int
+xchk_nlinks_collect_metafile(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino)
+{
+ if (!xfs_verify_ino(xnc->sc->mp, ino))
+ return 0;
+
+ trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino);
+ return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+}
+
+/* Bump the link counts of metadata files rooted in the superblock. */
+STATIC int
+xchk_nlinks_collect_metafiles(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xfs_mount *mp = xnc->sc->mp;
+ int error = -ECANCELED;
+
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan))
+ goto out_incomplete;
+
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino);
+ if (error)
+ goto out_abort;
+
+ error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino);
+ if (error)
+ goto out_abort;
+ mutex_unlock(&xnc->lock);
+
+ return 0;
+
+out_abort:
+ mutex_unlock(&xnc->lock);
+ xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+ xchk_set_incomplete(xnc->sc);
+ return error;
+}
+
+/* Advance the collection scan cursor for this non-directory file. */
+static inline int
+xchk_nlinks_collect_file(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode *ip)
+{
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ xchk_iscan_mark_visited(&xnc->collect_iscan, ip);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return 0;
+}
+
+/* Walk all directories and count inode links. */
+STATIC int
+xchk_nlinks_collect(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xfs_scrub *sc = xnc->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /* Count the rt and quota files that are rooted in the superblock. */
+ error = xchk_nlinks_collect_metafiles(xnc);
+ if (error)
+ return error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Cancel the transaction to release the log grant space while we scan
+ * the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done even though
+ * we take the IOLOCK to quiesce the file because empty transactions
+ * do not take sb_internal.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
+ if (S_ISDIR(VFS_I(ip)->i_mode))
+ error = xchk_nlinks_collect_dir(xnc, ip);
+ else
+ error = xchk_nlinks_collect_file(xnc, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&xnc->collect_iscan);
+ if (error) {
+ xchk_set_incomplete(sc);
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Switch out for a real transaction in preparation for building a new
+ * tree.
+ */
+ xchk_trans_cancel(sc);
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 2: Comparing file link counters. Walk each inode and compare the link
+ * counts against our shadow information; and then walk each shadow link count
+ * structure (that wasn't covered in the first part), comparing it against the
+ * file.
+ */
+
+/* Read the observed link count for comparison with the actual inode. */
+STATIC int
+xchk_nlinks_comparison_read(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino,
+ struct xchk_nlink *obs)
+{
+ struct xchk_nlink nl;
+ int error;
+
+ error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
+ if (error)
+ return error;
+
+ nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN);
+
+ error = xfarray_store(xnc->nlinks, ino, &nl);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete. This
+ * shouldn't really happen outside of the collection phase.
+ */
+ xchk_set_incomplete(xnc->sc);
+ return -ECANCELED;
+ }
+ if (error)
+ return error;
+
+ /* Copy the counters, but do not expose the internal state. */
+ obs->parents = nl.parents;
+ obs->backrefs = nl.backrefs;
+ obs->children = nl.children;
+ obs->flags = 0;
+ return 0;
+}
+
+/* Check our link count against an inode. */
+STATIC int
+xchk_nlinks_compare_inode(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode *ip)
+{
+ struct xchk_nlink obs;
+ struct xfs_scrub *sc = xnc->sc;
+ uint64_t total_links;
+ unsigned int actual_nlink;
+ int error;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ mutex_lock(&xnc->lock);
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ xchk_set_incomplete(xnc->sc);
+ error = -ECANCELED;
+ goto out_scanlock;
+ }
+
+ error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs);
+ if (error)
+ goto out_scanlock;
+
+ /*
+ * If we don't have ftype to get an accurate count of the subdirectory
+ * entries in this directory, take advantage of the fact that on a
+ * consistent ftype=0 filesystem, the number of subdirectory
+ * backreferences (dotdot entries) pointing towards this directory
+ * should be equal to the number of subdirectory entries in the
+ * directory.
+ */
+ if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode))
+ obs.children = obs.backrefs;
+
+ total_links = xchk_nlink_total(ip, &obs);
+ actual_nlink = VFS_I(ip)->i_nlink;
+
+ trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs);
+
+ /*
+ * If we found so many parents that we'd overflow i_nlink, we must flag
+ * this as a corruption. The VFS won't let users increase the link
+ * count, but it will let them decrease it.
+ */
+ if (total_links > XFS_MAXLINK) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+
+ /* Link counts should match. */
+ if (total_links != actual_nlink) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+
+ if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) {
+ /*
+ * The collection phase ignores directories with zero link
+ * count, so we ignore them here too.
+ *
+ * The number of subdirectory backreferences (dotdot entries)
+ * pointing towards this directory should be equal to the
+ * number of subdirectory entries in the directory.
+ */
+ if (obs.children != obs.backrefs)
+ xchk_ino_xref_set_corrupt(sc, ip->i_ino);
+ } else {
+ /*
+ * Non-directories and unlinked directories should not have
+ * back references.
+ */
+ if (obs.backrefs != 0) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+
+ /*
+ * Non-directories and unlinked directories should not have
+ * children.
+ */
+ if (obs.children != 0) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+ }
+
+ if (ip == sc->mp->m_rootip) {
+ /*
+ * For the root of a directory tree, both the '.' and '..'
+ * entries should point to the root directory. The dotdot
+ * entry is counted as a parent of the root /and/ a backref of
+ * the root directory.
+ */
+ if (obs.parents != 1) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+ } else if (actual_nlink > 0) {
+ /*
+ * Linked files that are not the root directory should have at
+ * least one parent.
+ */
+ if (obs.parents == 0) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ goto out_corrupt;
+ }
+ }
+
+out_corrupt:
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ error = -ECANCELED;
+out_scanlock:
+ mutex_unlock(&xnc->lock);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
+}
+
+/*
+ * Check our link count against an inode that wasn't checked previously. This
+ * is intended to catch directories with dangling links, though we could be
+ * racing with inode allocation in other threads.
+ */
+STATIC int
+xchk_nlinks_compare_inum(
+ struct xchk_nlink_ctrs *xnc,
+ xfs_ino_t ino)
+{
+ struct xchk_nlink obs;
+ struct xfs_mount *mp = xnc->sc->mp;
+ struct xfs_trans *tp = xnc->sc->tp;
+ struct xfs_buf *agi_bp;
+ struct xfs_inode *ip;
+ int error;
+
+ /*
+ * The first iget failed, so try again with the variant that returns
+ * either an incore inode or the AGI buffer. If the function returns
+ * EINVAL/ENOENT, it should have passed us the AGI buffer so that we
+ * can guarantee that the inode won't be allocated while we check for
+ * a zero link count in the observed link count data.
+ */
+ error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip);
+ if (!error) {
+ /* Actually got an inode, so use the inode compare. */
+ error = xchk_nlinks_compare_inode(xnc, ip);
+ xchk_irele(xnc->sc, ip);
+ return error;
+ }
+ if (error == -ENOENT || error == -EINVAL) {
+ /* No inode was found. Check for zero link count below. */
+ error = 0;
+ }
+ if (error)
+ goto out_agi;
+
+ /* Ensure that we have protected against inode allocation/freeing. */
+ if (agi_bp == NULL) {
+ ASSERT(agi_bp != NULL);
+ xchk_set_incomplete(xnc->sc);
+ return -ECANCELED;
+ }
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ xchk_set_incomplete(xnc->sc);
+ error = -ECANCELED;
+ goto out_agi;
+ }
+
+ mutex_lock(&xnc->lock);
+ error = xchk_nlinks_comparison_read(xnc, ino, &obs);
+ if (error)
+ goto out_scanlock;
+
+ trace_xchk_nlinks_check_zero(mp, ino, &obs);
+
+ /*
+ * If we can't grab the inode, the link count had better be zero. We
+ * still hold the AGI to prevent inode allocation/freeing.
+ */
+ if (xchk_nlink_total(NULL, &obs) != 0) {
+ xchk_ino_set_corrupt(xnc->sc, ino);
+ error = -ECANCELED;
+ }
+
+out_scanlock:
+ mutex_unlock(&xnc->lock);
+out_agi:
+ if (agi_bp)
+ xfs_trans_brelse(tp, agi_bp);
+ return error;
+}
+
+/*
+ * Try to visit every inode in the filesystem to compare the link count. Move
+ * on if we can't grab an inode, since we'll revisit unchecked nlink records in
+ * the second part.
+ */
+static int
+xchk_nlinks_compare_iter(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode **ipp)
+{
+ int error;
+
+ do {
+ error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
+ } while (error == -EBUSY);
+
+ return error;
+}
+
+/* Compare the link counts we observed against the live information. */
+STATIC int
+xchk_nlinks_compare(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xchk_nlink nl;
+ struct xfs_scrub *sc = xnc->sc;
+ struct xfs_inode *ip;
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /*
+ * Create a new empty transaction so that we can advance the iscan
+ * cursor without deadlocking if the inobt has a cycle and push on the
+ * inactivation workqueue.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ /*
+ * Use the inobt to walk all allocated inodes to compare the link
+ * counts. Inodes skipped by _compare_iter will be tried again in the
+ * next phase of the scan.
+ */
+ xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan);
+ while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) {
+ error = xchk_nlinks_compare_inode(xnc, ip);
+ xchk_iscan_mark_visited(&xnc->compare_iscan, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&xnc->compare_iscan);
+ xchk_iscan_teardown(&xnc->compare_iscan);
+ if (error)
+ return error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /*
+ * Walk all the non-null nlink observations that weren't checked in the
+ * previous step.
+ */
+ mutex_lock(&xnc->lock);
+ while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) {
+ xfs_ino_t ino = cur - 1;
+
+ if (nl.flags & XCHK_NLINK_COMPARE_SCANNED)
+ continue;
+
+ mutex_unlock(&xnc->lock);
+
+ error = xchk_nlinks_compare_inum(xnc, ino);
+ if (error)
+ return error;
+
+ if (xchk_should_terminate(xnc->sc, &error))
+ return error;
+
+ mutex_lock(&xnc->lock);
+ }
+ mutex_unlock(&xnc->lock);
+
+ return error;
+}
+
+/* Tear down everything associated with a nlinks check. */
+static void
+xchk_nlinks_teardown_scan(
+ void *priv)
+{
+ struct xchk_nlink_ctrs *xnc = priv;
+
+ /* Discourage any hook functions that might be running. */
+ xchk_iscan_abort(&xnc->collect_iscan);
+
+ xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook);
+
+ xfarray_destroy(xnc->nlinks);
+ xnc->nlinks = NULL;
+
+ xchk_iscan_teardown(&xnc->collect_iscan);
+ mutex_destroy(&xnc->lock);
+ xnc->sc = NULL;
+}
+
+/*
+ * Scan all inodes in the entire filesystem to generate link count data. If
+ * the scan is successful, the counts will be left alive for a repair. If any
+ * error occurs, we'll tear everything down.
+ */
+STATIC int
+xchk_nlinks_setup_scan(
+ struct xfs_scrub *sc,
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ unsigned long long max_inos;
+ xfs_agnumber_t last_agno = mp->m_sb.sb_agcount - 1;
+ xfs_agino_t first_agino, last_agino;
+ int error;
+
+ ASSERT(xnc->sc == NULL);
+ xnc->sc = sc;
+
+ mutex_init(&xnc->lock);
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan);
+
+ /*
+ * Set up enough space to store an nlink record for the highest
+ * possible inode number in this system.
+ */
+ xfs_agino_range(mp, last_agno, &first_agino, &last_agino);
+ max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1;
+ descr = xchk_xfile_descr(sc, "file link counts");
+ error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos),
+ sizeof(struct xchk_nlink), &xnc->nlinks);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+
+ /*
+ * Hook into the directory entry code so that we can capture updates to
+ * file link counts. The hook only triggers for inodes that were
+ * already scanned, and the scanner thread takes each inode's ILOCK,
+ * which means that any in-progress inode updates will finish before we
+ * can scan the inode.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+ xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update);
+ error = xfs_dir_hook_add(mp, &xnc->dhook);
+ if (error)
+ goto out_teardown;
+
+ /* Use deferred cleanup to pass the inode link count data to repair. */
+ sc->buf_cleanup = xchk_nlinks_teardown_scan;
+ return 0;
+
+out_teardown:
+ xchk_nlinks_teardown_scan(xnc);
+ return error;
+}
+
+/* Scrub the link count of all inodes on the filesystem. */
+int
+xchk_nlinks(
+ struct xfs_scrub *sc)
+{
+ struct xchk_nlink_ctrs *xnc = sc->buf;
+ int error = 0;
+
+ /* Set ourselves up to check link counts on the live filesystem. */
+ error = xchk_nlinks_setup_scan(sc, xnc);
+ if (error)
+ return error;
+
+ /* Walk all inodes, picking up link count information. */
+ error = xchk_nlinks_collect(xnc);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ /* Fail fast if we're not playing with a full dataset. */
+ if (xchk_iscan_aborted(&xnc->collect_iscan))
+ xchk_set_incomplete(sc);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
+ /* Compare link counts. */
+ error = xchk_nlinks_compare(xnc);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ /* Check one last time for an incomplete dataset. */
+ if (xchk_iscan_aborted(&xnc->collect_iscan))
+ xchk_set_incomplete(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h
new file mode 100644
index 000000000000..a950f3daf204
--- /dev/null
+++ b/fs/xfs/scrub/nlinks.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_NLINKS_H__
+#define __XFS_SCRUB_NLINKS_H__
+
+/* Live link count control structure. */
+struct xchk_nlink_ctrs {
+ struct xfs_scrub *sc;
+
+ /* Shadow link count data and its mutex. */
+ struct xfarray *nlinks;
+ struct mutex lock;
+
+ /*
+ * The collection step uses a separate iscan context from the compare
+ * step because the collection iscan coordinates live updates to the
+ * observation data while this scanner is running. The compare iscan
+ * is secondary and can be reinitialized as needed.
+ */
+ struct xchk_iscan collect_iscan;
+ struct xchk_iscan compare_iscan;
+
+ /*
+ * Hook into directory updates so that we can receive live updates
+ * from other writer threads.
+ */
+ struct xfs_dir_hook dhook;
+};
+
+/*
+ * In-core link counts for a given inode in the filesystem.
+ *
+ * For an empty rootdir, the directory entries and the field to which they are
+ * accounted are as follows:
+ *
+ * Root directory:
+ *
+ * . points to self (root.child)
+ * .. points to self (root.parent)
+ * f1 points to a child file (f1.parent)
+ * d1 points to a child dir (d1.parent, root.child)
+ *
+ * Subdirectory d1:
+ *
+ * . points to self (d1.child)
+ * .. points to root dir (root.backref)
+ * f2 points to child file (f2.parent)
+ * f3 points to root.f1 (f1.parent)
+ *
+ * root.nlink == 3 (root.dot, root.dotdot, root.d1)
+ * d1.nlink == 2 (root.d1, d1.dot)
+ * f1.nlink == 2 (root.f1, d1.f3)
+ * f2.nlink == 1 (d1.f2)
+ */
+struct xchk_nlink {
+ /* Count of forward links from parent directories to this file. */
+ xfs_nlink_t parents;
+
+ /*
+ * Count of back links to this parent directory from child
+ * subdirectories.
+ */
+ xfs_nlink_t backrefs;
+
+ /*
+ * Count of forward links from this directory to all child files and
+ * the number of dot entries. Should be zero for non-directories.
+ */
+ xfs_nlink_t children;
+
+ /* Record state flags */
+ unsigned int flags;
+};
+
+/*
+ * This incore link count has been written at least once. We never want to
+ * store an xchk_nlink that looks uninitialized.
+ */
+#define XCHK_NLINK_WRITTEN (1U << 0)
+
+/* Already checked this link count record. */
+#define XCHK_NLINK_COMPARE_SCANNED (1U << 1)
+
+/* Already made a repair with this link count record. */
+#define XREP_NLINK_DIRTY (1U << 2)
+
+/* Compute total link count, using large enough variables to detect overflow. */
+static inline uint64_t
+xchk_nlink_total(struct xfs_inode *ip, const struct xchk_nlink *live)
+{
+ uint64_t ret = live->parents;
+
+ /* Add one link count for the dot entry of any linked directory. */
+ if (ip && S_ISDIR(VFS_I(ip)->i_mode) && VFS_I(ip)->i_nlink)
+ ret++;
+ return ret + live->children;
+}
+
+#endif /* __XFS_SCRUB_NLINKS_H__ */
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
new file mode 100644
index 000000000000..b87618322f55
--- /dev/null
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_sb.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/nlinks.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Inode Link Count Repair
+ * ============================
+ *
+ * Use the live inode link count information that we collected to replace the
+ * nlink values of the incore inodes. A scrub->repair cycle should have left
+ * the live data and hooks active, so this is safe so long as we make sure the
+ * inode is locked.
+ */
+
+/*
+ * Correct the link count of the given inode. Because we have to grab locks
+ * and resources in a certain order, it's possible that this will be a no-op.
+ */
+STATIC int
+xrep_nlinks_repair_inode(
+ struct xchk_nlink_ctrs *xnc)
+{
+ struct xchk_nlink obs;
+ struct xfs_scrub *sc = xnc->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ uint64_t total_links;
+ uint64_t actual_nlink;
+ bool dirty = false;
+ int error;
+
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp);
+ if (error)
+ return error;
+
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(sc->tp, ip, 0);
+
+ mutex_lock(&xnc->lock);
+
+ if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+ error = -ECANCELED;
+ goto out_scanlock;
+ }
+
+ error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs);
+ if (error)
+ goto out_scanlock;
+
+ /*
+ * We're done accessing the shared scan data, so we can drop the lock.
+ * We still hold @ip's ILOCK, so its link count cannot change.
+ */
+ mutex_unlock(&xnc->lock);
+
+ total_links = xchk_nlink_total(ip, &obs);
+ actual_nlink = VFS_I(ip)->i_nlink;
+
+ /*
+ * Non-directories cannot have directories pointing up to them.
+ *
+ * We previously set error to zero, but set it again because one static
+ * checker author fears that programmers will fail to maintain this
+ * invariant and built their tool to flag this as a security risk. A
+ * different tool author made their bot complain about the redundant
+ * store. This is a never-ending and stupid battle; both tools missed
+ * *actual bugs* elsewhere; and I no longer care.
+ */
+ if (!S_ISDIR(VFS_I(ip)->i_mode) && obs.children != 0) {
+ trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+ error = 0;
+ goto out_trans;
+ }
+
+ /*
+ * We did not find any links to this inode. If the inode agrees, we
+ * have nothing further to do. If not, the inode has a nonzero link
+ * count and we don't have anywhere to graft the child onto. Dropping
+ * a live inode's link count to zero can cause unexpected shutdowns in
+ * inactivation, so leave it alone.
+ */
+ if (total_links == 0) {
+ if (actual_nlink != 0)
+ trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+ goto out_trans;
+ }
+
+ /* Commit the new link count if it changed. */
+ if (total_links != actual_nlink) {
+ if (total_links > XFS_MAXLINK) {
+ trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+ goto out_trans;
+ }
+
+ trace_xrep_nlinks_update_inode(mp, ip, &obs);
+
+ set_nlink(VFS_I(ip), total_links);
+ dirty = true;
+ }
+
+ if (!dirty) {
+ error = 0;
+ goto out_trans;
+ }
+
+ xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE);
+
+ error = xrep_trans_commit(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ return error;
+
+out_scanlock:
+ mutex_unlock(&xnc->lock);
+out_trans:
+ xchk_trans_cancel(sc);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ return error;
+}
+
+/*
+ * Try to visit every inode in the filesystem for repairs. Move on if we can't
+ * grab an inode, since we're still making forward progress.
+ */
+static int
+xrep_nlinks_iter(
+ struct xchk_nlink_ctrs *xnc,
+ struct xfs_inode **ipp)
+{
+ int error;
+
+ do {
+ error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
+ } while (error == -EBUSY);
+
+ return error;
+}
+
+/* Commit the new inode link counters. */
+int
+xrep_nlinks(
+ struct xfs_scrub *sc)
+{
+ struct xchk_nlink_ctrs *xnc = sc->buf;
+ int error;
+
+ /*
+ * We need ftype for an accurate count of the number of child
+ * subdirectory links. Child subdirectories with a back link (dotdot
+ * entry) but no forward link are unfixable, so we cannot repair the
+ * link count of the parent directory based on the back link count
+ * alone. Filesystems without ftype support are rare (old V4) so we
+ * just skip out here.
+ */
+ if (!xfs_has_ftype(sc->mp))
+ return -EOPNOTSUPP;
+
+ /*
+ * Use the inobt to walk all allocated inodes to compare and fix the
+ * link counts. Retry iget every tenth of a second for up to 30
+ * seconds -- even if repair misses a few inodes, we still try to fix
+ * as many of them as we can.
+ */
+ xchk_iscan_start(sc, 30000, 100, &xnc->compare_iscan);
+ ASSERT(sc->ip == NULL);
+
+ while ((error = xrep_nlinks_iter(xnc, &sc->ip)) == 1) {
+ /*
+ * Commit the scrub transaction so that we can create repair
+ * transactions with the correct reservations.
+ */
+ xchk_trans_cancel(sc);
+
+ error = xrep_nlinks_repair_inode(xnc);
+ xchk_iscan_mark_visited(&xnc->compare_iscan, sc->ip);
+ xchk_irele(sc, sc->ip);
+ sc->ip = NULL;
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+
+ /*
+ * Create a new empty transaction so that we can advance the
+ * iscan cursor without deadlocking if the inobt has a cycle.
+ * We can only push the inactivation workqueues with an empty
+ * transaction.
+ */
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ break;
+ }
+ xchk_iscan_iter_finish(&xnc->compare_iscan);
+ xchk_iscan_teardown(&xnc->compare_iscan);
+
+ return error;
+}
diff --git a/fs/xfs/scrub/off_bitmap.h b/fs/xfs/scrub/off_bitmap.h
new file mode 100644
index 000000000000..0d3f9e6c1aad
--- /dev/null
+++ b/fs/xfs/scrub/off_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_OFF_BITMAP_H__
+#define __XFS_SCRUB_OFF_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_fileoff_t */
+
+struct xoff_bitmap {
+ struct xbitmap64 offbitmap;
+};
+
+static inline void xoff_bitmap_init(struct xoff_bitmap *bitmap)
+{
+ xbitmap64_init(&bitmap->offbitmap);
+}
+
+static inline void xoff_bitmap_destroy(struct xoff_bitmap *bitmap)
+{
+ xbitmap64_destroy(&bitmap->offbitmap);
+}
+
+static inline int xoff_bitmap_set(struct xoff_bitmap *bitmap,
+ xfs_fileoff_t off, xfs_filblks_t len)
+{
+ return xbitmap64_set(&bitmap->offbitmap, off, len);
+}
+
+static inline int xoff_bitmap_walk(struct xoff_bitmap *bitmap,
+ xbitmap64_walk_fn fn, void *priv)
+{
+ return xbitmap64_walk(&bitmap->offbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_OFF_BITMAP_H__ */
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index e6155d86f791..7db873672146 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -156,6 +156,16 @@ xchk_parent_validate(
goto out_rele;
}
+ /*
+ * We cannot yet validate this parent pointer if the directory looks as
+ * though it has been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ xchk_set_incomplete(sc);
+ goto out_unlock;
+ }
+
/* Look for a directory entry in the parent pointing to the child. */
error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc);
if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
@@ -217,6 +227,13 @@ xchk_parent(
*/
error = xchk_parent_validate(sc, parent_ino);
} while (error == -EAGAIN);
+ if (error == -EBUSY) {
+ /*
+ * We could not scan a directory, so we marked the check
+ * incomplete. No further error return is necessary.
+ */
+ return 0;
+ }
return error;
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 5671c8153433..183d531875ea 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -6,6 +6,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
+#include "xfs_bit.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
@@ -17,9 +18,10 @@
#include "xfs_bmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
+#include "scrub/quota.h"
/* Convert a scrub type code to a DQ flag, or return 0 if error. */
-static inline xfs_dqtype_t
+xfs_dqtype_t
xchk_quota_to_dqtype(
struct xfs_scrub *sc)
{
@@ -75,14 +77,70 @@ struct xchk_quota_info {
xfs_dqid_t last_id;
};
+/* There's a written block backing this dquot, right? */
+STATIC int
+xchk_quota_item_bmap(
+ struct xfs_scrub *sc,
+ struct xfs_dquot *dq,
+ xfs_fileoff_t offset)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_mount *mp = sc->mp;
+ int nmaps = 1;
+ int error;
+
+ if (!xfs_verify_fileoff(mp, offset)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return 0;
+ }
+
+ if (dq->q_fileoffset != offset) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return 0;
+ }
+
+ error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0);
+ if (error)
+ return error;
+
+ if (nmaps != 1) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return 0;
+ }
+
+ if (!xfs_verify_fsbno(mp, irec.br_startblock))
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ if (XFS_FSB_TO_DADDR(mp, irec.br_startblock) != dq->q_blkno)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ if (!xfs_bmap_is_written_extent(&irec))
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ return 0;
+}
+
+/* Complain if a quota timer is incorrectly set. */
+static inline void
+xchk_quota_item_timer(
+ struct xfs_scrub *sc,
+ xfs_fileoff_t offset,
+ const struct xfs_dquot_res *res)
+{
+ if ((res->softlimit && res->count > res->softlimit) ||
+ (res->hardlimit && res->count > res->hardlimit)) {
+ if (!res->timer)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ } else {
+ if (res->timer)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ }
+}
+
/* Scrub the fields in an individual quota item. */
STATIC int
xchk_quota_item(
- struct xfs_dquot *dq,
- xfs_dqtype_t dqtype,
- void *priv)
+ struct xchk_quota_info *sqi,
+ struct xfs_dquot *dq)
{
- struct xchk_quota_info *sqi = priv;
struct xfs_scrub *sc = sqi->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_quotainfo *qi = mp->m_quotainfo;
@@ -94,6 +152,17 @@ xchk_quota_item(
return error;
/*
+ * We want to validate the bmap record for the storage backing this
+ * dquot, so we need to lock the dquot and the quota file. For quota
+ * operations, the locking order is first the ILOCK and then the dquot.
+ * However, dqiterate gave us a locked dquot, so drop the dquot lock to
+ * get the ILOCK.
+ */
+ xfs_dqunlock(dq);
+ xchk_ilock(sc, XFS_ILOCK_SHARED);
+ xfs_dqlock(dq);
+
+ /*
* Except for the root dquot, the actual dquot we got must either have
* the same or higher id as we saw before.
*/
@@ -103,6 +172,11 @@ xchk_quota_item(
sqi->last_id = dq->q_id;
+ error = xchk_quota_item_bmap(sc, dq, offset);
+ xchk_iunlock(sc, XFS_ILOCK_SHARED);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error))
+ return error;
+
/*
* Warn if the hard limits are larger than the fs.
* Administrators can do this, though in production this seems
@@ -166,6 +240,10 @@ xchk_quota_item(
dq->q_rtb.count > dq->q_rtb.hardlimit)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ xchk_quota_item_timer(sc, offset, &dq->q_blk);
+ xchk_quota_item_timer(sc, offset, &dq->q_ino);
+ xchk_quota_item_timer(sc, offset, &dq->q_rtb);
+
out:
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return -ECANCELED;
@@ -191,7 +269,7 @@ xchk_quota_data_fork(
return error;
/* Check for data fork problems that apply only to quota files. */
- max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
+ max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk;
ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error))
@@ -218,9 +296,11 @@ int
xchk_quota(
struct xfs_scrub *sc)
{
- struct xchk_quota_info sqi;
+ struct xchk_dqiter cursor = { };
+ struct xchk_quota_info sqi = { .sc = sc };
struct xfs_mount *mp = sc->mp;
struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_dquot *dq;
xfs_dqtype_t dqtype;
int error = 0;
@@ -239,10 +319,15 @@ xchk_quota(
* functions.
*/
xchk_iunlock(sc, sc->ilock_flags);
- sqi.sc = sc;
- sqi.last_id = 0;
- error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi);
- xchk_ilock(sc, XFS_ILOCK_EXCL);
+
+ /* Now look for things that the quota verifiers won't complain about. */
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xchk_quota_item(&sqi, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
if (error == -ECANCELED)
error = 0;
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK,
diff --git a/fs/xfs/scrub/quota.h b/fs/xfs/scrub/quota.h
new file mode 100644
index 000000000000..6c7134ce2385
--- /dev/null
+++ b/fs/xfs/scrub/quota.h
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_QUOTA_H__
+#define __XFS_SCRUB_QUOTA_H__
+
+xfs_dqtype_t xchk_quota_to_dqtype(struct xfs_scrub *sc);
+
+/* dquot iteration code */
+
+struct xchk_dqiter {
+ struct xfs_scrub *sc;
+
+ /* Quota file that we're walking. */
+ struct xfs_inode *quota_ip;
+
+ /* Cached data fork mapping for the dquot. */
+ struct xfs_bmbt_irec bmap;
+
+ /* The next dquot to scan. */
+ uint64_t id;
+
+ /* Quota type (user/group/project). */
+ xfs_dqtype_t dqtype;
+
+ /* Data fork sequence number to detect stale mappings. */
+ unsigned int if_seq;
+};
+
+void xchk_dqiter_init(struct xchk_dqiter *cursor, struct xfs_scrub *sc,
+ xfs_dqtype_t dqtype);
+int xchk_dquot_iter(struct xchk_dqiter *cursor, struct xfs_dquot **dqpp);
+
+#endif /* __XFS_SCRUB_QUOTA_H__ */
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
new file mode 100644
index 000000000000..0bab4c30cb85
--- /dev/null
+++ b/fs/xfs/scrub/quota_repair.c
@@ -0,0 +1,575 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_dquot.h"
+#include "xfs_dquot_item.h"
+#include "xfs_reflink.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/quota.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/*
+ * Quota Repair
+ * ============
+ *
+ * Quota repairs are fairly simplistic; we fix everything that the dquot
+ * verifiers complain about, cap any counters or limits that make no sense,
+ * and schedule a quotacheck if we had to fix anything. We also repair any
+ * data fork extent records that don't apply to metadata files.
+ */
+
+struct xrep_quota_info {
+ struct xfs_scrub *sc;
+ bool need_quotacheck;
+};
+
+/*
+ * Allocate a new block into a sparse hole in the quota file backing this
+ * dquot, initialize the block, and commit the whole mess.
+ */
+STATIC int
+xrep_quota_item_fill_bmap_hole(
+ struct xfs_scrub *sc,
+ struct xfs_dquot *dq,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_buf *bp;
+ struct xfs_mount *mp = sc->mp;
+ int nmaps = 1;
+ int error;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Map a block into the file. */
+ error = xfs_trans_reserve_more(sc->tp, XFS_QM_DQALLOC_SPACE_RES(mp),
+ 0);
+ if (error)
+ return error;
+
+ error = xfs_bmapi_write(sc->tp, sc->ip, dq->q_fileoffset,
+ XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0,
+ irec, &nmaps);
+ if (error)
+ return error;
+ if (nmaps != 1)
+ return -ENOSPC;
+
+ dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock);
+
+ trace_xrep_dquot_item_fill_bmap_hole(sc->mp, dq->q_type, dq->q_id);
+
+ /* Initialize the new block. */
+ error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, dq->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_dquot_buf_ops;
+
+ xfs_qm_init_dquot_blk(sc->tp, dq->q_id, dq->q_type, bp);
+ xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+
+ /*
+ * Finish the mapping transactions and roll one more time to
+ * disconnect sc->ip from sc->tp.
+ */
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ return xfs_trans_roll(&sc->tp);
+}
+
+/* Make sure there's a written block backing this dquot */
+STATIC int
+xrep_quota_item_bmap(
+ struct xfs_scrub *sc,
+ struct xfs_dquot *dq,
+ bool *dirty)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ xfs_fileoff_t offset = dq->q_id / qi->qi_dqperchunk;
+ int nmaps = 1;
+ int error;
+
+ /* The computed file offset should always be valid. */
+ if (!xfs_verify_fileoff(mp, offset)) {
+ ASSERT(xfs_verify_fileoff(mp, offset));
+ return -EFSCORRUPTED;
+ }
+ dq->q_fileoffset = offset;
+
+ error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0);
+ if (error)
+ return error;
+
+ if (nmaps < 1 || !xfs_bmap_is_real_extent(&irec)) {
+ /* Hole/delalloc extent; allocate a real block. */
+ error = xrep_quota_item_fill_bmap_hole(sc, dq, &irec);
+ if (error)
+ return error;
+ } else if (irec.br_state != XFS_EXT_NORM) {
+ /* Unwritten extent, which we already took care of? */
+ ASSERT(irec.br_state == XFS_EXT_NORM);
+ return -EFSCORRUPTED;
+ } else if (dq->q_blkno != XFS_FSB_TO_DADDR(mp, irec.br_startblock)) {
+ /*
+ * If the cached daddr is incorrect, repair probably punched a
+ * hole out of the quota file and filled it back in with a new
+ * block. Update the block mapping in the dquot.
+ */
+ dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec.br_startblock);
+ }
+
+ *dirty = true;
+ return 0;
+}
+
+/* Reset quota timers if incorrectly set. */
+static inline void
+xrep_quota_item_timer(
+ struct xfs_scrub *sc,
+ const struct xfs_dquot_res *res,
+ bool *dirty)
+{
+ if ((res->softlimit && res->count > res->softlimit) ||
+ (res->hardlimit && res->count > res->hardlimit)) {
+ if (!res->timer)
+ *dirty = true;
+ } else {
+ if (res->timer)
+ *dirty = true;
+ }
+}
+
+/* Scrub the fields in an individual quota item. */
+STATIC int
+xrep_quota_item(
+ struct xrep_quota_info *rqi,
+ struct xfs_dquot *dq)
+{
+ struct xfs_scrub *sc = rqi->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t fs_icount;
+ bool dirty = false;
+ int error = 0;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * We might need to fix holes in the bmap record for the storage
+ * backing this dquot, so we need to lock the dquot and the quota file.
+ * dqiterate gave us a locked dquot, so drop the dquot lock to get the
+ * ILOCK_EXCL.
+ */
+ xfs_dqunlock(dq);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ xfs_dqlock(dq);
+
+ error = xrep_quota_item_bmap(sc, dq, &dirty);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ /* Check the limits. */
+ if (dq->q_blk.softlimit > dq->q_blk.hardlimit) {
+ dq->q_blk.softlimit = dq->q_blk.hardlimit;
+ dirty = true;
+ }
+
+ if (dq->q_ino.softlimit > dq->q_ino.hardlimit) {
+ dq->q_ino.softlimit = dq->q_ino.hardlimit;
+ dirty = true;
+ }
+
+ if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) {
+ dq->q_rtb.softlimit = dq->q_rtb.hardlimit;
+ dirty = true;
+ }
+
+ /*
+ * Check that usage doesn't exceed physical limits. However, on
+ * a reflink filesystem we're allowed to exceed physical space
+ * if there are no quota limits. We don't know what the real number
+ * is, but we can make quotacheck find out for us.
+ */
+ if (!xfs_has_reflink(mp) && dq->q_blk.count > mp->m_sb.sb_dblocks) {
+ dq->q_blk.reserved -= dq->q_blk.count;
+ dq->q_blk.reserved += mp->m_sb.sb_dblocks;
+ dq->q_blk.count = mp->m_sb.sb_dblocks;
+ rqi->need_quotacheck = true;
+ dirty = true;
+ }
+ fs_icount = percpu_counter_sum(&mp->m_icount);
+ if (dq->q_ino.count > fs_icount) {
+ dq->q_ino.reserved -= dq->q_ino.count;
+ dq->q_ino.reserved += fs_icount;
+ dq->q_ino.count = fs_icount;
+ rqi->need_quotacheck = true;
+ dirty = true;
+ }
+ if (dq->q_rtb.count > mp->m_sb.sb_rblocks) {
+ dq->q_rtb.reserved -= dq->q_rtb.count;
+ dq->q_rtb.reserved += mp->m_sb.sb_rblocks;
+ dq->q_rtb.count = mp->m_sb.sb_rblocks;
+ rqi->need_quotacheck = true;
+ dirty = true;
+ }
+
+ xrep_quota_item_timer(sc, &dq->q_blk, &dirty);
+ xrep_quota_item_timer(sc, &dq->q_ino, &dirty);
+ xrep_quota_item_timer(sc, &dq->q_rtb, &dirty);
+
+ if (!dirty)
+ return 0;
+
+ trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id);
+
+ dq->q_flags |= XFS_DQFLAG_DIRTY;
+ xfs_trans_dqjoin(sc->tp, dq);
+ if (dq->q_id) {
+ xfs_qm_adjust_dqlimits(dq);
+ xfs_qm_adjust_dqtimers(dq);
+ }
+ xfs_trans_log_dquot(sc->tp, dq);
+ error = xfs_trans_roll(&sc->tp);
+ xfs_dqlock(dq);
+ return error;
+}
+
+/* Fix a quota timer so that we can pass the verifier. */
+STATIC void
+xrep_quota_fix_timer(
+ struct xfs_mount *mp,
+ const struct xfs_disk_dquot *ddq,
+ __be64 softlimit,
+ __be64 countnow,
+ __be32 *timer,
+ time64_t timelimit)
+{
+ uint64_t soft = be64_to_cpu(softlimit);
+ uint64_t count = be64_to_cpu(countnow);
+ time64_t new_timer;
+ uint32_t t;
+
+ if (!soft || count <= soft || *timer != 0)
+ return;
+
+ new_timer = xfs_dquot_set_timeout(mp,
+ ktime_get_real_seconds() + timelimit);
+ if (ddq->d_type & XFS_DQTYPE_BIGTIME)
+ t = xfs_dq_unix_to_bigtime(new_timer);
+ else
+ t = new_timer;
+
+ *timer = cpu_to_be32(t);
+}
+
+/* Fix anything the verifiers complain about. */
+STATIC int
+xrep_quota_block(
+ struct xfs_scrub *sc,
+ xfs_daddr_t daddr,
+ xfs_dqtype_t dqtype,
+ xfs_dqid_t id)
+{
+ struct xfs_dqblk *dqblk;
+ struct xfs_disk_dquot *ddq;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ struct xfs_def_quota *defq = xfs_get_defquota(qi, dqtype);
+ struct xfs_buf *bp = NULL;
+ enum xfs_blft buftype = 0;
+ int i;
+ int error;
+
+ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, daddr,
+ qi->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops);
+ switch (error) {
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Failed verifier, retry read with no ops. */
+ error = xfs_trans_read_buf(sc->mp, sc->tp,
+ sc->mp->m_ddev_targp, daddr, qi->qi_dqchunklen,
+ 0, &bp, NULL);
+ if (error)
+ return error;
+ break;
+ case 0:
+ dqblk = bp->b_addr;
+ ddq = &dqblk[0].dd_diskdq;
+
+ /*
+ * If there's nothing that would impede a dqiterate, we're
+ * done.
+ */
+ if ((ddq->d_type & XFS_DQTYPE_REC_MASK) != dqtype ||
+ id == be32_to_cpu(ddq->d_id)) {
+ xfs_trans_brelse(sc->tp, bp);
+ return 0;
+ }
+ break;
+ default:
+ return error;
+ }
+
+ /* Something's wrong with the block, fix the whole thing. */
+ dqblk = bp->b_addr;
+ bp->b_ops = &xfs_dquot_buf_ops;
+ for (i = 0; i < qi->qi_dqperchunk; i++, dqblk++) {
+ ddq = &dqblk->dd_diskdq;
+
+ trace_xrep_disk_dquot(sc->mp, dqtype, id + i);
+
+ ddq->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ ddq->d_version = XFS_DQUOT_VERSION;
+ ddq->d_type = dqtype;
+ ddq->d_id = cpu_to_be32(id + i);
+
+ if (xfs_has_bigtime(sc->mp) && ddq->d_id)
+ ddq->d_type |= XFS_DQTYPE_BIGTIME;
+
+ xrep_quota_fix_timer(sc->mp, ddq, ddq->d_blk_softlimit,
+ ddq->d_bcount, &ddq->d_btimer,
+ defq->blk.time);
+
+ xrep_quota_fix_timer(sc->mp, ddq, ddq->d_ino_softlimit,
+ ddq->d_icount, &ddq->d_itimer,
+ defq->ino.time);
+
+ xrep_quota_fix_timer(sc->mp, ddq, ddq->d_rtb_softlimit,
+ ddq->d_rtbcount, &ddq->d_rtbtimer,
+ defq->rtb.time);
+
+ /* We only support v5 filesystems so always set these. */
+ uuid_copy(&dqblk->dd_uuid, &sc->mp->m_sb.sb_meta_uuid);
+ xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ dqblk->dd_lsn = 0;
+ }
+ switch (dqtype) {
+ case XFS_DQTYPE_USER:
+ buftype = XFS_BLFT_UDQUOT_BUF;
+ break;
+ case XFS_DQTYPE_GROUP:
+ buftype = XFS_BLFT_GDQUOT_BUF;
+ break;
+ case XFS_DQTYPE_PROJ:
+ buftype = XFS_BLFT_PDQUOT_BUF;
+ break;
+ }
+ xfs_trans_buf_set_type(sc->tp, bp, buftype);
+ xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
+ return xrep_roll_trans(sc);
+}
+
+/*
+ * Repair a quota file's data fork. The function returns with the inode
+ * joined.
+ */
+STATIC int
+xrep_quota_data_fork(
+ struct xfs_scrub *sc,
+ xfs_dqtype_t dqtype)
+{
+ struct xfs_bmbt_irec irec = { 0 };
+ struct xfs_iext_cursor icur;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t max_dqid_off;
+ xfs_fileoff_t off;
+ xfs_fsblock_t fsbno;
+ bool truncate = false;
+ bool joined = false;
+ int error = 0;
+
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ goto out;
+
+ /* Check for data fork problems that apply only to quota files. */
+ max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk;
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ if (isnullstartblock(irec.br_startblock)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ if (irec.br_startoff > max_dqid_off ||
+ irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
+ truncate = true;
+ break;
+ }
+
+ /* Convert unwritten extents to real ones. */
+ if (irec.br_state == XFS_EXT_UNWRITTEN) {
+ struct xfs_bmbt_irec nrec;
+ int nmap = 1;
+
+ if (!joined) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+ }
+
+ error = xfs_bmapi_write(sc->tp, sc->ip,
+ irec.br_startoff, irec.br_blockcount,
+ XFS_BMAPI_CONVERT, 0, &nrec, &nmap);
+ if (error)
+ goto out;
+ if (nmap != 1) {
+ error = -ENOSPC;
+ goto out;
+ }
+ ASSERT(nrec.br_startoff == irec.br_startoff);
+ ASSERT(nrec.br_blockcount == irec.br_blockcount);
+
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ goto out;
+ }
+ }
+
+ if (!joined) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+ }
+
+ if (truncate) {
+ /* Erase everything after the block containing the max dquot */
+ error = xfs_bunmapi_range(&sc->tp, sc->ip, 0,
+ max_dqid_off * sc->mp->m_sb.sb_blocksize,
+ XFS_MAX_FILEOFF);
+ if (error)
+ goto out;
+
+ /* Remove all CoW reservations. */
+ error = xfs_reflink_cancel_cow_blocks(sc->ip, &sc->tp, 0,
+ XFS_MAX_FILEOFF, true);
+ if (error)
+ goto out;
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+
+ /*
+ * Always re-log the inode so that our permanent transaction
+ * can keep on rolling it forward in the log.
+ */
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ }
+
+ /* Now go fix anything that fails the verifiers. */
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ for (fsbno = irec.br_startblock, off = irec.br_startoff;
+ fsbno < irec.br_startblock + irec.br_blockcount;
+ fsbno += XFS_DQUOT_CLUSTER_SIZE_FSB,
+ off += XFS_DQUOT_CLUSTER_SIZE_FSB) {
+ error = xrep_quota_block(sc,
+ XFS_FSB_TO_DADDR(sc->mp, fsbno),
+ dqtype, off * qi->qi_dqperchunk);
+ if (error)
+ goto out;
+ }
+ }
+
+out:
+ return error;
+}
+
+/*
+ * Go fix anything in the quota items that we could have been mad about. Now
+ * that we've checked the quota inode data fork we have to drop ILOCK_EXCL to
+ * use the regular dquot functions.
+ */
+STATIC int
+xrep_quota_problems(
+ struct xfs_scrub *sc,
+ xfs_dqtype_t dqtype)
+{
+ struct xchk_dqiter cursor = { };
+ struct xrep_quota_info rqi = { .sc = sc };
+ struct xfs_dquot *dq;
+ int error;
+
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xrep_quota_item(&rqi, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
+ if (error)
+ return error;
+
+ /* Make a quotacheck happen. */
+ if (rqi.need_quotacheck)
+ xrep_force_quotacheck(sc, dqtype);
+ return 0;
+}
+
+/* Repair all of a quota type's items. */
+int
+xrep_quota(
+ struct xfs_scrub *sc)
+{
+ xfs_dqtype_t dqtype;
+ int error;
+
+ dqtype = xchk_quota_to_dqtype(sc);
+
+ /*
+ * Re-take the ILOCK so that we can fix any problems that we found
+ * with the data fork mappings, or with the dquot bufs themselves.
+ */
+ if (!(sc->ilock_flags & XFS_ILOCK_EXCL))
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ error = xrep_quota_data_fork(sc, dqtype);
+ if (error)
+ return error;
+
+ /*
+ * Finish deferred items and roll the transaction to unjoin the quota
+ * inode from transaction so that we can unlock the quota inode; we
+ * play only with dquots from now on.
+ */
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+ xchk_iunlock(sc, sc->ilock_flags);
+
+ /* Fix anything the dquot verifiers don't complain about. */
+ error = xrep_quota_problems(sc, dqtype);
+ if (error)
+ return error;
+
+ return xrep_trans_commit(sc);
+}
diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c
new file mode 100644
index 000000000000..c77eb2de8df7
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/quota.h"
+#include "scrub/quotacheck.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Quotacheck
+ * ===============
+ *
+ * Quota counters are "summary" metadata, in the sense that they are computed
+ * as the summation of the block usage counts for every file on the filesystem.
+ * Therefore, we compute the correct icount, bcount, and rtbcount values by
+ * creating a shadow quota counter structure and walking every inode.
+ */
+
+/* Track the quota deltas for a dquot in a transaction. */
+struct xqcheck_dqtrx {
+ xfs_dqtype_t q_type;
+ xfs_dqid_t q_id;
+
+ int64_t icount_delta;
+
+ int64_t bcount_delta;
+ int64_t delbcnt_delta;
+
+ int64_t rtbcount_delta;
+ int64_t delrtb_delta;
+};
+
+#define XQCHECK_MAX_NR_DQTRXS (XFS_QM_TRANS_DQTYPES * XFS_QM_TRANS_MAXDQS)
+
+/*
+ * Track the quota deltas for all dquots attached to a transaction if the
+ * quota deltas are being applied to an inode that we already scanned.
+ */
+struct xqcheck_dqacct {
+ struct rhash_head hash;
+ uintptr_t tx_id;
+ struct xqcheck_dqtrx dqtrx[XQCHECK_MAX_NR_DQTRXS];
+ unsigned int refcount;
+};
+
+/* Free a shadow dquot accounting structure. */
+static void
+xqcheck_dqacct_free(
+ void *ptr,
+ void *arg)
+{
+ struct xqcheck_dqacct *dqa = ptr;
+
+ kfree(dqa);
+}
+
+/* Set us up to scrub quota counters. */
+int
+xchk_setup_quotacheck(
+ struct xfs_scrub *sc)
+{
+ if (!XFS_IS_QUOTA_ON(sc->mp))
+ return -ENOENT;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_QUOTA);
+
+ sc->buf = kzalloc(sizeof(struct xqcheck), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 1: Collecting dquot resource usage counts. For each xfs_dquot attached
+ * to each inode, we create a shadow dquot, and compute the inode count and add
+ * the data/rt block usage from what we see.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned. This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error. Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
+ *
+ * Because we are scanning a live filesystem, it's possible that another thread
+ * will try to update the quota counters for an inode that we've already
+ * scanned. This will cause our counts to be incorrect. Therefore, we hook
+ * the live transaction code in two places: (1) when the callers update the
+ * per-transaction dqtrx structure to log quota counter updates; and (2) when
+ * transaction commit actually logs those updates to the incore dquot. By
+ * shadowing transaction updates in this manner, live quotacheck can ensure
+ * by locking the dquot and the shadow structure that its own copies are not
+ * out of date. Because the hook code runs in a different process context from
+ * the scrub code and the scrub state flags are not accessed atomically,
+ * failures in the hook code must abort the iscan and the scrubber must notice
+ * the aborted scan and set the incomplete flag.
+ *
+ * Note that we use srcu notifier hooks to minimize the overhead when live
+ * quotacheck is /not/ running.
+ */
+
+/* Update an incore dquot counter information from a live update. */
+static int
+xqcheck_update_incore_counts(
+ struct xqcheck *xqc,
+ struct xfarray *counts,
+ xfs_dqid_t id,
+ int64_t inodes,
+ int64_t nblks,
+ int64_t rtblks)
+{
+ struct xqcheck_dquot xcdq;
+ int error;
+
+ error = xfarray_load_sparse(counts, id, &xcdq);
+ if (error)
+ return error;
+
+ xcdq.flags |= XQCHECK_DQUOT_WRITTEN;
+ xcdq.icount += inodes;
+ xcdq.bcount += nblks;
+ xcdq.rtbcount += rtblks;
+
+ error = xfarray_store(counts, id, &xcdq);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete.
+ */
+ error = -ECANCELED;
+ }
+ return error;
+}
+
+/* Decide if this is the shadow dquot accounting structure for a transaction. */
+static int
+xqcheck_dqacct_obj_cmpfn(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const uintptr_t *tx_idp = arg->key;
+ const struct xqcheck_dqacct *dqa = obj;
+
+ if (dqa->tx_id != *tx_idp)
+ return 1;
+ return 0;
+}
+
+static const struct rhashtable_params xqcheck_dqacct_hash_params = {
+ .min_size = 32,
+ .key_len = sizeof(uintptr_t),
+ .key_offset = offsetof(struct xqcheck_dqacct, tx_id),
+ .head_offset = offsetof(struct xqcheck_dqacct, hash),
+ .automatic_shrinking = true,
+ .obj_cmpfn = xqcheck_dqacct_obj_cmpfn,
+};
+
+/* Find a shadow dqtrx slot for the given dquot. */
+STATIC struct xqcheck_dqtrx *
+xqcheck_get_dqtrx(
+ struct xqcheck_dqacct *dqa,
+ xfs_dqtype_t q_type,
+ xfs_dqid_t q_id)
+{
+ int i;
+
+ for (i = 0; i < XQCHECK_MAX_NR_DQTRXS; i++) {
+ if (dqa->dqtrx[i].q_type == 0 ||
+ (dqa->dqtrx[i].q_type == q_type &&
+ dqa->dqtrx[i].q_id == q_id))
+ return &dqa->dqtrx[i];
+ }
+
+ return NULL;
+}
+
+/*
+ * Create and fill out a quota delta tracking structure to shadow the updates
+ * going on in the regular quota code.
+ */
+static int
+xqcheck_mod_live_ino_dqtrx(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_mod_ino_dqtrx_params *p = data;
+ struct xqcheck *xqc;
+ struct xqcheck_dqacct *dqa;
+ struct xqcheck_dqtrx *dqtrx;
+ int error;
+
+ xqc = container_of(nb, struct xqcheck, qhook.mod_hook.nb);
+
+ /* Skip quota reservation fields. */
+ switch (action) {
+ case XFS_TRANS_DQ_BCOUNT:
+ case XFS_TRANS_DQ_DELBCOUNT:
+ case XFS_TRANS_DQ_ICOUNT:
+ case XFS_TRANS_DQ_RTBCOUNT:
+ case XFS_TRANS_DQ_DELRTBCOUNT:
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ /* Ignore dqtrx updates for quota types we don't care about. */
+ switch (p->q_type) {
+ case XFS_DQTYPE_USER:
+ if (!xqc->ucounts)
+ return NOTIFY_DONE;
+ break;
+ case XFS_DQTYPE_GROUP:
+ if (!xqc->gcounts)
+ return NOTIFY_DONE;
+ break;
+ case XFS_DQTYPE_PROJ:
+ if (!xqc->pcounts)
+ return NOTIFY_DONE;
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ /* Skip inodes that haven't been scanned yet. */
+ if (!xchk_iscan_want_live_update(&xqc->iscan, p->ino))
+ return NOTIFY_DONE;
+
+ /* Make a shadow quota accounting tracker for this transaction. */
+ mutex_lock(&xqc->lock);
+ dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id,
+ xqcheck_dqacct_hash_params);
+ if (!dqa) {
+ dqa = kzalloc(sizeof(struct xqcheck_dqacct), XCHK_GFP_FLAGS);
+ if (!dqa)
+ goto out_abort;
+
+ dqa->tx_id = p->tx_id;
+ error = rhashtable_insert_fast(&xqc->shadow_dquot_acct,
+ &dqa->hash, xqcheck_dqacct_hash_params);
+ if (error)
+ goto out_abort;
+ }
+
+ /* Find the shadow dqtrx (or an empty slot) here. */
+ dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id);
+ if (!dqtrx)
+ goto out_abort;
+ if (dqtrx->q_type == 0) {
+ dqtrx->q_type = p->q_type;
+ dqtrx->q_id = p->q_id;
+ dqa->refcount++;
+ }
+
+ /* Update counter */
+ switch (action) {
+ case XFS_TRANS_DQ_BCOUNT:
+ dqtrx->bcount_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_DELBCOUNT:
+ dqtrx->delbcnt_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_ICOUNT:
+ dqtrx->icount_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_RTBCOUNT:
+ dqtrx->rtbcount_delta += p->delta;
+ break;
+ case XFS_TRANS_DQ_DELRTBCOUNT:
+ dqtrx->delrtb_delta += p->delta;
+ break;
+ }
+
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+
+out_abort:
+ xchk_iscan_abort(&xqc->iscan);
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Apply the transaction quota deltas to our shadow quota accounting info when
+ * the regular quota code are doing the same.
+ */
+static int
+xqcheck_apply_live_dqtrx(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_apply_dqtrx_params *p = data;
+ struct xqcheck *xqc;
+ struct xqcheck_dqacct *dqa;
+ struct xqcheck_dqtrx *dqtrx;
+ struct xfarray *counts;
+ int error;
+
+ xqc = container_of(nb, struct xqcheck, qhook.apply_hook.nb);
+
+ /* Map the dquot type to an incore counter object. */
+ switch (p->q_type) {
+ case XFS_DQTYPE_USER:
+ counts = xqc->ucounts;
+ break;
+ case XFS_DQTYPE_GROUP:
+ counts = xqc->gcounts;
+ break;
+ case XFS_DQTYPE_PROJ:
+ counts = xqc->pcounts;
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ if (xchk_iscan_aborted(&xqc->iscan) || counts == NULL)
+ return NOTIFY_DONE;
+
+ /*
+ * Find the shadow dqtrx for this transaction and dquot, if any deltas
+ * need to be applied here. If not, we're finished early.
+ */
+ mutex_lock(&xqc->lock);
+ dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id,
+ xqcheck_dqacct_hash_params);
+ if (!dqa)
+ goto out_unlock;
+ dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id);
+ if (!dqtrx || dqtrx->q_type == 0)
+ goto out_unlock;
+
+ /* Update our shadow dquot if we're committing. */
+ if (action == XFS_APPLY_DQTRX_COMMIT) {
+ error = xqcheck_update_incore_counts(xqc, counts, p->q_id,
+ dqtrx->icount_delta,
+ dqtrx->bcount_delta + dqtrx->delbcnt_delta,
+ dqtrx->rtbcount_delta + dqtrx->delrtb_delta);
+ if (error)
+ goto out_abort;
+ }
+
+ /* Free the shadow accounting structure if that was the last user. */
+ dqa->refcount--;
+ if (dqa->refcount == 0) {
+ error = rhashtable_remove_fast(&xqc->shadow_dquot_acct,
+ &dqa->hash, xqcheck_dqacct_hash_params);
+ if (error)
+ goto out_abort;
+ xqcheck_dqacct_free(dqa, NULL);
+ }
+
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+
+out_abort:
+ xchk_iscan_abort(&xqc->iscan);
+out_unlock:
+ mutex_unlock(&xqc->lock);
+ return NOTIFY_DONE;
+}
+
+/* Record this inode's quota usage in our shadow quota counter data. */
+STATIC int
+xqcheck_collect_inode(
+ struct xqcheck *xqc,
+ struct xfs_inode *ip)
+{
+ struct xfs_trans *tp = xqc->sc->tp;
+ xfs_filblks_t nblks, rtblks;
+ uint ilock_flags = 0;
+ xfs_dqid_t id;
+ bool isreg = S_ISREG(VFS_I(ip)->i_mode);
+ int error = 0;
+
+ if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) {
+ /*
+ * Quota files are never counted towards quota, so we do not
+ * need to take the lock.
+ */
+ xchk_iscan_mark_visited(&xqc->iscan, ip);
+ return 0;
+ }
+
+ /* Figure out the data / rt device block counts. */
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ if (isreg)
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ /*
+ * Read in the data fork for rt files so that _count_blocks
+ * can count the number of blocks allocated from the rt volume.
+ * Inodes do not track that separately.
+ */
+ ilock_flags = xfs_ilock_data_map_shared(ip);
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_abort;
+ } else {
+ ilock_flags = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ }
+ xfs_inode_count_blocks(tp, ip, &nblks, &rtblks);
+
+ if (xchk_iscan_aborted(&xqc->iscan)) {
+ error = -ECANCELED;
+ goto out_incomplete;
+ }
+
+ /* Update the shadow dquot counters. */
+ mutex_lock(&xqc->lock);
+ if (xqc->ucounts) {
+ id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_USER);
+ error = xqcheck_update_incore_counts(xqc, xqc->ucounts, id, 1,
+ nblks, rtblks);
+ if (error)
+ goto out_mutex;
+ }
+
+ if (xqc->gcounts) {
+ id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_GROUP);
+ error = xqcheck_update_incore_counts(xqc, xqc->gcounts, id, 1,
+ nblks, rtblks);
+ if (error)
+ goto out_mutex;
+ }
+
+ if (xqc->pcounts) {
+ id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_PROJ);
+ error = xqcheck_update_incore_counts(xqc, xqc->pcounts, id, 1,
+ nblks, rtblks);
+ if (error)
+ goto out_mutex;
+ }
+ mutex_unlock(&xqc->lock);
+
+ xchk_iscan_mark_visited(&xqc->iscan, ip);
+ goto out_ilock;
+
+out_mutex:
+ mutex_unlock(&xqc->lock);
+out_abort:
+ xchk_iscan_abort(&xqc->iscan);
+out_incomplete:
+ xchk_set_incomplete(xqc->sc);
+out_ilock:
+ xfs_iunlock(ip, ilock_flags);
+ if (isreg)
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return error;
+}
+
+/* Walk all the allocated inodes and run a quota scan on them. */
+STATIC int
+xqcheck_collect_counts(
+ struct xqcheck *xqc)
+{
+ struct xfs_scrub *sc = xqc->sc;
+ struct xfs_inode *ip;
+ int error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Cancel the transaction to release the log grant space while we scan
+ * the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done without
+ * risk of deadlock between sb_internal and the IOLOCK (we take the
+ * IOLOCK to quiesce the file before scanning) because empty
+ * transactions do not take sb_internal.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) {
+ error = xqcheck_collect_inode(xqc, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&xqc->iscan);
+ if (error) {
+ xchk_set_incomplete(sc);
+ /*
+ * If we couldn't grab an inode that was busy with a state
+ * change, change the error code so that we exit to userspace
+ * as quickly as possible.
+ */
+ if (error == -EBUSY)
+ return -ECANCELED;
+ return error;
+ }
+
+ /*
+ * Switch out for a real transaction in preparation for building a new
+ * tree.
+ */
+ xchk_trans_cancel(sc);
+ return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 2: Comparing dquot resource counters. Walk each xfs_dquot, comparing
+ * the resource usage counters against our shadow dquots; and then walk each
+ * shadow dquot (that wasn't covered in the first part), comparing it against
+ * the xfs_dquot.
+ */
+
+/*
+ * Check the dquot data against what we observed. Caller must hold the dquot
+ * lock.
+ */
+STATIC int
+xqcheck_compare_dquot(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype,
+ struct xfs_dquot *dq)
+{
+ struct xqcheck_dquot xcdq;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ int error;
+
+ if (xchk_iscan_aborted(&xqc->iscan)) {
+ xchk_set_incomplete(xqc->sc);
+ return -ECANCELED;
+ }
+
+ mutex_lock(&xqc->lock);
+ error = xfarray_load_sparse(counts, dq->q_id, &xcdq);
+ if (error)
+ goto out_unlock;
+
+ if (xcdq.icount != dq->q_ino.count)
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+ if (xcdq.bcount != dq->q_blk.count)
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+ if (xcdq.rtbcount != dq->q_rtb.count)
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+ xcdq.flags |= (XQCHECK_DQUOT_COMPARE_SCANNED | XQCHECK_DQUOT_WRITTEN);
+ error = xfarray_store(counts, dq->q_id, &xcdq);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the check and
+ * must notify userspace that the check was incomplete. This
+ * should never happen outside of the collection phase.
+ */
+ xchk_set_incomplete(xqc->sc);
+ error = -ECANCELED;
+ }
+ mutex_unlock(&xqc->lock);
+ if (error)
+ return error;
+
+ if (xqc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return -ECANCELED;
+
+ return 0;
+
+out_unlock:
+ mutex_unlock(&xqc->lock);
+ return error;
+}
+
+/*
+ * Walk all the observed dquots, and make sure there's a matching incore
+ * dquot and that its counts match ours.
+ */
+STATIC int
+xqcheck_walk_observations(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype)
+{
+ struct xqcheck_dquot xcdq;
+ struct xfs_dquot *dq;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ int error;
+
+ mutex_lock(&xqc->lock);
+ while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) {
+ xfs_dqid_t id = cur - 1;
+
+ if (xcdq.flags & XQCHECK_DQUOT_COMPARE_SCANNED)
+ continue;
+
+ mutex_unlock(&xqc->lock);
+
+ error = xfs_qm_dqget(xqc->sc->mp, id, dqtype, false, &dq);
+ if (error == -ENOENT) {
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, id);
+ return 0;
+ }
+ if (error)
+ return error;
+
+ error = xqcheck_compare_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ return error;
+
+ if (xchk_should_terminate(xqc->sc, &error))
+ return error;
+
+ mutex_lock(&xqc->lock);
+ }
+ mutex_unlock(&xqc->lock);
+
+ return error;
+}
+
+/* Compare the quota counters we observed against the live dquots. */
+STATIC int
+xqcheck_compare_dqtype(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype)
+{
+ struct xchk_dqiter cursor = { };
+ struct xfs_scrub *sc = xqc->sc;
+ struct xfs_dquot *dq;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /* If the quota CHKD flag is cleared, we need to repair this quota. */
+ if (!(xfs_quota_chkd_flag(dqtype) & sc->mp->m_qflags)) {
+ xchk_qcheck_set_corrupt(xqc->sc, dqtype, 0);
+ return 0;
+ }
+
+ /* Compare what we observed against the actual dquots. */
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xqcheck_compare_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
+ if (error)
+ return error;
+
+ /* Walk all the observed dquots and compare to the incore ones. */
+ return xqcheck_walk_observations(xqc, dqtype);
+}
+
+/* Tear down everything associated with a quotacheck. */
+static void
+xqcheck_teardown_scan(
+ void *priv)
+{
+ struct xqcheck *xqc = priv;
+ struct xfs_quotainfo *qi = xqc->sc->mp->m_quotainfo;
+
+ /* Discourage any hook functions that might be running. */
+ xchk_iscan_abort(&xqc->iscan);
+
+ /*
+ * As noted above, the apply hook is responsible for cleaning up the
+ * shadow dquot accounting data when a transaction completes. The mod
+ * hook must be removed before the apply hook so that we don't
+ * mistakenly leave an active shadow account for the mod hook to get
+ * its hands on. No hooks should be running after these functions
+ * return.
+ */
+ xfs_dqtrx_hook_del(qi, &xqc->qhook);
+
+ if (xqc->shadow_dquot_acct.key_len) {
+ rhashtable_free_and_destroy(&xqc->shadow_dquot_acct,
+ xqcheck_dqacct_free, NULL);
+ xqc->shadow_dquot_acct.key_len = 0;
+ }
+
+ if (xqc->pcounts) {
+ xfarray_destroy(xqc->pcounts);
+ xqc->pcounts = NULL;
+ }
+
+ if (xqc->gcounts) {
+ xfarray_destroy(xqc->gcounts);
+ xqc->gcounts = NULL;
+ }
+
+ if (xqc->ucounts) {
+ xfarray_destroy(xqc->ucounts);
+ xqc->ucounts = NULL;
+ }
+
+ xchk_iscan_teardown(&xqc->iscan);
+ mutex_destroy(&xqc->lock);
+ xqc->sc = NULL;
+}
+
+/*
+ * Scan all inodes in the entire filesystem to generate quota counter data.
+ * If the scan is successful, the quota data will be left alive for a repair.
+ * If any error occurs, we'll tear everything down.
+ */
+STATIC int
+xqcheck_setup_scan(
+ struct xfs_scrub *sc,
+ struct xqcheck *xqc)
+{
+ char *descr;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ unsigned long long max_dquots = XFS_DQ_ID_MAX + 1ULL;
+ int error;
+
+ ASSERT(xqc->sc == NULL);
+ xqc->sc = sc;
+
+ mutex_init(&xqc->lock);
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &xqc->iscan);
+
+ error = -ENOMEM;
+ if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_USER)) {
+ descr = xchk_xfile_descr(sc, "user dquot records");
+ error = xfarray_create(descr, max_dquots,
+ sizeof(struct xqcheck_dquot), &xqc->ucounts);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+ }
+
+ if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_GROUP)) {
+ descr = xchk_xfile_descr(sc, "group dquot records");
+ error = xfarray_create(descr, max_dquots,
+ sizeof(struct xqcheck_dquot), &xqc->gcounts);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+ }
+
+ if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_PROJ)) {
+ descr = xchk_xfile_descr(sc, "project dquot records");
+ error = xfarray_create(descr, max_dquots,
+ sizeof(struct xqcheck_dquot), &xqc->pcounts);
+ kfree(descr);
+ if (error)
+ goto out_teardown;
+ }
+
+ /*
+ * Set up hash table to map transactions to our internal shadow dqtrx
+ * structures.
+ */
+ error = rhashtable_init(&xqc->shadow_dquot_acct,
+ &xqcheck_dqacct_hash_params);
+ if (error)
+ goto out_teardown;
+
+ /*
+ * Hook into the quota code. The hook only triggers for inodes that
+ * were already scanned, and the scanner thread takes each inode's
+ * ILOCK, which means that any in-progress inode updates will finish
+ * before we can scan the inode.
+ *
+ * The apply hook (which removes the shadow dquot accounting struct)
+ * must be installed before the mod hook so that we never fail to catch
+ * the end of a quota update sequence and leave stale shadow data.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_QUOTA);
+ xfs_dqtrx_hook_setup(&xqc->qhook, xqcheck_mod_live_ino_dqtrx,
+ xqcheck_apply_live_dqtrx);
+
+ error = xfs_dqtrx_hook_add(qi, &xqc->qhook);
+ if (error)
+ goto out_teardown;
+
+ /* Use deferred cleanup to pass the quota count data to repair. */
+ sc->buf_cleanup = xqcheck_teardown_scan;
+ return 0;
+
+out_teardown:
+ xqcheck_teardown_scan(xqc);
+ return error;
+}
+
+/* Scrub all counters for a given quota type. */
+int
+xchk_quotacheck(
+ struct xfs_scrub *sc)
+{
+ struct xqcheck *xqc = sc->buf;
+ int error = 0;
+
+ /* Check quota counters on the live filesystem. */
+ error = xqcheck_setup_scan(sc, xqc);
+ if (error)
+ return error;
+
+ /* Walk all inodes, picking up quota information. */
+ error = xqcheck_collect_counts(xqc);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ /* Fail fast if we're not playing with a full dataset. */
+ if (xchk_iscan_aborted(&xqc->iscan))
+ xchk_set_incomplete(sc);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
+ /* Compare quota counters. */
+ if (xqc->ucounts) {
+ error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_USER);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+ }
+ if (xqc->gcounts) {
+ error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_GROUP);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+ }
+ if (xqc->pcounts) {
+ error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_PROJ);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+ }
+
+ /* Check one last time for an incomplete dataset. */
+ if (xchk_iscan_aborted(&xqc->iscan))
+ xchk_set_incomplete(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/quotacheck.h b/fs/xfs/scrub/quotacheck.h
new file mode 100644
index 000000000000..4ea5f249c978
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_QUOTACHECK_H__
+#define __XFS_SCRUB_QUOTACHECK_H__
+
+/* Quota counters for live quotacheck. */
+struct xqcheck_dquot {
+ /* block usage count */
+ int64_t bcount;
+
+ /* inode usage count */
+ int64_t icount;
+
+ /* realtime block usage count */
+ int64_t rtbcount;
+
+ /* Record state */
+ unsigned int flags;
+};
+
+/*
+ * This incore dquot record has been written at least once. We never want to
+ * store an xqcheck_dquot that looks uninitialized.
+ */
+#define XQCHECK_DQUOT_WRITTEN (1U << 0)
+
+/* Already checked this dquot. */
+#define XQCHECK_DQUOT_COMPARE_SCANNED (1U << 1)
+
+/* Already repaired this dquot. */
+#define XQCHECK_DQUOT_REPAIR_SCANNED (1U << 2)
+
+/* Live quotacheck control structure. */
+struct xqcheck {
+ struct xfs_scrub *sc;
+
+ /* Shadow dquot counter data. */
+ struct xfarray *ucounts;
+ struct xfarray *gcounts;
+ struct xfarray *pcounts;
+
+ /* Lock protecting quotacheck count observations */
+ struct mutex lock;
+
+ struct xchk_iscan iscan;
+
+ /* Hooks into the quota code. */
+ struct xfs_dqtrx_hook qhook;
+
+ /* Shadow quota delta tracking structure. */
+ struct rhashtable shadow_dquot_acct;
+};
+
+/* Return the incore counter array for a given quota type. */
+static inline struct xfarray *
+xqcheck_counters_for(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype)
+{
+ switch (dqtype) {
+ case XFS_DQTYPE_USER:
+ return xqc->ucounts;
+ case XFS_DQTYPE_GROUP:
+ return xqc->gcounts;
+ case XFS_DQTYPE_PROJ:
+ return xqc->pcounts;
+ }
+
+ ASSERT(0);
+ return NULL;
+}
+
+#endif /* __XFS_SCRUB_QUOTACHECK_H__ */
diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c
new file mode 100644
index 000000000000..dd8554c755b5
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck_repair.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_sb.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/quota.h"
+#include "scrub/quotacheck.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Quotacheck Repair
+ * ======================
+ *
+ * Use the live quota counter information that we collected to replace the
+ * counter values in the incore dquots. A scrub->repair cycle should have left
+ * the live data and hooks active, so this is safe so long as we make sure the
+ * dquot is locked.
+ */
+
+/* Commit new counters to a dquot. */
+static int
+xqcheck_commit_dquot(
+ struct xqcheck *xqc,
+ xfs_dqtype_t dqtype,
+ struct xfs_dquot *dq)
+{
+ struct xqcheck_dquot xcdq;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ int64_t delta;
+ bool dirty = false;
+ int error = 0;
+
+ /* Unlock the dquot just long enough to allocate a transaction. */
+ xfs_dqunlock(dq);
+ error = xchk_trans_alloc(xqc->sc, 0);
+ xfs_dqlock(dq);
+ if (error)
+ return error;
+
+ xfs_trans_dqjoin(xqc->sc->tp, dq);
+
+ if (xchk_iscan_aborted(&xqc->iscan)) {
+ error = -ECANCELED;
+ goto out_cancel;
+ }
+
+ mutex_lock(&xqc->lock);
+ error = xfarray_load_sparse(counts, dq->q_id, &xcdq);
+ if (error)
+ goto out_unlock;
+
+ /* Adjust counters as needed. */
+ delta = (int64_t)xcdq.icount - dq->q_ino.count;
+ if (delta) {
+ dq->q_ino.reserved += delta;
+ dq->q_ino.count += delta;
+ dirty = true;
+ }
+
+ delta = (int64_t)xcdq.bcount - dq->q_blk.count;
+ if (delta) {
+ dq->q_blk.reserved += delta;
+ dq->q_blk.count += delta;
+ dirty = true;
+ }
+
+ delta = (int64_t)xcdq.rtbcount - dq->q_rtb.count;
+ if (delta) {
+ dq->q_rtb.reserved += delta;
+ dq->q_rtb.count += delta;
+ dirty = true;
+ }
+
+ xcdq.flags |= (XQCHECK_DQUOT_REPAIR_SCANNED | XQCHECK_DQUOT_WRITTEN);
+ error = xfarray_store(counts, dq->q_id, &xcdq);
+ if (error == -EFBIG) {
+ /*
+ * EFBIG means we tried to store data at too high a byte offset
+ * in the sparse array. IOWs, we cannot complete the repair
+ * and must cancel the whole operation. This should never
+ * happen, but we need to catch it anyway.
+ */
+ error = -ECANCELED;
+ }
+ mutex_unlock(&xqc->lock);
+ if (error || !dirty)
+ goto out_cancel;
+
+ trace_xrep_quotacheck_dquot(xqc->sc->mp, dq->q_type, dq->q_id);
+
+ /* Commit the dirty dquot to disk. */
+ dq->q_flags |= XFS_DQFLAG_DIRTY;
+ if (dq->q_id)
+ xfs_qm_adjust_dqtimers(dq);
+ xfs_trans_log_dquot(xqc->sc->tp, dq);
+
+ /*
+ * Transaction commit unlocks the dquot, so we must re-lock it so that
+ * the caller can put the reference (which apparently requires a locked
+ * dquot).
+ */
+ error = xrep_trans_commit(xqc->sc);
+ xfs_dqlock(dq);
+ return error;
+
+out_unlock:
+ mutex_unlock(&xqc->lock);
+out_cancel:
+ xchk_trans_cancel(xqc->sc);
+
+ /* Re-lock the dquot so the caller can put the reference. */
+ xfs_dqlock(dq);
+ return error;
+}
+
+/* Commit new quota counters for a particular quota type. */
+STATIC int
+xqcheck_commit_dqtype(
+ struct xqcheck *xqc,
+ unsigned int dqtype)
+{
+ struct xchk_dqiter cursor = { };
+ struct xqcheck_dquot xcdq;
+ struct xfs_scrub *sc = xqc->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfarray *counts = xqcheck_counters_for(xqc, dqtype);
+ struct xfs_dquot *dq;
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ int error;
+
+ /*
+ * Update the counters of every dquot that the quota file knows about.
+ */
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xqcheck_commit_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
+ if (error)
+ return error;
+
+ /*
+ * Make a second pass to deal with the dquots that we know about but
+ * the quota file previously did not know about.
+ */
+ mutex_lock(&xqc->lock);
+ while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) {
+ xfs_dqid_t id = cur - 1;
+
+ if (xcdq.flags & XQCHECK_DQUOT_REPAIR_SCANNED)
+ continue;
+
+ mutex_unlock(&xqc->lock);
+
+ /*
+ * Grab the dquot, allowing for dquot block allocation in a
+ * separate transaction. We committed the scrub transaction
+ * in a previous step, so we will not be creating nested
+ * transactions here.
+ */
+ error = xfs_qm_dqget(mp, id, dqtype, true, &dq);
+ if (error)
+ return error;
+
+ error = xqcheck_commit_dquot(xqc, dqtype, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ return error;
+
+ mutex_lock(&xqc->lock);
+ }
+ mutex_unlock(&xqc->lock);
+
+ return error;
+}
+
+/* Figure out quota CHKD flags for the running quota types. */
+static inline unsigned int
+xqcheck_chkd_flags(
+ struct xfs_mount *mp)
+{
+ unsigned int ret = 0;
+
+ if (XFS_IS_UQUOTA_ON(mp))
+ ret |= XFS_UQUOTA_CHKD;
+ if (XFS_IS_GQUOTA_ON(mp))
+ ret |= XFS_GQUOTA_CHKD;
+ if (XFS_IS_PQUOTA_ON(mp))
+ ret |= XFS_PQUOTA_CHKD;
+ return ret;
+}
+
+/* Commit the new dquot counters. */
+int
+xrep_quotacheck(
+ struct xfs_scrub *sc)
+{
+ struct xqcheck *xqc = sc->buf;
+ unsigned int qflags = xqcheck_chkd_flags(sc->mp);
+ int error;
+
+ /*
+ * Clear the CHKD flag for the running quota types and commit the scrub
+ * transaction so that we can allocate new quota block mappings if we
+ * have to. If we crash after this point, the sb still has the CHKD
+ * flags cleared, so mount quotacheck will fix all of this up.
+ */
+ xrep_update_qflags(sc, qflags, 0);
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ /* Commit the new counters to the dquots. */
+ if (xqc->ucounts) {
+ error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_USER);
+ if (error)
+ return error;
+ }
+ if (xqc->gcounts) {
+ error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_GROUP);
+ if (error)
+ return error;
+ }
+ if (xqc->pcounts) {
+ error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_PROJ);
+ if (error)
+ return error;
+ }
+
+ /* Set the CHKD flags now that we've fixed quota counts. */
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+
+ xrep_update_qflags(sc, 0, qflags);
+ return xrep_trans_commit(sc);
+}
diff --git a/fs/xfs/scrub/rcbag.c b/fs/xfs/scrub/rcbag.c
new file mode 100644
index 000000000000..e1e52bc20713
--- /dev/null
+++ b/fs/xfs/scrub/rcbag.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_error.h"
+#include "scrub/scrub.h"
+#include "scrub/rcbag_btree.h"
+#include "scrub/rcbag.h"
+#include "scrub/trace.h"
+
+struct rcbag {
+ struct xfs_mount *mp;
+ struct xfbtree xfbtree;
+ uint64_t nr_items;
+};
+
+int
+rcbag_init(
+ struct xfs_mount *mp,
+ struct xfs_buftarg *btp,
+ struct rcbag **bagp)
+{
+ struct rcbag *bag;
+ int error;
+
+ bag = kzalloc(sizeof(struct rcbag), XCHK_GFP_FLAGS);
+ if (!bag)
+ return -ENOMEM;
+
+ bag->nr_items = 0;
+ bag->mp = mp;
+
+ error = rcbagbt_mem_init(mp, &bag->xfbtree, btp);
+ if (error)
+ goto out_bag;
+
+ *bagp = bag;
+ return 0;
+
+out_bag:
+ kfree(bag);
+ return error;
+}
+
+void
+rcbag_free(
+ struct rcbag **bagp)
+{
+ struct rcbag *bag = *bagp;
+
+ xfbtree_destroy(&bag->xfbtree);
+ kfree(bag);
+ *bagp = NULL;
+}
+
+/* Track an rmap in the refcount bag. */
+int
+rcbag_add(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ const struct xfs_rmap_irec *rmap)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ int has;
+ int error;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = rcbagbt_lookup_eq(cur, rmap, &has);
+ if (error)
+ goto out_cur;
+
+ if (has) {
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ bagrec.rbg_refcount++;
+ error = rcbagbt_update(cur, &bagrec);
+ if (error)
+ goto out_cur;
+ } else {
+ bagrec.rbg_startblock = rmap->rm_startblock;
+ bagrec.rbg_blockcount = rmap->rm_blockcount;
+ bagrec.rbg_refcount = 1;
+
+ error = rcbagbt_insert(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+
+ error = xfbtree_trans_commit(&bag->xfbtree, tp);
+ if (error)
+ return error;
+
+ bag->nr_items++;
+ return 0;
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ xfbtree_trans_cancel(&bag->xfbtree, tp);
+ return error;
+}
+
+/* Return the number of records in the bag. */
+uint64_t
+rcbag_count(
+ const struct rcbag *rcbag)
+{
+ return rcbag->nr_items;
+}
+
+static inline uint32_t rcbag_rec_next_bno(const struct rcbag_rec *r)
+{
+ return r->rbg_startblock + r->rbg_blockcount;
+}
+
+/*
+ * Find the next block where the refcount changes, given the next rmap we
+ * looked at and the ones we're already tracking.
+ */
+int
+rcbag_next_edge(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ const struct xfs_rmap_irec *next_rmap,
+ bool next_valid,
+ uint32_t *next_bnop)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ uint32_t next_bno = NULLAGBLOCK;
+ int has;
+ int error;
+
+ if (next_valid)
+ next_bno = next_rmap->rm_startblock;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_increment(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ next_bno = min(next_bno, rcbag_rec_next_bno(&bagrec));
+ }
+
+ /*
+ * We should have found /something/ because either next_rrm is the next
+ * interesting rmap to look at after emitting this refcount extent, or
+ * there are other rmaps in rmap_bag contributing to the current
+ * sharing count. But if something is seriously wrong, bail out.
+ */
+ if (next_bno == NULLAGBLOCK) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+
+ *next_bnop = next_bno;
+ return 0;
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/* Pop all refcount bag records that end at next_bno */
+int
+rcbag_remove_ending_at(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ uint32_t next_bno)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ int has;
+ int error;
+
+ /* go to the right edge of the tree */
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ memset(&cur->bc_rec, 0xFF, sizeof(cur->bc_rec));
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, &has);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_decrement(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ if (rcbag_rec_next_bno(&bagrec) != next_bno)
+ continue;
+
+ error = xfs_btree_delete(cur, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ bag->nr_items -= bagrec.rbg_refcount;
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+ return xfbtree_trans_commit(&bag->xfbtree, tp);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ xfbtree_trans_cancel(&bag->xfbtree, tp);
+ return error;
+}
+
+/* Dump the rcbag. */
+void
+rcbag_dump(
+ struct rcbag *bag,
+ struct xfs_trans *tp)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ unsigned long long nr = 0;
+ int has;
+ int error;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_increment(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ xfs_err(bag->mp, "[%llu]: bno 0x%x fsbcount 0x%x refcount 0x%llx\n",
+ nr++,
+ (unsigned int)bagrec.rbg_startblock,
+ (unsigned int)bagrec.rbg_blockcount,
+ (unsigned long long)bagrec.rbg_refcount);
+ }
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+}
diff --git a/fs/xfs/scrub/rcbag.h b/fs/xfs/scrub/rcbag.h
new file mode 100644
index 000000000000..e29ef788ba72
--- /dev/null
+++ b/fs/xfs/scrub/rcbag.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RCBAG_H__
+#define __XFS_SCRUB_RCBAG_H__
+
+struct xfs_mount;
+struct rcbag;
+struct xfs_buftarg;
+
+int rcbag_init(struct xfs_mount *mp, struct xfs_buftarg *btp,
+ struct rcbag **bagp);
+void rcbag_free(struct rcbag **bagp);
+int rcbag_add(struct rcbag *bag, struct xfs_trans *tp,
+ const struct xfs_rmap_irec *rmap);
+uint64_t rcbag_count(const struct rcbag *bag);
+
+int rcbag_next_edge(struct rcbag *bag, struct xfs_trans *tp,
+ const struct xfs_rmap_irec *next_rmap, bool next_valid,
+ uint32_t *next_bnop);
+int rcbag_remove_ending_at(struct rcbag *bag, struct xfs_trans *tp,
+ uint32_t next_bno);
+
+void rcbag_dump(struct rcbag *bag, struct xfs_trans *tp);
+
+#endif /* __XFS_SCRUB_RCBAG_H__ */
diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c
new file mode 100644
index 000000000000..709356dc6256
--- /dev/null
+++ b/fs/xfs/scrub/rcbag_btree.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_error.h"
+#include "scrub/rcbag_btree.h"
+#include "scrub/trace.h"
+
+static struct kmem_cache *rcbagbt_cur_cache;
+
+STATIC void
+rcbagbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ struct rcbag_key *bag_key = (struct rcbag_key *)key;
+ const struct rcbag_rec *bag_rec = (const struct rcbag_rec *)rec;
+
+ BUILD_BUG_ON(sizeof(struct rcbag_key) > sizeof(union xfs_btree_key));
+ BUILD_BUG_ON(sizeof(struct rcbag_rec) > sizeof(union xfs_btree_rec));
+
+ bag_key->rbg_startblock = bag_rec->rbg_startblock;
+ bag_key->rbg_blockcount = bag_rec->rbg_blockcount;
+}
+
+STATIC void
+rcbagbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ struct rcbag_rec *bag_rec = (struct rcbag_rec *)rec;
+ struct rcbag_rec *bag_irec = (struct rcbag_rec *)&cur->bc_rec;
+
+ bag_rec->rbg_startblock = bag_irec->rbg_startblock;
+ bag_rec->rbg_blockcount = bag_irec->rbg_blockcount;
+ bag_rec->rbg_refcount = bag_irec->rbg_refcount;
+}
+
+STATIC int64_t
+rcbagbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
+ const struct rcbag_key *kp = (const struct rcbag_key *)key;
+
+ if (kp->rbg_startblock > rec->rbg_startblock)
+ return 1;
+ if (kp->rbg_startblock < rec->rbg_startblock)
+ return -1;
+
+ if (kp->rbg_blockcount > rec->rbg_blockcount)
+ return 1;
+ if (kp->rbg_blockcount < rec->rbg_blockcount)
+ return -1;
+
+ return 0;
+}
+
+STATIC int64_t
+rcbagbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
+{
+ const struct rcbag_key *kp1 = (const struct rcbag_key *)k1;
+ const struct rcbag_key *kp2 = (const struct rcbag_key *)k2;
+
+ ASSERT(mask == NULL);
+
+ if (kp1->rbg_startblock > kp2->rbg_startblock)
+ return 1;
+ if (kp1->rbg_startblock < kp2->rbg_startblock)
+ return -1;
+
+ if (kp1->rbg_blockcount > kp2->rbg_blockcount)
+ return 1;
+ if (kp1->rbg_blockcount < kp2->rbg_blockcount)
+ return -1;
+
+ return 0;
+}
+
+STATIC int
+rcbagbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ const struct rcbag_key *kp1 = (const struct rcbag_key *)k1;
+ const struct rcbag_key *kp2 = (const struct rcbag_key *)k2;
+
+ if (kp1->rbg_startblock > kp2->rbg_startblock)
+ return 0;
+ if (kp1->rbg_startblock < kp2->rbg_startblock)
+ return 1;
+
+ if (kp1->rbg_blockcount > kp2->rbg_blockcount)
+ return 0;
+ if (kp1->rbg_blockcount < kp2->rbg_blockcount)
+ return 1;
+
+ return 0;
+}
+
+STATIC int
+rcbagbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ const struct rcbag_rec *rp1 = (const struct rcbag_rec *)r1;
+ const struct rcbag_rec *rp2 = (const struct rcbag_rec *)r2;
+
+ if (rp1->rbg_startblock > rp2->rbg_startblock)
+ return 0;
+ if (rp1->rbg_startblock < rp2->rbg_startblock)
+ return 1;
+
+ if (rp1->rbg_blockcount > rp2->rbg_blockcount)
+ return 0;
+ if (rp1->rbg_blockcount < rp2->rbg_blockcount)
+ return 1;
+
+ return 0;
+}
+
+static xfs_failaddr_t
+rcbagbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ unsigned int level;
+ unsigned int maxrecs;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+
+ level = be16_to_cpu(block->bb_level);
+ if (level >= rcbagbt_maxlevels_possible())
+ return __this_address;
+
+ maxrecs = rcbagbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0);
+ return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+rcbagbt_rw_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa = rcbagbt_verify(bp);
+
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops rcbagbt_mem_buf_ops = {
+ .name = "rcbagbt_mem",
+ .magic = { 0, cpu_to_be32(RCBAG_MAGIC) },
+ .verify_read = rcbagbt_rw_verify,
+ .verify_write = rcbagbt_rw_verify,
+ .verify_struct = rcbagbt_verify,
+};
+
+static const struct xfs_btree_ops rcbagbt_mem_ops = {
+ .name = "rcbag",
+ .type = XFS_BTREE_TYPE_MEM,
+
+ .rec_len = sizeof(struct rcbag_rec),
+ .key_len = sizeof(struct rcbag_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = 1,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rcbag_2),
+
+ .dup_cursor = xfbtree_dup_cursor,
+ .set_root = xfbtree_set_root,
+ .alloc_block = xfbtree_alloc_block,
+ .free_block = xfbtree_free_block,
+ .get_minrecs = xfbtree_get_minrecs,
+ .get_maxrecs = xfbtree_get_maxrecs,
+ .init_key_from_rec = rcbagbt_init_key_from_rec,
+ .init_rec_from_cur = rcbagbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfbtree_init_ptr_from_cur,
+ .key_diff = rcbagbt_key_diff,
+ .buf_ops = &rcbagbt_mem_buf_ops,
+ .diff_two_keys = rcbagbt_diff_two_keys,
+ .keys_inorder = rcbagbt_keys_inorder,
+ .recs_inorder = rcbagbt_recs_inorder,
+};
+
+/* Create a cursor for an in-memory btree. */
+struct xfs_btree_cur *
+rcbagbt_mem_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfbtree *xfbtree)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_btree_alloc_cursor(mp, tp, &rcbagbt_mem_ops,
+ rcbagbt_maxlevels_possible(), rcbagbt_cur_cache);
+
+ cur->bc_mem.xfbtree = xfbtree;
+ cur->bc_nlevels = xfbtree->nlevels;
+ return cur;
+}
+
+/* Create an in-memory refcount bag btree. */
+int
+rcbagbt_mem_init(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp)
+{
+ xfbt->owner = 0;
+ return xfbtree_init(mp, xfbt, btp, &rcbagbt_mem_ops);
+}
+
+/* Calculate number of records in a refcount bag btree block. */
+static inline unsigned int
+rcbagbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct rcbag_rec);
+ return blocklen /
+ (sizeof(struct rcbag_key) + sizeof(rcbag_ptr_t));
+}
+
+/*
+ * Calculate number of records in an refcount bag btree block.
+ */
+unsigned int
+rcbagbt_maxrecs(
+ struct xfs_mount *mp,
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= RCBAG_BLOCK_LEN;
+ return rcbagbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for refcount bag btrees. */
+unsigned int
+rcbagbt_maxlevels_possible(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_space_to_height(minrecs, ULLONG_MAX);
+}
+
+/* Calculate the refcount bag btree size for some records. */
+unsigned long long
+rcbagbt_calc_size(
+ unsigned long long nr_records)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_calc_size(minrecs, nr_records);
+}
+
+int __init
+rcbagbt_init_cur_cache(void)
+{
+ rcbagbt_cur_cache = kmem_cache_create("xfs_rcbagbt_cur",
+ xfs_btree_cur_sizeof(rcbagbt_maxlevels_possible()),
+ 0, 0, NULL);
+
+ if (!rcbagbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+rcbagbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(rcbagbt_cur_cache);
+ rcbagbt_cur_cache = NULL;
+}
+
+/* Look up the refcount bag record corresponding to this reverse mapping. */
+int
+rcbagbt_lookup_eq(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rmap,
+ int *success)
+{
+ struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
+
+ rec->rbg_startblock = rmap->rm_startblock;
+ rec->rbg_blockcount = rmap->rm_blockcount;
+
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, success);
+}
+
+/* Get the data from the pointed-to record. */
+int
+rcbagbt_get_rec(
+ struct xfs_btree_cur *cur,
+ struct rcbag_rec *rec,
+ int *has)
+{
+ union xfs_btree_rec *btrec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &btrec, has);
+ if (error || !(*has))
+ return error;
+
+ memcpy(rec, btrec, sizeof(struct rcbag_rec));
+ return 0;
+}
+
+/* Update the record referred to by cur to the value given. */
+int
+rcbagbt_update(
+ struct xfs_btree_cur *cur,
+ const struct rcbag_rec *rec)
+{
+ union xfs_btree_rec btrec;
+
+ memcpy(&btrec, rec, sizeof(struct rcbag_rec));
+ return xfs_btree_update(cur, &btrec);
+}
+
+/* Update the record referred to by cur to the value given. */
+int
+rcbagbt_insert(
+ struct xfs_btree_cur *cur,
+ const struct rcbag_rec *rec,
+ int *success)
+{
+ struct rcbag_rec *btrec = (struct rcbag_rec *)&cur->bc_rec;
+
+ memcpy(btrec, rec, sizeof(struct rcbag_rec));
+ return xfs_btree_insert(cur, success);
+}
diff --git a/fs/xfs/scrub/rcbag_btree.h b/fs/xfs/scrub/rcbag_btree.h
new file mode 100644
index 000000000000..03cadb032552
--- /dev/null
+++ b/fs/xfs/scrub/rcbag_btree.h
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RCBAG_BTREE_H__
+#define __XFS_SCRUB_RCBAG_BTREE_H__
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+#define RCBAG_MAGIC 0x74826671 /* 'JRBG' */
+
+struct rcbag_key {
+ uint32_t rbg_startblock;
+ uint32_t rbg_blockcount;
+};
+
+struct rcbag_rec {
+ uint32_t rbg_startblock;
+ uint32_t rbg_blockcount;
+ uint64_t rbg_refcount;
+};
+
+typedef __be64 rcbag_ptr_t;
+
+/* reflinks only exist on crc enabled filesystems */
+#define RCBAG_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define RCBAG_REC_ADDR(block, index) \
+ ((struct rcbag_rec *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ (((index) - 1) * sizeof(struct rcbag_rec))))
+
+#define RCBAG_KEY_ADDR(block, index) \
+ ((struct rcbag_key *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ ((index) - 1) * sizeof(struct rcbag_key)))
+
+#define RCBAG_PTR_ADDR(block, index, maxrecs) \
+ ((rcbag_ptr_t *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ (maxrecs) * sizeof(struct rcbag_key) + \
+ ((index) - 1) * sizeof(rcbag_ptr_t)))
+
+unsigned int rcbagbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+ bool leaf);
+
+unsigned long long rcbagbt_calc_size(unsigned long long nr_records);
+
+unsigned int rcbagbt_maxlevels_possible(void);
+
+int __init rcbagbt_init_cur_cache(void);
+void rcbagbt_destroy_cur_cache(void);
+
+struct xfs_btree_cur *rcbagbt_mem_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfbtree *xfbtree);
+int rcbagbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+ struct xfs_buftarg *btp);
+
+int rcbagbt_lookup_eq(struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rmap, int *success);
+int rcbagbt_get_rec(struct xfs_btree_cur *cur, struct rcbag_rec *rec, int *has);
+int rcbagbt_update(struct xfs_btree_cur *cur, const struct rcbag_rec *rec);
+int rcbagbt_insert(struct xfs_btree_cur *cur, const struct rcbag_rec *rec,
+ int *success);
+
+#else
+# define rcbagbt_init_cur_cache() 0
+# define rcbagbt_destroy_cur_cache() ((void)0)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+#endif /* __XFS_SCRUB_RCBAG_BTREE_H__ */
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
index e51c1544be63..dfdcb96b6c16 100644
--- a/fs/xfs/scrub/readdir.c
+++ b/fs/xfs/scrub/readdir.c
@@ -36,16 +36,14 @@ xchk_dir_walk_sf(
struct xfs_mount *mp = dp->i_mount;
struct xfs_da_geometry *geo = mp->m_dir_geo;
struct xfs_dir2_sf_entry *sfep;
- struct xfs_dir2_sf_hdr *sfp;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
xfs_ino_t ino;
xfs_dir2_dataptr_t dapos;
unsigned int i;
int error;
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
-
- sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
/* dot entry */
dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
@@ -283,7 +281,7 @@ xchk_dir_walk(
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
- ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
return xchk_dir_walk_sf(sc, dp, dirent_fn, priv);
@@ -334,7 +332,7 @@ xchk_dir_lookup(
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
- ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
error = xfs_dir2_sf_lookup(&args);
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 86a62420e02c..0252a3b5b65a 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -20,6 +20,7 @@
#include "xfs_ialloc_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_extent_busy.h"
#include "xfs_ag.h"
@@ -31,11 +32,14 @@
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_remote.h"
+#include "xfs_defer.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/fsb_bitmap.h"
#include "scrub/reap.h"
/*
@@ -73,10 +77,10 @@
* with only the same rmap owner but the block is not owned by something with
* the same rmap owner, the block will be freed.
*
- * The caller is responsible for locking the AG headers for the entire rebuild
- * operation so that nothing else can sneak in and change the AG state while
- * we're not looking. We must also invalidate any buffers associated with
- * @bitmap.
+ * The caller is responsible for locking the AG headers/inode for the entire
+ * rebuild operation so that nothing else can sneak in and change the incore
+ * state while we're not looking. We must also invalidate any buffers
+ * associated with @bitmap.
*/
/* Information about reaping extents after a repair. */
@@ -110,7 +114,7 @@ xreap_put_freelist(
int error;
/* Make sure there's space on the freelist. */
- error = xrep_fix_freelist(sc, true);
+ error = xrep_fix_freelist(sc, 0);
if (error)
return error;
@@ -247,7 +251,7 @@ xreap_agextent_binval(
max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
- for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) {
+ for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) {
struct xfs_buf *bp = NULL;
xfs_daddr_t daddr;
int error;
@@ -377,6 +381,17 @@ xreap_agextent_iter(
trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
rs->force_roll = true;
+
+ if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
+ /*
+ * If we're unmapping CoW staging extents, remove the
+ * records from the refcountbt, which will remove the
+ * rmap record as well.
+ */
+ xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+ return 0;
+ }
+
return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
*aglenp, rs->oinfo);
}
@@ -395,6 +410,26 @@ xreap_agextent_iter(
return 0;
}
+ /*
+ * If we're getting rid of CoW staging extents, use deferred work items
+ * to remove the refcountbt records (which removes the rmap records)
+ * and free the extent. We're not worried about the system going down
+ * here because log recovery walks the refcount btree to clean out the
+ * CoW staging extents.
+ */
+ if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
+ ASSERT(rs->resv == XFS_AG_RESV_NONE);
+
+ xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+ error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
+ rs->resv, true);
+ if (error)
+ return error;
+
+ rs->force_roll = true;
+ return 0;
+ }
+
/* Put blocks back on the AGFL one at a time. */
if (rs->resv == XFS_AG_RESV_AGFL) {
ASSERT(*aglenp == 1);
@@ -409,13 +444,17 @@ xreap_agextent_iter(
/*
* Use deferred frees to get rid of the old btree blocks to try to
* minimize the window in which we could crash and lose the old blocks.
+ * Add a defer ops barrier every other extent to avoid stressing the
+ * system with large EFIs.
*/
- error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
+ error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
rs->resv, true);
if (error)
return error;
rs->deferred++;
+ if (rs->deferred % 2 == 0)
+ xfs_defer_add_barrier(sc->tp);
return 0;
}
@@ -425,13 +464,12 @@ xreap_agextent_iter(
*/
STATIC int
xreap_agmeta_extent(
- uint64_t fsbno,
- uint64_t len,
+ uint32_t agbno,
+ uint32_t len,
void *priv)
{
struct xreap_state *rs = priv;
struct xfs_scrub *sc = rs->sc;
- xfs_agblock_t agbno = fsbno;
xfs_agblock_t agbno_next = agbno + len;
int error = 0;
@@ -496,3 +534,115 @@ xrep_reap_agblocks(
return 0;
}
+
+/*
+ * Break a file metadata extent into sub-extents by fate (crosslinked, not
+ * crosslinked), and dispose of each sub-extent separately. The extent must
+ * not cross an AG boundary.
+ */
+STATIC int
+xreap_fsmeta_extent(
+ uint64_t fsbno,
+ uint64_t len,
+ void *priv)
+{
+ struct xreap_state *rs = priv;
+ struct xfs_scrub *sc = rs->sc;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+ xfs_agblock_t agbno_next = agbno + len;
+ int error = 0;
+
+ ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
+ ASSERT(sc->ip != NULL);
+ ASSERT(!sc->sa.pag);
+
+ /*
+ * We're reaping blocks after repairing file metadata, which means that
+ * we have to init the xchk_ag structure ourselves.
+ */
+ sc->sa.pag = xfs_perag_get(sc->mp, agno);
+ if (!sc->sa.pag)
+ return -EFSCORRUPTED;
+
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
+ if (error)
+ goto out_pag;
+
+ while (agbno < agbno_next) {
+ xfs_extlen_t aglen;
+ bool crosslinked;
+
+ error = xreap_agextent_select(rs, agbno, agbno_next,
+ &crosslinked, &aglen);
+ if (error)
+ goto out_agf;
+
+ error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
+ if (error)
+ goto out_agf;
+
+ if (xreap_want_defer_finish(rs)) {
+ /*
+ * Holds the AGF buffer across the deferred chain
+ * processing.
+ */
+ error = xrep_defer_finish(sc);
+ if (error)
+ goto out_agf;
+ xreap_defer_finish_reset(rs);
+ } else if (xreap_want_roll(rs)) {
+ /*
+ * Hold the AGF buffer across the transaction roll so
+ * that we don't have to reattach it to the scrub
+ * context.
+ */
+ xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+ error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+ xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+ if (error)
+ goto out_agf;
+ xreap_reset(rs);
+ }
+
+ agbno += aglen;
+ }
+
+out_agf:
+ xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
+ sc->sa.agf_bp = NULL;
+out_pag:
+ xfs_perag_put(sc->sa.pag);
+ sc->sa.pag = NULL;
+ return error;
+}
+
+/*
+ * Dispose of every block of every fs metadata extent in the bitmap.
+ * Do not use this to dispose of the mappings in an ondisk inode fork.
+ */
+int
+xrep_reap_fsblocks(
+ struct xfs_scrub *sc,
+ struct xfsb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xreap_state rs = {
+ .sc = sc,
+ .oinfo = oinfo,
+ .resv = XFS_AG_RESV_NONE,
+ };
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(sc->ip != NULL);
+
+ error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
+ if (error)
+ return error;
+
+ if (xreap_dirty(&rs))
+ return xrep_defer_finish(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index fe24626af164..0b69f16dd98f 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -6,7 +6,12 @@
#ifndef __XFS_SCRUB_REAP_H__
#define __XFS_SCRUB_REAP_H__
+struct xagb_bitmap;
+struct xfsb_bitmap;
+
int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo);
#endif /* __XFS_SCRUB_REAP_H__ */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 304ea1e1bfb0..d0c7d4a29c0f 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -7,8 +7,10 @@
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_ag.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
@@ -17,6 +19,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub reference count btrees.
@@ -27,6 +30,15 @@ xchk_setup_ag_refcountbt(
{
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ if (xchk_could_repair(sc)) {
+ int error;
+
+ error = xrep_setup_ag_refcountbt(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_ag_btree(sc, false);
}
@@ -441,7 +453,7 @@ xchk_refcountbt_rec(
struct xchk_refcbt_records *rrc = bs->private;
xfs_refcount_btrec_to_irec(rec, &irec);
- if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) {
+ if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
new file mode 100644
index 000000000000..a00d7ce7ae5b
--- /dev/null
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -0,0 +1,751 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_error.h"
+#include "xfs_ag.h"
+#include "xfs_health.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+#include "scrub/rcbag.h"
+
+/*
+ * Rebuilding the Reference Count Btree
+ * ====================================
+ *
+ * This algorithm is "borrowed" from xfs_repair. Imagine the rmap
+ * entries as rectangles representing extents of physical blocks, and
+ * that the rectangles can be laid down to allow them to overlap each
+ * other; then we know that we must emit a refcnt btree entry wherever
+ * the amount of overlap changes, i.e. the emission stimulus is
+ * level-triggered:
+ *
+ * - ---
+ * -- ----- ---- --- ------
+ * -- ---- ----------- ---- ---------
+ * -------------------------------- -----------
+ * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
+ * 2 1 23 21 3 43 234 2123 1 01 2 3 0
+ *
+ * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
+ *
+ * Note that in the actual refcnt btree we don't store the refcount < 2
+ * cases because the bnobt tells us which blocks are free; single-use
+ * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
+ * supports storing multiple entries covering a given block we could
+ * theoretically dispense with the refcntbt and simply count rmaps, but
+ * that's inefficient in the (hot) write path, so we'll take the cost of
+ * the extra tree to save time. Also there's no guarantee that rmap
+ * will be enabled.
+ *
+ * Given an array of rmaps sorted by physical block number, a starting
+ * physical block (sp), a bag to hold rmaps that cover sp, and the next
+ * physical block where the level changes (np), we can reconstruct the
+ * refcount btree as follows:
+ *
+ * While there are still unprocessed rmaps in the array,
+ * - Set sp to the physical block (pblk) of the next unprocessed rmap.
+ * - Add to the bag all rmaps in the array where startblock == sp.
+ * - Set np to the physical block where the bag size will change. This
+ * is the minimum of (the pblk of the next unprocessed rmap) and
+ * (startblock + len of each rmap in the bag).
+ * - Record the bag size as old_bag_size.
+ *
+ * - While the bag isn't empty,
+ * - Remove from the bag all rmaps where startblock + len == np.
+ * - Add to the bag all rmaps in the array where startblock == np.
+ * - If the bag size isn't old_bag_size, store the refcount entry
+ * (sp, np - sp, bag_size) in the refcnt btree.
+ * - If the bag is empty, break out of the inner loop.
+ * - Set old_bag_size to the bag size
+ * - Set sp = np.
+ * - Set np to the physical block where the bag size will change.
+ * This is the minimum of (the pblk of the next unprocessed rmap)
+ * and (startblock + len of each rmap in the bag).
+ *
+ * Like all the other repairers, we make a list of all the refcount
+ * records we need, then reinitialize the refcount btree root and
+ * insert all the records.
+ */
+
+struct xrep_refc {
+ /* refcount extents */
+ struct xfarray *refcount_records;
+
+ /* new refcountbt information */
+ struct xrep_newbt new_btree;
+
+ /* old refcountbt blocks */
+ struct xagb_bitmap old_refcountbt_blocks;
+
+ struct xfs_scrub *sc;
+
+ /* get_records()'s position in the refcount record array. */
+ xfarray_idx_t array_cur;
+
+ /* # of refcountbt blocks */
+ xfs_extlen_t btblocks;
+};
+
+/* Set us up to repair refcount btrees. */
+int
+xrep_setup_ag_refcountbt(
+ struct xfs_scrub *sc)
+{
+ char *descr;
+ int error;
+
+ descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ return error;
+}
+
+/* Check for any obvious conflicts with this shared/CoW staging extent. */
+STATIC int
+xrep_refc_check_ext(
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *rec)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock,
+ rec->rc_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ /* Must not be an inode chunk. */
+ error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+ rec->rc_startblock, rec->rc_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Record a reference count extent. */
+STATIC int
+xrep_refc_stash(
+ struct xrep_refc *rr,
+ enum xfs_refc_domain domain,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ uint64_t refcount)
+{
+ struct xfs_refcount_irec irec = {
+ .rc_startblock = agbno,
+ .rc_blockcount = len,
+ .rc_domain = domain,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount);
+
+ error = xrep_refc_check_ext(rr->sc, &irec);
+ if (error)
+ return error;
+
+ trace_xrep_refc_found(sc->sa.pag, &irec);
+
+ return xfarray_append(rr->refcount_records, &irec);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_refc_stash_cow(
+ struct xrep_refc *rr,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1);
+}
+
+/* Decide if an rmap could describe a shared extent. */
+static inline bool
+xrep_refc_rmap_shareable(
+ struct xfs_mount *mp,
+ const struct xfs_rmap_irec *rmap)
+{
+ /* AG metadata are never sharable */
+ if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ return false;
+
+ /* Metadata in files are never shareable */
+ if (xfs_internal_inum(mp, rmap->rm_owner))
+ return false;
+
+ /* Metadata and unwritten file blocks are not shareable. */
+ if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+ XFS_RMAP_UNWRITTEN))
+ return false;
+
+ return true;
+}
+
+/*
+ * Walk along the reverse mapping records until we find one that could describe
+ * a shared extent.
+ */
+STATIC int
+xrep_refc_walk_rmaps(
+ struct xrep_refc *rr,
+ struct xfs_rmap_irec *rmap,
+ bool *have_rec)
+{
+ struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur;
+ struct xfs_mount *mp = cur->bc_mp;
+ int have_gt;
+ int error = 0;
+
+ *have_rec = false;
+
+ /*
+ * Loop through the remaining rmaps. Remember CoW staging
+ * extents and the refcountbt blocks from the old tree for later
+ * disposal. We can only share written data fork extents, so
+ * keep looping until we find an rmap for one.
+ */
+ do {
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ error = xfs_btree_increment(cur, 0, &have_gt);
+ if (error)
+ return error;
+ if (!have_gt)
+ return 0;
+
+ error = xfs_rmap_get_rec(cur, rmap, &have_gt);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(mp, !have_gt)) {
+ xfs_btree_mark_sick(cur);
+ return -EFSCORRUPTED;
+ }
+
+ if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
+ error = xrep_refc_stash_cow(rr, rmap->rm_startblock,
+ rmap->rm_blockcount);
+ if (error)
+ return error;
+ } else if (rmap->rm_owner == XFS_RMAP_OWN_REFC) {
+ /* refcountbt block, dump it when we're done. */
+ rr->btblocks += rmap->rm_blockcount;
+ error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
+ rmap->rm_startblock,
+ rmap->rm_blockcount);
+ if (error)
+ return error;
+ }
+ } while (!xrep_refc_rmap_shareable(mp, rmap));
+
+ *have_rec = true;
+ return 0;
+}
+
+static inline uint32_t
+xrep_refc_encode_startblock(
+ const struct xfs_refcount_irec *irec)
+{
+ uint32_t start;
+
+ start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
+ if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
+ start |= XFS_REFC_COWFLAG;
+
+ return start;
+}
+
+/* Sort in the same order as the ondisk records. */
+static int
+xrep_refc_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_refcount_irec *ap = a;
+ const struct xfs_refcount_irec *bp = b;
+ uint32_t sa, sb;
+
+ sa = xrep_refc_encode_startblock(ap);
+ sb = xrep_refc_encode_startblock(bp);
+
+ if (sa > sb)
+ return 1;
+ if (sa < sb)
+ return -1;
+ return 0;
+}
+
+/*
+ * Sort the refcount extents by startblock or else the btree records will be in
+ * the wrong order. Make sure the records do not overlap in physical space.
+ */
+STATIC int
+xrep_refc_sort_records(
+ struct xrep_refc *rr)
+{
+ struct xfs_refcount_irec irec;
+ xfarray_idx_t cur;
+ enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED;
+ xfs_agblock_t next_agbno = 0;
+ int error;
+
+ error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp,
+ XFARRAY_SORT_KILLABLE);
+ if (error)
+ return error;
+
+ foreach_xfarray_idx(rr->refcount_records, cur) {
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ error = xfarray_load(rr->refcount_records, cur, &irec);
+ if (error)
+ return error;
+
+ if (dom == XFS_REFC_DOMAIN_SHARED &&
+ irec.rc_domain == XFS_REFC_DOMAIN_COW) {
+ dom = irec.rc_domain;
+ next_agbno = 0;
+ }
+
+ if (dom != irec.rc_domain)
+ return -EFSCORRUPTED;
+ if (irec.rc_startblock < next_agbno)
+ return -EFSCORRUPTED;
+
+ next_agbno = irec.rc_startblock + irec.rc_blockcount;
+ }
+
+ return error;
+}
+
+/*
+ * Walk forward through the rmap btree to collect all rmaps starting at
+ * @bno in @rmap_bag. These represent the file(s) that share ownership of
+ * the current block. Upon return, the rmap cursor points to the last record
+ * satisfying the startblock constraint.
+ */
+static int
+xrep_refc_push_rmaps_at(
+ struct xrep_refc *rr,
+ struct rcbag *rcstack,
+ xfs_agblock_t bno,
+ struct xfs_rmap_irec *rmap,
+ bool *have)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int have_gt;
+ int error;
+
+ while (*have && rmap->rm_startblock == bno) {
+ error = rcbag_add(rcstack, rr->sc->tp, rmap);
+ if (error)
+ return error;
+
+ error = xrep_refc_walk_rmaps(rr, rmap, have);
+ if (error)
+ return error;
+ }
+
+ error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
+ xfs_btree_mark_sick(sc->sa.rmap_cur);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/* Iterate all the rmap records to generate reference count data. */
+STATIC int
+xrep_refc_find_refcounts(
+ struct xrep_refc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct rcbag *rcstack;
+ uint64_t old_stack_height;
+ xfs_agblock_t sbno;
+ xfs_agblock_t cbno;
+ xfs_agblock_t nbno;
+ bool have;
+ int error;
+
+ xrep_ag_btcur_init(sc, &sc->sa);
+
+ /*
+ * Set up a bag to store all the rmap records that we're tracking to
+ * generate a reference count record. If the size of the bag exceeds
+ * MAXREFCOUNT, we clamp rc_refcount.
+ */
+ error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
+ if (error)
+ goto out_cur;
+
+ /* Start the rmapbt cursor to the left of all records. */
+ error = xfs_btree_goto_left_edge(sc->sa.rmap_cur);
+ if (error)
+ goto out_bag;
+
+ /* Process reverse mappings into refcount data. */
+ while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
+ struct xfs_rmap_irec rmap;
+
+ /* Push all rmaps with pblk == sbno onto the stack */
+ error = xrep_refc_walk_rmaps(rr, &rmap, &have);
+ if (error)
+ goto out_bag;
+ if (!have)
+ break;
+ sbno = cbno = rmap.rm_startblock;
+ error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap,
+ &have);
+ if (error)
+ goto out_bag;
+
+ /* Set nbno to the bno of the next refcount change */
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
+ if (error)
+ goto out_bag;
+
+ ASSERT(nbno > sbno);
+ old_stack_height = rcbag_count(rcstack);
+
+ /* While stack isn't empty... */
+ while (rcbag_count(rcstack) > 0) {
+ /* Pop all rmaps that end at nbno */
+ error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
+ if (error)
+ goto out_bag;
+
+ /* Push array items that start at nbno */
+ error = xrep_refc_walk_rmaps(rr, &rmap, &have);
+ if (error)
+ goto out_bag;
+ if (have) {
+ error = xrep_refc_push_rmaps_at(rr, rcstack,
+ nbno, &rmap, &have);
+ if (error)
+ goto out_bag;
+ }
+
+ /* Emit refcount if necessary */
+ ASSERT(nbno > cbno);
+ if (rcbag_count(rcstack) != old_stack_height) {
+ if (old_stack_height > 1) {
+ error = xrep_refc_stash(rr,
+ XFS_REFC_DOMAIN_SHARED,
+ cbno, nbno - cbno,
+ old_stack_height);
+ if (error)
+ goto out_bag;
+ }
+ cbno = nbno;
+ }
+
+ /* Stack empty, go find the next rmap */
+ if (rcbag_count(rcstack) == 0)
+ break;
+ old_stack_height = rcbag_count(rcstack);
+ sbno = nbno;
+
+ /* Set nbno to the bno of the next refcount change */
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
+ &nbno);
+ if (error)
+ goto out_bag;
+
+ ASSERT(nbno > sbno);
+ }
+ }
+
+ ASSERT(rcbag_count(rcstack) == 0);
+out_bag:
+ rcbag_free(&rcstack);
+out_cur:
+ xchk_ag_btcur_free(&sc->sa);
+ return error;
+}
+
+/* Retrieve refcountbt data for bulk load. */
+STATIC int
+xrep_refc_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ struct xrep_refc *rr = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ error = xfarray_load(rr->refcount_records, rr->array_cur++,
+ irec);
+ if (error)
+ return error;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_refc_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_refc *rr = priv;
+
+ return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Update the AGF counters. */
+STATIC int
+xrep_refc_reset_counters(
+ struct xrep_refc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+
+ /*
+ * After we commit the new btree to disk, it is possible that the
+ * process to reap the old btree blocks will race with the AIL trying
+ * to checkpoint the old btree blocks into the filesystem. If the new
+ * tree is shorter than the old one, the refcountbt write verifier will
+ * fail and the AIL will shut down the filesystem.
+ *
+ * To avoid this, save the old incore btree height values as the alt
+ * height values before re-initializing the perag info from the updated
+ * AGF to capture all the new values.
+ */
+ pag->pagf_repair_refcount_level = pag->pagf_refcount_level;
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagf(sc);
+}
+
+/*
+ * Use the collected refcount information to stage a new refcount btree. If
+ * this is successful we'll return with the new btree root information logged
+ * to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_refc_build_new_tree(
+ struct xrep_refc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_btree_cur *refc_cur;
+ struct xfs_perag *pag = sc->sa.pag;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ error = xrep_refc_sort_records(rr);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the AG header.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp));
+ xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno,
+ XFS_AG_RESV_METADATA);
+ rr->new_btree.bload.get_records = xrep_refc_get_records;
+ rr->new_btree.bload.claim_block = xrep_refc_claim_block;
+
+ /* Compute how many blocks we'll need. */
+ refc_cur = xfs_refcountbt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(refc_cur, &rr->new_btree.afake);
+ error = xfs_btree_bload_compute_geometry(refc_cur,
+ &rr->new_btree.bload,
+ xfarray_length(rr->refcount_records));
+ if (error)
+ goto err_cur;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_cur;
+
+ /* Reserve the space we'll need for the new btree. */
+ error = xrep_newbt_alloc_blocks(&rr->new_btree,
+ rr->new_btree.bload.nr_blocks);
+ if (error)
+ goto err_cur;
+
+ /*
+ * Due to btree slack factors, it's possible for a new btree to be one
+ * level taller than the old btree. Update the incore btree height so
+ * that we don't trip the verifiers when writing the new btree blocks
+ * to disk.
+ */
+ pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height;
+
+ /* Add all observed refcount records. */
+ rr->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
+ if (error)
+ goto err_level;
+
+ /*
+ * Install the new btree in the AG header. After this point the old
+ * btree is no longer accessible and the new tree is live.
+ */
+ xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(refc_cur, 0);
+
+ /* Reset the AGF counters now that we've changed the btree shape. */
+ error = xrep_refc_reset_counters(rr);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rr->new_btree);
+ if (error)
+ return error;
+
+ return xrep_roll_ag_trans(sc);
+
+err_level:
+ pag->pagf_repair_refcount_level = 0;
+err_cur:
+ xfs_btree_del_cursor(refc_cur, error);
+err_newbt:
+ xrep_newbt_cancel(&rr->new_btree);
+ return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_refc_remove_old_tree(
+ struct xrep_refc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ int error;
+
+ /* Free the old refcountbt blocks if they're not in use. */
+ error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks,
+ &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
+ if (error)
+ return error;
+
+ /*
+ * Now that we've zapped all the old refcountbt blocks we can turn off
+ * the alternate height mechanism and reset the per-AG space
+ * reservations.
+ */
+ pag->pagf_repair_refcount_level = 0;
+ sc->flags |= XREP_RESET_PERAG_RESV;
+ return 0;
+}
+
+/* Rebuild the refcount btree. */
+int
+xrep_refcountbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_refc *rr;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+
+ rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS);
+ if (!rr)
+ return -ENOMEM;
+ rr->sc = sc;
+
+ /* Set up enough storage to handle one refcount record per block. */
+ descr = xchk_xfile_ag_descr(sc, "reference count records");
+ error = xfarray_create(descr, mp->m_sb.sb_agblocks,
+ sizeof(struct xfs_refcount_irec),
+ &rr->refcount_records);
+ kfree(descr);
+ if (error)
+ goto out_rr;
+
+ /* Collect all reference counts. */
+ xagb_bitmap_init(&rr->old_refcountbt_blocks);
+ error = xrep_refc_find_refcounts(rr);
+ if (error)
+ goto out_bitmap;
+
+ /* Rebuild the refcount information. */
+ error = xrep_refc_build_new_tree(rr);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old tree. */
+ error = xrep_refc_remove_old_tree(rr);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xagb_bitmap_destroy(&rr->old_refcountbt_blocks);
+ xfarray_destroy(rr->refcount_records);
+out_rr:
+ kfree(rr);
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1b8b5439f2d7..f43dce771cdd 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -27,12 +27,18 @@
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_defer.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_reflink.h"
+#include "xfs_health.h"
+#include "xfs_buf_mem.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/stats.h"
+#include "scrub/xfile.h"
/*
* Attempt to repair some metadata, if the metadata is corrupt and userspace
@@ -176,6 +182,16 @@ xrep_roll_ag_trans(
return 0;
}
+/* Roll the scrub transaction, holding the primary metadata locked. */
+int
+xrep_roll_trans(
+ struct xfs_scrub *sc)
+{
+ if (!sc->ip)
+ return xrep_roll_ag_trans(sc);
+ return xfs_trans_roll_inode(&sc->tp, sc->ip);
+}
+
/* Finish all deferred work attached to the repair transaction. */
int
xrep_defer_finish(
@@ -387,7 +403,7 @@ xrep_calc_ag_resblks(
int
xrep_fix_freelist(
struct xfs_scrub *sc,
- bool can_shrink)
+ int alloc_flags)
{
struct xfs_alloc_arg args = {0};
@@ -397,8 +413,7 @@ xrep_fix_freelist(
args.alignment = 1;
args.pag = sc->sa.pag;
- return xfs_alloc_fix_freelist(&args,
- can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+ return xfs_alloc_fix_freelist(&args, alloc_flags);
}
/*
@@ -673,6 +688,45 @@ xrep_find_ag_btree_roots(
return error;
}
+#ifdef CONFIG_XFS_QUOTA
+/* Update some quota flags in the superblock. */
+void
+xrep_update_qflags(
+ struct xfs_scrub *sc,
+ unsigned int clear_flags,
+ unsigned int set_flags)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+
+ mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+ if ((mp->m_qflags & clear_flags) == 0 &&
+ (mp->m_qflags & set_flags) == set_flags)
+ goto no_update;
+
+ mp->m_qflags &= ~clear_flags;
+ mp->m_qflags |= set_flags;
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_qflags &= ~clear_flags;
+ mp->m_sb.sb_qflags |= set_flags;
+ spin_unlock(&mp->m_sb_lock);
+
+ /*
+ * Update the quota flags in the ondisk superblock without touching
+ * the summary counters. We have not quiesced inode chunk allocation,
+ * so we cannot coordinate with updates to the icount and ifree percpu
+ * counters.
+ */
+ bp = xfs_trans_getsb(sc->tp);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
+ xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1);
+
+no_update:
+ mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+}
+
/* Force a quotacheck the next time we mount. */
void
xrep_force_quotacheck(
@@ -685,13 +739,7 @@ xrep_force_quotacheck(
if (!(flag & sc->mp->m_qflags))
return;
- mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
- sc->mp->m_qflags &= ~flag;
- spin_lock(&sc->mp->m_sb_lock);
- sc->mp->m_sb.sb_qflags &= ~flag;
- spin_unlock(&sc->mp->m_sb_lock);
- xfs_log_sb(sc->tp);
- mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+ xrep_update_qflags(sc, flag, 0);
}
/*
@@ -699,10 +747,10 @@ xrep_force_quotacheck(
*
* This function ensures that the appropriate dquots are attached to an inode.
* We cannot allow the dquot code to allocate an on-disk dquot block here
- * because we're already in transaction context with the inode locked. The
- * on-disk dquot should already exist anyway. If the quota code signals
- * corruption or missing quota information, schedule quotacheck, which will
- * repair corruptions in the quota metadata.
+ * because we're already in transaction context. The on-disk dquot should
+ * already exist anyway. If the quota code signals corruption or missing quota
+ * information, schedule quotacheck, which will repair corruptions in the quota
+ * metadata.
*/
int
xrep_ino_dqattach(
@@ -710,7 +758,10 @@ xrep_ino_dqattach(
{
int error;
- error = xfs_qm_dqattach_locked(sc->ip, false);
+ ASSERT(sc->tp != NULL);
+ ASSERT(sc->ip != NULL);
+
+ error = xfs_qm_dqattach(sc->ip);
switch (error) {
case -EFSBADCRC:
case -EFSCORRUPTED:
@@ -734,3 +785,419 @@ xrep_ino_dqattach(
return error;
}
+#endif /* CONFIG_XFS_QUOTA */
+
+/*
+ * Ensure that the inode being repaired is ready to handle a certain number of
+ * extents, or return EFSCORRUPTED. Caller must hold the ILOCK of the inode
+ * being repaired and have joined it to the scrub transaction.
+ */
+int
+xrep_ino_ensure_extent_count(
+ struct xfs_scrub *sc,
+ int whichfork,
+ xfs_extnum_t nextents)
+{
+ xfs_extnum_t max_extents;
+ bool inode_has_nrext64;
+
+ inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip);
+ max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork);
+ if (nextents <= max_extents)
+ return 0;
+ if (inode_has_nrext64)
+ return -EFSCORRUPTED;
+ if (!xfs_has_large_extent_counts(sc->mp))
+ return -EFSCORRUPTED;
+
+ max_extents = xfs_iext_max_nextents(true, whichfork);
+ if (nextents > max_extents)
+ return -EFSCORRUPTED;
+
+ sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ return 0;
+}
+
+/*
+ * Initialize all the btree cursors for an AG repair except for the btree that
+ * we're rebuilding.
+ */
+void
+xrep_ag_btcur_init(
+ struct xfs_scrub *sc,
+ struct xchk_ag *sa)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ /* Set up a bnobt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
+ sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag);
+ sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag);
+ }
+
+ /* Set up a inobt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
+ sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
+ sa->agi_bp);
+ if (xfs_has_finobt(mp))
+ sa->fino_cur = xfs_finobt_init_cursor(sc->sa.pag,
+ sc->tp, sa->agi_bp);
+ }
+
+ /* Set up a rmapbt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT &&
+ xfs_has_rmapbt(mp))
+ sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag);
+
+ /* Set up a refcountbt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT &&
+ xfs_has_reflink(mp))
+ sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, sc->sa.pag);
+}
+
+/*
+ * Reinitialize the in-core AG state after a repair by rereading the AGF
+ * buffer. We had better get the same AGF buffer as the one that's attached
+ * to the scrub context.
+ */
+int
+xrep_reinit_pagf(
+ struct xfs_scrub *sc)
+{
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(pag);
+ ASSERT(xfs_perag_initialised_agf(pag));
+
+ clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp);
+ if (error)
+ return error;
+
+ if (bp != sc->sa.agf_bp) {
+ ASSERT(bp == sc->sa.agf_bp);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/*
+ * Reinitialize the in-core AG state after a repair by rereading the AGI
+ * buffer. We had better get the same AGI buffer as the one that's attached
+ * to the scrub context.
+ */
+int
+xrep_reinit_pagi(
+ struct xfs_scrub *sc)
+{
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(pag);
+ ASSERT(xfs_perag_initialised_agi(pag));
+
+ clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
+ error = xfs_ialloc_read_agi(pag, sc->tp, &bp);
+ if (error)
+ return error;
+
+ if (bp != sc->sa.agi_bp) {
+ ASSERT(bp == sc->sa.agi_bp);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/*
+ * Given an active reference to a perag structure, load AG headers and cursors.
+ * This should only be called to scan an AG while repairing file-based metadata.
+ */
+int
+xrep_ag_init(
+ struct xfs_scrub *sc,
+ struct xfs_perag *pag,
+ struct xchk_ag *sa)
+{
+ int error;
+
+ ASSERT(!sa->pag);
+
+ error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp);
+ if (error)
+ return error;
+
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp);
+ if (error)
+ return error;
+
+ /* Grab our own passive reference from the caller's ref. */
+ sa->pag = xfs_perag_hold(pag);
+ xrep_ag_btcur_init(sc, sa);
+ return 0;
+}
+
+/* Reinitialize the per-AG block reservation for the AG we just fixed. */
+int
+xrep_reset_perag_resv(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ if (!(sc->flags & XREP_RESET_PERAG_RESV))
+ return 0;
+
+ ASSERT(sc->sa.pag != NULL);
+ ASSERT(sc->ops->type == ST_PERAG);
+ ASSERT(sc->tp);
+
+ sc->flags &= ~XREP_RESET_PERAG_RESV;
+ error = xfs_ag_resv_free(sc->sa.pag);
+ if (error)
+ goto out;
+ error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
+ if (error == -ENOSPC) {
+ xfs_err(sc->mp,
+"Insufficient free space to reset per-AG reservation for AG %u after repair.",
+ sc->sa.pag->pag_agno);
+ error = 0;
+ }
+
+out:
+ return error;
+}
+
+/* Decide if we are going to call the repair function for a scrub type. */
+bool
+xrep_will_attempt(
+ struct xfs_scrub *sc)
+{
+ /* Userspace asked us to rebuild the structure regardless. */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
+ return true;
+
+ /* Let debug users force us into the repair routines. */
+ if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+ return true;
+
+ /* Metadata is corrupt or failed cross-referencing. */
+ if (xchk_needs_repair(sc->sm))
+ return true;
+
+ return false;
+}
+
+/* Try to fix some part of a metadata inode by calling another scrubber. */
+STATIC int
+xrep_metadata_inode_subtype(
+ struct xfs_scrub *sc,
+ unsigned int scrub_type)
+{
+ __u32 smtype = sc->sm->sm_type;
+ __u32 smflags = sc->sm->sm_flags;
+ unsigned int sick_mask = sc->sick_mask;
+ int error;
+
+ /*
+ * Let's see if the inode needs repair. We're going to open-code calls
+ * to the scrub and repair functions so that we can hang on to the
+ * resources that we already acquired instead of using the standard
+ * setup/teardown routines.
+ */
+ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ sc->sm->sm_type = scrub_type;
+
+ switch (scrub_type) {
+ case XFS_SCRUB_TYPE_INODE:
+ error = xchk_inode(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTD:
+ error = xchk_bmap_data(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTA:
+ error = xchk_bmap_attr(sc);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ }
+ if (error)
+ goto out;
+
+ if (!xrep_will_attempt(sc))
+ goto out;
+
+ /*
+ * Repair some part of the inode. This will potentially join the inode
+ * to the transaction.
+ */
+ switch (scrub_type) {
+ case XFS_SCRUB_TYPE_INODE:
+ error = xrep_inode(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTD:
+ error = xrep_bmap(sc, XFS_DATA_FORK, false);
+ break;
+ case XFS_SCRUB_TYPE_BMBTA:
+ error = xrep_bmap(sc, XFS_ATTR_FORK, false);
+ break;
+ }
+ if (error)
+ goto out;
+
+ /*
+ * Finish all deferred intent items and then roll the transaction so
+ * that the inode will not be joined to the transaction when we exit
+ * the function.
+ */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ goto out;
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ goto out;
+
+ /*
+ * Clear the corruption flags and re-check the metadata that we just
+ * repaired.
+ */
+ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+
+ switch (scrub_type) {
+ case XFS_SCRUB_TYPE_INODE:
+ error = xchk_inode(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTD:
+ error = xchk_bmap_data(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTA:
+ error = xchk_bmap_attr(sc);
+ break;
+ }
+ if (error)
+ goto out;
+
+ /* If corruption persists, the repair has failed. */
+ if (xchk_needs_repair(sc->sm)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+out:
+ sc->sick_mask = sick_mask;
+ sc->sm->sm_type = smtype;
+ sc->sm->sm_flags = smflags;
+ return error;
+}
+
+/*
+ * Repair the ondisk forks of a metadata inode. The caller must ensure that
+ * sc->ip points to the metadata inode and the ILOCK is held on that inode.
+ * The inode must not be joined to the transaction before the call, and will
+ * not be afterwards.
+ */
+int
+xrep_metadata_inode_forks(
+ struct xfs_scrub *sc)
+{
+ bool dirty = false;
+ int error;
+
+ /* Repair the inode record and the data fork. */
+ error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
+ if (error)
+ return error;
+
+ error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
+ if (error)
+ return error;
+
+ /* Make sure the attr fork looks ok before we delete it. */
+ error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
+ if (error)
+ return error;
+
+ /* Clear the reflink flag since metadata never shares. */
+ if (xfs_is_reflink_inode(sc->ip)) {
+ dirty = true;
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If we modified the inode, roll the transaction but don't rejoin the
+ * inode to the new transaction because xrep_bmap_data can do that.
+ */
+ if (dirty) {
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+ dirty = false;
+ }
+
+ return 0;
+}
+
+/*
+ * Set up an in-memory buffer cache so that we can use the xfbtree. Allocating
+ * a shmem file might take loks, so we cannot be in transaction context. Park
+ * our resources in the scrub context and let the teardown function take care
+ * of them at the right time.
+ */
+int
+xrep_setup_xfbtree(
+ struct xfs_scrub *sc,
+ const char *descr)
+{
+ ASSERT(sc->tp == NULL);
+
+ return xmbuf_alloc(sc->mp, descr, &sc->xmbtp);
+}
+
+/*
+ * Create a dummy transaction for use in a live update hook function. This
+ * function MUST NOT be called from regular repair code because the current
+ * process' transaction is saved via the cookie.
+ */
+int
+xrep_trans_alloc_hook_dummy(
+ struct xfs_mount *mp,
+ void **cookiep,
+ struct xfs_trans **tpp)
+{
+ int error;
+
+ *cookiep = current->journal_info;
+ current->journal_info = NULL;
+
+ error = xfs_trans_alloc_empty(mp, tpp);
+ if (!error)
+ return 0;
+
+ current->journal_info = *cookiep;
+ *cookiep = NULL;
+ return error;
+}
+
+/* Cancel a dummy transaction used by a live update hook function. */
+void
+xrep_trans_cancel_hook_dummy(
+ void **cookiep,
+ struct xfs_trans *tp)
+{
+ xfs_trans_cancel(tp);
+ current->journal_info = *cookiep;
+ *cookiep = NULL;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 60d2a9ae5f2e..ce082d941459 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -28,17 +28,30 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
/* Repair helpers */
int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run);
+bool xrep_will_attempt(struct xfs_scrub *sc);
void xrep_failure(struct xfs_mount *mp);
int xrep_roll_ag_trans(struct xfs_scrub *sc);
+int xrep_roll_trans(struct xfs_scrub *sc);
int xrep_defer_finish(struct xfs_scrub *sc);
bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
enum xfs_ag_resv_type type);
xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
+static inline int
+xrep_trans_commit(
+ struct xfs_scrub *sc)
+{
+ int error = xfs_trans_commit(sc->tp);
+
+ sc->tp = NULL;
+ return error;
+}
+
struct xbitmap;
struct xagb_bitmap;
+struct xfsb_bitmap;
-int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
+int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags);
struct xrep_find_ag_btree {
/* in: rmap owner of the btree we're looking for */
@@ -57,8 +70,41 @@ struct xrep_find_ag_btree {
int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
+
+#ifdef CONFIG_XFS_QUOTA
+void xrep_update_qflags(struct xfs_scrub *sc, unsigned int clear_flags,
+ unsigned int set_flags);
void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
int xrep_ino_dqattach(struct xfs_scrub *sc);
+#else
+# define xrep_force_quotacheck(sc, type) ((void)0)
+# define xrep_ino_dqattach(sc) (0)
+#endif /* CONFIG_XFS_QUOTA */
+
+int xrep_setup_xfbtree(struct xfs_scrub *sc, const char *descr);
+
+int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork,
+ xfs_extnum_t nextents);
+int xrep_reset_perag_resv(struct xfs_scrub *sc);
+int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten);
+int xrep_metadata_inode_forks(struct xfs_scrub *sc);
+int xrep_setup_ag_rmapbt(struct xfs_scrub *sc);
+int xrep_setup_ag_refcountbt(struct xfs_scrub *sc);
+
+/* Repair setup functions */
+int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
+
+struct xfs_imap;
+int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap);
+
+void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
+int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag,
+ struct xchk_ag *sa);
+
+/* Metadata revalidators */
+
+int xrep_revalidate_allocbt(struct xfs_scrub *sc);
+int xrep_revalidate_iallocbt(struct xfs_scrub *sc);
/* Metadata repairers */
@@ -67,9 +113,43 @@ int xrep_superblock(struct xfs_scrub *sc);
int xrep_agf(struct xfs_scrub *sc);
int xrep_agfl(struct xfs_scrub *sc);
int xrep_agi(struct xfs_scrub *sc);
+int xrep_allocbt(struct xfs_scrub *sc);
+int xrep_iallocbt(struct xfs_scrub *sc);
+int xrep_rmapbt(struct xfs_scrub *sc);
+int xrep_refcountbt(struct xfs_scrub *sc);
+int xrep_inode(struct xfs_scrub *sc);
+int xrep_bmap_data(struct xfs_scrub *sc);
+int xrep_bmap_attr(struct xfs_scrub *sc);
+int xrep_bmap_cow(struct xfs_scrub *sc);
+int xrep_nlinks(struct xfs_scrub *sc);
+int xrep_fscounters(struct xfs_scrub *sc);
+
+#ifdef CONFIG_XFS_RT
+int xrep_rtbitmap(struct xfs_scrub *sc);
+#else
+# define xrep_rtbitmap xrep_notsupported
+#endif /* CONFIG_XFS_RT */
+
+#ifdef CONFIG_XFS_QUOTA
+int xrep_quota(struct xfs_scrub *sc);
+int xrep_quotacheck(struct xfs_scrub *sc);
+#else
+# define xrep_quota xrep_notsupported
+# define xrep_quotacheck xrep_notsupported
+#endif /* CONFIG_XFS_QUOTA */
+
+int xrep_reinit_pagf(struct xfs_scrub *sc);
+int xrep_reinit_pagi(struct xfs_scrub *sc);
+
+int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
+ struct xfs_trans **tpp);
+void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
#else
+#define xrep_ino_dqattach(sc) (0)
+#define xrep_will_attempt(sc) (false)
+
static inline int
xrep_attempt(
struct xfs_scrub *sc,
@@ -87,11 +167,51 @@ xrep_calc_ag_resblks(
return 0;
}
+static inline int
+xrep_reset_perag_resv(
+ struct xfs_scrub *sc)
+{
+ if (!(sc->flags & XREP_RESET_PERAG_RESV))
+ return 0;
+
+ ASSERT(0);
+ return -EOPNOTSUPP;
+}
+
+/* repair setup functions for no-repair */
+static inline int
+xrep_setup_nothing(
+ struct xfs_scrub *sc)
+{
+ return 0;
+}
+#define xrep_setup_ag_allocbt xrep_setup_nothing
+#define xrep_setup_ag_rmapbt xrep_setup_nothing
+#define xrep_setup_ag_refcountbt xrep_setup_nothing
+
+#define xrep_setup_inode(sc, imap) ((void)0)
+
+#define xrep_revalidate_allocbt (NULL)
+#define xrep_revalidate_iallocbt (NULL)
+
#define xrep_probe xrep_notsupported
#define xrep_superblock xrep_notsupported
#define xrep_agf xrep_notsupported
#define xrep_agfl xrep_notsupported
#define xrep_agi xrep_notsupported
+#define xrep_allocbt xrep_notsupported
+#define xrep_iallocbt xrep_notsupported
+#define xrep_rmapbt xrep_notsupported
+#define xrep_refcountbt xrep_notsupported
+#define xrep_inode xrep_notsupported
+#define xrep_bmap_data xrep_notsupported
+#define xrep_bmap_attr xrep_notsupported
+#define xrep_bmap_cow xrep_notsupported
+#define xrep_rtbitmap xrep_notsupported
+#define xrep_quota xrep_notsupported
+#define xrep_quotacheck xrep_notsupported
+#define xrep_nlinks xrep_notsupported
+#define xrep_fscounters xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index d29a26ecddd6..ba5bbc3fb754 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -24,6 +24,8 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub reverse mapping btrees.
@@ -35,6 +37,14 @@ xchk_setup_ag_rmapbt(
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+ if (xchk_could_repair(sc)) {
+ int error;
+
+ error = xrep_setup_ag_rmapbt(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_ag_btree(sc, false);
}
@@ -348,7 +358,7 @@ xchk_rmapbt_rec(
struct xfs_rmap_irec irec;
if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL ||
- xfs_rmap_check_irec(bs->cur, &irec) != NULL) {
+ xfs_rmap_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -411,8 +421,8 @@ xchk_rmapbt_walk_ag_metadata(
/* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */
cur = sc->sa.bno_cur;
if (!cur)
- cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
- sc->sa.pag, XFS_BTNUM_BNO);
+ cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
if (cur != sc->sa.bno_cur)
xfs_btree_del_cursor(cur, error);
@@ -421,8 +431,8 @@ xchk_rmapbt_walk_ag_metadata(
cur = sc->sa.cnt_cur;
if (!cur)
- cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
- sc->sa.pag, XFS_BTNUM_CNT);
+ cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
if (cur != sc->sa.cnt_cur)
xfs_btree_del_cursor(cur, error);
@@ -446,8 +456,7 @@ xchk_rmapbt_walk_ag_metadata(
/* OWN_INOBT: inobt, finobt */
cur = sc->sa.ino_cur;
if (!cur)
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp,
- XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp);
error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
if (cur != sc->sa.ino_cur)
xfs_btree_del_cursor(cur, error);
@@ -457,8 +466,8 @@ xchk_rmapbt_walk_ag_metadata(
if (xfs_has_finobt(sc->mp)) {
cur = sc->sa.fino_cur;
if (!cur)
- cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
- sc->sa.agi_bp, XFS_BTNUM_FINO);
+ cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp,
+ sc->sa.agi_bp);
error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
if (cur != sc->sa.fino_cur)
xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
new file mode 100644
index 000000000000..e8e07b683eab
--- /dev/null
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -0,0 +1,1697 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Reverse Mapping Btree Repair
+ * ============================
+ *
+ * This is the most involved of all the AG space btree rebuilds. Everywhere
+ * else in XFS we lock inodes and then AG data structures, but generating the
+ * list of rmap records requires that we be able to scan both block mapping
+ * btrees of every inode in the filesystem to see if it owns any extents in
+ * this AG. We can't tolerate any inode updates while we do this, so we
+ * freeze the filesystem to lock everyone else out, and grant ourselves
+ * special privileges to run transactions with regular background reclamation
+ * turned off.
+ *
+ * We also have to be very careful not to allow inode reclaim to start a
+ * transaction because all transactions (other than our own) will block.
+ * Deferred inode inactivation helps us out there.
+ *
+ * I) Reverse mappings for all non-space metadata and file data are collected
+ * according to the following algorithm:
+ *
+ * 1. For each fork of each inode:
+ * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary.
+ * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate
+ * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree
+ * block.
+ * 1.3. If the incore extent map is loaded but the fork is in btree format,
+ * just visit the bmbt blocks to set the corresponding BMBIT areas.
+ * 1.4. From the incore extent map, accumulate each bmap that falls into our
+ * target AG. Remember, multiple bmap records can map to a single rmap
+ * record, so we cannot simply emit rmap records 1:1.
+ * 1.5. Emit rmap records for each extent in BMBIT and free it.
+ * 2. Create bitmaps INOBIT and ICHUNKBIT.
+ * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT,
+ * and set bits in INOBIT for each btree block. If the inobt has no records
+ * at all, we must be careful to record its root in INOBIT.
+ * 4. For each block in the finobt, set the corresponding INOBIT area.
+ * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them.
+ * 6. Create bitmaps REFCBIT and COWBIT.
+ * 7. For each CoW staging extent in the refcountbt, set the corresponding
+ * areas in COWBIT.
+ * 8. For each block in the refcountbt, set the corresponding REFCBIT area.
+ * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them.
+ * A. Emit rmap for the AG headers.
+ * B. Emit rmap for the log, if there is one.
+ *
+ * II) The rmapbt shape and space metadata rmaps are computed as follows:
+ *
+ * 1. Count the rmaps collected in the previous step. (= NR)
+ * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB)
+ * 3. Reserve RMB blocks through the newbt using the allocator in normap mode.
+ * 4. Create bitmap AGBIT.
+ * 5. For each reservation in the newbt, set the corresponding areas in AGBIT.
+ * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT.
+ * 7. Count the extents in AGBIT. (= AGNR)
+ * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB')
+ * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB',
+ * and clear AGBIT. Go to step 5.
+ * A. Emit rmaps for each extent in AGBIT.
+ *
+ * III) The rmapbt is constructed and set in place as follows:
+ *
+ * 1. Sort the rmap records.
+ * 2. Bulk load the rmaps.
+ *
+ * IV) Reap the old btree blocks.
+ *
+ * 1. Create a bitmap OLDRMBIT.
+ * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT.
+ * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT.
+ * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are
+ * the parts of the AG that the rmap didn't find during its scan of the
+ * primary metadata and aren't known to be in the free space, which implies
+ * that they were the old rmapbt blocks.
+ * 5. Commit.
+ *
+ * We use the 'xrep_rmap' prefix for all the rmap functions.
+ */
+
+/* Context for collecting rmaps */
+struct xrep_rmap {
+ /* new rmapbt information */
+ struct xrep_newbt new_btree;
+
+ /* lock for the xfbtree and xfile */
+ struct mutex lock;
+
+ /* rmap records generated from primary metadata */
+ struct xfbtree rmap_btree;
+
+ struct xfs_scrub *sc;
+
+ /* in-memory btree cursor for the xfs_btree_bload iteration */
+ struct xfs_btree_cur *mcur;
+
+ /* Hooks into rmap update code. */
+ struct xfs_rmap_hook rhook;
+
+ /* inode scan cursor */
+ struct xchk_iscan iscan;
+
+ /* Number of non-freespace records found. */
+ unsigned long long nr_records;
+
+ /* bnobt/cntbt contribution to btreeblks */
+ xfs_agblock_t freesp_btblocks;
+
+ /* old agf_rmap_blocks counter */
+ unsigned int old_rmapbt_fsbcount;
+};
+
+/* Set us up to repair reverse mapping btrees. */
+int
+xrep_setup_ag_rmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rmap *rr;
+ char *descr;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
+
+ descr = xchk_xfile_ag_descr(sc, "reverse mapping records");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ if (error)
+ return error;
+
+ rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS);
+ if (!rr)
+ return -ENOMEM;
+
+ rr->sc = sc;
+ sc->buf = rr;
+ return 0;
+}
+
+/* Make sure there's nothing funny about this mapping. */
+STATIC int
+xrep_rmap_check_mapping(
+ struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *rec)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+ rec->rm_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Store a reverse-mapping record. */
+static inline int
+xrep_rmap_stash(
+ struct xrep_rmap *rr,
+ xfs_agblock_t startblock,
+ xfs_extlen_t blockcount,
+ uint64_t owner,
+ uint64_t offset,
+ unsigned int flags)
+{
+ struct xfs_rmap_irec rmap = {
+ .rm_startblock = startblock,
+ .rm_blockcount = blockcount,
+ .rm_owner = owner,
+ .rm_offset = offset,
+ .rm_flags = flags,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_btree_cur *mcur;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ if (xchk_iscan_aborted(&rr->iscan))
+ return -EFSCORRUPTED;
+
+ trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap);
+
+ mutex_lock(&rr->lock);
+ mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
+ error = xfs_rmap_map_raw(mcur, &rmap);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_cancel;
+
+ error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp);
+ if (error)
+ goto out_abort;
+
+ mutex_unlock(&rr->lock);
+ return 0;
+
+out_cancel:
+ xfbtree_trans_cancel(&rr->rmap_btree, sc->tp);
+out_abort:
+ xchk_iscan_abort(&rr->iscan);
+ mutex_unlock(&rr->lock);
+ return error;
+}
+
+struct xrep_rmap_stash_run {
+ struct xrep_rmap *rr;
+ uint64_t owner;
+ unsigned int rmap_flags;
+};
+
+static int
+xrep_rmap_stash_run(
+ uint32_t start,
+ uint32_t len,
+ void *priv)
+{
+ struct xrep_rmap_stash_run *rsr = priv;
+ struct xrep_rmap *rr = rsr->rr;
+
+ return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags);
+}
+
+/*
+ * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure
+ * that the ranges are in units of FS blocks.
+ */
+STATIC int
+xrep_rmap_stash_bitmap(
+ struct xrep_rmap *rr,
+ struct xagb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xrep_rmap_stash_run rsr = {
+ .rr = rr,
+ .owner = oinfo->oi_owner,
+ .rmap_flags = 0,
+ };
+
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ rsr.rmap_flags |= XFS_RMAP_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK;
+
+ return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr);
+}
+
+/* Section (I): Finding all file and bmbt extents. */
+
+/* Context for accumulating rmaps for an inode fork. */
+struct xrep_rmap_ifork {
+ /*
+ * Accumulate rmap data here to turn multiple adjacent bmaps into a
+ * single rmap.
+ */
+ struct xfs_rmap_irec accum;
+
+ /* Bitmap of bmbt blocks in this AG. */
+ struct xagb_bitmap bmbt_blocks;
+
+ struct xrep_rmap *rr;
+
+ /* Which inode fork? */
+ int whichfork;
+};
+
+/* Stash an rmap that we accumulated while walking an inode fork. */
+STATIC int
+xrep_rmap_stash_accumulated(
+ struct xrep_rmap_ifork *rf)
+{
+ if (rf->accum.rm_blockcount == 0)
+ return 0;
+
+ return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock,
+ rf->accum.rm_blockcount, rf->accum.rm_owner,
+ rf->accum.rm_offset, rf->accum.rm_flags);
+}
+
+/* Accumulate a bmbt record. */
+STATIC int
+xrep_rmap_visit_bmbt(
+ struct xfs_btree_cur *cur,
+ struct xfs_bmbt_irec *rec,
+ void *priv)
+{
+ struct xrep_rmap_ifork *rf = priv;
+ struct xfs_mount *mp = rf->rr->sc->mp;
+ struct xfs_rmap_irec *accum = &rf->accum;
+ xfs_agblock_t agbno;
+ unsigned int rmap_flags = 0;
+ int error;
+
+ if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
+ rf->rr->sc->sa.pag->pag_agno)
+ return 0;
+
+ agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
+ if (rf->whichfork == XFS_ATTR_FORK)
+ rmap_flags |= XFS_RMAP_ATTR_FORK;
+ if (rec->br_state == XFS_EXT_UNWRITTEN)
+ rmap_flags |= XFS_RMAP_UNWRITTEN;
+
+ /* If this bmap is adjacent to the previous one, just add it. */
+ if (accum->rm_blockcount > 0 &&
+ rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
+ agbno == accum->rm_startblock + accum->rm_blockcount &&
+ rmap_flags == accum->rm_flags) {
+ accum->rm_blockcount += rec->br_blockcount;
+ return 0;
+ }
+
+ /* Otherwise stash the old rmap and start accumulating a new one. */
+ error = xrep_rmap_stash_accumulated(rf);
+ if (error)
+ return error;
+
+ accum->rm_startblock = agbno;
+ accum->rm_blockcount = rec->br_blockcount;
+ accum->rm_offset = rec->br_startoff;
+ accum->rm_flags = rmap_flags;
+ return 0;
+}
+
+/* Add a btree block to the bitmap. */
+STATIC int
+xrep_rmap_visit_iroot_btree_block(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xrep_rmap_ifork *rf = priv;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t agbno;
+
+ xfs_btree_get_block(cur, level, &bp);
+ if (!bp)
+ return 0;
+
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+ if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno)
+ return 0;
+
+ agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1);
+}
+
+/*
+ * Iterate a metadata btree rooted in an inode to collect rmap records for
+ * anything in this fork that matches the AG.
+ */
+STATIC int
+xrep_rmap_scan_iroot_btree(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_owner_info oinfo;
+ struct xrep_rmap *rr = rf->rr;
+ int error;
+
+ xagb_bitmap_init(&rf->bmbt_blocks);
+
+ /* Record all the blocks in the btree itself. */
+ error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block,
+ XFS_BTREE_VISIT_ALL, rf);
+ if (error)
+ goto out;
+
+ /* Emit rmaps for the btree blocks. */
+ xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork);
+ error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo);
+ if (error)
+ goto out;
+
+ /* Stash any remaining accumulated rmaps. */
+ error = xrep_rmap_stash_accumulated(rf);
+out:
+ xagb_bitmap_destroy(&rf->bmbt_blocks);
+ return error;
+}
+
+static inline bool
+is_rt_data_fork(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK;
+}
+
+/*
+ * Iterate the block mapping btree to collect rmap records for anything in this
+ * fork that matches the AG. Sets @mappings_done to true if we've scanned the
+ * block mappings in this fork.
+ */
+STATIC int
+xrep_rmap_scan_bmbt(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_inode *ip,
+ bool *mappings_done)
+{
+ struct xrep_rmap *rr = rf->rr;
+ struct xfs_btree_cur *cur;
+ struct xfs_ifork *ifp;
+ int error;
+
+ *mappings_done = false;
+ ifp = xfs_ifork_ptr(ip, rf->whichfork);
+ cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork);
+
+ if (!xfs_ifork_is_realtime(ip, rf->whichfork) &&
+ xfs_need_iread_extents(ifp)) {
+ /*
+ * If the incore extent cache isn't loaded, scan the bmbt for
+ * mapping records. This avoids loading the incore extent
+ * tree, which will increase memory pressure at a time when
+ * we're trying to run as quickly as we possibly can. Ignore
+ * realtime extents.
+ */
+ error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf);
+ if (error)
+ goto out_cur;
+
+ *mappings_done = true;
+ }
+
+ /* Scan for the bmbt blocks, which always live on the data device. */
+ error = xrep_rmap_scan_iroot_btree(rf, cur);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Iterate the in-core extent cache to collect rmap records for anything in
+ * this fork that matches the AG.
+ */
+STATIC int
+xrep_rmap_scan_iext(
+ struct xrep_rmap_ifork *rf,
+ struct xfs_ifork *ifp)
+{
+ struct xfs_bmbt_irec rec;
+ struct xfs_iext_cursor icur;
+ int error;
+
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
+ continue;
+ error = xrep_rmap_visit_bmbt(NULL, &rec, rf);
+ if (error)
+ return error;
+ }
+
+ return xrep_rmap_stash_accumulated(rf);
+}
+
+/* Find all the extents from a given AG in an inode fork. */
+STATIC int
+xrep_rmap_scan_ifork(
+ struct xrep_rmap *rr,
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xrep_rmap_ifork rf = {
+ .accum = { .rm_owner = ip->i_ino, },
+ .rr = rr,
+ .whichfork = whichfork,
+ };
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ int error = 0;
+
+ if (!ifp)
+ return 0;
+
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ bool mappings_done;
+
+ /*
+ * Scan the bmap btree for data device mappings. This includes
+ * the btree blocks themselves, even if this is a realtime
+ * file.
+ */
+ error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
+ if (error || mappings_done)
+ return error;
+ } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
+ return 0;
+ }
+
+ /* Scan incore extent cache if this isn't a realtime file. */
+ if (xfs_ifork_is_realtime(ip, whichfork))
+ return 0;
+
+ return xrep_rmap_scan_iext(&rf, ifp);
+}
+
+/*
+ * Take ILOCK on a file that we want to scan.
+ *
+ * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded
+ * attr bmbt. Otherwise, take ILOCK_SHARED.
+ */
+static inline unsigned int
+xrep_rmap_scan_ilock(
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ if (xfs_need_iread_extents(&ip->i_df)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ goto lock;
+ }
+
+ if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
+ lock_mode = XFS_ILOCK_EXCL;
+
+lock:
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
+/* Record reverse mappings for a file. */
+STATIC int
+xrep_rmap_scan_inode(
+ struct xrep_rmap *rr,
+ struct xfs_inode *ip)
+{
+ unsigned int lock_mode = 0;
+ int error;
+
+ /*
+ * Directory updates (create/link/unlink/rename) drop the directory's
+ * ILOCK before finishing any rmapbt updates associated with directory
+ * shape changes. For this scan to coordinate correctly with the live
+ * update hook, we must take the only lock (i_rwsem) that is held all
+ * the way to dir op completion. This will get fixed by the parent
+ * pointer patchset.
+ */
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ lock_mode = XFS_IOLOCK_SHARED;
+ xfs_ilock(ip, lock_mode);
+ }
+ lock_mode |= xrep_rmap_scan_ilock(ip);
+
+ /* Check the data fork. */
+ error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ /* Check the attr fork. */
+ error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK);
+ if (error)
+ goto out_unlock;
+
+ /* COW fork extents are "owned" by the refcount btree. */
+
+ xchk_iscan_mark_visited(&rr->iscan, ip);
+out_unlock:
+ xfs_iunlock(ip, lock_mode);
+ return error;
+}
+
+/* Section (I): Find all AG metadata extents except for free space metadata. */
+
+struct xrep_rmap_inodes {
+ struct xrep_rmap *rr;
+ struct xagb_bitmap inobt_blocks; /* INOBIT */
+ struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */
+};
+
+/* Record inode btree rmaps. */
+STATIC int
+xrep_rmap_walk_inobt(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore irec;
+ struct xrep_rmap_inodes *ri = priv;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agblock_t agbno;
+ xfs_extlen_t aglen;
+ xfs_agino_t agino;
+ xfs_agino_t iperhole;
+ unsigned int i;
+ int error;
+
+ /* Record the inobt blocks. */
+ error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur);
+ if (error)
+ return error;
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+ if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL)
+ return -EFSCORRUPTED;
+
+ agino = irec.ir_startino;
+
+ /* Record a non-sparse inode chunk. */
+ if (!xfs_inobt_issparse(irec.ir_holemask)) {
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ aglen = max_t(xfs_extlen_t, 1,
+ XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock);
+
+ return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
+ }
+
+ /* Iterate each chunk. */
+ iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
+ XFS_INODES_PER_HOLEMASK_BIT);
+ aglen = iperhole / mp->m_sb.sb_inopblock;
+ for (i = 0, agino = irec.ir_startino;
+ i < XFS_INOBT_HOLEMASK_BITS;
+ i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
+ /* Skip holes. */
+ if (irec.ir_holemask & (1 << i))
+ continue;
+
+ /* Record the inode chunk otherwise. */
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Collect rmaps for the blocks containing inode btrees and the inode chunks. */
+STATIC int
+xrep_rmap_find_inode_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xrep_rmap_inodes ri = {
+ .rr = rr,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ xagb_bitmap_init(&ri.inobt_blocks);
+ xagb_bitmap_init(&ri.ichunk_blocks);
+
+ /*
+ * Iterate every record in the inobt so we can capture all the inode
+ * chunks and the blocks in the inobt itself.
+ */
+ error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri);
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Note that if there are zero records in the inobt then query_all does
+ * nothing and we have to account the empty inobt root manually.
+ */
+ if (xagb_bitmap_empty(&ri.ichunk_blocks)) {
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+
+ error = xagb_bitmap_set(&ri.inobt_blocks,
+ be32_to_cpu(agi->agi_root), 1);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Scan the finobt too. */
+ if (xfs_has_finobt(sc->mp)) {
+ error = xagb_bitmap_set_btblocks(&ri.inobt_blocks,
+ sc->sa.fino_cur);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Generate rmaps for everything. */
+ error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks,
+ &XFS_RMAP_OINFO_INOBT);
+ if (error)
+ goto out_bitmap;
+ error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks,
+ &XFS_RMAP_OINFO_INODES);
+
+out_bitmap:
+ xagb_bitmap_destroy(&ri.inobt_blocks);
+ xagb_bitmap_destroy(&ri.ichunk_blocks);
+ return error;
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rmap_walk_cowblocks(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *irec,
+ void *priv)
+{
+ struct xagb_bitmap *bitmap = priv;
+
+ if (!xfs_refcount_check_domain(irec) ||
+ irec->rc_domain != XFS_REFC_DOMAIN_COW)
+ return -EFSCORRUPTED;
+
+ return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount);
+}
+
+/*
+ * Collect rmaps for the blocks containing the refcount btree, and all CoW
+ * staging extents.
+ */
+STATIC int
+xrep_rmap_find_refcount_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xagb_bitmap refcountbt_blocks; /* REFCBIT */
+ struct xagb_bitmap cow_blocks; /* COWBIT */
+ struct xfs_refcount_irec low = {
+ .rc_startblock = 0,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_refcount_irec high = {
+ .rc_startblock = -1U,
+ .rc_domain = XFS_REFC_DOMAIN_COW,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ if (!xfs_has_reflink(sc->mp))
+ return 0;
+
+ xagb_bitmap_init(&refcountbt_blocks);
+ xagb_bitmap_init(&cow_blocks);
+
+ /* refcountbt */
+ error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur);
+ if (error)
+ goto out_bitmap;
+
+ /* Collect rmaps for CoW staging extents. */
+ error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high,
+ xrep_rmap_walk_cowblocks, &cow_blocks);
+ if (error)
+ goto out_bitmap;
+
+ /* Generate rmaps for everything. */
+ error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
+ if (error)
+ goto out_bitmap;
+ error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks,
+ &XFS_RMAP_OINFO_REFC);
+
+out_bitmap:
+ xagb_bitmap_destroy(&cow_blocks);
+ xagb_bitmap_destroy(&refcountbt_blocks);
+ return error;
+}
+
+/* Generate rmaps for the AG headers (AGI/AGF/AGFL) */
+STATIC int
+xrep_rmap_find_agheader_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ /* Create a record for the AG sb->agfl. */
+ return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp),
+ XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1,
+ XFS_RMAP_OWN_FS, 0, 0);
+}
+
+/* Generate rmaps for the log, if it's in this AG. */
+STATIC int
+xrep_rmap_find_log_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno))
+ return 0;
+
+ return xrep_rmap_stash(rr,
+ XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart),
+ sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0);
+}
+
+/* Check and count all the records that we gathered. */
+STATIC int
+xrep_rmap_check_record(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rmap *rr = priv;
+ int error;
+
+ error = xrep_rmap_check_mapping(rr->sc, rec);
+ if (error)
+ return error;
+
+ rr->nr_records++;
+ return 0;
+}
+
+/*
+ * Generate all the reverse-mappings for this AG, a list of the old rmapbt
+ * blocks, and the new btreeblks count. Figure out if we have enough free
+ * space to reconstruct the inode btrees. The caller must clean up the lists
+ * if anything goes wrong. This implements section (I) above.
+ */
+STATIC int
+xrep_rmap_find_rmaps(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xchk_ag *sa = &sc->sa;
+ struct xfs_inode *ip;
+ struct xfs_btree_cur *mcur;
+ int error;
+
+ /* Find all the per-AG metadata. */
+ xrep_ag_btcur_init(sc, &sc->sa);
+
+ error = xrep_rmap_find_inode_rmaps(rr);
+ if (error)
+ goto end_agscan;
+
+ error = xrep_rmap_find_refcount_rmaps(rr);
+ if (error)
+ goto end_agscan;
+
+ error = xrep_rmap_find_agheader_rmaps(rr);
+ if (error)
+ goto end_agscan;
+
+ error = xrep_rmap_find_log_rmaps(rr);
+end_agscan:
+ xchk_ag_btcur_free(&sc->sa);
+ if (error)
+ return error;
+
+ /*
+ * Set up for a potentially lengthy filesystem scan by reducing our
+ * transaction resource usage for the duration. Specifically:
+ *
+ * Unlock the AG header buffers and cancel the transaction to release
+ * the log grant space while we scan the filesystem.
+ *
+ * Create a new empty transaction to eliminate the possibility of the
+ * inode scan deadlocking on cyclical metadata.
+ *
+ * We pass the empty transaction to the file scanning function to avoid
+ * repeatedly cycling empty transactions. This can be done even though
+ * we take the IOLOCK to quiesce the file because empty transactions
+ * do not take sb_internal.
+ */
+ sa->agf_bp = NULL;
+ sa->agi_bp = NULL;
+ xchk_trans_cancel(sc);
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ /* Iterate all AGs for inodes rmaps. */
+ while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
+ error = xrep_rmap_scan_inode(rr, ip);
+ xchk_irele(sc, ip);
+ if (error)
+ break;
+
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&rr->iscan);
+ if (error)
+ return error;
+
+ /*
+ * Switch out for a real transaction and lock the AG headers in
+ * preparation for building a new tree.
+ */
+ xchk_trans_cancel(sc);
+ error = xchk_setup_fs(sc);
+ if (error)
+ return error;
+ error = xchk_perag_drain_and_lock(sc);
+ if (error)
+ return error;
+
+ /*
+ * If a hook failed to update the in-memory btree, we lack the data to
+ * continue the repair.
+ */
+ if (xchk_iscan_aborted(&rr->iscan))
+ return -EFSCORRUPTED;
+
+ /*
+ * Now that we have everything locked again, we need to count the
+ * number of rmap records stashed in the btree. This should reflect
+ * all actively-owned space in the filesystem. At the same time, check
+ * all our records before we start building a new btree, which requires
+ * a bnobt cursor.
+ */
+ mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
+ sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+
+ rr->nr_records = 0;
+ error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr);
+
+ xfs_btree_del_cursor(sc->sa.bno_cur, error);
+ sc->sa.bno_cur = NULL;
+ xfs_btree_del_cursor(mcur, error);
+
+ return error;
+}
+
+/* Section (II): Reserving space for new rmapbt and setting free space bitmap */
+
+struct xrep_rmap_agfl {
+ struct xagb_bitmap *bitmap;
+ xfs_agnumber_t agno;
+};
+
+/* Add an AGFL block to the rmap list. */
+STATIC int
+xrep_rmap_walk_agfl(
+ struct xfs_mount *mp,
+ xfs_agblock_t agbno,
+ void *priv)
+{
+ struct xrep_rmap_agfl *ra = priv;
+
+ return xagb_bitmap_set(ra->bitmap, agbno, 1);
+}
+
+/*
+ * Run one round of reserving space for the new rmapbt and recomputing the
+ * number of blocks needed to store the previously observed rmapbt records and
+ * the ones we'll create for the free space metadata. When we don't need more
+ * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to
+ * true.
+ */
+STATIC int
+xrep_rmap_try_reserve(
+ struct xrep_rmap *rr,
+ struct xfs_btree_cur *rmap_cur,
+ struct xagb_bitmap *freesp_blocks,
+ uint64_t *blocks_reserved,
+ bool *done)
+{
+ struct xrep_rmap_agfl ra = {
+ .bitmap = freesp_blocks,
+ .agno = rr->sc->sa.pag->pag_agno,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xrep_newbt_resv *resv, *n;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_buf *agfl_bp;
+ uint64_t nr_blocks; /* RMB */
+ uint64_t freesp_records;
+ int error;
+
+ /*
+ * We're going to recompute new_btree.bload.nr_blocks at the end of
+ * this function to reflect however many btree blocks we need to store
+ * all the rmap records (including the ones that reflect the changes we
+ * made to support the new rmapbt blocks), so we save the old value
+ * here so we can decide if we've reserved enough blocks.
+ */
+ nr_blocks = rr->new_btree.bload.nr_blocks;
+
+ /*
+ * Make sure we've reserved enough space for the new btree. This can
+ * change the shape of the free space btrees, which can cause secondary
+ * interactions with the rmap records because all three space btrees
+ * have the same rmap owner. We'll account for all that below.
+ */
+ error = xrep_newbt_alloc_blocks(&rr->new_btree,
+ nr_blocks - *blocks_reserved);
+ if (error)
+ return error;
+
+ *blocks_reserved = rr->new_btree.bload.nr_blocks;
+
+ /* Clear everything in the bitmap. */
+ xagb_bitmap_destroy(freesp_blocks);
+
+ /* Set all the bnobt blocks in the bitmap. */
+ sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur);
+ xfs_btree_del_cursor(sc->sa.bno_cur, error);
+ sc->sa.bno_cur = NULL;
+ if (error)
+ return error;
+
+ /* Set all the cntbt blocks in the bitmap. */
+ sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur);
+ xfs_btree_del_cursor(sc->sa.cnt_cur, error);
+ sc->sa.cnt_cur = NULL;
+ if (error)
+ return error;
+
+ /* Record our new btreeblks value. */
+ rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2;
+
+ /* Set all the new rmapbt blocks in the bitmap. */
+ list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) {
+ error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len);
+ if (error)
+ return error;
+ }
+
+ /* Set all the AGFL blocks in the bitmap. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ return error;
+
+ error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra);
+ if (error)
+ return error;
+
+ /* Count the extents in the bitmap. */
+ freesp_records = xagb_bitmap_count_set_regions(freesp_blocks);
+
+ /* Compute how many blocks we'll need for all the rmaps. */
+ error = xfs_btree_bload_compute_geometry(rmap_cur,
+ &rr->new_btree.bload, rr->nr_records + freesp_records);
+ if (error)
+ return error;
+
+ /* We're done when we don't need more blocks. */
+ *done = nr_blocks >= rr->new_btree.bload.nr_blocks;
+ return 0;
+}
+
+/*
+ * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for
+ * the free space metadata. This implements section (II) above.
+ */
+STATIC int
+xrep_rmap_reserve_space(
+ struct xrep_rmap *rr,
+ struct xfs_btree_cur *rmap_cur)
+{
+ struct xagb_bitmap freesp_blocks; /* AGBIT */
+ uint64_t blocks_reserved = 0;
+ bool done = false;
+ int error;
+
+ /* Compute how many blocks we'll need for the rmaps collected so far. */
+ error = xfs_btree_bload_compute_geometry(rmap_cur,
+ &rr->new_btree.bload, rr->nr_records);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ xagb_bitmap_init(&freesp_blocks);
+
+ /*
+ * Iteratively reserve space for the new rmapbt and recompute the
+ * number of blocks needed to store the previously observed rmapbt
+ * records and the ones we'll create for the free space metadata.
+ * Finish when we don't need more blocks.
+ */
+ do {
+ error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks,
+ &blocks_reserved, &done);
+ if (error)
+ goto out_bitmap;
+ } while (!done);
+
+ /* Emit rmaps for everything in the free space bitmap. */
+ xrep_ag_btcur_init(rr->sc, &rr->sc->sa);
+ error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG);
+ xchk_ag_btcur_free(&rr->sc->sa);
+
+out_bitmap:
+ xagb_bitmap_destroy(&freesp_blocks);
+ return error;
+}
+
+/* Section (III): Building the new rmap btree. */
+
+/* Update the AGF counters. */
+STATIC int
+xrep_rmap_reset_counters(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ xfs_agblock_t rmap_btblocks;
+
+ /*
+ * The AGF header contains extra information related to the reverse
+ * mapping btree, so we must update those fields here.
+ */
+ rmap_btblocks = rr->new_btree.afake.af_blocks - 1;
+ agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks);
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS);
+
+ /*
+ * After we commit the new btree to disk, it is possible that the
+ * process to reap the old btree blocks will race with the AIL trying
+ * to checkpoint the old btree blocks into the filesystem. If the new
+ * tree is shorter than the old one, the rmapbt write verifier will
+ * fail and the AIL will shut down the filesystem.
+ *
+ * To avoid this, save the old incore btree height values as the alt
+ * height values before re-initializing the perag info from the updated
+ * AGF to capture all the new values.
+ */
+ pag->pagf_repair_rmap_level = pag->pagf_rmap_level;
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagf(sc);
+}
+
+/* Retrieve rmapbt data for bulk load. */
+STATIC int
+xrep_rmap_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xrep_rmap *rr = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ int stat = 0;
+
+ error = xfs_btree_increment(rr->mcur, 0, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return -EFSCORRUPTED;
+
+ error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return -EFSCORRUPTED;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rmap_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_rmap *rr = priv;
+
+ return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Custom allocation function for new rmap btrees. */
+STATIC int
+xrep_rmap_alloc_vextent(
+ struct xfs_scrub *sc,
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t alloc_hint)
+{
+ int error;
+
+ /*
+ * We don't want an rmap update on the allocation, since we iteratively
+ * compute the OWN_AG records /after/ allocating blocks for the records
+ * that we already know we need to store. Therefore, fix the freelist
+ * with the NORMAP flag set so that we don't also try to create an rmap
+ * for new AGFL blocks.
+ */
+ error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP);
+ if (error)
+ return error;
+
+ /*
+ * If xrep_fix_freelist fixed the freelist by moving blocks from the
+ * free space btrees or by removing blocks from the AGFL and queueing
+ * an EFI to free the block, the transaction will be dirty. This
+ * second case is of interest to us.
+ *
+ * Later on, we will need to compare gaps in the new recordset against
+ * the block usage of all OWN_AG owners in order to free the old
+ * btree's blocks, which means that we can't have EFIs for former AGFL
+ * blocks attached to the repair transaction when we commit the new
+ * btree.
+ *
+ * xrep_newbt_alloc_blocks guarantees this for us by calling
+ * xrep_defer_finish to commit anything that fix_freelist may have
+ * added to the transaction.
+ */
+ return xfs_alloc_vextent_near_bno(args, alloc_hint);
+}
+
+
+/* Count the records in this btree. */
+STATIC int
+xrep_rmap_count_records(
+ struct xfs_btree_cur *cur,
+ unsigned long long *nr)
+{
+ int running = 1;
+ int error;
+
+ *nr = 0;
+
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ return error;
+
+ while (running && !(error = xfs_btree_increment(cur, 0, &running))) {
+ if (running)
+ (*nr)++;
+ }
+
+ return error;
+}
+/*
+ * Use the collected rmap information to stage a new rmap btree. If this is
+ * successful we'll return with the new btree root information logged to the
+ * repair transaction but not yet committed. This implements section (III)
+ * above.
+ */
+STATIC int
+xrep_rmap_build_new_tree(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_btree_cur *rmap_cur;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ /*
+ * Preserve the old rmapbt block count so that we can adjust the
+ * per-AG rmapbt reservation after we commit the new btree root and
+ * want to dispose of the old btree blocks.
+ */
+ rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks);
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the AG header. The new blocks are accounted to the
+ * rmapbt per-AG reservation, which we will adjust further after
+ * committing the new btree.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp));
+ xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
+ fsbno, XFS_AG_RESV_RMAPBT);
+ rr->new_btree.bload.get_records = xrep_rmap_get_records;
+ rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
+ rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
+ rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag);
+ xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake);
+
+ /*
+ * Initialize @rr->new_btree, reserve space for the new rmapbt,
+ * and compute OWN_AG rmaps.
+ */
+ error = xrep_rmap_reserve_space(rr, rmap_cur);
+ if (error)
+ goto err_cur;
+
+ /*
+ * Count the rmapbt records again, because the space reservation
+ * for the rmapbt itself probably added more records to the btree.
+ */
+ rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL,
+ &rr->rmap_btree);
+
+ error = xrep_rmap_count_records(rr->mcur, &rr->nr_records);
+ if (error)
+ goto err_mcur;
+
+ /*
+ * Due to btree slack factors, it's possible for a new btree to be one
+ * level taller than the old btree. Update the incore btree height so
+ * that we don't trip the verifiers when writing the new btree blocks
+ * to disk.
+ */
+ pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height;
+
+ /*
+ * Move the cursor to the left edge of the tree so that the first
+ * increment in ->get_records positions us at the first record.
+ */
+ error = xfs_btree_goto_left_edge(rr->mcur);
+ if (error)
+ goto err_level;
+
+ /* Add all observed rmap records. */
+ error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
+ if (error)
+ goto err_level;
+
+ /*
+ * Install the new btree in the AG header. After this point the old
+ * btree is no longer accessible and the new tree is live.
+ */
+ xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(rmap_cur, 0);
+ xfs_btree_del_cursor(rr->mcur, 0);
+ rr->mcur = NULL;
+
+ /*
+ * Now that we've written the new btree to disk, we don't need to keep
+ * updating the in-memory btree. Abort the scan to stop live updates.
+ */
+ xchk_iscan_abort(&rr->iscan);
+
+ /*
+ * The newly committed rmap recordset includes mappings for the blocks
+ * that we reserved to build the new btree. If there is excess space
+ * reservation to be freed, the corresponding rmap records must also be
+ * removed.
+ */
+ rr->new_btree.oinfo = XFS_RMAP_OINFO_AG;
+
+ /* Reset the AGF counters now that we've changed the btree shape. */
+ error = xrep_rmap_reset_counters(rr);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rr->new_btree);
+ if (error)
+ return error;
+
+ return xrep_roll_ag_trans(sc);
+
+err_level:
+ pag->pagf_repair_rmap_level = 0;
+err_mcur:
+ xfs_btree_del_cursor(rr->mcur, error);
+err_cur:
+ xfs_btree_del_cursor(rmap_cur, error);
+err_newbt:
+ xrep_newbt_cancel(&rr->new_btree);
+ return error;
+}
+
+/* Section (IV): Reaping the old btree. */
+
+struct xrep_rmap_find_gaps {
+ struct xagb_bitmap rmap_gaps;
+ xfs_agblock_t next_agbno;
+};
+
+/* Subtract each free extent in the bnobt from the rmap gaps. */
+STATIC int
+xrep_rmap_find_freesp(
+ struct xfs_btree_cur *cur,
+ const struct xfs_alloc_rec_incore *rec,
+ void *priv)
+{
+ struct xrep_rmap_find_gaps *rfg = priv;
+
+ return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock,
+ rec->ar_blockcount);
+}
+
+/* Record the free space we find, as part of cleaning out the btree. */
+STATIC int
+xrep_rmap_find_gaps(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_rmap_find_gaps *rfg = priv;
+ int error;
+
+ if (rec->rm_startblock > rfg->next_agbno) {
+ error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno,
+ rec->rm_startblock - rfg->next_agbno);
+ if (error)
+ return error;
+ }
+
+ rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno,
+ rec->rm_startblock + rec->rm_blockcount);
+ return 0;
+}
+
+/*
+ * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make
+ * a list of gaps in the rmap records and a list of the extents mentioned in
+ * the bnobt. Any block that's in the new rmapbt gap list but not mentioned
+ * in the bnobt is a block from the old rmapbt and can be removed.
+ */
+STATIC int
+xrep_rmap_remove_old_tree(
+ struct xrep_rmap *rr)
+{
+ struct xrep_rmap_find_gaps rfg = {
+ .next_agbno = 0,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_btree_cur *mcur;
+ xfs_agblock_t agend;
+ int error;
+
+ xagb_bitmap_init(&rfg.rmap_gaps);
+
+ /* Compute free space from the new rmapbt. */
+ mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
+
+ error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_bitmap;
+
+ /* Insert a record for space between the last rmap and EOAG. */
+ agend = be32_to_cpu(agf->agf_length);
+ if (rfg.next_agbno < agend) {
+ error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno,
+ agend - rfg.next_agbno);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Compute free space from the existing bnobt. */
+ sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp,
+ &rfg);
+ xfs_btree_del_cursor(sc->sa.bno_cur, error);
+ sc->sa.bno_cur = NULL;
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Free the "free" blocks that the new rmapbt knows about but the bnobt
+ * doesn't--these are the old rmapbt blocks. Credit the old rmapbt
+ * block usage count back to the per-AG rmapbt reservation (and not
+ * fdblocks, since the rmap btree lives in free space) to keep the
+ * reservation and free space accounting correct.
+ */
+ error = xrep_reap_agblocks(sc, &rfg.rmap_gaps,
+ &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT);
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Now that we've zapped all the old rmapbt blocks we can turn off
+ * the alternate height mechanism and reset the per-AG space
+ * reservation.
+ */
+ pag->pagf_repair_rmap_level = 0;
+ sc->flags |= XREP_RESET_PERAG_RESV;
+out_bitmap:
+ xagb_bitmap_destroy(&rfg.rmap_gaps);
+ return error;
+}
+
+static inline bool
+xrep_rmapbt_want_live_update(
+ struct xchk_iscan *iscan,
+ const struct xfs_owner_info *oi)
+{
+ if (xchk_iscan_aborted(iscan))
+ return false;
+
+ /*
+ * Before unlocking the AG header to perform the inode scan, we
+ * recorded reverse mappings for all AG metadata except for the OWN_AG
+ * metadata. IOWs, the in-memory btree knows about the AG headers, the
+ * two inode btrees, the CoW staging extents, and the refcount btrees.
+ * For these types of metadata, we need to record the live updates in
+ * the in-memory rmap btree.
+ *
+ * However, we do not scan the free space btrees or the AGFL until we
+ * have re-locked the AGF and are ready to reserve space for the new
+ * rmap btree, so we do not want live updates for OWN_AG metadata.
+ */
+ if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
+ return oi->oi_owner != XFS_RMAP_OWN_AG;
+
+ /* Ignore updates to files that the scanner hasn't visited yet. */
+ return xchk_iscan_want_live_update(iscan, oi->oi_owner);
+}
+
+/*
+ * Apply a rmapbt update from the regular filesystem into our shadow btree.
+ * We're running from the thread that owns the AGF buffer and is generating
+ * the update, so we must be careful about which parts of the struct xrep_rmap
+ * that we change.
+ */
+static int
+xrep_rmapbt_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_rmap_update_params *p = data;
+ struct xrep_rmap *rr;
+ struct xfs_mount *mp;
+ struct xfs_btree_cur *mcur;
+ struct xfs_trans *tp;
+ void *txcookie;
+ int error;
+
+ rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
+ mp = rr->sc->mp;
+
+ if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
+ goto out_unlock;
+
+ trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p);
+
+ error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
+ if (error)
+ goto out_abort;
+
+ mutex_lock(&rr->lock);
+ mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
+ error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
+ p->blockcount, &p->oinfo, p->unwritten);
+ xfs_btree_del_cursor(mcur, error);
+ if (error)
+ goto out_cancel;
+
+ error = xfbtree_trans_commit(&rr->rmap_btree, tp);
+ if (error)
+ goto out_cancel;
+
+ xrep_trans_cancel_hook_dummy(&txcookie, tp);
+ mutex_unlock(&rr->lock);
+ return NOTIFY_DONE;
+
+out_cancel:
+ xfbtree_trans_cancel(&rr->rmap_btree, tp);
+ xrep_trans_cancel_hook_dummy(&txcookie, tp);
+out_abort:
+ mutex_unlock(&rr->lock);
+ xchk_iscan_abort(&rr->iscan);
+out_unlock:
+ return NOTIFY_DONE;
+}
+
+/* Set up the filesystem scan components. */
+STATIC int
+xrep_rmap_setup_scan(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int error;
+
+ mutex_init(&rr->lock);
+
+ /* Set up in-memory rmap btree */
+ error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
+ sc->sa.pag->pag_agno);
+ if (error)
+ goto out_mutex;
+
+ /* Retry iget every tenth of a second for up to 30 seconds. */
+ xchk_iscan_start(sc, 30000, 100, &rr->iscan);
+
+ /*
+ * Hook into live rmap operations so that we can update our in-memory
+ * btree to reflect live changes on the filesystem. Since we drop the
+ * AGF buffer to scan all the inodes, we need this piece to avoid
+ * installing a stale btree.
+ */
+ ASSERT(sc->flags & XCHK_FSGATES_RMAP);
+ xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
+ error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook);
+ if (error)
+ goto out_iscan;
+ return 0;
+
+out_iscan:
+ xchk_iscan_teardown(&rr->iscan);
+ xfbtree_destroy(&rr->rmap_btree);
+out_mutex:
+ mutex_destroy(&rr->lock);
+ return error;
+}
+
+/* Tear down scan components. */
+STATIC void
+xrep_rmap_teardown(
+ struct xrep_rmap *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+
+ xchk_iscan_abort(&rr->iscan);
+ xfs_rmap_hook_del(sc->sa.pag, &rr->rhook);
+ xchk_iscan_teardown(&rr->iscan);
+ xfbtree_destroy(&rr->rmap_btree);
+ mutex_destroy(&rr->lock);
+}
+
+/* Repair the rmap btree for some AG. */
+int
+xrep_rmapbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_rmap *rr = sc->buf;
+ int error;
+
+ error = xrep_rmap_setup_scan(rr);
+ if (error)
+ return error;
+
+ /*
+ * Collect rmaps for everything in this AG that isn't space metadata.
+ * These rmaps won't change even as we try to allocate blocks.
+ */
+ error = xrep_rmap_find_rmaps(rr);
+ if (error)
+ goto out_records;
+
+ /* Rebuild the rmap information. */
+ error = xrep_rmap_build_new_tree(rr);
+ if (error)
+ goto out_records;
+
+ /* Kill the old tree. */
+ error = xrep_rmap_remove_old_tree(rr);
+ if (error)
+ goto out_records;
+
+out_records:
+ xrep_rmap_teardown(rr);
+ return error;
+}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 008ddb599e13..46583517377f 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -11,20 +11,37 @@
#include "xfs_mount.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/rtbitmap.h"
/* Set us up with the realtime metadata locked. */
int
xchk_setup_rtbitmap(
struct xfs_scrub *sc)
{
+ struct xfs_mount *mp = sc->mp;
+ struct xchk_rtbitmap *rtb;
int error;
- error = xchk_trans_alloc(sc, 0);
+ rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS);
+ if (!rtb)
+ return -ENOMEM;
+ sc->buf = rtb;
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_rtbitmap(sc, rtb);
+ if (error)
+ return error;
+ }
+
+ error = xchk_trans_alloc(sc, rtb->resblks);
if (error)
return error;
@@ -32,7 +49,22 @@ xchk_setup_rtbitmap(
if (error)
return error;
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ return error;
+
xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+
+ /*
+ * Now that we've locked the rtbitmap, we can't race with growfsrt
+ * trying to expand the bitmap or change the size of the rt volume.
+ * Hence it is safe to compute and check the geometry values.
+ */
+ if (mp->m_sb.sb_rblocks) {
+ rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
+ rtb->rextslog = xfs_compute_rextslog(rtb->rextents);
+ rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents);
+ }
return 0;
}
@@ -48,12 +80,12 @@ xchk_rtbitmap_rec(
{
struct xfs_scrub *sc = priv;
xfs_rtblock_t startblock;
- xfs_rtblock_t blockcount;
+ xfs_filblks_t blockcount;
- startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
- blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
+ startblock = xfs_rtx_to_rtb(mp, rec->ar_startext);
+ blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount);
- if (!xfs_verify_rtext(mp, startblock, blockcount))
+ if (!xfs_verify_rtbext(mp, startblock, blockcount))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
return 0;
}
@@ -63,21 +95,30 @@ STATIC int
xchk_rtbitmap_check_extents(
struct xfs_scrub *sc)
{
- struct xfs_mount *mp = sc->mp;
struct xfs_bmbt_irec map;
- xfs_rtblock_t off;
- int nmap;
+ struct xfs_iext_cursor icur;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ xfs_fileoff_t off = 0;
+ xfs_fileoff_t endoff;
int error = 0;
- for (off = 0; off < mp->m_sb.sb_rbmblocks;) {
+ /* Mappings may not cross or lie beyond EOF. */
+ endoff = XFS_B_TO_FSB(mp, ip->i_disk_size);
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff);
+ return 0;
+ }
+
+ while (off < endoff) {
+ int nmap = 1;
+
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
break;
/* Make sure we have a written extent. */
- nmap = 1;
- error = xfs_bmapi_read(mp->m_rbmip, off,
- mp->m_sb.sb_rbmblocks - off, &map, &nmap,
+ error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap,
XFS_DATA_FORK);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
break;
@@ -98,12 +139,48 @@ int
xchk_rtbitmap(
struct xfs_scrub *sc)
{
+ struct xfs_mount *mp = sc->mp;
+ struct xchk_rtbitmap *rtb = sc->buf;
int error;
- /* Is the size of the rtbitmap correct? */
- if (sc->mp->m_rbmip->i_disk_size !=
- XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) {
- xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
+ /* Is sb_rextents correct? */
+ if (mp->m_sb.sb_rextents != rtb->rextents) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+
+ /* Is sb_rextslog correct? */
+ if (mp->m_sb.sb_rextslog != rtb->rextslog) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+
+ /*
+ * Is sb_rbmblocks large enough to handle the current rt volume? In no
+ * case can we exceed 4bn bitmap blocks since the super field is a u32.
+ */
+ if (rtb->rbmblocks > U32_MAX) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+ if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+
+ /* The bitmap file length must be aligned to an fsblock. */
+ if (mp->m_rbmip->i_disk_size & mp->m_blockmask) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+
+ /*
+ * Is the bitmap file itself large enough to handle the rt volume?
+ * growfsrt expands the bitmap file before updating sb_rextents, so the
+ * file can be larger than sb_rbmblocks.
+ */
+ if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
return 0;
}
@@ -116,38 +193,33 @@ xchk_rtbitmap(
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
- error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc);
+ error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
- goto out;
+ return error;
-out:
- return error;
+ return 0;
}
/* xref check that the extent is not free in the rtbitmap */
void
xchk_xref_is_used_rt_space(
struct xfs_scrub *sc,
- xfs_rtblock_t fsbno,
+ xfs_rtblock_t rtbno,
xfs_extlen_t len)
{
- xfs_rtblock_t startext;
- xfs_rtblock_t endext;
- xfs_rtblock_t extcount;
+ xfs_rtxnum_t startext;
+ xfs_rtxnum_t endext;
bool is_free;
int error;
if (xchk_skip_xref(sc->sm))
return;
- startext = fsbno;
- endext = fsbno + len - 1;
- do_div(startext, sc->mp->m_sb.sb_rextsize);
- do_div(endext, sc->mp->m_sb.sb_rextsize);
- extcount = endext - startext + 1;
+ startext = xfs_rtb_to_rtx(sc->mp, rtbno);
+ endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
- &is_free);
+ error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext,
+ endext - startext + 1, &is_free);
if (!xchk_should_check_xref(sc, &error, NULL))
goto out_unlock;
if (is_free)
diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h
new file mode 100644
index 000000000000..85304ff019e1
--- /dev/null
+++ b/fs/xfs/scrub/rtbitmap.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RTBITMAP_H__
+#define __XFS_SCRUB_RTBITMAP_H__
+
+struct xchk_rtbitmap {
+ uint64_t rextents;
+ uint64_t rbmblocks;
+ unsigned int rextslog;
+ unsigned int resblks;
+};
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb);
+#else
+# define xrep_setup_rtbitmap(sc, rtb) (0)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_RTBITMAP_H__ */
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
new file mode 100644
index 000000000000..46f5d5f605c9
--- /dev/null
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/rtbitmap.h"
+
+/* Set up to repair the realtime bitmap file metadata. */
+int
+xrep_setup_rtbitmap(
+ struct xfs_scrub *sc,
+ struct xchk_rtbitmap *rtb)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long blocks = 0;
+
+ /*
+ * Reserve enough blocks to write out a completely new bmbt for a
+ * maximally fragmented bitmap file. We do not hold the rtbitmap
+ * ILOCK yet, so this is entirely speculative.
+ */
+ blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks);
+ if (blocks > UINT_MAX)
+ return -EOPNOTSUPP;
+
+ rtb->resblks += blocks;
+ return 0;
+}
+
+/*
+ * Make sure that the given range of the data fork of the realtime file is
+ * mapped to written blocks. The caller must ensure that the inode is joined
+ * to the transaction.
+ */
+STATIC int
+xrep_rtbitmap_data_mappings(
+ struct xfs_scrub *sc,
+ xfs_filblks_t len)
+{
+ struct xfs_bmbt_irec map;
+ xfs_fileoff_t off = 0;
+ int error;
+
+ ASSERT(sc->ip != NULL);
+
+ while (off < len) {
+ int nmaps = 1;
+
+ /*
+ * If we have a real extent mapping this block then we're
+ * in ok shape.
+ */
+ error = xfs_bmapi_read(sc->ip, off, len - off, &map, &nmaps,
+ XFS_DATA_FORK);
+ if (error)
+ return error;
+ if (nmaps == 0) {
+ ASSERT(nmaps != 0);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Written extents are ok. Holes are not filled because we
+ * do not know the freespace information.
+ */
+ if (xfs_bmap_is_written_extent(&map) ||
+ map.br_startblock == HOLESTARTBLOCK) {
+ off = map.br_startoff + map.br_blockcount;
+ continue;
+ }
+
+ /*
+ * If we find a delalloc reservation then something is very
+ * very wrong. Bail out.
+ */
+ if (map.br_startblock == DELAYSTARTBLOCK)
+ return -EFSCORRUPTED;
+
+ /* Make sure we're really converting an unwritten extent. */
+ if (map.br_state != XFS_EXT_UNWRITTEN) {
+ ASSERT(map.br_state == XFS_EXT_UNWRITTEN);
+ return -EFSCORRUPTED;
+ }
+
+ /* Make sure this block has a real zeroed extent mapped. */
+ nmaps = 1;
+ error = xfs_bmapi_write(sc->tp, sc->ip, map.br_startoff,
+ map.br_blockcount,
+ XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO,
+ 0, &map, &nmaps);
+ if (error)
+ return error;
+ if (nmaps != 1)
+ return -EFSCORRUPTED;
+
+ /* Commit new extent and all deferred work. */
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+
+ off = map.br_startoff + map.br_blockcount;
+ }
+
+ return 0;
+}
+
+/* Fix broken rt volume geometry. */
+STATIC int
+xrep_rtbitmap_geometry(
+ struct xfs_scrub *sc,
+ struct xchk_rtbitmap *rtb)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+
+ /* Superblock fields */
+ if (mp->m_sb.sb_rextents != rtb->rextents)
+ xfs_trans_mod_sb(sc->tp, XFS_TRANS_SB_REXTENTS,
+ rtb->rextents - mp->m_sb.sb_rextents);
+
+ if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+ rtb->rbmblocks - mp->m_sb.sb_rbmblocks);
+
+ if (mp->m_sb.sb_rextslog != rtb->rextslog)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+ rtb->rextslog - mp->m_sb.sb_rextslog);
+
+ /* Fix broken isize */
+ sc->ip->i_disk_size = roundup_64(sc->ip->i_disk_size,
+ mp->m_sb.sb_blocksize);
+
+ if (sc->ip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks))
+ sc->ip->i_disk_size = XFS_FSB_TO_B(mp, rtb->rbmblocks);
+
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ return xrep_roll_trans(sc);
+}
+
+/* Repair the realtime bitmap file metadata. */
+int
+xrep_rtbitmap(
+ struct xfs_scrub *sc)
+{
+ struct xchk_rtbitmap *rtb = sc->buf;
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long blocks = 0;
+ int error;
+
+ /* Impossibly large rtbitmap means we can't touch the filesystem. */
+ if (rtb->rbmblocks > U32_MAX)
+ return 0;
+
+ /*
+ * If the size of the rt bitmap file is larger than what we reserved,
+ * figure out if we need to adjust the block reservation in the
+ * transaction.
+ */
+ blocks = xfs_bmbt_calc_size(mp, rtb->rbmblocks);
+ if (blocks > UINT_MAX)
+ return -EOPNOTSUPP;
+ if (blocks > rtb->resblks) {
+ error = xfs_trans_reserve_more(sc->tp, blocks, 0);
+ if (error)
+ return error;
+
+ rtb->resblks += blocks;
+ }
+
+ /* Fix inode core and forks. */
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ return error;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Ensure no unwritten extents. */
+ error = xrep_rtbitmap_data_mappings(sc, rtb->rbmblocks);
+ if (error)
+ return error;
+
+ /* Fix inconsistent bitmap geometry */
+ return xrep_rtbitmap_geometry(sc, rtb);
+}
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 437ed9acbb27..5055092bd9e8 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -13,9 +13,10 @@
#include "xfs_inode.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_bit.h"
#include "xfs_bmap.h"
+#include "xfs_sb.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -31,6 +32,18 @@
* (potentially large) amount of data in pageable memory.
*/
+struct xchk_rtsummary {
+ struct xfs_rtalloc_args args;
+
+ uint64_t rextents;
+ uint64_t rbmblocks;
+ uint64_t rsumsize;
+ unsigned int rsumlevels;
+
+ /* Memory buffer for the summary comparison. */
+ union xfs_suminfo_raw words[];
+};
+
/* Set us up to check the rtsummary file. */
int
xchk_setup_rtsummary(
@@ -38,8 +51,15 @@ xchk_setup_rtsummary(
{
struct xfs_mount *mp = sc->mp;
char *descr;
+ struct xchk_rtsummary *rts;
int error;
+ rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize),
+ XCHK_GFP_FLAGS);
+ if (!rts)
+ return -ENOMEM;
+ sc->buf = rts;
+
/*
* Create an xfile to construct a new rtsummary file. The xfile allows
* us to avoid pinning kernel memory for this purpose.
@@ -54,15 +74,14 @@ xchk_setup_rtsummary(
if (error)
return error;
- /* Allocate a memory buffer for the summary comparison. */
- sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS);
- if (!sc->buf)
- return -ENOMEM;
-
error = xchk_install_live_inode(sc, mp->m_rsumip);
if (error)
return error;
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ return error;
+
/*
* Locking order requires us to take the rtbitmap first. We must be
* careful to unlock it ourselves when we are done with the rtbitmap
@@ -71,44 +90,71 @@ xchk_setup_rtsummary(
*/
xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+
+ /*
+ * Now that we've locked the rtbitmap and rtsummary, we can't race with
+ * growfsrt trying to expand the summary or change the size of the rt
+ * volume. Hence it is safe to compute and check the geometry values.
+ */
+ if (mp->m_sb.sb_rblocks) {
+ xfs_filblks_t rsumblocks;
+ int rextslog;
+
+ rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
+ rextslog = xfs_compute_rextslog(rts->rextents);
+ rts->rsumlevels = rextslog + 1;
+ rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents);
+ rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels,
+ rts->rbmblocks);
+ rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks);
+ }
return 0;
}
/* Helper functions to record suminfo words in an xfile. */
-typedef unsigned int xchk_rtsumoff_t;
-
static inline int
xfsum_load(
struct xfs_scrub *sc,
- xchk_rtsumoff_t sumoff,
- xfs_suminfo_t *info)
+ xfs_rtsumoff_t sumoff,
+ union xfs_suminfo_raw *rawinfo)
{
- return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t),
+ return xfile_load(sc->xfile, rawinfo,
+ sizeof(union xfs_suminfo_raw),
sumoff << XFS_WORDLOG);
}
static inline int
xfsum_store(
struct xfs_scrub *sc,
- xchk_rtsumoff_t sumoff,
- const xfs_suminfo_t info)
+ xfs_rtsumoff_t sumoff,
+ const union xfs_suminfo_raw rawinfo)
{
- return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t),
+ return xfile_store(sc->xfile, &rawinfo,
+ sizeof(union xfs_suminfo_raw),
sumoff << XFS_WORDLOG);
}
static inline int
xfsum_copyout(
struct xfs_scrub *sc,
- xchk_rtsumoff_t sumoff,
- xfs_suminfo_t *info,
+ xfs_rtsumoff_t sumoff,
+ union xfs_suminfo_raw *rawinfo,
unsigned int nr_words)
{
- return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG,
+ return xfile_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG,
sumoff << XFS_WORDLOG);
}
+static inline xfs_suminfo_t
+xchk_rtsum_inc(
+ struct xfs_mount *mp,
+ union xfs_suminfo_raw *v)
+{
+ v->old += 1;
+ return v->old;
+}
+
/* Update the summary file to reflect the free extent that we've accumulated. */
STATIC int
xchk_rtsum_record_free(
@@ -121,23 +167,24 @@ xchk_rtsum_record_free(
xfs_fileoff_t rbmoff;
xfs_rtblock_t rtbno;
xfs_filblks_t rtlen;
- xchk_rtsumoff_t offs;
+ xfs_rtsumoff_t offs;
unsigned int lenlog;
- xfs_suminfo_t v = 0;
+ union xfs_suminfo_raw v;
+ xfs_suminfo_t value;
int error = 0;
if (xchk_should_terminate(sc, &error))
return error;
/* Compute the relevant location in the rtsum file. */
- rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext);
- lenlog = XFS_RTBLOCKLOG(rec->ar_extcount);
- offs = XFS_SUMOFFS(mp, lenlog, rbmoff);
+ rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext);
+ lenlog = xfs_highbit64(rec->ar_extcount);
+ offs = xfs_rtsumoffs(mp, lenlog, rbmoff);
- rtbno = rec->ar_startext * mp->m_sb.sb_rextsize;
- rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize;
+ rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
+ rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount);
- if (!xfs_verify_rtext(mp, rtbno, rtlen)) {
+ if (!xfs_verify_rtbext(mp, rtbno, rtlen)) {
xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
return -EFSCORRUPTED;
}
@@ -147,9 +194,9 @@ xchk_rtsum_record_free(
if (error)
return error;
- v++;
+ value = xchk_rtsum_inc(sc->mp, &v);
trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount,
- lenlog, offs, v);
+ lenlog, offs, value);
return xfsum_store(sc, offs, v);
}
@@ -160,12 +207,11 @@ xchk_rtsum_compute(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
- unsigned long long rtbmp_bytes;
+ unsigned long long rtbmp_blocks;
/* If the bitmap size doesn't match the computed size, bail. */
- rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY);
- if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) !=
- mp->m_rbmip->i_disk_size)
+ rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents);
+ if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size)
return -EFSCORRUPTED;
return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free,
@@ -177,15 +223,29 @@ STATIC int
xchk_rtsum_compare(
struct xfs_scrub *sc)
{
- struct xfs_mount *mp = sc->mp;
- struct xfs_buf *bp;
struct xfs_bmbt_irec map;
- xfs_fileoff_t off;
- xchk_rtsumoff_t sumoff = 0;
- int nmap;
+ struct xfs_iext_cursor icur;
+
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct xchk_rtsummary *rts = sc->buf;
+ xfs_fileoff_t off = 0;
+ xfs_fileoff_t endoff;
+ xfs_rtsumoff_t sumoff = 0;
+ int error = 0;
+
+ rts->args.mp = sc->mp;
+ rts->args.tp = sc->tp;
- for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) {
- int error = 0;
+ /* Mappings may not cross or lie beyond EOF. */
+ endoff = XFS_B_TO_FSB(mp, ip->i_disk_size);
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff);
+ return 0;
+ }
+
+ while (off < endoff) {
+ int nmap = 1;
if (xchk_should_terminate(sc, &error))
return error;
@@ -193,8 +253,7 @@ xchk_rtsum_compare(
return 0;
/* Make sure we have a written extent. */
- nmap = 1;
- error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap,
+ error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap,
XFS_DATA_FORK);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
return error;
@@ -204,23 +263,33 @@ xchk_rtsum_compare(
return 0;
}
+ off += map.br_blockcount;
+ }
+
+ for (off = 0; off < endoff; off++) {
+ union xfs_suminfo_raw *ondisk_info;
+
/* Read a block's worth of ondisk rtsummary file. */
- error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp);
+ error = xfs_rtsummary_read_buf(&rts->args, off);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
return error;
/* Read a block's worth of computed rtsummary file. */
- error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize);
+ error = xfsum_copyout(sc, sumoff, rts->words, mp->m_blockwsize);
if (error) {
- xfs_trans_brelse(sc->tp, bp);
+ xfs_rtbuf_cache_relse(&rts->args);
return error;
}
- if (memcmp(bp->b_addr, sc->buf,
- mp->m_blockwsize << XFS_WORDLOG) != 0)
+ ondisk_info = xfs_rsumblock_infoptr(&rts->args, 0);
+ if (memcmp(ondisk_info, rts->words,
+ mp->m_blockwsize << XFS_WORDLOG) != 0) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+ xfs_rtbuf_cache_relse(&rts->args);
+ return error;
+ }
- xfs_trans_brelse(sc->tp, bp);
+ xfs_rtbuf_cache_relse(&rts->args);
sumoff += mp->m_blockwsize;
}
@@ -233,8 +302,43 @@ xchk_rtsummary(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
+ struct xchk_rtsummary *rts = sc->buf;
int error = 0;
+ /* Is sb_rextents correct? */
+ if (mp->m_sb.sb_rextents != rts->rextents) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ goto out_rbm;
+ }
+
+ /* Is m_rsumlevels correct? */
+ if (mp->m_rsumlevels != rts->rsumlevels) {
+ xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+ goto out_rbm;
+ }
+
+ /* Is m_rsumsize correct? */
+ if (mp->m_rsumsize != rts->rsumsize) {
+ xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+ goto out_rbm;
+ }
+
+ /* The summary file length must be aligned to an fsblock. */
+ if (mp->m_rsumip->i_disk_size & mp->m_blockmask) {
+ xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+ goto out_rbm;
+ }
+
+ /*
+ * Is the summary file itself large enough to handle the rt volume?
+ * growfsrt expands the summary file before updating sb_rextents, so
+ * the file can be larger than rsumsize.
+ */
+ if (mp->m_rsumip->i_disk_size < rts->rsumsize) {
+ xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+ goto out_rbm;
+ }
+
/* Invoke the fork scrubber. */
error = xchk_metadata_inode_forks(sc);
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 4849efcaa33a..20fac9723c08 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -14,9 +14,9 @@
#include "xfs_inode.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
-#include "xfs_errortag.h"
-#include "xfs_error.h"
#include "xfs_scrub.h"
+#include "xfs_buf_mem.h"
+#include "xfs_rmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -159,6 +159,15 @@ xchk_fsgates_disable(
if (sc->flags & XCHK_FSGATES_DRAIN)
xfs_drain_wait_disable();
+ if (sc->flags & XCHK_FSGATES_QUOTA)
+ xfs_dqtrx_hook_disable();
+
+ if (sc->flags & XCHK_FSGATES_DIRENTS)
+ xfs_dir_hook_disable();
+
+ if (sc->flags & XCHK_FSGATES_RMAP)
+ xfs_rmap_hook_disable();
+
sc->flags &= ~XCHK_FSGATES_ALL;
}
@@ -186,6 +195,10 @@ xchk_teardown(
sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
mnt_drop_write_file(sc->file);
}
+ if (sc->xmbtp) {
+ xmbuf_free(sc->xmbtp);
+ sc->xmbtp = NULL;
+ }
if (sc->xfile) {
xfile_destroy(sc->xfile);
sc->xfile = NULL;
@@ -238,65 +251,69 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
[XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */
.type = ST_PERAG,
.setup = xchk_setup_ag_allocbt,
- .scrub = xchk_bnobt,
- .repair = xrep_notsupported,
+ .scrub = xchk_allocbt,
+ .repair = xrep_allocbt,
+ .repair_eval = xrep_revalidate_allocbt,
},
[XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */
.type = ST_PERAG,
.setup = xchk_setup_ag_allocbt,
- .scrub = xchk_cntbt,
- .repair = xrep_notsupported,
+ .scrub = xchk_allocbt,
+ .repair = xrep_allocbt,
+ .repair_eval = xrep_revalidate_allocbt,
},
[XFS_SCRUB_TYPE_INOBT] = { /* inobt */
.type = ST_PERAG,
.setup = xchk_setup_ag_iallocbt,
- .scrub = xchk_inobt,
- .repair = xrep_notsupported,
+ .scrub = xchk_iallocbt,
+ .repair = xrep_iallocbt,
+ .repair_eval = xrep_revalidate_iallocbt,
},
[XFS_SCRUB_TYPE_FINOBT] = { /* finobt */
.type = ST_PERAG,
.setup = xchk_setup_ag_iallocbt,
- .scrub = xchk_finobt,
+ .scrub = xchk_iallocbt,
.has = xfs_has_finobt,
- .repair = xrep_notsupported,
+ .repair = xrep_iallocbt,
+ .repair_eval = xrep_revalidate_iallocbt,
},
[XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */
.type = ST_PERAG,
.setup = xchk_setup_ag_rmapbt,
.scrub = xchk_rmapbt,
.has = xfs_has_rmapbt,
- .repair = xrep_notsupported,
+ .repair = xrep_rmapbt,
},
[XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */
.type = ST_PERAG,
.setup = xchk_setup_ag_refcountbt,
.scrub = xchk_refcountbt,
.has = xfs_has_reflink,
- .repair = xrep_notsupported,
+ .repair = xrep_refcountbt,
},
[XFS_SCRUB_TYPE_INODE] = { /* inode record */
.type = ST_INODE,
.setup = xchk_setup_inode,
.scrub = xchk_inode,
- .repair = xrep_notsupported,
+ .repair = xrep_inode,
},
[XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */
.type = ST_INODE,
.setup = xchk_setup_inode_bmap,
.scrub = xchk_bmap_data,
- .repair = xrep_notsupported,
+ .repair = xrep_bmap_data,
},
[XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */
.type = ST_INODE,
.setup = xchk_setup_inode_bmap,
.scrub = xchk_bmap_attr,
- .repair = xrep_notsupported,
+ .repair = xrep_bmap_attr,
},
[XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */
.type = ST_INODE,
.setup = xchk_setup_inode_bmap,
.scrub = xchk_bmap_cow,
- .repair = xrep_notsupported,
+ .repair = xrep_bmap_cow,
},
[XFS_SCRUB_TYPE_DIR] = { /* directory */
.type = ST_INODE,
@@ -326,39 +343,55 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_FS,
.setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
- .has = xfs_has_realtime,
- .repair = xrep_notsupported,
+ .repair = xrep_rtbitmap,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_FS,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
- .has = xfs_has_realtime,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
.type = ST_FS,
.setup = xchk_setup_quota,
.scrub = xchk_quota,
- .repair = xrep_notsupported,
+ .repair = xrep_quota,
},
[XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */
.type = ST_FS,
.setup = xchk_setup_quota,
.scrub = xchk_quota,
- .repair = xrep_notsupported,
+ .repair = xrep_quota,
},
[XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */
.type = ST_FS,
.setup = xchk_setup_quota,
.scrub = xchk_quota,
- .repair = xrep_notsupported,
+ .repair = xrep_quota,
},
[XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */
.type = ST_FS,
.setup = xchk_setup_fscounters,
.scrub = xchk_fscounters,
- .repair = xrep_notsupported,
+ .repair = xrep_fscounters,
+ },
+ [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */
+ .type = ST_FS,
+ .setup = xchk_setup_quotacheck,
+ .scrub = xchk_quotacheck,
+ .repair = xrep_quotacheck,
+ },
+ [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */
+ .type = ST_FS,
+ .setup = xchk_setup_nlinks,
+ .scrub = xchk_nlinks,
+ .repair = xrep_nlinks,
+ },
+ [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */
+ .type = ST_FS,
+ .setup = xchk_setup_fs,
+ .scrub = xchk_health_record,
+ .repair = xrep_notsupported,
},
};
@@ -531,7 +564,10 @@ retry_op:
/* Scrub for errors. */
check_start = xchk_stats_now();
- error = sc->ops->scrub(sc);
+ if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL)
+ error = sc->ops->repair_eval(sc);
+ else
+ error = sc->ops->scrub(sc);
run.scrub_ns += xchk_stats_elapsed_ns(check_start);
if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
goto try_harder;
@@ -542,23 +578,12 @@ retry_op:
xchk_update_health(sc);
- if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
- !(sc->flags & XREP_ALREADY_FIXED)) {
- bool needs_fix = xchk_needs_repair(sc->sm);
-
- /* Userspace asked us to rebuild the structure regardless. */
- if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
- needs_fix = true;
-
- /* Let debug users force us into the repair routines. */
- if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
- needs_fix = true;
-
+ if (xchk_could_repair(sc)) {
/*
* If userspace asked for a repair but it wasn't necessary,
* report that back to userspace.
*/
- if (!needs_fix) {
+ if (!xrep_will_attempt(sc)) {
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
goto out_nofix;
}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 1ef9c6b4842a..9ad65b604fe1 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -35,6 +35,14 @@ struct xchk_meta_ops {
/* Repair or optimize the metadata. */
int (*repair)(struct xfs_scrub *);
+ /*
+ * Re-scrub the metadata we repaired, in case there's extra work that
+ * we need to do to check our repair work. If this is NULL, we'll use
+ * the ->scrub function pointer, assuming that the regular scrub is
+ * sufficient.
+ */
+ int (*repair_eval)(struct xfs_scrub *sc);
+
/* Decide if we even have this piece of metadata. */
bool (*has)(struct xfs_mount *);
@@ -91,6 +99,9 @@ struct xfs_scrub {
/* xfile used by the scrubbers; freed at teardown. */
struct xfile *xfile;
+ /* buffer target for in-memory btrees; also freed at teardown. */
+ struct xfs_buftarg *xmbtp;
+
/* Lock flags for @ip. */
uint ilock_flags;
@@ -113,6 +124,10 @@ struct xfs_scrub {
#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
+#define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */
+#define XCHK_FSGATES_DIRENTS (1U << 5) /* directory live update enabled */
+#define XCHK_FSGATES_RMAP (1U << 6) /* rmapbt live update enabled */
+#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
/*
@@ -121,7 +136,10 @@ struct xfs_scrub {
* features are gated off via dynamic code patching, which is why the state
* must be enabled during scrub setup and can only be torn down afterwards.
*/
-#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN)
+#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN | \
+ XCHK_FSGATES_QUOTA | \
+ XCHK_FSGATES_DIRENTS | \
+ XCHK_FSGATES_RMAP)
/* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc);
@@ -129,10 +147,8 @@ int xchk_superblock(struct xfs_scrub *sc);
int xchk_agf(struct xfs_scrub *sc);
int xchk_agfl(struct xfs_scrub *sc);
int xchk_agi(struct xfs_scrub *sc);
-int xchk_bnobt(struct xfs_scrub *sc);
-int xchk_cntbt(struct xfs_scrub *sc);
-int xchk_inobt(struct xfs_scrub *sc);
-int xchk_finobt(struct xfs_scrub *sc);
+int xchk_allocbt(struct xfs_scrub *sc);
+int xchk_iallocbt(struct xfs_scrub *sc);
int xchk_rmapbt(struct xfs_scrub *sc);
int xchk_refcountbt(struct xfs_scrub *sc);
int xchk_inode(struct xfs_scrub *sc);
@@ -160,14 +176,21 @@ xchk_rtsummary(struct xfs_scrub *sc)
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_quota(struct xfs_scrub *sc);
+int xchk_quotacheck(struct xfs_scrub *sc);
#else
static inline int
xchk_quota(struct xfs_scrub *sc)
{
return -ENOENT;
}
+static inline int
+xchk_quotacheck(struct xfs_scrub *sc)
+{
+ return -ENOENT;
+}
#endif
int xchk_fscounters(struct xfs_scrub *sc);
+int xchk_nlinks(struct xfs_scrub *sc);
/* cross-referencing helpers */
void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno,
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index cd91db4a5548..42cafbed94ac 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -77,6 +77,8 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_GQUOTA] = "grpquota",
[XFS_SCRUB_TYPE_PQUOTA] = "prjquota",
[XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters",
+ [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck",
+ [XFS_SCRUB_TYPE_NLINKS] = "nlinks",
};
/* Format the scrub stats into a text buffer, similar to pcp style. */
@@ -329,9 +331,9 @@ xchk_stats_register(
if (!cs->cs_debugfs)
return;
- debugfs_create_file("stats", 0644, cs->cs_debugfs, cs,
+ debugfs_create_file("stats", 0444, cs->cs_debugfs, cs,
&scrub_stats_fops);
- debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs,
+ debugfs_create_file("clear_stats", 0200, cs->cs_debugfs, cs,
&clear_scrub_stats_fops);
}
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 38708fb9a5d7..d77d8a9598f6 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -12,8 +12,11 @@
#include "xfs_log_format.h"
#include "xfs_inode.h"
#include "xfs_symlink.h"
+#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
+#include "scrub/health.h"
/* Set us up to scrub a symbolic link. */
int
@@ -41,29 +44,37 @@ xchk_symlink(
if (!S_ISLNK(VFS_I(ip)->i_mode))
return -ENOENT;
+
+ if (xchk_file_looks_zapped(sc, XFS_SICK_INO_SYMLINK_ZAPPED)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
len = ip->i_disk_size;
/* Plausible size? */
if (len > XFS_SYMLINK_MAXLEN || len <= 0) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
- goto out;
+ return 0;
}
/* Inline symlink? */
if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
if (len > xfs_inode_data_fork_size(ip) ||
- len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip)))
+ len > strnlen(ifp->if_data, xfs_inode_data_fork_size(ip)))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
- goto out;
+ return 0;
}
/* Remote symlink; must read the contents. */
- error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf);
+ error = xfs_symlink_remote_read(sc->ip, sc->buf);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
- goto out;
+ return error;
if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
-out:
- return error;
+
+ /* If a remote symlink is clean, it is clearly not zapped. */
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED);
+ return 0;
}
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 46249e7b17e0..3dd281d6d185 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -13,9 +13,19 @@
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_quota.h"
+#include "xfs_quota_defs.h"
+#include "xfs_da_format.h"
+#include "xfs_dir2.h"
+#include "xfs_rmap.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
+#include "scrub/quota.h"
+#include "scrub/iscan.h"
+#include "scrub/nlinks.h"
+#include "scrub/fscounters.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
@@ -28,7 +38,7 @@ xchk_btree_cur_fsbno(
xfs_buf_daddr(cur->bc_levels[level].bp));
if (level == cur->bc_nlevels - 1 &&
- (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
return NULLFSBLOCK;
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index cbd4d01e253c..5b294be52c55 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -15,10 +15,17 @@
#include <linux/tracepoint.h>
#include "xfs_bit.h"
+#include "xfs_quota_defs.h"
+struct xfs_scrub;
struct xfile;
struct xfarray;
struct xfarray_sortinfo;
+struct xchk_dqiter;
+struct xchk_iscan;
+struct xchk_nlink;
+struct xchk_fscounters;
+struct xfs_rmap_update_params;
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -26,14 +33,6 @@ struct xfarray_sortinfo;
* ring buffer. Somehow this was only worth mentioning in the ftrace sample
* code.
*/
-TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
-
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
@@ -62,6 +61,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
#define XFS_SCRUB_TYPE_STRINGS \
{ XFS_SCRUB_TYPE_PROBE, "probe" }, \
@@ -88,7 +90,10 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \
{ XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \
{ XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \
- { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }
+ { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \
+ { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \
+ { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \
+ { XFS_SCRUB_TYPE_HEALTHY, "healthy" }
#define XFS_SCRUB_FLAG_STRINGS \
{ XFS_SCRUB_IFLAG_REPAIR, "repair" }, \
@@ -106,8 +111,21 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \
{ XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
{ XCHK_NEED_DRAIN, "need_drain" }, \
+ { XCHK_FSGATES_QUOTA, "fsgates_quota" }, \
+ { XCHK_FSGATES_DIRENTS, "fsgates_dirents" }, \
+ { XCHK_FSGATES_RMAP, "fsgates_rmap" }, \
+ { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \
{ XREP_ALREADY_FIXED, "already_fixed" }
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC);
+TRACE_DEFINE_ENUM(XFS_RMAP_FREE);
+
DECLARE_EVENT_CLASS(xchk_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
int error),
@@ -347,6 +365,77 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error);
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning);
+#ifdef CONFIG_XFS_QUOTA
+DECLARE_EVENT_CLASS(xchk_dqiter_class,
+ TP_PROTO(struct xchk_dqiter *cursor, uint64_t id),
+ TP_ARGS(cursor, id),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_dqtype_t, dqtype)
+ __field(xfs_ino_t, ino)
+ __field(unsigned long long, cur_id)
+ __field(unsigned long long, id)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_filblks_t, blockcount)
+ __field(xfs_exntst_t, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev;
+ __entry->dqtype = cursor->dqtype;
+ __entry->ino = cursor->quota_ip->i_ino;
+ __entry->cur_id = cursor->id;
+ __entry->startoff = cursor->bmap.br_startoff;
+ __entry->startblock = cursor->bmap.br_startblock;
+ __entry->blockcount = cursor->bmap.br_blockcount;
+ __entry->state = cursor->bmap.br_state;
+ __entry->id = id;
+ ),
+ TP_printk("dev %d:%d dquot type %s ino 0x%llx cursor_id 0x%llx startoff 0x%llx startblock 0x%llx blockcount 0x%llx state %u id 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS),
+ __entry->ino,
+ __entry->cur_id,
+ __entry->startoff,
+ __entry->startblock,
+ __entry->blockcount,
+ __entry->state,
+ __entry->id)
+);
+
+#define DEFINE_SCRUB_DQITER_EVENT(name) \
+DEFINE_EVENT(xchk_dqiter_class, name, \
+ TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), \
+ TP_ARGS(cursor, id))
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap);
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap);
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore);
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter);
+
+TRACE_EVENT(xchk_qcheck_error,
+ TP_PROTO(struct xfs_scrub *sc, xfs_dqtype_t dqtype, xfs_dqid_t id,
+ void *ret_ip),
+ TP_ARGS(sc, dqtype, id, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_dqtype_t, dqtype)
+ __field(xfs_dqid_t, id)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->dqtype = dqtype;
+ __entry->id = id;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d dquot type %s id 0x%x ret_ip %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS),
+ __entry->id,
+ __entry->ret_ip)
+);
+#endif /* CONFIG_XFS_QUOTA */
+
TRACE_EVENT(xchk_incomplete,
TP_PROTO(struct xfs_scrub *sc, void *ret_ip),
TP_ARGS(sc, ret_ip),
@@ -373,7 +462,7 @@ TRACE_EVENT(xchk_btree_op_error,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
@@ -386,7 +475,7 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -394,10 +483,10 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
+ TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -415,7 +504,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__field(xfs_ino_t, ino)
__field(int, whichfork)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(int, ptr)
__field(xfs_agnumber_t, agno)
@@ -429,7 +518,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->ino = sc->ip->i_ino;
__entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->ptr = cur->bc_levels[level].ptr;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
@@ -437,12 +526,12 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -458,7 +547,7 @@ TRACE_EVENT(xchk_btree_error,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
@@ -469,17 +558,17 @@ TRACE_EVENT(xchk_btree_error,
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
+ TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -496,7 +585,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
__field(xfs_ino_t, ino)
__field(int, whichfork)
__field(unsigned int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
@@ -509,19 +598,19 @@ TRACE_EVENT(xchk_ifork_btree_error,
__entry->ino = sc->ip->i_ino;
__entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -536,7 +625,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(int, type)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
__field(int, level)
@@ -548,17 +637,17 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_levels[level].ptr;
),
- TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
+ TP_printk("dev %d:%d type %s %sbt agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->agno,
__entry->bno,
__entry->level,
@@ -811,18 +900,11 @@ TRACE_EVENT(xfile_destroy,
__field(loff_t, size)
),
TP_fast_assign(
- struct xfile_stat statbuf;
- int ret;
+ struct inode *inode = file_inode(xf->file);
- ret = xfile_stat(xf, &statbuf);
- if (!ret) {
- __entry->bytes = statbuf.bytes;
- __entry->size = statbuf.size;
- } else {
- __entry->bytes = -1;
- __entry->size = -1;
- }
- __entry->ino = file_inode(xf->file)->i_ino;
+ __entry->ino = inode->i_ino;
+ __entry->bytes = inode->i_blocks << SECTOR_SHIFT;
+ __entry->size = i_size_read(inode);
),
TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx",
__entry->ino,
@@ -841,19 +923,12 @@ DECLARE_EVENT_CLASS(xfile_class,
__field(unsigned long long, bytecount)
),
TP_fast_assign(
- struct xfile_stat statbuf;
- int ret;
+ struct inode *inode = file_inode(xf->file);
- ret = xfile_stat(xf, &statbuf);
- if (!ret) {
- __entry->bytes_used = statbuf.bytes;
- __entry->size = statbuf.size;
- } else {
- __entry->bytes_used = -1;
- __entry->size = -1;
- }
- __entry->ino = file_inode(xf->file)->i_ino;
+ __entry->ino = inode->i_ino;
+ __entry->bytes_used = inode->i_blocks << SECTOR_SHIFT;
__entry->pos = pos;
+ __entry->size = i_size_read(inode);
__entry->bytecount = bytecount;
),
TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx",
@@ -867,11 +942,11 @@ DECLARE_EVENT_CLASS(xfile_class,
DEFINE_EVENT(xfile_class, name, \
TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \
TP_ARGS(xf, pos, bytecount))
-DEFINE_XFILE_EVENT(xfile_pread);
-DEFINE_XFILE_EVENT(xfile_pwrite);
+DEFINE_XFILE_EVENT(xfile_load);
+DEFINE_XFILE_EVENT(xfile_store);
DEFINE_XFILE_EVENT(xfile_seek_data);
-DEFINE_XFILE_EVENT(xfile_get_page);
-DEFINE_XFILE_EVENT(xfile_put_page);
+DEFINE_XFILE_EVENT(xfile_get_folio);
+DEFINE_XFILE_EVENT(xfile_put_folio);
TRACE_EVENT(xfarray_create,
TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity),
@@ -918,7 +993,7 @@ TRACE_EVENT(xfarray_isort,
__entry->hi - __entry->lo)
);
-TRACE_EVENT(xfarray_pagesort,
+TRACE_EVENT(xfarray_foliosort,
TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
TP_ARGS(si, lo, hi),
TP_STRUCT__entry(
@@ -989,6 +1064,47 @@ TRACE_EVENT(xfarray_sort,
__entry->bytes)
);
+TRACE_EVENT(xfarray_sort_scan,
+ TP_PROTO(struct xfarray_sortinfo *si, unsigned long long idx),
+ TP_ARGS(si, idx),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, nr)
+ __field(size_t, obj_size)
+ __field(unsigned long long, idx)
+ __field(unsigned long long, folio_pos)
+ __field(unsigned long, folio_bytes)
+ __field(unsigned long long, first_idx)
+ __field(unsigned long long, last_idx)
+ ),
+ TP_fast_assign(
+ __entry->nr = si->array->nr;
+ __entry->obj_size = si->array->obj_size;
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+ __entry->idx = idx;
+ if (si->folio) {
+ __entry->folio_pos = folio_pos(si->folio);
+ __entry->folio_bytes = folio_size(si->folio);
+ __entry->first_idx = si->first_folio_idx;
+ __entry->last_idx = si->last_folio_idx;
+ } else {
+ __entry->folio_pos = 0;
+ __entry->folio_bytes = 0;
+ __entry->first_idx = 0;
+ __entry->last_idx = 0;
+ }
+ ),
+ TP_printk("xfino 0x%lx nr %llu objsz %zu idx %llu folio_pos 0x%llx folio_bytes 0x%lx first_idx %llu last_idx %llu",
+ __entry->ino,
+ __entry->nr,
+ __entry->obj_size,
+ __entry->idx,
+ __entry->folio_pos,
+ __entry->folio_bytes,
+ __entry->first_idx,
+ __entry->last_idx)
+);
+
TRACE_EVENT(xfarray_sort_stats,
TP_PROTO(struct xfarray_sortinfo *si, int error),
TP_ARGS(si, error),
@@ -1036,17 +1152,18 @@ TRACE_EVENT(xfarray_sort_stats,
#ifdef CONFIG_XFS_RT
TRACE_EVENT(xchk_rtsum_record_free,
- TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start,
- uint64_t len, unsigned int log, loff_t pos, xfs_suminfo_t v),
- TP_ARGS(mp, start, len, log, pos, v),
+ TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start,
+ xfs_rtbxlen_t len, unsigned int log, loff_t pos,
+ xfs_suminfo_t value),
+ TP_ARGS(mp, start, len, log, pos, value),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(dev_t, rtdev)
- __field(xfs_rtblock_t, start)
+ __field(xfs_rtxnum_t, start)
__field(unsigned long long, len)
__field(unsigned int, log)
__field(loff_t, pos)
- __field(xfs_suminfo_t, v)
+ __field(xfs_suminfo_t, value)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
@@ -1055,7 +1172,7 @@ TRACE_EVENT(xchk_rtsum_record_free,
__entry->len = len;
__entry->log = log;
__entry->pos = pos;
- __entry->v = v;
+ __entry->value = value;
),
TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1064,10 +1181,327 @@ TRACE_EVENT(xchk_rtsum_record_free,
__entry->len,
__entry->log,
__entry->pos,
- __entry->v)
+ __entry->value)
);
#endif /* CONFIG_XFS_RT */
+DECLARE_EVENT_CLASS(xchk_iscan_class,
+ TP_PROTO(struct xchk_iscan *iscan),
+ TP_ARGS(iscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited)
+)
+#define DEFINE_ISCAN_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan), \
+ TP_ARGS(iscan))
+DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor);
+DEFINE_ISCAN_EVENT(xchk_iscan_visit);
+DEFINE_ISCAN_EVENT(xchk_iscan_skip);
+DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag);
+
+DECLARE_EVENT_CLASS(xchk_iscan_ino_class,
+ TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino),
+ TP_ARGS(iscan, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, startino)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->startino = iscan->scan_start_ino;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d iscan start 0x%llx cursor 0x%llx visited 0x%llx ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->startino,
+ __entry->cursor,
+ __entry->visited,
+ __entry->ino)
+)
+#define DEFINE_ISCAN_INO_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_ino_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), \
+ TP_ARGS(iscan, ino))
+DEFINE_ISCAN_INO_EVENT(xchk_iscan_want_live_update);
+DEFINE_ISCAN_INO_EVENT(xchk_iscan_start);
+
+TRACE_EVENT(xchk_iscan_iget,
+ TP_PROTO(struct xchk_iscan *iscan, int error),
+ TP_ARGS(iscan, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->error = error;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->error)
+);
+
+TRACE_EVENT(xchk_iscan_iget_batch,
+ TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan,
+ unsigned int nr, unsigned int avail),
+ TP_ARGS(mp, iscan, nr, avail),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(unsigned int, nr)
+ __field(unsigned int, avail)
+ __field(unsigned int, unavail)
+ __field(xfs_ino_t, batch_ino)
+ __field(unsigned long long, skipmask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->nr = nr;
+ __entry->avail = avail;
+ __entry->unavail = hweight64(iscan->__skipped_inomask);
+ __entry->batch_ino = iscan->__batch_ino;
+ __entry->skipmask = iscan->__skipped_inomask;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx batchino 0x%llx skipmask 0x%llx nr %u avail %u unavail %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->batch_ino,
+ __entry->skipmask,
+ __entry->nr,
+ __entry->avail,
+ __entry->unavail)
+);
+
+TRACE_EVENT(xchk_iscan_iget_retry_wait,
+ TP_PROTO(struct xchk_iscan *iscan),
+ TP_ARGS(iscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(unsigned int, retry_delay)
+ __field(unsigned long, remaining)
+ __field(unsigned int, iget_timeout)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->retry_delay = iscan->iget_retry_delay;
+ __entry->remaining = jiffies_to_msecs(iscan->__iget_deadline - jiffies);
+ __entry->iget_timeout = iscan->iget_timeout;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx remaining %lu timeout %u delay %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->remaining,
+ __entry->iget_timeout,
+ __entry->retry_delay)
+);
+
+TRACE_EVENT(xchk_nlinks_collect_dirent,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp,
+ xfs_ino_t ino, const struct xfs_name *name),
+ TP_ARGS(mp, dp, ino, name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->dir = dp->i_ino;
+ __entry->ino = ino;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir,
+ __entry->ino,
+ __entry->namelen,
+ __get_str(name))
+);
+
+TRACE_EVENT(xchk_nlinks_collect_metafile,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino),
+ TP_ARGS(mp, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino)
+);
+
+TRACE_EVENT(xchk_nlinks_live_update,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_inode *dp,
+ int action, xfs_ino_t ino, int delta,
+ const char *name, unsigned int namelen),
+ TP_ARGS(mp, dp, action, ino, delta, name, namelen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir)
+ __field(int, action)
+ __field(xfs_ino_t, ino)
+ __field(int, delta)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, namelen)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->dir = dp ? dp->i_ino : NULLFSINO;
+ __entry->action = action;
+ __entry->ino = ino;
+ __entry->delta = delta;
+ __entry->namelen = namelen;
+ memcpy(__get_str(name), name, namelen);
+ ),
+ TP_printk("dev %d:%d dir 0x%llx ino 0x%llx nlink_delta %d name '%.*s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir,
+ __entry->ino,
+ __entry->delta,
+ __entry->namelen,
+ __get_str(name))
+);
+
+TRACE_EVENT(xchk_nlinks_check_zero,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+ const struct xchk_nlink *live),
+ TP_ARGS(mp, ino, live),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->parents = live->parents;
+ __entry->backrefs = live->backrefs;
+ __entry->children = live->children;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parents,
+ __entry->backrefs,
+ __entry->children)
+);
+
+TRACE_EVENT(xchk_nlinks_update_incore,
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+ const struct xchk_nlink *live, int parents_delta,
+ int backrefs_delta, int children_delta),
+ TP_ARGS(mp, ino, live, parents_delta, backrefs_delta, children_delta),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ __field(int, parents_delta)
+ __field(int, backrefs_delta)
+ __field(int, children_delta)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->parents = live->parents;
+ __entry->backrefs = live->backrefs;
+ __entry->children = live->children;
+ __entry->parents_delta = parents_delta;
+ __entry->backrefs_delta = backrefs_delta;
+ __entry->children_delta = children_delta;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parents %d:%u backrefs %d:%u children %d:%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parents_delta,
+ __entry->parents,
+ __entry->backrefs_delta,
+ __entry->backrefs,
+ __entry->children_delta,
+ __entry->children)
+);
+
+DECLARE_EVENT_CLASS(xchk_nlinks_diff_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip,
+ const struct xchk_nlink *live),
+ TP_ARGS(mp, ip, live),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(uint8_t, ftype)
+ __field(xfs_nlink_t, nlink)
+ __field(xfs_nlink_t, parents)
+ __field(xfs_nlink_t, backrefs)
+ __field(xfs_nlink_t, children)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
+ __entry->nlink = VFS_I(ip)->i_nlink;
+ __entry->parents = live->parents;
+ __entry->backrefs = live->backrefs;
+ __entry->children = live->children;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx ftype %s nlink %u parents %u backrefs %u children %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __entry->nlink,
+ __entry->parents,
+ __entry->backrefs,
+ __entry->children)
+);
+#define DEFINE_SCRUB_NLINKS_DIFF_EVENT(name) \
+DEFINE_EVENT(xchk_nlinks_diff_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, \
+ const struct xchk_nlink *live), \
+ TP_ARGS(mp, ip, live))
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode);
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
@@ -1171,37 +1605,156 @@ DEFINE_EVENT(xrep_rmap_class, name, \
xfs_agblock_t agbno, xfs_extlen_t len, \
uint64_t owner, uint64_t offset, unsigned int flags), \
TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
-DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn);
-DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn);
-DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn);
-DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap);
+DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap);
-TRACE_EVENT(xrep_refcount_extent_fn,
+TRACE_EVENT(xrep_abt_found,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- struct xfs_refcount_irec *irec),
- TP_ARGS(mp, agno, irec),
+ const struct xfs_alloc_rec_incore *rec),
+ TP_ARGS(mp, agno, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
- __field(xfs_nlink_t, refcount)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
- __entry->startblock = irec->rc_startblock;
- __entry->blockcount = irec->rc_blockcount;
- __entry->refcount = irec->rc_refcount;
+ __entry->startblock = rec->ar_startblock;
+ __entry->blockcount = rec->ar_blockcount;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startblock,
+ __entry->blockcount)
+)
+
+TRACE_EVENT(xrep_ibt_found,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ const struct xfs_inobt_rec_incore *rec),
+ TP_ARGS(mp, agno, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(uint16_t, holemask)
+ __field(uint8_t, count)
+ __field(uint8_t, freecount)
+ __field(uint64_t, freemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = rec->ir_startino;
+ __entry->holemask = rec->ir_holemask;
+ __entry->count = rec->ir_count;
+ __entry->freecount = rec->ir_freecount;
+ __entry->freemask = rec->ir_free;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x count 0x%x freecount 0x%x freemask 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startino,
+ __entry->holemask,
+ __entry->count,
+ __entry->freecount,
+ __entry->freemask)
+)
+
+TRACE_EVENT(xrep_refc_found,
+ TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec),
+ TP_ARGS(pag, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, domain)
+ __field(xfs_agblock_t, startblock)
+ __field(xfs_extlen_t, blockcount)
+ __field(xfs_nlink_t, refcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->domain = rec->rc_domain;
+ __entry->startblock = rec->rc_startblock;
+ __entry->blockcount = rec->rc_blockcount;
+ __entry->refcount = rec->rc_refcount;
+ ),
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
+ __entry->startblock,
__entry->blockcount,
__entry->refcount)
)
+TRACE_EVENT(xrep_bmap_found,
+ TP_PROTO(struct xfs_inode *ip, int whichfork,
+ struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, whichfork, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, lblk)
+ __field(xfs_filblks_t, len)
+ __field(xfs_fsblock_t, pblk)
+ __field(int, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->lblk = irec->br_startoff;
+ __entry->len = irec->br_blockcount;
+ __entry->pblk = irec->br_startblock;
+ __entry->state = irec->br_state;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->lblk,
+ __entry->len,
+ __entry->pblk,
+ __entry->state)
+);
+
+TRACE_EVENT(xrep_rmap_found,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ const struct xfs_rmap_irec *rec),
+ TP_ARGS(mp, agno, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = rec->rm_startblock;
+ __entry->len = rec->rm_blockcount;
+ __entry->owner = rec->rm_owner;
+ __entry->offset = rec->rm_offset;
+ __entry->flags = rec->rm_flags;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+
TRACE_EVENT(xrep_findroot_block,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
uint32_t magic, uint16_t level),
@@ -1286,51 +1839,440 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize,
__entry->refcbt_sz)
)
TRACE_EVENT(xrep_reset_counters,
- TP_PROTO(struct xfs_mount *mp),
- TP_ARGS(mp),
+ TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc),
+ TP_ARGS(mp, fsc),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(uint64_t, icount)
+ __field(uint64_t, ifree)
+ __field(uint64_t, fdblocks)
+ __field(uint64_t, frextents)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
+ __entry->icount = fsc->icount;
+ __entry->ifree = fsc->ifree;
+ __entry->fdblocks = fsc->fdblocks;
+ __entry->frextents = fsc->frextents;
),
- TP_printk("dev %d:%d",
- MAJOR(__entry->dev), MINOR(__entry->dev))
+ TP_printk("dev %d:%d icount %llu ifree %llu fdblocks %llu frextents %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->icount,
+ __entry->ifree,
+ __entry->fdblocks,
+ __entry->frextents)
)
-TRACE_EVENT(xrep_ialloc_insert,
+DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t startino, uint16_t holemask, uint8_t count,
- uint8_t freecount, uint64_t freemask),
- TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask),
+ xfs_agblock_t agbno, xfs_extlen_t len,
+ int64_t owner),
+ TP_ARGS(mp, agno, agbno, len, owner),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
- __field(xfs_agino_t, startino)
- __field(uint16_t, holemask)
- __field(uint8_t, count)
- __field(uint8_t, freecount)
- __field(uint64_t, freemask)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(int64_t, owner)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
- __entry->startino = startino;
- __entry->holemask = holemask;
- __entry->count = count;
- __entry->freecount = freecount;
- __entry->freemask = freemask;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->owner = owner;
),
- TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
- __entry->startino,
- __entry->holemask,
- __entry->count,
- __entry->freecount,
- __entry->freemask)
+ __entry->agbno,
+ __entry->len,
+ __entry->owner)
+);
+#define DEFINE_NEWBT_EXTENT_EVENT(name) \
+DEFINE_EVENT(xrep_newbt_extent_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len, \
+ int64_t owner), \
+ TP_ARGS(mp, agno, agbno, len, owner))
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block);
+
+DECLARE_EVENT_CLASS(xrep_dinode_class,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip),
+ TP_ARGS(sc, dip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(uint16_t, mode)
+ __field(uint8_t, version)
+ __field(uint8_t, format)
+ __field(uint32_t, uid)
+ __field(uint32_t, gid)
+ __field(uint64_t, size)
+ __field(uint64_t, nblocks)
+ __field(uint32_t, extsize)
+ __field(uint32_t, nextents)
+ __field(uint16_t, anextents)
+ __field(uint8_t, forkoff)
+ __field(uint8_t, aformat)
+ __field(uint16_t, flags)
+ __field(uint32_t, gen)
+ __field(uint64_t, flags2)
+ __field(uint32_t, cowextsize)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->mode = be16_to_cpu(dip->di_mode);
+ __entry->version = dip->di_version;
+ __entry->format = dip->di_format;
+ __entry->uid = be32_to_cpu(dip->di_uid);
+ __entry->gid = be32_to_cpu(dip->di_gid);
+ __entry->size = be64_to_cpu(dip->di_size);
+ __entry->nblocks = be64_to_cpu(dip->di_nblocks);
+ __entry->extsize = be32_to_cpu(dip->di_extsize);
+ __entry->nextents = be32_to_cpu(dip->di_nextents);
+ __entry->anextents = be16_to_cpu(dip->di_anextents);
+ __entry->forkoff = dip->di_forkoff;
+ __entry->aformat = dip->di_aformat;
+ __entry->flags = be16_to_cpu(dip->di_flags);
+ __entry->gen = be32_to_cpu(dip->di_gen);
+ __entry->flags2 = be64_to_cpu(dip->di_flags2);
+ __entry->cowextsize = be32_to_cpu(dip->di_cowextsize);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx mode 0x%x version %u format %u uid %u gid %u disize 0x%llx nblocks 0x%llx extsize %u nextents %u anextents %u forkoff 0x%x aformat %u flags 0x%x gen 0x%x flags2 0x%llx cowextsize %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->mode,
+ __entry->version,
+ __entry->format,
+ __entry->uid,
+ __entry->gid,
+ __entry->size,
+ __entry->nblocks,
+ __entry->extsize,
+ __entry->nextents,
+ __entry->anextents,
+ __entry->forkoff,
+ __entry->aformat,
+ __entry->flags,
+ __entry->gen,
+ __entry->flags2,
+ __entry->cowextsize)
+)
+
+#define DEFINE_REPAIR_DINODE_EVENT(name) \
+DEFINE_EVENT(xrep_dinode_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), \
+ TP_ARGS(sc, dip))
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_header);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_mode);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_flags);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_size);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_forks);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dfork);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_afork);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_ensure_forkoff);
+
+DECLARE_EVENT_CLASS(xrep_inode_class,
+ TP_PROTO(struct xfs_scrub *sc),
+ TP_ARGS(sc),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsize_t, size)
+ __field(xfs_rfsblock_t, nblocks)
+ __field(uint16_t, flags)
+ __field(uint64_t, flags2)
+ __field(uint32_t, nextents)
+ __field(uint8_t, format)
+ __field(uint32_t, anextents)
+ __field(uint8_t, aformat)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->size = sc->ip->i_disk_size;
+ __entry->nblocks = sc->ip->i_nblocks;
+ __entry->flags = sc->ip->i_diflags;
+ __entry->flags2 = sc->ip->i_diflags2;
+ __entry->nextents = sc->ip->i_df.if_nextents;
+ __entry->format = sc->ip->i_df.if_format;
+ __entry->anextents = sc->ip->i_af.if_nextents;
+ __entry->aformat = sc->ip->i_af.if_format;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx nblocks 0x%llx flags 0x%x flags2 0x%llx nextents %u format %u anextents %u aformat %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->nblocks,
+ __entry->flags,
+ __entry->flags2,
+ __entry->nextents,
+ __entry->format,
+ __entry->anextents,
+ __entry->aformat)
)
+#define DEFINE_REPAIR_INODE_EVENT(name) \
+DEFINE_EVENT(xrep_inode_class, name, \
+ TP_PROTO(struct xfs_scrub *sc), \
+ TP_ARGS(sc))
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockcounts);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_ids);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_flags);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockdir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_dir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed);
+
+TRACE_EVENT(xrep_dinode_count_rmaps,
+ TP_PROTO(struct xfs_scrub *sc, xfs_rfsblock_t data_blocks,
+ xfs_rfsblock_t rt_blocks, xfs_rfsblock_t attr_blocks,
+ xfs_extnum_t data_extents, xfs_extnum_t rt_extents,
+ xfs_aextnum_t attr_extents),
+ TP_ARGS(sc, data_blocks, rt_blocks, attr_blocks, data_extents,
+ rt_extents, attr_extents),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_rfsblock_t, data_blocks)
+ __field(xfs_rfsblock_t, rt_blocks)
+ __field(xfs_rfsblock_t, attr_blocks)
+ __field(xfs_extnum_t, data_extents)
+ __field(xfs_extnum_t, rt_extents)
+ __field(xfs_aextnum_t, attr_extents)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->data_blocks = data_blocks;
+ __entry->rt_blocks = rt_blocks;
+ __entry->attr_blocks = attr_blocks;
+ __entry->data_extents = data_extents;
+ __entry->rt_extents = rt_extents;
+ __entry->attr_extents = attr_extents;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx dblocks 0x%llx rtblocks 0x%llx ablocks 0x%llx dextents %llu rtextents %llu aextents %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->data_blocks,
+ __entry->rt_blocks,
+ __entry->attr_blocks,
+ __entry->data_extents,
+ __entry->rt_extents,
+ __entry->attr_extents)
+);
+
+TRACE_EVENT(xrep_dinode_findmode_dirent,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp,
+ unsigned int ftype),
+ TP_ARGS(sc, dp, ftype),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->ftype = ftype;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR))
+);
+
+TRACE_EVENT(xrep_dinode_findmode_dirent_inval,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp,
+ unsigned int ftype, unsigned int found_ftype),
+ TP_ARGS(sc, dp, ftype, found_ftype),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, ftype)
+ __field(unsigned int, found_ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->ftype = ftype;
+ __entry->found_ftype = found_ftype;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s' found_ftype '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __print_symbolic(__entry->found_ftype, XFS_DIR3_FTYPE_STR))
+);
+
+TRACE_EVENT(xrep_cow_mark_file_range,
+ TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock,
+ xfs_fileoff_t startoff, xfs_filblks_t blockcount),
+ TP_ARGS(ip, startblock, startoff, blockcount),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_filblks_t, blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->startoff = startoff;
+ __entry->startblock = startblock;
+ __entry->blockcount = blockcount;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->startoff,
+ __entry->startblock,
+ __entry->blockcount)
+);
+
+TRACE_EVENT(xrep_cow_replace_mapping,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_bmbt_irec *irec,
+ xfs_fsblock_t new_startblock, xfs_extlen_t new_blockcount),
+ TP_ARGS(ip, irec, new_startblock, new_blockcount),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_filblks_t, blockcount)
+ __field(xfs_exntst_t, state)
+ __field(xfs_fsblock_t, new_startblock)
+ __field(xfs_extlen_t, new_blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->startoff = irec->br_startoff;
+ __entry->startblock = irec->br_startblock;
+ __entry->blockcount = irec->br_blockcount;
+ __entry->state = irec->br_state;
+ __entry->new_startblock = new_startblock;
+ __entry->new_blockcount = new_blockcount;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx startoff 0x%llx startblock 0x%llx fsbcount 0x%llx state 0x%x new_startblock 0x%llx new_fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->startoff,
+ __entry->startblock,
+ __entry->blockcount,
+ __entry->state,
+ __entry->new_startblock,
+ __entry->new_blockcount)
+);
+
+TRACE_EVENT(xrep_cow_free_staging,
+ TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno,
+ xfs_extlen_t blockcount),
+ TP_ARGS(pag, agbno, blockcount),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->agbno = agbno;
+ __entry->blockcount = blockcount;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->blockcount)
+);
+
+#ifdef CONFIG_XFS_QUOTA
+DECLARE_EVENT_CLASS(xrep_dquot_class,
+ TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id),
+ TP_ARGS(mp, type, id),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint8_t, type)
+ __field(uint32_t, id)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->id = id;
+ __entry->type = type;
+ ),
+ TP_printk("dev %d:%d type %s id 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+ __entry->id)
+);
+
+#define DEFINE_XREP_DQUOT_EVENT(name) \
+DEFINE_EVENT(xrep_dquot_class, name, \
+ TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), \
+ TP_ARGS(mp, type, id))
+DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item);
+DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot);
+DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole);
+DEFINE_XREP_DQUOT_EVENT(xrep_quotacheck_dquot);
+#endif /* CONFIG_XFS_QUOTA */
+
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode);
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode);
+
+TRACE_EVENT(xrep_rmap_live_update,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int op,
+ const struct xfs_rmap_update_params *p),
+ TP_ARGS(mp, agno, op, p),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, op)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->op = op;
+ __entry->agbno = p->startblock;
+ __entry->len = p->blockcount;
+ xfs_owner_info_unpack(&p->oinfo, &__entry->owner,
+ &__entry->offset, &__entry->flags);
+ if (p->unwritten)
+ __entry->flags |= XFS_RMAP_UNWRITTEN;
+ ),
+ TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->op,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index f0f532c10a5a..17c982a4821d 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -16,7 +16,7 @@
* Large Arrays of Fixed-Size Records
* ==================================
*
- * This memory array uses an xfile (which itself is a memfd "file") to store
+ * This memory array uses an xfile (which itself is a shmem file) to store
* large numbers of fixed-size records in memory that can be paged out. This
* puts less stress on the memory reclaim algorithms during an online repair
* because we don't have to pin so much memory. However, array access is less
@@ -136,7 +136,7 @@ xfarray_load(
if (idx >= array->nr)
return -ENODATA;
- return xfile_obj_load(array->xfile, ptr, array->obj_size,
+ return xfile_load(array->xfile, ptr, array->obj_size,
xfarray_pos(array, idx));
}
@@ -152,7 +152,7 @@ xfarray_is_unset(
if (array->unset_slots == 0)
return false;
- error = xfile_obj_load(array->xfile, temp, array->obj_size, pos);
+ error = xfile_load(array->xfile, temp, array->obj_size, pos);
if (!error && xfarray_element_is_null(array, temp))
return true;
@@ -184,7 +184,7 @@ xfarray_unset(
return 0;
memset(temp, 0, array->obj_size);
- error = xfile_obj_store(array->xfile, temp, array->obj_size, pos);
+ error = xfile_store(array->xfile, temp, array->obj_size, pos);
if (error)
return error;
@@ -209,7 +209,7 @@ xfarray_store(
ASSERT(!xfarray_element_is_null(array, ptr));
- ret = xfile_obj_store(array->xfile, ptr, array->obj_size,
+ ret = xfile_store(array->xfile, ptr, array->obj_size,
xfarray_pos(array, idx));
if (ret)
return ret;
@@ -245,12 +245,12 @@ xfarray_store_anywhere(
for (pos = 0;
pos < endpos && array->unset_slots > 0;
pos += array->obj_size) {
- error = xfile_obj_load(array->xfile, temp, array->obj_size,
+ error = xfile_load(array->xfile, temp, array->obj_size,
pos);
if (error || !xfarray_element_is_null(array, temp))
continue;
- error = xfile_obj_store(array->xfile, ptr, array->obj_size,
+ error = xfile_store(array->xfile, ptr, array->obj_size,
pos);
if (error)
return error;
@@ -552,7 +552,7 @@ xfarray_isort(
trace_xfarray_isort(si, lo, hi);
xfarray_sort_bump_loads(si);
- error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos);
+ error = xfile_load(si->array->xfile, scratch, len, lo_pos);
if (error)
return error;
@@ -560,88 +560,45 @@ xfarray_isort(
sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
xfarray_sort_bump_stores(si);
- return xfile_obj_store(si->array->xfile, scratch, len, lo_pos);
+ return xfile_store(si->array->xfile, scratch, len, lo_pos);
}
-/* Grab a page for sorting records. */
-static inline int
-xfarray_sort_get_page(
- struct xfarray_sortinfo *si,
- loff_t pos,
- uint64_t len)
-{
- int error;
-
- error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
- if (error)
- return error;
-
- /*
- * xfile pages must never be mapped into userspace, so we skip the
- * dcache flush when mapping the page.
- */
- si->page_kaddr = kmap_local_page(si->xfpage.page);
- return 0;
-}
-
-/* Release a page we grabbed for sorting records. */
-static inline int
-xfarray_sort_put_page(
- struct xfarray_sortinfo *si)
-{
- if (!si->page_kaddr)
- return 0;
-
- kunmap_local(si->page_kaddr);
- si->page_kaddr = NULL;
-
- return xfile_put_page(si->array->xfile, &si->xfpage);
-}
-
-/* Decide if these records are eligible for in-page sorting. */
-static inline bool
-xfarray_want_pagesort(
- struct xfarray_sortinfo *si,
- xfarray_idx_t lo,
- xfarray_idx_t hi)
-{
- pgoff_t lo_page;
- pgoff_t hi_page;
- loff_t end_pos;
-
- /* We can only map one page at a time. */
- lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
- end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
- hi_page = end_pos >> PAGE_SHIFT;
-
- return lo_page == hi_page;
-}
-
-/* Sort a bunch of records that all live in the same memory page. */
+/*
+ * Sort the records from lo to hi (inclusive) if they are all backed by the
+ * same memory folio. Returns 1 if it sorted, 0 if it did not, or a negative
+ * errno.
+ */
STATIC int
-xfarray_pagesort(
+xfarray_foliosort(
struct xfarray_sortinfo *si,
xfarray_idx_t lo,
xfarray_idx_t hi)
{
+ struct folio *folio;
void *startp;
loff_t lo_pos = xfarray_pos(si->array, lo);
- uint64_t len = xfarray_pos(si->array, hi - lo);
- int error = 0;
+ uint64_t len = xfarray_pos(si->array, hi - lo + 1);
- trace_xfarray_pagesort(si, lo, hi);
+ /* No single folio could back this many records. */
+ if (len > XFILE_MAX_FOLIO_SIZE)
+ return 0;
xfarray_sort_bump_loads(si);
- error = xfarray_sort_get_page(si, lo_pos, len);
- if (error)
- return error;
+ folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ if (!folio)
+ return 0;
+
+ trace_xfarray_foliosort(si, lo, hi);
xfarray_sort_bump_heapsorts(si);
- startp = si->page_kaddr + offset_in_page(lo_pos);
+ startp = folio_address(folio) + offset_in_folio(folio, lo_pos);
sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
xfarray_sort_bump_stores(si);
- return xfarray_sort_put_page(si);
+ xfile_put_folio(si->array->xfile, folio);
+ return 1;
}
/* Return a pointer to the xfarray pivot record within the sortinfo struct. */
@@ -829,63 +786,78 @@ xfarray_qsort_push(
return 0;
}
+static inline void
+xfarray_sort_scan_done(
+ struct xfarray_sortinfo *si)
+{
+ if (si->folio)
+ xfile_put_folio(si->array->xfile, si->folio);
+ si->folio = NULL;
+}
+
/*
- * Load an element from the array into the first scratchpad and cache the page,
- * if possible.
+ * Cache the folio backing the start of the given array element. If the array
+ * element is contained entirely within the folio, return a pointer to the
+ * cached folio. Otherwise, load the element into the scratchpad and return a
+ * pointer to the scratchpad.
*/
static inline int
-xfarray_sort_load_cached(
+xfarray_sort_scan(
struct xfarray_sortinfo *si,
xfarray_idx_t idx,
- void *ptr)
+ void **ptrp)
{
loff_t idx_pos = xfarray_pos(si->array, idx);
- pgoff_t startpage;
- pgoff_t endpage;
int error = 0;
- /*
- * If this load would split a page, release the cached page, if any,
- * and perform a traditional read.
- */
- startpage = idx_pos >> PAGE_SHIFT;
- endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
- if (startpage != endpage) {
- error = xfarray_sort_put_page(si);
- if (error)
- return error;
+ if (xfarray_sort_terminated(si, &error))
+ return error;
- if (xfarray_sort_terminated(si, &error))
- return error;
+ trace_xfarray_sort_scan(si, idx);
- return xfile_obj_load(si->array->xfile, ptr,
- si->array->obj_size, idx_pos);
- }
+ /* If the cached folio doesn't cover this index, release it. */
+ if (si->folio &&
+ (idx < si->first_folio_idx || idx > si->last_folio_idx))
+ xfarray_sort_scan_done(si);
- /* If the cached page is not the one we want, release it. */
- if (xfile_page_cached(&si->xfpage) &&
- xfile_page_index(&si->xfpage) != startpage) {
- error = xfarray_sort_put_page(si);
- if (error)
- return error;
+ /* Grab the first folio that backs this array element. */
+ if (!si->folio) {
+ loff_t next_pos;
+
+ si->folio = xfile_get_folio(si->array->xfile, idx_pos,
+ si->array->obj_size, XFILE_ALLOC);
+ if (IS_ERR(si->folio))
+ return PTR_ERR(si->folio);
+
+ si->first_folio_idx = xfarray_idx(si->array,
+ folio_pos(si->folio) + si->array->obj_size - 1);
+
+ next_pos = folio_pos(si->folio) + folio_size(si->folio);
+ si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
+ if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
+ si->last_folio_idx--;
+
+ trace_xfarray_sort_scan(si, idx);
}
/*
- * If we don't have a cached page (and we know the load is contained
- * in a single page) then grab it.
+ * If this folio still doesn't cover the desired element, it must cross
+ * a folio boundary. Read into the scratchpad and we're done.
*/
- if (!xfile_page_cached(&si->xfpage)) {
- if (xfarray_sort_terminated(si, &error))
- return error;
+ if (idx < si->first_folio_idx || idx > si->last_folio_idx) {
+ void *temp = xfarray_scratch(si->array);
- error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT,
- PAGE_SIZE);
+ error = xfile_load(si->array->xfile, temp, si->array->obj_size,
+ idx_pos);
if (error)
return error;
+
+ *ptrp = temp;
+ return 0;
}
- memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos),
- si->array->obj_size);
+ /* Otherwise return a pointer to the array element in the folio. */
+ *ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos);
return 0;
}
@@ -952,6 +924,8 @@ xfarray_sort(
pivot = xfarray_sortinfo_pivot(si);
while (si->stack_depth >= 0) {
+ int ret;
+
lo = si_lo[si->stack_depth];
hi = si_hi[si->stack_depth];
@@ -964,13 +938,13 @@ xfarray_sort(
}
/*
- * If directly mapping the page and sorting can solve our
+ * If directly mapping the folio and sorting can solve our
* problems, we're done.
*/
- if (xfarray_want_pagesort(si, lo, hi)) {
- error = xfarray_pagesort(si, lo, hi);
- if (error)
- goto out_free;
+ ret = xfarray_foliosort(si, lo, hi);
+ if (ret < 0)
+ goto out_free;
+ if (ret == 1) {
si->stack_depth--;
continue;
}
@@ -995,25 +969,24 @@ xfarray_sort(
* than the pivot is on the right side of the range.
*/
while (lo < hi) {
+ void *p;
+
/*
* Decrement hi until it finds an a[hi] less than the
* pivot value.
*/
- error = xfarray_sort_load_cached(si, hi, scratch);
+ error = xfarray_sort_scan(si, hi, &p);
if (error)
goto out_free;
- while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
- lo < hi) {
+ while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) {
hi--;
- error = xfarray_sort_load_cached(si, hi,
- scratch);
+ error = xfarray_sort_scan(si, hi, &p);
if (error)
goto out_free;
}
- error = xfarray_sort_put_page(si);
- if (error)
- goto out_free;
-
+ if (p != scratch)
+ memcpy(scratch, p, si->array->obj_size);
+ xfarray_sort_scan_done(si);
if (xfarray_sort_terminated(si, &error))
goto out_free;
@@ -1028,21 +1001,18 @@ xfarray_sort(
* Increment lo until it finds an a[lo] greater than
* the pivot value.
*/
- error = xfarray_sort_load_cached(si, lo, scratch);
+ error = xfarray_sort_scan(si, lo, &p);
if (error)
goto out_free;
- while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
- lo < hi) {
+ while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) {
lo++;
- error = xfarray_sort_load_cached(si, lo,
- scratch);
+ error = xfarray_sort_scan(si, lo, &p);
if (error)
goto out_free;
}
- error = xfarray_sort_put_page(si);
- if (error)
- goto out_free;
-
+ if (p != scratch)
+ memcpy(scratch, p, si->array->obj_size);
+ xfarray_sort_scan_done(si);
if (xfarray_sort_terminated(si, &error))
goto out_free;
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
index 4ecac01363d9..acb2f94c56c1 100644
--- a/fs/xfs/scrub/xfarray.h
+++ b/fs/xfs/scrub/xfarray.h
@@ -45,6 +45,25 @@ int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr);
int xfarray_store_anywhere(struct xfarray *array, const void *ptr);
bool xfarray_element_is_null(struct xfarray *array, const void *ptr);
+/*
+ * Load an array element, but zero the buffer if there's no data because we
+ * haven't stored to that array element yet.
+ */
+static inline int
+xfarray_load_sparse(
+ struct xfarray *array,
+ uint64_t idx,
+ void *rec)
+{
+ int error = xfarray_load(array, idx, rec);
+
+ if (error == -ENODATA) {
+ memset(rec, 0, array->obj_size);
+ return 0;
+ }
+ return error;
+}
+
/* Append an element to the array. */
static inline int xfarray_append(struct xfarray *array, const void *ptr)
{
@@ -54,6 +73,28 @@ static inline int xfarray_append(struct xfarray *array, const void *ptr)
uint64_t xfarray_length(struct xfarray *array);
int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec);
+/*
+ * Iterate the non-null elements in a sparse xfarray. Callers should
+ * initialize *idx to XFARRAY_CURSOR_INIT before the first call; on return, it
+ * will be set to one more than the index of the record that was retrieved.
+ * Returns 1 if a record was retrieved, 0 if there weren't any more records, or
+ * a negative errno.
+ */
+static inline int
+xfarray_iter(
+ struct xfarray *array,
+ xfarray_idx_t *idx,
+ void *rec)
+{
+ int ret = xfarray_load_next(array, idx, rec);
+
+ if (ret == -ENODATA)
+ return 0;
+ if (ret == 0)
+ return 1;
+ return ret;
+}
+
/* Declarations for xfile array sort functionality. */
typedef cmp_func_t xfarray_cmp_fn;
@@ -83,9 +124,14 @@ struct xfarray_sortinfo {
/* XFARRAY_SORT_* flags; see below. */
unsigned int flags;
- /* Cache a page here for faster access. */
- struct xfile_page xfpage;
- void *page_kaddr;
+ /* Cache a folio here for faster scanning for pivots */
+ struct folio *folio;
+
+ /* First array index in folio that is completely readable */
+ xfarray_idx_t first_folio_idx;
+
+ /* Last array index in folio that is completely readable */
+ xfarray_idx_t last_folio_idx;
#ifdef DEBUG
/* Performance statistics. */
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 090c3ead43fd..8cdd863db585 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -34,13 +34,6 @@
* xfiles assume that the caller will handle all required concurrency
* management; standard vfs locks (freezer and inode) are not taken. Reads
* and writes are satisfied directly from the page cache.
- *
- * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
- * of a hole cause a page to be mapped into the file. If you are going to
- * create a sparse xfile, please be careful about reading from uninitialized
- * parts of the file. These pages are !Uptodate and will eventually be
- * reclaimed if not written, but in the short term this boosts memory
- * consumption.
*/
/*
@@ -62,38 +55,27 @@ xfile_create(
{
struct inode *inode;
struct xfile *xf;
- int error = -ENOMEM;
+ int error;
xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
if (!xf)
return -ENOMEM;
- xf->file = shmem_file_setup(description, isize, 0);
- if (!xf->file)
- goto out_xfile;
+ xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE);
if (IS_ERR(xf->file)) {
error = PTR_ERR(xf->file);
goto out_xfile;
}
- /*
- * We want a large sparse file that we can pread, pwrite, and seek.
- * xfile users are responsible for keeping the xfile hidden away from
- * all other callers, so we skip timestamp updates and security checks.
- * Make the inode only accessible by root, just in case the xfile ever
- * escapes.
- */
- xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
- FMODE_LSEEK;
- xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
inode = file_inode(xf->file);
- inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
- inode->i_mode &= ~0177;
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
-
lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
+ /*
+ * We don't want to bother with kmapping data during repair, so don't
+ * allow highmem pages to back this mapping.
+ */
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
+
trace_xfile_create(xf);
*xfilep = xf;
@@ -118,164 +100,128 @@ xfile_destroy(
}
/*
- * Read a memory object directly from the xfile's page cache. Unlike regular
- * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
- * high an offset, instead of truncating the read. Otherwise, we return
- * bytes read or an error code, like regular pread.
+ * Load an object. Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
*/
-ssize_t
-xfile_pread(
+int
+xfile_load(
struct xfile *xf,
void *buf,
size_t count,
loff_t pos)
{
struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- struct page *page = NULL;
- ssize_t read = 0;
unsigned int pflags;
- int error = 0;
if (count > MAX_RW_COUNT)
- return -E2BIG;
+ return -ENOMEM;
if (inode->i_sb->s_maxbytes - pos < count)
- return -EFBIG;
+ return -ENOMEM;
- trace_xfile_pread(xf, pos, count);
+ trace_xfile_load(xf, pos, count);
pflags = memalloc_nofs_save();
while (count > 0) {
- void *p, *kaddr;
+ struct folio *folio;
unsigned int len;
+ unsigned int offset;
- len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
-
- /*
- * In-kernel reads of a shmem file cause it to allocate a page
- * if the mapping shows a hole. Therefore, if we hit ENOMEM
- * we can continue by zeroing the caller's buffer.
- */
- page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
- __GFP_NOWARN);
- if (IS_ERR(page)) {
- error = PTR_ERR(page);
- if (error != -ENOMEM)
- break;
-
- memset(buf, 0, len);
- goto advance;
- }
-
- if (PageUptodate(page)) {
+ if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ SGP_READ) < 0)
+ break;
+ if (!folio) {
/*
- * xfile pages must never be mapped into userspace, so
- * we skip the dcache flush.
+ * No data stored at this offset, just zero the output
+ * buffer until the next page boundary.
*/
- kaddr = kmap_local_page(page);
- p = kaddr + offset_in_page(pos);
- memcpy(buf, p, len);
- kunmap_local(kaddr);
- } else {
+ len = min_t(ssize_t, count,
+ PAGE_SIZE - offset_in_page(pos));
memset(buf, 0, len);
- }
- put_page(page);
+ } else {
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ break;
+ }
+
+ offset = offset_in_folio(folio, pos);
+ len = min_t(ssize_t, count, folio_size(folio) - offset);
+ memcpy(buf, folio_address(folio) + offset, len);
-advance:
+ folio_unlock(folio);
+ folio_put(folio);
+ }
count -= len;
pos += len;
buf += len;
- read += len;
}
memalloc_nofs_restore(pflags);
- if (read > 0)
- return read;
- return error;
+ if (count)
+ return -ENOMEM;
+ return 0;
}
/*
- * Write a memory object directly to the xfile's page cache. Unlike regular
- * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
- * high an offset, instead of truncating the write. Otherwise, we return
- * bytes written or an error code, like regular pwrite.
+ * Store an object. Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
*/
-ssize_t
-xfile_pwrite(
+int
+xfile_store(
struct xfile *xf,
const void *buf,
size_t count,
loff_t pos)
{
struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- const struct address_space_operations *aops = mapping->a_ops;
- struct page *page = NULL;
- ssize_t written = 0;
unsigned int pflags;
- int error = 0;
if (count > MAX_RW_COUNT)
- return -E2BIG;
+ return -ENOMEM;
if (inode->i_sb->s_maxbytes - pos < count)
- return -EFBIG;
+ return -ENOMEM;
- trace_xfile_pwrite(xf, pos, count);
+ trace_xfile_store(xf, pos, count);
+
+ /*
+ * Increase the file size first so that shmem_get_folio(..., SGP_CACHE),
+ * actually allocates a folio instead of erroring out.
+ */
+ if (pos + count > i_size_read(inode))
+ i_size_write(inode, pos + count);
pflags = memalloc_nofs_save();
while (count > 0) {
- void *fsdata = NULL;
- void *p, *kaddr;
+ struct folio *folio;
unsigned int len;
- int ret;
-
- len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
-
- /*
- * We call write_begin directly here to avoid all the freezer
- * protection lock-taking that happens in the normal path.
- * shmem doesn't support fs freeze, but lockdep doesn't know
- * that and will trip over that.
- */
- error = aops->write_begin(NULL, mapping, pos, len, &page,
- &fsdata);
- if (error)
- break;
+ unsigned int offset;
- /*
- * xfile pages must never be mapped into userspace, so we skip
- * the dcache flush. If the page is not uptodate, zero it
- * before writing data.
- */
- kaddr = kmap_local_page(page);
- if (!PageUptodate(page)) {
- memset(kaddr, 0, PAGE_SIZE);
- SetPageUptodate(page);
- }
- p = kaddr + offset_in_page(pos);
- memcpy(p, buf, len);
- kunmap_local(kaddr);
-
- ret = aops->write_end(NULL, mapping, pos, len, len, page,
- fsdata);
- if (ret < 0) {
- error = ret;
+ if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ SGP_CACHE) < 0)
+ break;
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
- written += ret;
- if (ret != len)
- break;
+ offset = offset_in_folio(folio, pos);
+ len = min_t(ssize_t, count, folio_size(folio) - offset);
+ memcpy(folio_address(folio) + offset, buf, len);
+
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
- count -= ret;
- pos += ret;
- buf += ret;
+ count -= len;
+ pos += len;
+ buf += len;
}
memalloc_nofs_restore(pflags);
- if (written > 0)
- return written;
- return error;
+ if (count)
+ return -ENOMEM;
+ return 0;
}
/* Find the next written area in the xfile data for a given offset. */
@@ -291,129 +237,76 @@ xfile_seek_data(
return ret;
}
-/* Query stat information for an xfile. */
-int
-xfile_stat(
- struct xfile *xf,
- struct xfile_stat *statbuf)
-{
- struct kstat ks;
- int error;
-
- error = vfs_getattr_nosec(&xf->file->f_path, &ks,
- STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
- if (error)
- return error;
-
- statbuf->size = ks.size;
- statbuf->bytes = ks.blocks << SECTOR_SHIFT;
- return 0;
-}
-
/*
- * Grab the (locked) page for a memory object. The object cannot span a page
- * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we
- * cannot grab the page, or the usual negative errno.
+ * Grab the (locked) folio for a memory object. The object cannot span a folio
+ * boundary. Returns the locked folio if successful, NULL if there was no
+ * folio or it didn't cover the range requested, or an ERR_PTR on failure.
*/
-int
-xfile_get_page(
+struct folio *
+xfile_get_folio(
struct xfile *xf,
loff_t pos,
- unsigned int len,
- struct xfile_page *xfpage)
+ size_t len,
+ unsigned int flags)
{
struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- const struct address_space_operations *aops = mapping->a_ops;
- struct page *page = NULL;
- void *fsdata = NULL;
- loff_t key = round_down(pos, PAGE_SIZE);
+ struct folio *folio = NULL;
unsigned int pflags;
int error;
if (inode->i_sb->s_maxbytes - pos < len)
- return -ENOMEM;
- if (len > PAGE_SIZE - offset_in_page(pos))
- return -ENOTBLK;
-
- trace_xfile_get_page(xf, pos, len);
+ return ERR_PTR(-ENOMEM);
- pflags = memalloc_nofs_save();
+ trace_xfile_get_folio(xf, pos, len);
/*
- * We call write_begin directly here to avoid all the freezer
- * protection lock-taking that happens in the normal path. shmem
- * doesn't support fs freeze, but lockdep doesn't know that and will
- * trip over that.
+ * Increase the file size first so that shmem_get_folio(..., SGP_CACHE),
+ * actually allocates a folio instead of erroring out.
*/
- error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
- &fsdata);
+ if ((flags & XFILE_ALLOC) && pos + len > i_size_read(inode))
+ i_size_write(inode, pos + len);
+
+ pflags = memalloc_nofs_save();
+ error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ);
+ memalloc_nofs_restore(pflags);
if (error)
- goto out_pflags;
+ return ERR_PTR(error);
- /* We got the page, so make sure we push out EOF. */
- if (i_size_read(inode) < pos + len)
- i_size_write(inode, pos + len);
+ if (!folio)
+ return NULL;
- /*
- * If the page isn't up to date, fill it with zeroes before we hand it
- * to the caller and make sure the backing store will hold on to them.
- */
- if (!PageUptodate(page)) {
- void *kaddr;
+ if (len > folio_size(folio) - offset_in_folio(folio, pos)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return NULL;
+ }
- kaddr = kmap_local_page(page);
- memset(kaddr, 0, PAGE_SIZE);
- kunmap_local(kaddr);
- SetPageUptodate(page);
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ERR_PTR(-EIO);
}
/*
- * Mark each page dirty so that the contents are written to some
- * backing store when we drop this buffer, and take an extra reference
- * to prevent the xfile page from being swapped or removed from the
- * page cache by reclaim if the caller unlocks the page.
+ * Mark the folio dirty so that it won't be reclaimed once we drop the
+ * (potentially last) reference in xfile_put_folio.
*/
- set_page_dirty(page);
- get_page(page);
-
- xfpage->page = page;
- xfpage->fsdata = fsdata;
- xfpage->pos = key;
-out_pflags:
- memalloc_nofs_restore(pflags);
- return error;
+ if (flags & XFILE_ALLOC)
+ folio_set_dirty(folio);
+ return folio;
}
/*
- * Release the (locked) page for a memory object. Returns 0 or a negative
- * errno.
+ * Release the (locked) folio for a memory object.
*/
-int
-xfile_put_page(
+void
+xfile_put_folio(
struct xfile *xf,
- struct xfile_page *xfpage)
+ struct folio *folio)
{
- struct inode *inode = file_inode(xf->file);
- struct address_space *mapping = inode->i_mapping;
- const struct address_space_operations *aops = mapping->a_ops;
- unsigned int pflags;
- int ret;
-
- trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
-
- /* Give back the reference that we took in xfile_get_page. */
- put_page(xfpage->page);
+ trace_xfile_put_folio(xf, folio_pos(folio), folio_size(folio));
- pflags = memalloc_nofs_save();
- ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
- xfpage->page, xfpage->fsdata);
- memalloc_nofs_restore(pflags);
- memset(xfpage, 0, sizeof(struct xfile_page));
-
- if (ret < 0)
- return ret;
- if (ret != PAGE_SIZE)
- return -EIO;
- return 0;
+ folio_unlock(folio);
+ folio_put(folio);
}
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
index d56643b0f429..76d78dba7e34 100644
--- a/fs/xfs/scrub/xfile.h
+++ b/fs/xfs/scrub/xfile.h
@@ -6,22 +6,6 @@
#ifndef __XFS_SCRUB_XFILE_H__
#define __XFS_SCRUB_XFILE_H__
-struct xfile_page {
- struct page *page;
- void *fsdata;
- loff_t pos;
-};
-
-static inline bool xfile_page_cached(const struct xfile_page *xfpage)
-{
- return xfpage->page != NULL;
-}
-
-static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage)
-{
- return xfpage->page->index;
-}
-
struct xfile {
struct file *file;
};
@@ -29,49 +13,17 @@ struct xfile {
int xfile_create(const char *description, loff_t isize, struct xfile **xfilep);
void xfile_destroy(struct xfile *xf);
-ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos);
-ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count,
+int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos);
+int xfile_store(struct xfile *xf, const void *buf, size_t count,
loff_t pos);
-/*
- * Load an object. Since we're treating this file as "memory", any error or
- * short IO is treated as a failure to allocate memory.
- */
-static inline int
-xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos)
-{
- ssize_t ret = xfile_pread(xf, buf, count, pos);
-
- if (ret < 0 || ret != count)
- return -ENOMEM;
- return 0;
-}
-
-/*
- * Store an object. Since we're treating this file as "memory", any error or
- * short IO is treated as a failure to allocate memory.
- */
-static inline int
-xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos)
-{
- ssize_t ret = xfile_pwrite(xf, buf, count, pos);
-
- if (ret < 0 || ret != count)
- return -ENOMEM;
- return 0;
-}
-
loff_t xfile_seek_data(struct xfile *xf, loff_t pos);
-struct xfile_stat {
- loff_t size;
- unsigned long long bytes;
-};
-
-int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf);
+#define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER)
-int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len,
- struct xfile_page *xbuf);
-int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf);
+#define XFILE_ALLOC (1 << 0) /* allocate folio if not present */
+struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len,
+ unsigned int flags);
+void xfile_put_folio(struct xfile *xf, struct folio *folio);
#endif /* __XFS_SCRUB_XFILE_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 6b840301817a..4bf69c9c088e 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -167,7 +167,7 @@ xfs_get_acl(struct inode *inode, int type, bool rcu)
acl = ERR_PTR(error);
}
- kmem_free(args.value);
+ kvfree(args.value);
return acl;
}
@@ -204,7 +204,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
}
error = xfs_attr_change(&args);
- kmem_free(args.value);
+ kvfree(args.value);
/*
* If the attribute didn't exist to start with that's fine.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 465d7630bb21..3f428620ebf2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -112,7 +112,7 @@ xfs_end_ioend(
* longer dirty. If we don't remove delalloc blocks here, they become
* stale and can corrupt free space accounting on unmount.
*/
- error = blk_status_to_errno(ioend->io_bio->bi_status);
+ error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
@@ -179,7 +179,7 @@ STATIC void
xfs_end_bio(
struct bio *bio)
{
- struct iomap_ioend *ioend = bio->bi_private;
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
unsigned long flags;
@@ -276,7 +276,8 @@ static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
struct inode *inode,
- loff_t offset)
+ loff_t offset,
+ unsigned int len)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -444,7 +445,7 @@ xfs_prepare_ioend(
/* send ioends that might require a transaction to the completion wq */
if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
(ioend->io_flags & IOMAP_F_SHARED))
- ioend->io_bio->bi_end_io = xfs_end_bio;
+ ioend->io_bio.bi_end_io = xfs_end_bio;
return status;
}
@@ -502,13 +503,6 @@ xfs_vm_writepages(
{
struct xfs_writepage_ctx wpc = { };
- /*
- * Writing back data in a transaction context can result in recursive
- * transactions. This is bad, so issue a warning and get out of here.
- */
- if (WARN_ON_ONCE(current->journal_info))
- return 0;
-
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
@@ -584,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = {
.bmap = xfs_vm_bmap,
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = xfs_iomap_swapfile_activate,
};
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 89c7a9f4f930..24fb12986a56 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -23,6 +23,7 @@
#include "xfs_quota.h"
#include "xfs_dir2.h"
#include "xfs_error.h"
+#include "xfs_health.h"
/*
* Invalidate any incore buffers associated with this remote attribute value
@@ -147,6 +148,7 @@ xfs_attr3_node_inactive(
if (level > XFS_DA_NODE_MAXDEPTH) {
xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -197,6 +199,7 @@ xfs_attr3_node_inactive(
default:
xfs_buf_mark_corrupt(child_bp);
xfs_trans_brelse(*trans, child_bp);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
error = -EFSCORRUPTED;
break;
}
@@ -286,6 +289,7 @@ xfs_attr3_root_inactive(
error = xfs_attr3_leaf_inactive(trans, dp, bp);
break;
default:
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
error = -EFSCORRUPTED;
xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp);
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 36fe2abb16e6..9b4c61e1c22e 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -33,8 +33,6 @@ struct kmem_cache *xfs_attrd_cache;
static const struct xfs_item_ops xfs_attri_item_ops;
static const struct xfs_item_ops xfs_attrd_item_ops;
-static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp,
- struct xfs_attri_log_item *attrip);
static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip)
{
@@ -110,7 +108,7 @@ STATIC void
xfs_attri_item_free(
struct xfs_attri_log_item *attrip)
{
- kmem_free(attrip->attri_item.li_lv_shadow);
+ kvfree(attrip->attri_item.li_lv_shadow);
xfs_attri_log_nameval_put(attrip->attri_nameval);
kmem_cache_free(xfs_attri_cache, attrip);
}
@@ -228,7 +226,7 @@ xfs_attri_init(
{
struct xfs_attri_log_item *attrip;
- attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL);
+ attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_KERNEL | __GFP_NOFAIL);
/*
* Grab an extra reference to the name/value buffer for this log item.
@@ -253,7 +251,7 @@ static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
STATIC void
xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp)
{
- kmem_free(attrdp->attrd_item.li_lv_shadow);
+ kvfree(attrdp->attrd_item.li_lv_shadow);
kmem_cache_free(xfs_attrd_cache, attrdp);
}
@@ -310,47 +308,6 @@ xfs_attrd_item_intent(
return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
}
-/*
- * Performs one step of an attribute update intent and marks the attrd item
- * dirty.. An attr operation may be a set or a remove. Note that the
- * transaction is marked dirty regardless of whether the operation succeeds or
- * fails to support the ATTRI/ATTRD lifecycle rules.
- */
-STATIC int
-xfs_xattri_finish_update(
- struct xfs_attr_intent *attr,
- struct xfs_attrd_log_item *attrdp)
-{
- struct xfs_da_args *args = attr->xattri_da_args;
- int error;
-
- if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
- error = -EIO;
- goto out;
- }
-
- error = xfs_attr_set_iter(attr);
- if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
- error = -EAGAIN;
-out:
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the ATTRI and frees the ATTRD
- * 2.) shuts down the filesystem
- */
- args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
-
- /*
- * attr intent/done items are null when logged attributes are disabled
- */
- if (attrdp)
- set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
-
- return error;
-}
-
/* Log an attr to the intent item. */
STATIC void
xfs_attr_log_item(
@@ -360,9 +317,6 @@ xfs_attr_log_item(
{
struct xfs_attri_log_format *attrp;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags);
-
/*
* At this point the xfs_attr_intent has been constructed, and we've
* created the log intent. Fill in the attri log item and log format
@@ -419,7 +373,6 @@ xfs_attr_create_intent(
}
attrip = xfs_attri_init(mp, attr->xattri_nameval);
- xfs_trans_add_item(tp, &attrip->attri_item);
xfs_attr_log_item(tp, attrip, attr);
return &attrip->attri_item;
@@ -433,11 +386,16 @@ xfs_attr_free_item(
xfs_da_state_free(attr->xattri_da_state);
xfs_attri_log_nameval_put(attr->xattri_nameval);
if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY)
- kmem_free(attr);
+ kfree(attr);
else
kmem_cache_free(xfs_attr_intent_cache, attr);
}
+static inline struct xfs_attr_intent *attri_entry(const struct list_head *e)
+{
+ return list_entry(e, struct xfs_attr_intent, xattri_list);
+}
+
/* Process an attr. */
STATIC int
xfs_attr_finish_item(
@@ -446,24 +404,33 @@ xfs_attr_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_attr_intent *attr;
- struct xfs_attrd_log_item *done_item = NULL;
+ struct xfs_attr_intent *attr = attri_entry(item);
+ struct xfs_da_args *args;
int error;
- attr = container_of(item, struct xfs_attr_intent, xattri_list);
- if (done)
- done_item = ATTRD_ITEM(done);
+ args = attr->xattri_da_args;
- /*
- * Always reset trans after EAGAIN cycle
- * since the transaction is new
- */
- attr->xattri_da_args->trans = tp;
+ /* Reset trans after EAGAIN cycle since the transaction is new */
+ args->trans = tp;
- error = xfs_xattri_finish_update(attr, done_item);
- if (error != -EAGAIN)
- xfs_attr_free_item(attr);
+ if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+ error = -EIO;
+ goto out;
+ }
+
+ /* If an attr removal is trivially complete, we're done. */
+ if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE &&
+ !xfs_inode_hasattr(args->dp)) {
+ error = 0;
+ goto out;
+ }
+ error = xfs_attr_set_iter(attr);
+ if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
+ return -EAGAIN;
+
+out:
+ xfs_attr_free_item(attr);
return error;
}
@@ -480,9 +447,8 @@ STATIC void
xfs_attr_cancel_item(
struct list_head *item)
{
- struct xfs_attr_intent *attr;
+ struct xfs_attr_intent *attr = attri_entry(item);
- attr = container_of(item, struct xfs_attr_intent, xattri_list);
xfs_attr_free_item(attr);
}
@@ -532,44 +498,25 @@ xfs_attri_validate(
return xfs_verify_ino(mp, attrp->alfi_ino);
}
-/*
- * Process an attr intent item that was recovered from the log. We need to
- * delete the attr that it describes.
- */
-STATIC int
-xfs_attri_item_recover(
- struct xfs_log_item *lip,
- struct list_head *capture_list)
+static inline struct xfs_attr_intent *
+xfs_attri_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct xfs_attri_log_format *attrp,
+ struct xfs_inode **ipp,
+ struct xfs_attri_log_nameval *nv)
{
- struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
struct xfs_attr_intent *attr;
- struct xfs_mount *mp = lip->li_log->l_mp;
- struct xfs_inode *ip;
struct xfs_da_args *args;
- struct xfs_trans *tp;
- struct xfs_trans_res resv;
- struct xfs_attri_log_format *attrp;
- struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
- int error;
- int total;
int local;
- struct xfs_attrd_log_item *done_item = NULL;
-
- /*
- * First check the validity of the attr described by the ATTRI. If any
- * are bad, then assume that all are bad and just toss the ATTRI.
- */
- attrp = &attrip->attri_format;
- if (!xfs_attri_validate(mp, attrp) ||
- !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
- return -EFSCORRUPTED;
+ int error;
- error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
+ error = xlog_recover_iget(mp, attrp->alfi_ino, ipp);
if (error)
- return error;
+ return ERR_PTR(error);
- attr = kmem_zalloc(sizeof(struct xfs_attr_intent) +
- sizeof(struct xfs_da_args), KM_NOFS);
+ attr = kzalloc(sizeof(struct xfs_attr_intent) +
+ sizeof(struct xfs_da_args), GFP_KERNEL | __GFP_NOFAIL);
args = (struct xfs_da_args *)(attr + 1);
attr->xattri_da_args = args;
@@ -584,7 +531,7 @@ xfs_attri_item_recover(
attr->xattri_nameval = xfs_attri_log_nameval_get(nv);
ASSERT(attr->xattri_nameval);
- args->dp = ip;
+ args->dp = *ipp;
args->geo = mp->m_attr_geo;
args->whichfork = XFS_ATTR_FORK;
args->name = nv->name.i_addr;
@@ -608,43 +555,65 @@ xfs_attri_item_recover(
attr->xattri_dela_state = xfs_attr_init_add_state(args);
break;
case XFS_ATTRI_OP_FLAGS_REMOVE:
- if (!xfs_inode_hasattr(args->dp))
- goto out;
attr->xattri_dela_state = xfs_attr_init_remove_state(args);
break;
- default:
- ASSERT(0);
- error = -EFSCORRUPTED;
- goto out;
}
+ xfs_defer_add_item(dfp, &attr->xattri_list);
+ return attr;
+}
+
+/*
+ * Process an attr intent item that was recovered from the log. We need to
+ * delete the attr that it describes.
+ */
+STATIC int
+xfs_attr_recover_work(
+ struct xfs_defer_pending *dfp,
+ struct list_head *capture_list)
+{
+ struct xfs_log_item *lip = dfp->dfp_intent;
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_attr_intent *attr;
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ struct xfs_inode *ip;
+ struct xfs_da_args *args;
+ struct xfs_trans *tp;
+ struct xfs_trans_res resv;
+ struct xfs_attri_log_format *attrp;
+ struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
+ int error;
+ int total;
+
+ /*
+ * First check the validity of the attr described by the ATTRI. If any
+ * are bad, then assume that all are bad and just toss the ATTRI.
+ */
+ attrp = &attrip->attri_format;
+ if (!xfs_attri_validate(mp, attrp) ||
+ !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
+ return -EFSCORRUPTED;
+
+ attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv);
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+ args = attr->xattri_da_args;
+
xfs_init_attr_trans(args, &resv, &total);
resv = xlog_recover_resv(&resv);
error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp);
if (error)
- goto out;
-
+ return error;
args->trans = tp;
- done_item = xfs_trans_get_attrd(tp, attrip);
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_xattri_finish_update(attr, done_item);
- if (error == -EAGAIN) {
- /*
- * There's more work to do, so add the intent item to this
- * transaction so that we can continue it later.
- */
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
- error = xfs_defer_ops_capture_and_commit(tp, capture_list);
- if (error)
- goto out_unlock;
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_irele(ip);
- return 0;
- }
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &attrip->attri_format,
+ sizeof(attrip->attri_format));
if (error) {
xfs_trans_cancel(tp);
goto out_unlock;
@@ -654,18 +623,16 @@ xfs_attri_item_recover(
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
-out:
- xfs_attr_free_item(attr);
return error;
}
/* Re-log an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_attri_item_relog(
+xfs_attr_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_attrd_log_item *attrdp;
struct xfs_attri_log_item *old_attrip;
struct xfs_attri_log_item *new_attrip;
struct xfs_attri_log_format *new_attrp;
@@ -674,10 +641,6 @@ xfs_attri_item_relog(
old_attrip = ATTRI_ITEM(intent);
old_attrp = &old_attrip->attri_format;
- tp->t_flags |= XFS_TRANS_DIRTY;
- attrdp = xfs_trans_get_attrd(tp, old_attrip);
- set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
-
/*
* Create a new log item that shares the same name/value buffer as the
* old log item.
@@ -691,12 +654,43 @@ xfs_attri_item_relog(
new_attrp->alfi_name_len = old_attrp->alfi_name_len;
new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter;
- xfs_trans_add_item(tp, &new_attrip->attri_item);
- set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags);
-
return &new_attrip->attri_item;
}
+/* Get an ATTRD so we can process all the attrs. */
+static struct xfs_log_item *
+xfs_attr_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attrd_log_item *attrdp;
+
+ attrip = ATTRI_ITEM(intent);
+
+ attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_KERNEL | __GFP_NOFAIL);
+
+ xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
+ &xfs_attrd_item_ops);
+ attrdp->attrd_attrip = attrip;
+ attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
+
+ return &attrdp->attrd_item;
+}
+
+const struct xfs_defer_op_type xfs_attr_defer_type = {
+ .name = "attr",
+ .max_items = 1,
+ .create_intent = xfs_attr_create_intent,
+ .abort_intent = xfs_attr_abort_intent,
+ .create_done = xfs_attr_create_done,
+ .finish_item = xfs_attr_finish_item,
+ .cancel_item = xfs_attr_cancel_item,
+ .recover_work = xfs_attr_recover_work,
+ .relog_intent = xfs_attr_relog_intent,
+};
+
STATIC int
xlog_recover_attri_commit_pass2(
struct xlog *log,
@@ -767,63 +761,13 @@ xlog_recover_attri_commit_pass2(
attrip = xfs_attri_init(mp, nv);
memcpy(&attrip->attri_format, attri_formatp, len);
- /*
- * The ATTRI has two references. One for the ATTRD and one for ATTRI to
- * ensure it makes it into the AIL. Insert the ATTRI into the AIL
- * directly and drop the ATTRI reference. Note that
- * xfs_trans_ail_update() drops the AIL lock.
- */
- xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn);
- xfs_attri_release(attrip);
+ xlog_recover_intent_item(log, &attrip->attri_item, lsn,
+ &xfs_attr_defer_type);
xfs_attri_log_nameval_put(nv);
return 0;
}
/*
- * This routine is called to allocate an "attr free done" log item.
- */
-static struct xfs_attrd_log_item *
-xfs_trans_get_attrd(struct xfs_trans *tp,
- struct xfs_attri_log_item *attrip)
-{
- struct xfs_attrd_log_item *attrdp;
-
- ASSERT(tp != NULL);
-
- attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
-
- xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
- &xfs_attrd_item_ops);
- attrdp->attrd_attrip = attrip;
- attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
-
- xfs_trans_add_item(tp, &attrdp->attrd_item);
- return attrdp;
-}
-
-/* Get an ATTRD so we can process all the attrs. */
-static struct xfs_log_item *
-xfs_attr_create_done(
- struct xfs_trans *tp,
- struct xfs_log_item *intent,
- unsigned int count)
-{
- if (!intent)
- return NULL;
-
- return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item;
-}
-
-const struct xfs_defer_op_type xfs_attr_defer_type = {
- .max_items = 1,
- .create_intent = xfs_attr_create_intent,
- .abort_intent = xfs_attr_abort_intent,
- .create_done = xfs_attr_create_done,
- .finish_item = xfs_attr_finish_item,
- .cancel_item = xfs_attr_cancel_item,
-};
-
-/*
* This routine is called when an ATTRD format structure is found in a committed
* transaction in the log. Its purpose is to cancel the corresponding ATTRI if
* it was still in the log. To do this it searches the AIL for the ATTRI with
@@ -857,9 +801,7 @@ static const struct xfs_item_ops xfs_attri_item_ops = {
.iop_format = xfs_attri_item_format,
.iop_unpin = xfs_attri_item_unpin,
.iop_release = xfs_attri_item_release,
- .iop_recover = xfs_attri_item_recover,
.iop_match = xfs_attri_item_match,
- .iop_relog = xfs_attri_item_relog,
};
const struct xlog_recover_item_ops xlog_attri_item_ops = {
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 99bbbe1a0e44..a6819a642cc0 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -22,6 +22,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_dir2.h"
+#include "xfs_health.h"
STATIC int
xfs_attr_shortform_compare(const void *a, const void *b)
@@ -56,14 +57,13 @@ xfs_attr_shortform_list(
struct xfs_attrlist_cursor_kern *cursor = &context->cursor;
struct xfs_inode *dp = context->dp;
struct xfs_attr_sf_sort *sbuf, *sbp;
- struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_hdr *sf = dp->i_af.if_data;
struct xfs_attr_sf_entry *sfe;
int sbsize, nsbuf, count, i;
int error = 0;
- sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
ASSERT(sf != NULL);
- if (!sf->hdr.count)
+ if (!sf->count)
return 0;
trace_xfs_attr_list_sf(context);
@@ -79,12 +79,14 @@ xfs_attr_shortform_list(
*/
if (context->bufsize == 0 ||
(XFS_ISRESET_CURSOR(cursor) &&
- (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) {
- for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ (dp->i_af.if_bytes + sf->count * 16) < context->bufsize)) {
+ for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sfe->nameval,
- sfe->namelen)))
+ sfe->namelen))) {
+ xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
context->put_listent(context,
sfe->flags,
sfe->nameval,
@@ -109,15 +111,15 @@ xfs_attr_shortform_list(
/*
* It didn't all fit, so we have to sort everything on hashval.
*/
- sbsize = sf->hdr.count * sizeof(*sbuf);
- sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
+ sbsize = sf->count * sizeof(*sbuf);
+ sbp = sbuf = kmalloc(sbsize, GFP_KERNEL | __GFP_NOFAIL);
/*
* Scan the attribute list for the rest of the entries, storing
* the relevant info from only those that match into a buffer.
*/
nsbuf = 0;
- for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
if (unlikely(
((char *)sfe < (char *)sf) ||
((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) {
@@ -125,7 +127,8 @@ xfs_attr_shortform_list(
XFS_ERRLEVEL_LOW,
context->dp->i_mount, sfe,
sizeof(*sfe));
- kmem_free(sbuf);
+ kfree(sbuf);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -176,6 +179,7 @@ xfs_attr_shortform_list(
if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sbp->name,
sbp->namelen))) {
+ xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
error = -EFSCORRUPTED;
goto out;
}
@@ -189,7 +193,7 @@ xfs_attr_shortform_list(
cursor->offset++;
}
out:
- kmem_free(sbuf);
+ kfree(sbuf);
return error;
}
@@ -263,8 +267,10 @@ xfs_attr_node_list_lookup(
return 0;
/* We can't point back to the root. */
- if (XFS_IS_CORRUPT(mp, cursor->blkno == 0))
+ if (XFS_IS_CORRUPT(mp, cursor->blkno == 0)) {
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
}
if (expected_level != 0)
@@ -276,6 +282,7 @@ xfs_attr_node_list_lookup(
out_corruptbuf:
xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(tp, bp);
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
}
@@ -305,6 +312,8 @@ xfs_attr_node_list(
if (cursor->blkno > 0) {
error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp,
XFS_ATTR_FORK);
+ if (xfs_metadata_is_sick(error))
+ xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
if ((error != 0) && (error != -EFSCORRUPTED))
return error;
if (bp) {
@@ -465,8 +474,10 @@ xfs_attr3_leaf_list_int(
}
if (XFS_IS_CORRUPT(context->dp->i_mount,
- !xfs_attr_namecheck(name, namelen)))
+ !xfs_attr_namecheck(name, namelen))) {
+ xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
return -EFSCORRUPTED;
+ }
context->put_listent(context, entry->flags,
name, namelen, valuelen);
if (context->seen_enough)
@@ -505,7 +516,7 @@ xfs_attr_list_ilocked(
{
struct xfs_inode *dp = context->dp;
- ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
/*
* Decide on what work routines to call based on the inode size.
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index e736a0844c89..d27859a684aa 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -25,6 +25,7 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
#include "xfs_ag.h"
+#include "xfs_trace.h"
struct kmem_cache *xfs_bui_cache;
struct kmem_cache *xfs_bud_cache;
@@ -40,7 +41,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
- kmem_free(buip->bui_item.li_lv_shadow);
+ kvfree(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_cache, buip);
}
@@ -201,7 +202,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
- kmem_free(budp->bud_item.li_lv_shadow);
+ kvfree(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_cache, budp);
}
@@ -221,49 +222,9 @@ static const struct xfs_item_ops xfs_bud_item_ops = {
.iop_intent = xfs_bud_item_intent,
};
-static struct xfs_bud_log_item *
-xfs_trans_get_bud(
- struct xfs_trans *tp,
- struct xfs_bui_log_item *buip)
+static inline struct xfs_bmap_intent *bi_entry(const struct list_head *e)
{
- struct xfs_bud_log_item *budp;
-
- budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
- &xfs_bud_item_ops);
- budp->bud_buip = buip;
- budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
-
- xfs_trans_add_item(tp, &budp->bud_item);
- return budp;
-}
-
-/*
- * Finish an bmap update and log it to the BUD. Note that the
- * transaction is marked dirty regardless of whether the bmap update
- * succeeds or fails to support the BUI/BUD lifecycle rules.
- */
-static int
-xfs_trans_log_finish_bmap_update(
- struct xfs_trans *tp,
- struct xfs_bud_log_item *budp,
- struct xfs_bmap_intent *bi)
-{
- int error;
-
- error = xfs_bmap_finish_one(tp, bi);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the BUI and frees the BUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
- set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
-
- return error;
+ return list_entry(e, struct xfs_bmap_intent, bi_list);
}
/* Sort bmap intents by inode. */
@@ -273,37 +234,12 @@ xfs_bmap_update_diff_items(
const struct list_head *a,
const struct list_head *b)
{
- struct xfs_bmap_intent *ba;
- struct xfs_bmap_intent *bb;
+ struct xfs_bmap_intent *ba = bi_entry(a);
+ struct xfs_bmap_intent *bb = bi_entry(b);
- ba = container_of(a, struct xfs_bmap_intent, bi_list);
- bb = container_of(b, struct xfs_bmap_intent, bi_list);
return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
}
-/* Set the map extent flags for this mapping. */
-static void
-xfs_trans_set_bmap_flags(
- struct xfs_map_extent *map,
- enum xfs_bmap_intent_type type,
- int whichfork,
- xfs_exntst_t state)
-{
- map->me_flags = 0;
- switch (type) {
- case XFS_BMAP_MAP:
- case XFS_BMAP_UNMAP:
- map->me_flags = type;
- break;
- default:
- ASSERT(0);
- }
- if (state == XFS_EXT_UNWRITTEN)
- map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
- if (whichfork == XFS_ATTR_FORK)
- map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
-}
-
/* Log bmap updates in the intent item. */
STATIC void
xfs_bmap_update_log_item(
@@ -314,9 +250,6 @@ xfs_bmap_update_log_item(
uint next_extent;
struct xfs_map_extent *map;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
-
/*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
@@ -329,8 +262,21 @@ xfs_bmap_update_log_item(
map->me_startblock = bi->bi_bmap.br_startblock;
map->me_startoff = bi->bi_bmap.br_startoff;
map->me_len = bi->bi_bmap.br_blockcount;
- xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork,
- bi->bi_bmap.br_state);
+
+ switch (bi->bi_type) {
+ case XFS_BMAP_MAP:
+ case XFS_BMAP_UNMAP:
+ map->me_flags = bi->bi_type;
+ break;
+ default:
+ ASSERT(0);
+ }
+ if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN)
+ map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+ if (bi->bi_whichfork == XFS_ATTR_FORK)
+ map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
+ if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+ map->me_flags |= XFS_BMAP_EXTENT_REALTIME;
}
static struct xfs_log_item *
@@ -346,7 +292,6 @@ xfs_bmap_update_create_intent(
ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
- xfs_trans_add_item(tp, &buip->bui_item);
if (sort)
list_sort(mp, items, xfs_bmap_update_diff_items);
list_for_each_entry(bi, items, bi_list)
@@ -354,24 +299,36 @@ xfs_bmap_update_create_intent(
return &buip->bui_item;
}
-/* Get an BUD so we can process all the deferred rmap updates. */
+/* Get an BUD so we can process all the deferred bmap updates. */
static struct xfs_log_item *
xfs_bmap_update_create_done(
struct xfs_trans *tp,
struct xfs_log_item *intent,
unsigned int count)
{
- return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item;
+ struct xfs_bui_log_item *buip = BUI_ITEM(intent);
+ struct xfs_bud_log_item *budp;
+
+ budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
+ &xfs_bud_item_ops);
+ budp->bud_buip = buip;
+ budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
+
+ return &budp->bud_item;
}
/* Take a passive ref to the AG containing the space we're mapping. */
-void
+static inline void
xfs_bmap_update_get_group(
struct xfs_mount *mp,
struct xfs_bmap_intent *bi)
{
xfs_agnumber_t agno;
+ if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+ return;
+
agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock);
/*
@@ -384,15 +341,41 @@ xfs_bmap_update_get_group(
bi->bi_pag = xfs_perag_intent_get(mp, agno);
}
+/* Add this deferred BUI to the transaction. */
+void
+xfs_bmap_defer_add(
+ struct xfs_trans *tp,
+ struct xfs_bmap_intent *bi)
+{
+ trace_xfs_bmap_defer(bi);
+
+ xfs_bmap_update_get_group(tp->t_mountp, bi);
+ xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
+}
+
/* Release a passive AG ref after finishing mapping work. */
static inline void
xfs_bmap_update_put_group(
struct xfs_bmap_intent *bi)
{
+ if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+ return;
+
xfs_perag_intent_put(bi->bi_pag);
}
-/* Process a deferred rmap update. */
+/* Cancel a deferred bmap update. */
+STATIC void
+xfs_bmap_update_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_bmap_intent *bi = bi_entry(item);
+
+ xfs_bmap_update_put_group(bi);
+ kmem_cache_free(xfs_bmap_intent_cache, bi);
+}
+
+/* Process a deferred bmap update. */
STATIC int
xfs_bmap_update_finish_item(
struct xfs_trans *tp,
@@ -400,19 +383,16 @@ xfs_bmap_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_bmap_intent *bi;
+ struct xfs_bmap_intent *bi = bi_entry(item);
int error;
- bi = container_of(item, struct xfs_bmap_intent, bi_list);
-
- error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi);
+ error = xfs_bmap_finish_one(tp, bi);
if (!error && bi->bi_bmap.br_blockcount > 0) {
ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
return -EAGAIN;
}
- xfs_bmap_update_put_group(bi);
- kmem_cache_free(xfs_bmap_intent_cache, bi);
+ xfs_bmap_update_cancel_item(item);
return error;
}
@@ -424,28 +404,6 @@ xfs_bmap_update_abort_intent(
xfs_bui_release(BUI_ITEM(intent));
}
-/* Cancel a deferred bmap update. */
-STATIC void
-xfs_bmap_update_cancel_item(
- struct list_head *item)
-{
- struct xfs_bmap_intent *bi;
-
- bi = container_of(item, struct xfs_bmap_intent, bi_list);
-
- xfs_bmap_update_put_group(bi);
- kmem_cache_free(xfs_bmap_intent_cache, bi);
-}
-
-const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
- .max_items = XFS_BUI_MAX_FAST_EXTENTS,
- .create_intent = xfs_bmap_update_create_intent,
- .abort_intent = xfs_bmap_update_abort_intent,
- .create_done = xfs_bmap_update_create_done,
- .finish_item = xfs_bmap_update_finish_item,
- .cancel_item = xfs_bmap_update_cancel_item,
-};
-
/* Is this recovered BUI ok? */
static inline bool
xfs_bui_validate(
@@ -477,26 +435,60 @@ xfs_bui_validate(
if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
return false;
+ if (map->me_flags & XFS_BMAP_EXTENT_REALTIME)
+ return xfs_verify_rtbext(mp, map->me_startblock, map->me_len);
+
return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
}
+static inline struct xfs_bmap_intent *
+xfs_bui_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct xfs_inode **ipp,
+ struct xfs_map_extent *map)
+{
+ struct xfs_bmap_intent *bi;
+ int error;
+
+ error = xlog_recover_iget(mp, map->me_owner, ipp);
+ if (error)
+ return ERR_PTR(error);
+
+ bi = kmem_cache_zalloc(xfs_bmap_intent_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+ bi->bi_bmap.br_startblock = map->me_startblock;
+ bi->bi_bmap.br_startoff = map->me_startoff;
+ bi->bi_bmap.br_blockcount = map->me_len;
+ bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ bi->bi_owner = *ipp;
+ xfs_bmap_update_get_group(mp, bi);
+
+ xfs_defer_add_item(dfp, &bi->bi_list);
+ return bi;
+}
+
/*
* Process a bmap update intent item that was recovered from the log.
* We need to update some inode's bmbt.
*/
STATIC int
-xfs_bui_item_recover(
- struct xfs_log_item *lip,
+xfs_bmap_recover_work(
+ struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
- struct xfs_bmap_intent fake = { };
struct xfs_trans_res resv;
+ struct xfs_log_item *lip = dfp->dfp_intent;
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_map_extent *map;
- struct xfs_bud_log_item *budp;
+ struct xfs_bmap_intent *work;
int iext_delta;
int error = 0;
@@ -507,13 +499,9 @@ xfs_bui_item_recover(
}
map = &buip->bui_format.bui_extents[0];
- fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
- fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
-
- error = xlog_recover_iget(mp, map->me_owner, &ip);
- if (error)
- return error;
+ work = xfs_bui_recover_work(mp, dfp, &ip, map);
+ if (IS_ERR(work))
+ return PTR_ERR(work);
/* Allocate transaction and do the work. */
resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
@@ -522,42 +510,33 @@ xfs_bui_item_recover(
if (error)
goto err_rele;
- budp = xfs_trans_get_bud(tp, buip);
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- if (fake.bi_type == XFS_BMAP_MAP)
+ if (!!(map->me_flags & XFS_BMAP_EXTENT_REALTIME) !=
+ xfs_ifork_is_realtime(ip, work->bi_whichfork)) {
+ error = -EFSCORRUPTED;
+ goto err_cancel;
+ }
+
+ if (work->bi_type == XFS_BMAP_MAP)
iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
else
iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
- error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta);
+ error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto err_cancel;
- fake.bi_owner = ip;
- fake.bi_bmap.br_startblock = map->me_startblock;
- fake.bi_bmap.br_startoff = map->me_startoff;
- fake.bi_bmap.br_blockcount = map->me_len;
- fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-
- xfs_bmap_update_get_group(mp, &fake);
- error = xfs_trans_log_finish_bmap_update(tp, budp, &fake);
+ error = xlog_recover_finish_intent(tp, dfp);
if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map,
- sizeof(*map));
- xfs_bmap_update_put_group(&fake);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &buip->bui_format, sizeof(buip->bui_format));
if (error)
goto err_cancel;
- if (fake.bi_bmap.br_blockcount > 0) {
- ASSERT(fake.bi_type == XFS_BMAP_UNMAP);
- xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap);
- }
-
/*
* Commit transaction, which frees the transaction and saves the inode
* for later replay activities.
@@ -579,21 +558,13 @@ err_rele:
return error;
}
-STATIC bool
-xfs_bui_item_match(
- struct xfs_log_item *lip,
- uint64_t intent_id)
-{
- return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
-}
-
/* Relog an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_bui_item_relog(
+xfs_bmap_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_bud_log_item *budp;
struct xfs_bui_log_item *buip;
struct xfs_map_extent *map;
unsigned int count;
@@ -601,27 +572,40 @@ xfs_bui_item_relog(
count = BUI_ITEM(intent)->bui_format.bui_nextents;
map = BUI_ITEM(intent)->bui_format.bui_extents;
- tp->t_flags |= XFS_TRANS_DIRTY;
- budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
- set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
-
buip = xfs_bui_init(tp->t_mountp);
memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map));
atomic_set(&buip->bui_next_extent, count);
- xfs_trans_add_item(tp, &buip->bui_item);
- set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+
return &buip->bui_item;
}
+const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+ .name = "bmap",
+ .max_items = XFS_BUI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_bmap_update_create_intent,
+ .abort_intent = xfs_bmap_update_abort_intent,
+ .create_done = xfs_bmap_update_create_done,
+ .finish_item = xfs_bmap_update_finish_item,
+ .cancel_item = xfs_bmap_update_cancel_item,
+ .recover_work = xfs_bmap_recover_work,
+ .relog_intent = xfs_bmap_relog_intent,
+};
+
+STATIC bool
+xfs_bui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
+}
+
static const struct xfs_item_ops xfs_bui_item_ops = {
.flags = XFS_ITEM_INTENT,
.iop_size = xfs_bui_item_size,
.iop_format = xfs_bui_item_format,
.iop_unpin = xfs_bui_item_unpin,
.iop_release = xfs_bui_item_release,
- .iop_recover = xfs_bui_item_recover,
.iop_match = xfs_bui_item_match,
- .iop_relog = xfs_bui_item_relog,
};
static inline void
@@ -681,12 +665,9 @@ xlog_recover_bui_commit_pass2(
buip = xfs_bui_init(mp);
xfs_bui_copy_format(&buip->bui_format, bui_formatp);
atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
- /*
- * Insert the intent into the AIL directly and drop one reference so
- * that finishing or canceling the work will drop the other.
- */
- xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn);
- xfs_bui_release(buip);
+
+ xlog_recover_intent_item(log, &buip->bui_item, lsn,
+ &xfs_bmap_update_defer_type);
return 0;
}
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 3fafd3881a0b..6fee6a508343 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -68,4 +68,8 @@ struct xfs_bud_log_item {
extern struct kmem_cache *xfs_bui_cache;
extern struct kmem_cache *xfs_bud_cache;
+struct xfs_bmap_intent;
+
+void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
+
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 40e0a1f1f753..19e11d1da660 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -28,6 +28,7 @@
#include "xfs_icache.h"
#include "xfs_iomap.h"
#include "xfs_reflink.h"
+#include "xfs_rtbitmap.h"
/* Kernel only BMAP related definitions and functions */
@@ -65,157 +66,9 @@ xfs_zero_extent(
return blkdev_issue_zeroout(target->bt_bdev,
block << (mp->m_super->s_blocksize_bits - 9),
count_fsb << (mp->m_super->s_blocksize_bits - 9),
- GFP_NOFS, 0);
+ GFP_KERNEL, 0);
}
-#ifdef CONFIG_XFS_RT
-int
-xfs_bmap_rtalloc(
- struct xfs_bmalloca *ap)
-{
- struct xfs_mount *mp = ap->ip->i_mount;
- xfs_fileoff_t orig_offset = ap->offset;
- xfs_rtblock_t rtb;
- xfs_extlen_t prod = 0; /* product factor for allocators */
- xfs_extlen_t mod = 0; /* product factor for allocators */
- xfs_extlen_t ralen = 0; /* realtime allocation length */
- xfs_extlen_t align; /* minimum allocation alignment */
- xfs_extlen_t orig_length = ap->length;
- xfs_extlen_t minlen = mp->m_sb.sb_rextsize;
- xfs_extlen_t raminlen;
- bool rtlocked = false;
- bool ignore_locality = false;
- int error;
-
- align = xfs_get_extsz_hint(ap->ip);
-retry:
- prod = align / mp->m_sb.sb_rextsize;
- error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
- align, 1, ap->eof, 0,
- ap->conv, &ap->offset, &ap->length);
- if (error)
- return error;
- ASSERT(ap->length);
- ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
-
- /*
- * If we shifted the file offset downward to satisfy an extent size
- * hint, increase minlen by that amount so that the allocator won't
- * give us an allocation that's too short to cover at least one of the
- * blocks that the caller asked for.
- */
- if (ap->offset != orig_offset)
- minlen += orig_offset - ap->offset;
-
- /*
- * If the offset & length are not perfectly aligned
- * then kill prod, it will just get us in trouble.
- */
- div_u64_rem(ap->offset, align, &mod);
- if (mod || ap->length % align)
- prod = 1;
- /*
- * Set ralen to be the actual requested length in rtextents.
- */
- ralen = ap->length / mp->m_sb.sb_rextsize;
- /*
- * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
- * we rounded up to it, cut it back so it's valid again.
- * Note that if it's a really large request (bigger than
- * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
- * adjust the starting point to match it.
- */
- if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
- ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
-
- /*
- * Lock out modifications to both the RT bitmap and summary inodes
- */
- if (!rtlocked) {
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
- xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
- xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
- rtlocked = true;
- }
-
- /*
- * If it's an allocation to an empty file at offset 0,
- * pick an extent that will space things out in the rt area.
- */
- if (ap->eof && ap->offset == 0) {
- xfs_rtblock_t rtx; /* realtime extent no */
-
- error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
- if (error)
- return error;
- ap->blkno = rtx * mp->m_sb.sb_rextsize;
- } else {
- ap->blkno = 0;
- }
-
- xfs_bmap_adjacent(ap);
-
- /*
- * Realtime allocation, done through xfs_rtallocate_extent.
- */
- if (ignore_locality)
- ap->blkno = 0;
- else
- do_div(ap->blkno, mp->m_sb.sb_rextsize);
- rtb = ap->blkno;
- ap->length = ralen;
- raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize);
- error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length,
- &ralen, ap->wasdel, prod, &rtb);
- if (error)
- return error;
-
- if (rtb != NULLRTBLOCK) {
- ap->blkno = rtb * mp->m_sb.sb_rextsize;
- ap->length = ralen * mp->m_sb.sb_rextsize;
- ap->ip->i_nblocks += ap->length;
- xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
- if (ap->wasdel)
- ap->ip->i_delayed_blks -= ap->length;
- /*
- * Adjust the disk quota also. This was reserved
- * earlier.
- */
- xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
- ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
- XFS_TRANS_DQ_RTBCOUNT, ap->length);
- return 0;
- }
-
- if (align > mp->m_sb.sb_rextsize) {
- /*
- * We previously enlarged the request length to try to satisfy
- * an extent size hint. The allocator didn't return anything,
- * so reset the parameters to the original values and try again
- * without alignment criteria.
- */
- ap->offset = orig_offset;
- ap->length = orig_length;
- minlen = align = mp->m_sb.sb_rextsize;
- goto retry;
- }
-
- if (!ignore_locality && ap->blkno != 0) {
- /*
- * If we can't allocate near a specific rt extent, try again
- * without locality criteria.
- */
- ignore_locality = true;
- goto retry;
- }
-
- ap->blkno = NULLFSBLOCK;
- ap->length = 0;
- return 0;
-}
-#endif /* CONFIG_XFS_RT */
-
/*
* Extent tree block counting routines.
*/
@@ -655,8 +508,8 @@ xfs_can_free_eofblocks(
* Caller must either hold the exclusive io lock; or be inactivating
* the inode, which guarantees there are no other users of the inode.
*/
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL) ||
- (VFS_I(ip)->i_state & I_FREEING));
+ if (!(VFS_I(ip)->i_state & I_FREEING))
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
/* prealloc/delalloc exists only on regular files */
if (!S_ISREG(VFS_I(ip)->i_mode))
@@ -690,7 +543,7 @@ xfs_can_free_eofblocks(
*/
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
- end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
+ end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
if (last_fsb <= end_fsb)
return false;
@@ -780,12 +633,10 @@ xfs_alloc_file_space(
{
xfs_mount_t *mp = ip->i_mount;
xfs_off_t count;
- xfs_filblks_t allocated_fsb;
xfs_filblks_t allocatesize_fsb;
xfs_extlen_t extsz, temp;
xfs_fileoff_t startoffset_fsb;
xfs_fileoff_t endoffset_fsb;
- int nimaps;
int rt;
xfs_trans_t *tp;
xfs_bmbt_irec_t imaps[1], *imapp;
@@ -808,7 +659,6 @@ xfs_alloc_file_space(
count = len;
imapp = &imaps[0];
- nimaps = 1;
startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
allocatesize_fsb = endoffset_fsb - startoffset_fsb;
@@ -819,6 +669,7 @@ xfs_alloc_file_space(
while (allocatesize_fsb && !error) {
xfs_fileoff_t s, e;
unsigned int dblocks, rblocks, resblks;
+ int nimaps = 1;
/*
* Determine space reservations for data/realtime.
@@ -884,15 +735,19 @@ xfs_alloc_file_space(
if (error)
break;
- allocated_fsb = imapp->br_blockcount;
-
- if (nimaps == 0) {
- error = -ENOSPC;
- break;
+ /*
+ * If the allocator cannot find a single free extent large
+ * enough to cover the start block of the requested range,
+ * xfs_bmapi_write will return 0 but leave *nimaps set to 0.
+ *
+ * In that case we simply need to keep looping with the same
+ * startoffset_fsb so that one of the following allocations
+ * will eventually reach the requested range.
+ */
+ if (nimaps) {
+ startoffset_fsb += imapp->br_blockcount;
+ allocatesize_fsb -= imapp->br_blockcount;
}
-
- startoffset_fsb += allocated_fsb;
- allocatesize_fsb -= allocated_fsb;
}
return error;
@@ -989,10 +844,8 @@ xfs_free_file_space(
/* We can only free complete realtime extents. */
if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
- startoffset_fsb = roundup_64(startoffset_fsb,
- mp->m_sb.sb_rextsize);
- endoffset_fsb = rounddown_64(endoffset_fsb,
- mp->m_sb.sb_rextsize);
+ startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb);
+ endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb);
}
/*
@@ -1112,8 +965,7 @@ xfs_collapse_file_space(
xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
bool done = false;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
trace_xfs_collapse_file_space(ip);
@@ -1182,8 +1034,7 @@ xfs_insert_file_space(
xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
bool done = false;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
trace_xfs_insert_file_space(ip);
@@ -1454,16 +1305,16 @@ xfs_swap_extent_rmap(
}
/* Remove the mapping from the donor file. */
- xfs_bmap_unmap_extent(tp, tip, &uirec);
+ xfs_bmap_unmap_extent(tp, tip, XFS_DATA_FORK, &uirec);
/* Remove the mapping from the source file. */
- xfs_bmap_unmap_extent(tp, ip, &irec);
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &irec);
/* Map the donor file's blocks into the source file. */
- xfs_bmap_map_extent(tp, ip, &uirec);
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &uirec);
/* Map the source file's blocks into the donor file. */
- xfs_bmap_map_extent(tp, tip, &irec);
+ xfs_bmap_map_extent(tp, tip, XFS_DATA_FORK, &irec);
error = xfs_defer_finish(tpp);
tp = *tpp;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 6888078f5c31..77ecbb753ef2 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -47,7 +47,7 @@ int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
int rt, int eof, int delay, int convert,
xfs_fileoff_t *offp, xfs_extlen_t *lenp);
-void xfs_bmap_adjacent(struct xfs_bmalloca *ap);
+bool xfs_bmap_adjacent(struct xfs_bmalloca *ap);
int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, struct xfs_bmbt_irec *rec,
int *is_empty);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 545c7991b9b5..f0fa02264eda 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -21,6 +21,7 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_ag.h"
+#include "xfs_buf_mem.h"
struct kmem_cache *xfs_buf_cache;
@@ -60,6 +61,11 @@ xfs_buf_submit(
return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
}
+static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
+{
+ return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
+}
+
static inline int
xfs_buf_is_vmapped(
struct xfs_buf *bp)
@@ -169,7 +175,7 @@ xfs_buf_stale(
atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
- (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+ (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
atomic_dec(&bp->b_hold);
ASSERT(atomic_read(&bp->b_hold) >= 1);
@@ -189,8 +195,8 @@ xfs_buf_get_maps(
return 0;
}
- bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
- KM_NOFS);
+ bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map),
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
if (!bp->b_maps)
return -ENOMEM;
return 0;
@@ -204,7 +210,7 @@ xfs_buf_free_maps(
struct xfs_buf *bp)
{
if (bp->b_maps != &bp->__b_map) {
- kmem_free(bp->b_maps);
+ kfree(bp->b_maps);
bp->b_maps = NULL;
}
}
@@ -222,7 +228,8 @@ _xfs_buf_alloc(
int i;
*bpp = NULL;
- bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
+ bp = kmem_cache_zalloc(xfs_buf_cache,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
/*
* We don't want certain flags to appear in b_flags unless they are
@@ -289,7 +296,7 @@ xfs_buf_free_pages(
mm_account_reclaimed_pages(bp->b_page_count);
if (bp->b_pages != bp->b_page_array)
- kmem_free(bp->b_pages);
+ kfree(bp->b_pages);
bp->b_pages = NULL;
bp->b_flags &= ~_XBF_PAGES;
}
@@ -312,10 +319,12 @@ xfs_buf_free(
ASSERT(list_empty(&bp->b_lru));
- if (bp->b_flags & _XBF_PAGES)
+ if (xfs_buftarg_is_mem(bp->b_target))
+ xmbuf_unmap_page(bp);
+ else if (bp->b_flags & _XBF_PAGES)
xfs_buf_free_pages(bp);
else if (bp->b_flags & _XBF_KMEM)
- kmem_free(bp->b_addr);
+ kfree(bp->b_addr);
call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}
@@ -325,21 +334,21 @@ xfs_buf_alloc_kmem(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
- xfs_km_flags_t kmflag_mask = KM_NOFS;
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL;
size_t size = BBTOB(bp->b_length);
/* Assure zeroed buffer for non-read cases. */
if (!(flags & XBF_READ))
- kmflag_mask |= KM_ZERO;
+ gfp_mask |= __GFP_ZERO;
- bp->b_addr = kmem_alloc(size, kmflag_mask);
+ bp->b_addr = kmalloc(size, gfp_mask);
if (!bp->b_addr)
return -ENOMEM;
if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
((unsigned long)bp->b_addr & PAGE_MASK)) {
/* b_addr spans two pages - use alloc_page instead */
- kmem_free(bp->b_addr);
+ kfree(bp->b_addr);
bp->b_addr = NULL;
return -ENOMEM;
}
@@ -356,13 +365,11 @@ xfs_buf_alloc_pages(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
- gfp_t gfp_mask = __GFP_NOWARN;
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
long filled = 0;
if (flags & XBF_READ_AHEAD)
gfp_mask |= __GFP_NORETRY;
- else
- gfp_mask |= GFP_NOFS;
/* Make sure that we have a page list */
bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
@@ -429,11 +436,18 @@ _xfs_buf_map_pages(
/*
* vm_map_ram() will allocate auxiliary structures (e.g.
- * pagetables) with GFP_KERNEL, yet we are likely to be under
- * GFP_NOFS context here. Hence we need to tell memory reclaim
- * that we are in such a context via PF_MEMALLOC_NOFS to prevent
- * memory reclaim re-entering the filesystem here and
- * potentially deadlocking.
+ * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
+ * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
+ * from the same call site that can be run from both above and
+ * below memory reclaim causes lockdep false positives. Hence we
+ * always need to force this allocation to nofs context because
+ * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
+ * prevent false positive lockdep reports.
+ *
+ * XXX(dgc): I think dquot reclaim is the only place we can get
+ * to this function from memory reclaim context now. If we fix
+ * that like we've fixed inode reclaim to avoid writeback from
+ * reclaim, this nofs wrapping can go away.
*/
nofs_flag = memalloc_nofs_save();
do {
@@ -499,18 +513,18 @@ static const struct rhashtable_params xfs_buf_hash_params = {
};
int
-xfs_buf_hash_init(
- struct xfs_perag *pag)
+xfs_buf_cache_init(
+ struct xfs_buf_cache *bch)
{
- spin_lock_init(&pag->pag_buf_lock);
- return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
+ spin_lock_init(&bch->bc_lock);
+ return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
}
void
-xfs_buf_hash_destroy(
- struct xfs_perag *pag)
+xfs_buf_cache_destroy(
+ struct xfs_buf_cache *bch)
{
- rhashtable_destroy(&pag->pag_buf_hash);
+ rhashtable_destroy(&bch->bc_hash);
}
static int
@@ -573,7 +587,7 @@ xfs_buf_find_lock(
static inline int
xfs_buf_lookup(
- struct xfs_perag *pag,
+ struct xfs_buf_cache *bch,
struct xfs_buf_map *map,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
@@ -582,7 +596,7 @@ xfs_buf_lookup(
int error;
rcu_read_lock();
- bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
+ bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
rcu_read_unlock();
return -ENOENT;
@@ -607,6 +621,7 @@ xfs_buf_lookup(
static int
xfs_buf_find_insert(
struct xfs_buftarg *btp,
+ struct xfs_buf_cache *bch,
struct xfs_perag *pag,
struct xfs_buf_map *cmap,
struct xfs_buf_map *map,
@@ -622,31 +637,33 @@ xfs_buf_find_insert(
if (error)
goto out_drop_pag;
- /*
- * For buffers that fit entirely within a single page, first attempt to
- * allocate the memory from the heap to minimise memory usage. If we
- * can't get heap memory for these small buffers, we fall back to using
- * the page allocator.
- */
- if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
- xfs_buf_alloc_kmem(new_bp, flags) < 0) {
+ if (xfs_buftarg_is_mem(new_bp->b_target)) {
+ error = xmbuf_map_page(new_bp);
+ } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
+ xfs_buf_alloc_kmem(new_bp, flags) < 0) {
+ /*
+ * For buffers that fit entirely within a single page, first
+ * attempt to allocate the memory from the heap to minimise
+ * memory usage. If we can't get heap memory for these small
+ * buffers, we fall back to using the page allocator.
+ */
error = xfs_buf_alloc_pages(new_bp, flags);
- if (error)
- goto out_free_buf;
}
+ if (error)
+ goto out_free_buf;
- spin_lock(&pag->pag_buf_lock);
- bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
+ spin_lock(&bch->bc_lock);
+ bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
&new_bp->b_rhash_head, xfs_buf_hash_params);
if (IS_ERR(bp)) {
error = PTR_ERR(bp);
- spin_unlock(&pag->pag_buf_lock);
+ spin_unlock(&bch->bc_lock);
goto out_free_buf;
}
if (bp) {
/* found an existing buffer */
atomic_inc(&bp->b_hold);
- spin_unlock(&pag->pag_buf_lock);
+ spin_unlock(&bch->bc_lock);
error = xfs_buf_find_lock(bp, flags);
if (error)
xfs_buf_rele(bp);
@@ -657,17 +674,40 @@ xfs_buf_find_insert(
/* The new buffer keeps the perag reference until it is freed. */
new_bp->b_pag = pag;
- spin_unlock(&pag->pag_buf_lock);
+ spin_unlock(&bch->bc_lock);
*bpp = new_bp;
return 0;
out_free_buf:
xfs_buf_free(new_bp);
out_drop_pag:
- xfs_perag_put(pag);
+ if (pag)
+ xfs_perag_put(pag);
return error;
}
+static inline struct xfs_perag *
+xfs_buftarg_get_pag(
+ struct xfs_buftarg *btp,
+ const struct xfs_buf_map *map)
+{
+ struct xfs_mount *mp = btp->bt_mount;
+
+ if (xfs_buftarg_is_mem(btp))
+ return NULL;
+ return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
+}
+
+static inline struct xfs_buf_cache *
+xfs_buftarg_buf_cache(
+ struct xfs_buftarg *btp,
+ struct xfs_perag *pag)
+{
+ if (pag)
+ return &pag->pag_bcache;
+ return btp->bt_cache;
+}
+
/*
* Assembles a buffer covering the specified range. The code is optimised for
* cache hits, as metadata intensive workloads will see 3 orders of magnitude
@@ -681,6 +721,7 @@ xfs_buf_get_map(
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
+ struct xfs_buf_cache *bch;
struct xfs_perag *pag;
struct xfs_buf *bp = NULL;
struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
@@ -696,10 +737,10 @@ xfs_buf_get_map(
if (error)
return error;
- pag = xfs_perag_get(btp->bt_mount,
- xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
+ pag = xfs_buftarg_get_pag(btp, &cmap);
+ bch = xfs_buftarg_buf_cache(btp, pag);
- error = xfs_buf_lookup(pag, &cmap, flags, &bp);
+ error = xfs_buf_lookup(bch, &cmap, flags, &bp);
if (error && error != -ENOENT)
goto out_put_perag;
@@ -711,13 +752,14 @@ xfs_buf_get_map(
goto out_put_perag;
/* xfs_buf_find_insert() consumes the perag reference. */
- error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
+ error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
flags, &bp);
if (error)
return error;
} else {
XFS_STATS_INC(btp->bt_mount, xb_get_locked);
- xfs_perag_put(pag);
+ if (pag)
+ xfs_perag_put(pag);
}
/* We do not hold a perag reference anymore. */
@@ -745,7 +787,8 @@ xfs_buf_get_map(
return 0;
out_put_perag:
- xfs_perag_put(pag);
+ if (pag)
+ xfs_perag_put(pag);
return error;
}
@@ -892,6 +935,13 @@ xfs_buf_readahead_map(
{
struct xfs_buf *bp;
+ /*
+ * Currently we don't have a good means or justification for performing
+ * xmbuf_map_page asynchronously, so we don't do readahead.
+ */
+ if (xfs_buftarg_is_mem(target))
+ return;
+
xfs_buf_read_map(target, map, nmaps,
XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
__this_address);
@@ -957,7 +1007,10 @@ xfs_buf_get_uncached(
if (error)
return error;
- error = xfs_buf_alloc_pages(bp, flags);
+ if (xfs_buftarg_is_mem(bp->b_target))
+ error = xmbuf_map_page(bp);
+ else
+ error = xfs_buf_alloc_pages(bp, flags);
if (error)
goto fail_free_buf;
@@ -990,29 +1043,29 @@ xfs_buf_hold(
atomic_inc(&bp->b_hold);
}
-/*
- * Release a hold on the specified buffer. If the hold count is 1, the buffer is
- * placed on LRU or freed (depending on b_lru_ref).
- */
-void
-xfs_buf_rele(
+static void
+xfs_buf_rele_uncached(
struct xfs_buf *bp)
{
+ ASSERT(list_empty(&bp->b_lru));
+ if (atomic_dec_and_test(&bp->b_hold)) {
+ xfs_buf_ioacct_dec(bp);
+ xfs_buf_free(bp);
+ }
+}
+
+static void
+xfs_buf_rele_cached(
+ struct xfs_buf *bp)
+{
+ struct xfs_buftarg *btp = bp->b_target;
struct xfs_perag *pag = bp->b_pag;
+ struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
bool release;
bool freebuf = false;
trace_xfs_buf_rele(bp, _RET_IP_);
- if (!pag) {
- ASSERT(list_empty(&bp->b_lru));
- if (atomic_dec_and_test(&bp->b_hold)) {
- xfs_buf_ioacct_dec(bp);
- xfs_buf_free(bp);
- }
- return;
- }
-
ASSERT(atomic_read(&bp->b_hold) > 0);
/*
@@ -1026,7 +1079,7 @@ xfs_buf_rele(
* leading to a use-after-free scenario.
*/
spin_lock(&bp->b_lock);
- release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+ release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
if (!release) {
/*
* Drop the in-flight state if the buffer is already on the LRU
@@ -1047,11 +1100,11 @@ xfs_buf_rele(
* buffer for the LRU and clear the (now stale) dispose list
* state flag
*/
- if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+ if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) {
bp->b_state &= ~XFS_BSTATE_DISPOSE;
atomic_inc(&bp->b_hold);
}
- spin_unlock(&pag->pag_buf_lock);
+ spin_unlock(&bch->bc_lock);
} else {
/*
* most of the time buffers will already be removed from the
@@ -1060,16 +1113,17 @@ xfs_buf_rele(
* was on was the disposal list
*/
if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
- list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+ list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
} else {
ASSERT(list_empty(&bp->b_lru));
}
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
- xfs_buf_hash_params);
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
+ rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
+ xfs_buf_hash_params);
+ spin_unlock(&bch->bc_lock);
+ if (pag)
+ xfs_perag_put(pag);
freebuf = true;
}
@@ -1080,6 +1134,19 @@ out_unlock:
xfs_buf_free(bp);
}
+/*
+ * Release a hold on the specified buffer.
+ */
+void
+xfs_buf_rele(
+ struct xfs_buf *bp)
+{
+ trace_xfs_buf_rele(bp, _RET_IP_);
+ if (xfs_buf_is_uncached(bp))
+ xfs_buf_rele_uncached(bp);
+ else
+ xfs_buf_rele_cached(bp);
+}
/*
* Lock a buffer object, if it is not already locked.
@@ -1585,6 +1652,12 @@ _xfs_buf_ioapply(
/* we only use the buffer cache for meta-data */
op |= REQ_META;
+ /* in-memory targets are directly mapped, no IO required. */
+ if (xfs_buftarg_is_mem(bp->b_target)) {
+ xfs_buf_ioend(bp);
+ return;
+ }
+
/*
* Walk all the vectors issuing IO on them. Set up the initial offset
* into the buffer and the desired IO size before we start -
@@ -1940,25 +2013,30 @@ xfs_buftarg_shrink_count(
}
void
-xfs_free_buftarg(
+xfs_destroy_buftarg(
struct xfs_buftarg *btp)
{
shrinker_free(btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
+}
+void
+xfs_free_buftarg(
+ struct xfs_buftarg *btp)
+{
+ xfs_destroy_buftarg(btp);
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
/* the main block device is closed by kill_block_super */
if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
- bdev_release(btp->bt_bdev_handle);
-
- kmem_free(btp);
+ bdev_fput(btp->bt_bdev_file);
+ kfree(btp);
}
int
xfs_setsize_buftarg(
- xfs_buftarg_t *btp,
+ struct xfs_buftarg *btp,
unsigned int sectorsize)
{
/* Set up metadata sector size info */
@@ -1972,83 +2050,93 @@ xfs_setsize_buftarg(
return -EINVAL;
}
- /* Set up device logical sector size mask */
- btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
- btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
-
return 0;
}
-/*
- * When allocating the initial buffer target we have not yet
- * read in the superblock, so don't know what sized sectors
- * are being used at this early stage. Play safe.
- */
-STATIC int
-xfs_setsize_buftarg_early(
- xfs_buftarg_t *btp)
+int
+xfs_init_buftarg(
+ struct xfs_buftarg *btp,
+ size_t logical_sectorsize,
+ const char *descr)
{
- return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev));
+ /* Set up device logical sector size mask */
+ btp->bt_logical_sectorsize = logical_sectorsize;
+ btp->bt_logical_sectormask = logical_sectorsize - 1;
+
+ /*
+ * Buffer IO error rate limiting. Limit it to no more than 10 messages
+ * per 30 seconds so as to not spam logs too much on repeated errors.
+ */
+ ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
+ DEFAULT_RATELIMIT_BURST);
+
+ if (list_lru_init(&btp->bt_lru))
+ return -ENOMEM;
+ if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+ goto out_destroy_lru;
+
+ btp->bt_shrinker =
+ shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr);
+ if (!btp->bt_shrinker)
+ goto out_destroy_io_count;
+ btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
+ btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
+ btp->bt_shrinker->private_data = btp;
+ shrinker_register(btp->bt_shrinker);
+ return 0;
+
+out_destroy_io_count:
+ percpu_counter_destroy(&btp->bt_io_count);
+out_destroy_lru:
+ list_lru_destroy(&btp->bt_lru);
+ return -ENOMEM;
}
struct xfs_buftarg *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct bdev_handle *bdev_handle)
+ struct file *bdev_file)
{
- xfs_buftarg_t *btp;
+ struct xfs_buftarg *btp;
const struct dax_holder_operations *ops = NULL;
#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
ops = &xfs_dax_holder_operations;
#endif
- btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
+ btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL);
btp->bt_mount = mp;
- btp->bt_bdev_handle = bdev_handle;
- btp->bt_dev = bdev_handle->bdev->bd_dev;
- btp->bt_bdev = bdev_handle->bdev;
+ btp->bt_bdev_file = bdev_file;
+ btp->bt_bdev = file_bdev(bdev_file);
+ btp->bt_dev = btp->bt_bdev->bd_dev;
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
/*
- * Buffer IO error rate limiting. Limit it to no more than 10 messages
- * per 30 seconds so as to not spam logs too much on repeated errors.
+ * When allocating the buftargs we have not yet read the super block and
+ * thus don't know the file system sector size yet.
*/
- ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
- DEFAULT_RATELIMIT_BURST);
-
- if (xfs_setsize_buftarg_early(btp))
+ if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
goto error_free;
-
- if (list_lru_init(&btp->bt_lru))
+ if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
+ mp->m_super->s_id))
goto error_free;
- if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
- goto error_lru;
-
- btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s",
- mp->m_super->s_id);
- if (!btp->bt_shrinker)
- goto error_pcpu;
-
- btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
- btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
- btp->bt_shrinker->private_data = btp;
-
- shrinker_register(btp->bt_shrinker);
-
return btp;
-error_pcpu:
- percpu_counter_destroy(&btp->bt_io_count);
-error_lru:
- list_lru_destroy(&btp->bt_lru);
error_free:
- kmem_free(btp);
+ kfree(btp);
return NULL;
}
+static inline void
+xfs_buf_list_del(
+ struct xfs_buf *bp)
+{
+ list_del_init(&bp->b_list);
+ wake_up_var(&bp->b_list);
+}
+
/*
* Cancel a delayed write list.
*
@@ -2066,7 +2154,7 @@ xfs_buf_delwri_cancel(
xfs_buf_lock(bp);
bp->b_flags &= ~_XBF_DELWRI_Q;
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
xfs_buf_relse(bp);
}
}
@@ -2120,6 +2208,34 @@ xfs_buf_delwri_queue(
}
/*
+ * Queue a buffer to this delwri list as part of a data integrity operation.
+ * If the buffer is on any other delwri list, we'll wait for that to clear
+ * so that the caller can submit the buffer for IO and wait for the result.
+ * Callers must ensure the buffer is not already on the list.
+ */
+void
+xfs_buf_delwri_queue_here(
+ struct xfs_buf *bp,
+ struct list_head *buffer_list)
+{
+ /*
+ * We need this buffer to end up on the /caller's/ delwri list, not any
+ * old list. This can happen if the buffer is marked stale (which
+ * clears DELWRI_Q) after the AIL queues the buffer to its list but
+ * before the AIL has a chance to submit the list.
+ */
+ while (!list_empty(&bp->b_list)) {
+ xfs_buf_unlock(bp);
+ wait_var_event(&bp->b_list, list_empty(&bp->b_list));
+ xfs_buf_lock(bp);
+ }
+
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+
+ xfs_buf_delwri_queue(bp, buffer_list);
+}
+
+/*
* Compare function is more complex than it needs to be because
* the return value is only 32 bits and we are doing comparisons
* on 64 bit values
@@ -2181,7 +2297,7 @@ xfs_buf_delwri_submit_buffers(
* reference and remove it from the list here.
*/
if (!(bp->b_flags & _XBF_DELWRI_Q)) {
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
xfs_buf_relse(bp);
continue;
}
@@ -2201,7 +2317,7 @@ xfs_buf_delwri_submit_buffers(
list_move_tail(&bp->b_list, wait_list);
} else {
bp->b_flags |= XBF_ASYNC;
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
}
__xfs_buf_submit(bp, false);
}
@@ -2255,7 +2371,7 @@ xfs_buf_delwri_submit(
while (!list_empty(&wait_list)) {
bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
/*
* Wait on the locked buffer, check for errors and unlock and
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c86e16419656..b1580644501f 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -83,6 +83,14 @@ typedef unsigned int xfs_buf_flags_t;
#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */
+struct xfs_buf_cache {
+ spinlock_t bc_lock;
+ struct rhashtable bc_hash;
+};
+
+int xfs_buf_cache_init(struct xfs_buf_cache *bch);
+void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
+
/*
* The xfs_buftarg contains 2 notions of "sector size" -
*
@@ -96,11 +104,12 @@ typedef unsigned int xfs_buf_flags_t;
* The latter is derived from the underlying device, and controls direct IO
* alignment constraints.
*/
-typedef struct xfs_buftarg {
+struct xfs_buftarg {
dev_t bt_dev;
- struct bdev_handle *bt_bdev_handle;
+ struct file *bt_bdev_file;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
+ struct file *bt_file;
u64 bt_dax_part_off;
struct xfs_mount *bt_mount;
unsigned int bt_meta_sectorsize;
@@ -114,7 +123,10 @@ typedef struct xfs_buftarg {
struct percpu_counter bt_io_count;
struct ratelimit_state bt_ioerror_rl;
-} xfs_buftarg_t;
+
+ /* built-in cache, if we're not using the perag one */
+ struct xfs_buf_cache bt_cache[];
+};
#define XB_PAGES 2
@@ -319,6 +331,7 @@ extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */
extern void xfs_buf_delwri_cancel(struct list_head *);
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
@@ -365,7 +378,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
* Handling of buftargs.
*/
struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
- struct bdev_handle *bdev_handle);
+ struct file *bdev_file);
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
@@ -378,4 +391,9 @@ int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
+/* for xfs_buf_mem.c only: */
+int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize,
+ const char *descr);
+void xfs_destroy_buftarg(struct xfs_buftarg *btp);
+
#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 023d4e0385dd..43031842341a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -805,8 +805,8 @@ xfs_buf_item_get_format(
return;
}
- bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
- 0);
+ bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
+ GFP_KERNEL | __GFP_NOFAIL);
}
STATIC void
@@ -814,7 +814,7 @@ xfs_buf_item_free_format(
struct xfs_buf_log_item *bip)
{
if (bip->bli_formats != &bip->__bli_format) {
- kmem_free(bip->bli_formats);
+ kfree(bip->bli_formats);
bip->bli_formats = NULL;
}
}
@@ -1044,7 +1044,7 @@ xfs_buf_item_free(
struct xfs_buf_log_item *bip)
{
xfs_buf_item_free_format(bip);
- kmem_free(bip->bli_item.li_lv_shadow);
+ kvfree(bip->bli_item.li_lv_shadow);
kmem_cache_free(xfs_buf_item_cache, bip);
}
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 43167f543afc..09e893cf563c 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -85,7 +85,7 @@ xlog_add_buffer_cancelled(
return false;
}
- bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
+ bcp = kmalloc(sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL);
bcp->bc_blkno = blkno;
bcp->bc_len = len;
bcp->bc_refcount = 1;
@@ -129,7 +129,7 @@ xlog_put_buffer_cancelled(
if (--bcp->bc_refcount == 0) {
list_del(&bcp->bc_list);
- kmem_free(bcp);
+ kfree(bcp);
}
return true;
}
@@ -1062,10 +1062,10 @@ xlog_free_buf_cancel_table(
&log->l_buf_cancel_table[i],
struct xfs_buf_cancel, bc_list))) {
list_del(&bc->bc_list);
- kmem_free(bc);
+ kfree(bc);
}
}
- kmem_free(log->l_buf_cancel_table);
+ kfree(log->l_buf_cancel_table);
log->l_buf_cancel_table = NULL;
}
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
new file mode 100644
index 000000000000..9bb2d24de709
--- /dev/null
+++ b/fs/xfs/xfs_buf_mem.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_buf.h"
+#include "xfs_buf_mem.h"
+#include "xfs_trace.h"
+#include <linux/shmem_fs.h>
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_error.h"
+
+/*
+ * Buffer Cache for In-Memory Files
+ * ================================
+ *
+ * Online fsck wants to create ephemeral ordered recordsets. The existing
+ * btree infrastructure can do this, but we need the buffer cache to target
+ * memory instead of block devices.
+ *
+ * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
+ * requirements. Therefore, the xmbuf mechanism uses an unlinked shmem file to
+ * store our staging data. This file is not installed in the file descriptor
+ * table so that user programs cannot access the data, which means that the
+ * xmbuf must be freed with xmbuf_destroy.
+ *
+ * xmbufs assume that the caller will handle all required concurrency
+ * management; standard vfs locks (freezer and inode) are not taken. Reads
+ * and writes are satisfied directly from the page cache.
+ *
+ * The only supported block size is PAGE_SIZE, and we cannot use highmem.
+ */
+
+/*
+ * shmem files used to back an in-memory buffer cache must not be exposed to
+ * userspace. Upper layers must coordinate access to the one handle returned
+ * by the constructor, so establish a separate lock class for xmbufs to avoid
+ * confusing lockdep.
+ */
+static struct lock_class_key xmbuf_i_mutex_key;
+
+/*
+ * Allocate a buffer cache target for a memory-backed file and set up the
+ * buffer target.
+ */
+int
+xmbuf_alloc(
+ struct xfs_mount *mp,
+ const char *descr,
+ struct xfs_buftarg **btpp)
+{
+ struct file *file;
+ struct inode *inode;
+ struct xfs_buftarg *btp;
+ int error;
+
+ btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
+ if (!btp)
+ return -ENOMEM;
+
+ file = shmem_kernel_file_setup(descr, 0, 0);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto out_free_btp;
+ }
+ inode = file_inode(file);
+
+ /* private file, private locking */
+ lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key);
+
+ /*
+ * We don't want to bother with kmapping data during repair, so don't
+ * allow highmem pages to back this mapping.
+ */
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
+
+ /* ensure all writes are below EOF to avoid pagecache zeroing */
+ i_size_write(inode, inode->i_sb->s_maxbytes);
+
+ error = xfs_buf_cache_init(btp->bt_cache);
+ if (error)
+ goto out_file;
+
+ /* Initialize buffer target */
+ btp->bt_mount = mp;
+ btp->bt_dev = (dev_t)-1U;
+ btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */
+ btp->bt_file = file;
+ btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE;
+ btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1;
+
+ error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
+ if (error)
+ goto out_bcache;
+
+ trace_xmbuf_create(btp);
+
+ *btpp = btp;
+ return 0;
+
+out_bcache:
+ xfs_buf_cache_destroy(btp->bt_cache);
+out_file:
+ fput(file);
+out_free_btp:
+ kfree(btp);
+ return error;
+}
+
+/* Free a buffer cache target for a memory-backed buffer cache. */
+void
+xmbuf_free(
+ struct xfs_buftarg *btp)
+{
+ ASSERT(xfs_buftarg_is_mem(btp));
+ ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+
+ trace_xmbuf_free(btp);
+
+ xfs_destroy_buftarg(btp);
+ xfs_buf_cache_destroy(btp->bt_cache);
+ fput(btp->bt_file);
+ kfree(btp);
+}
+
+/* Directly map a shmem page into the buffer cache. */
+int
+xmbuf_map_page(
+ struct xfs_buf *bp)
+{
+ struct inode *inode = file_inode(bp->b_target->bt_file);
+ struct folio *folio = NULL;
+ struct page *page;
+ loff_t pos = BBTOB(xfs_buf_daddr(bp));
+ int error;
+
+ ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+ if (bp->b_map_count != 1)
+ return -ENOMEM;
+ if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE)
+ return -ENOMEM;
+ if (offset_in_page(pos) != 0) {
+ ASSERT(offset_in_page(pos));
+ return -ENOMEM;
+ }
+
+ error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
+ if (error)
+ return error;
+
+ if (filemap_check_wb_err(inode->i_mapping, 0)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return -EIO;
+ }
+
+ page = folio_file_page(folio, pos >> PAGE_SHIFT);
+
+ /*
+ * Mark the page dirty so that it won't be reclaimed once we drop the
+ * (potentially last) reference in xmbuf_unmap_page.
+ */
+ set_page_dirty(page);
+ unlock_page(page);
+
+ bp->b_addr = page_address(page);
+ bp->b_pages = bp->b_page_array;
+ bp->b_pages[0] = page;
+ bp->b_page_count = 1;
+ return 0;
+}
+
+/* Unmap a shmem page that was mapped into the buffer cache. */
+void
+xmbuf_unmap_page(
+ struct xfs_buf *bp)
+{
+ struct page *page = bp->b_pages[0];
+
+ ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+ put_page(page);
+
+ bp->b_addr = NULL;
+ bp->b_pages[0] = NULL;
+ bp->b_pages = NULL;
+ bp->b_page_count = 0;
+}
+
+/* Is this a valid daddr within the buftarg? */
+bool
+xmbuf_verify_daddr(
+ struct xfs_buftarg *btp,
+ xfs_daddr_t daddr)
+{
+ struct inode *inode = file_inode(btp->bt_file);
+
+ ASSERT(xfs_buftarg_is_mem(btp));
+
+ return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
+}
+
+/* Discard the page backing this buffer. */
+static void
+xmbuf_stale(
+ struct xfs_buf *bp)
+{
+ struct inode *inode = file_inode(bp->b_target->bt_file);
+ loff_t pos;
+
+ ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+ pos = BBTOB(xfs_buf_daddr(bp));
+ shmem_truncate_range(inode, pos, pos + BBTOB(bp->b_length) - 1);
+}
+
+/*
+ * Finalize a buffer -- discard the backing page if it's stale, or run the
+ * write verifier to detect problems.
+ */
+int
+xmbuf_finalize(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+ int error = 0;
+
+ if (bp->b_flags & XBF_STALE) {
+ xmbuf_stale(bp);
+ return 0;
+ }
+
+ /*
+ * Although this btree is ephemeral, validate the buffer structure so
+ * that we can detect memory corruption errors and software bugs.
+ */
+ fa = bp->b_ops->verify_struct(bp);
+ if (fa) {
+ error = -EFSCORRUPTED;
+ xfs_verifier_error(bp, error, fa);
+ }
+
+ return error;
+}
+
+/*
+ * Detach this xmbuf buffer from the transaction by any means necessary.
+ * All buffers are direct-mapped, so they do not need bwrite.
+ */
+void
+xmbuf_trans_bdetach(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bli = bp->b_log_item;
+
+ ASSERT(bli != NULL);
+
+ bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED |
+ XFS_BLI_LOGGED | XFS_BLI_STALE);
+ clear_bit(XFS_LI_DIRTY, &bli->bli_item.li_flags);
+
+ while (bp->b_log_item != NULL)
+ xfs_trans_bdetach(tp, bp);
+}
diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h
new file mode 100644
index 000000000000..eed4a7b63232
--- /dev/null
+++ b/fs/xfs/xfs_buf_mem.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_BUF_MEM_H__
+#define __XFS_BUF_MEM_H__
+
+#define XMBUF_BLOCKSIZE (PAGE_SIZE)
+#define XMBUF_BLOCKSHIFT (PAGE_SHIFT)
+
+#ifdef CONFIG_XFS_MEMORY_BUFS
+static inline bool xfs_buftarg_is_mem(const struct xfs_buftarg *btp)
+{
+ return btp->bt_bdev == NULL;
+}
+
+int xmbuf_alloc(struct xfs_mount *mp, const char *descr,
+ struct xfs_buftarg **btpp);
+void xmbuf_free(struct xfs_buftarg *btp);
+
+int xmbuf_map_page(struct xfs_buf *bp);
+void xmbuf_unmap_page(struct xfs_buf *bp);
+bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr);
+void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
+int xmbuf_finalize(struct xfs_buf *bp);
+#else
+# define xfs_buftarg_is_mem(...) (false)
+# define xmbuf_map_page(...) (-ENOMEM)
+# define xmbuf_unmap_page(...) ((void)0)
+# define xmbuf_verify_daddr(...) (false)
+#endif /* CONFIG_XFS_MEMORY_BUFS */
+
+#endif /* __XFS_BUF_MEM_H__ */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 9f3ceb461515..cf9296b7e06f 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -18,6 +18,7 @@
#include "xfs_bmap.h"
#include "xfs_trans.h"
#include "xfs_error.h"
+#include "xfs_health.h"
/*
* Directory file type support functions
@@ -51,7 +52,7 @@ xfs_dir2_sf_getdents(
struct xfs_mount *mp = dp->i_mount;
xfs_dir2_dataptr_t off; /* current entry's offset */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
xfs_dir2_dataptr_t dot_offset;
xfs_dir2_dataptr_t dotdot_offset;
xfs_ino_t ino;
@@ -59,9 +60,7 @@ xfs_dir2_sf_getdents(
ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
-
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
/*
* If the block number in the offset is out of range, we're done.
@@ -119,8 +118,10 @@ xfs_dir2_sf_getdents(
ctx->pos = off & 0x7fffffff;
if (XFS_IS_CORRUPT(dp->i_mount,
!xfs_dir2_namecheck(sfep->name,
- sfep->namelen)))
+ sfep->namelen))) {
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
return -EFSCORRUPTED;
+ }
if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
xfs_dir3_get_dtype(mp, filetype)))
return 0;
@@ -212,6 +213,7 @@ xfs_dir2_block_getdents(
if (XFS_IS_CORRUPT(dp->i_mount,
!xfs_dir2_namecheck(dep->name,
dep->namelen))) {
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
error = -EFSCORRUPTED;
goto out_rele;
}
@@ -466,6 +468,7 @@ xfs_dir2_leaf_getdents(
if (XFS_IS_CORRUPT(dp->i_mount,
!xfs_dir2_namecheck(dep->name,
dep->namelen))) {
+ xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
error = -EFSCORRUPTED;
break;
}
@@ -519,9 +522,11 @@ xfs_readdir(
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
- ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ xfs_assert_ilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
args.dp = dp;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index d5787991bb5b..268bb734dc0a 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -8,6 +8,7 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_trans.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_alloc_btree.h"
@@ -18,6 +19,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_ag.h"
+#include "xfs_health.h"
/*
* Notes on an efficient, low latency fstrim algorithm
@@ -79,7 +81,7 @@ xfs_discard_endio_work(
container_of(work, struct xfs_busy_extents, endio_work);
xfs_extent_busy_clear(extents->mount, &extents->extent_list, false);
- kmem_free(extents->owner);
+ kfree(extents->owner);
}
/*
@@ -120,7 +122,7 @@ xfs_discard_extents(
error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
- GFP_NOFS, &bio);
+ GFP_KERNEL, &bio);
if (error && error != -EOPNOTSUPP) {
xfs_info(mp,
"discard failed for extent [0x%llx,%u], error %d",
@@ -155,6 +157,7 @@ xfs_trim_gather_extents(
uint64_t *blocks_trimmed)
{
struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_trans *tp;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
int error;
@@ -168,11 +171,15 @@ xfs_trim_gather_extents(
*/
xfs_log_force(mp, XFS_LOG_SYNC);
- error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
+ error = xfs_trans_alloc_empty(mp, &tp);
if (error)
return error;
- cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+ if (error)
+ goto out_trans_cancel;
+
+ cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
/*
* Look up the extent length requested in the AGF and start with it.
@@ -204,6 +211,7 @@ xfs_trim_gather_extents(
if (error)
break;
if (XFS_IS_CORRUPT(mp, i != 1)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
break;
}
@@ -279,7 +287,8 @@ next_extent:
xfs_extent_busy_clear(mp, &extents->extent_list, false);
out_del_cursor:
xfs_btree_del_cursor(cur, error);
- xfs_buf_relse(agbp);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index ac6ba646624d..c98cb468c357 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -24,6 +24,7 @@
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
#include "xfs_error.h"
+#include "xfs_health.h"
/*
* Lock order:
@@ -44,6 +45,29 @@ static struct kmem_cache *xfs_dquot_cache;
static struct lock_class_key xfs_dquot_group_class;
static struct lock_class_key xfs_dquot_project_class;
+/* Record observations of quota corruption with the health tracking system. */
+static void
+xfs_dquot_mark_sick(
+ struct xfs_dquot *dqp)
+{
+ struct xfs_mount *mp = dqp->q_mount;
+
+ switch (dqp->q_type) {
+ case XFS_DQTYPE_USER:
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_UQUOTA);
+ break;
+ case XFS_DQTYPE_GROUP:
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA);
+ break;
+ case XFS_DQTYPE_PROJ:
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
/*
* This is called to free all the memory associated with a dquot
*/
@@ -53,7 +77,7 @@ xfs_qm_dqdestroy(
{
ASSERT(list_empty(&dqp->q_lru));
- kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
+ kvfree(dqp->q_logitem.qli_item.li_lv_shadow);
mutex_destroy(&dqp->q_qlock);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
@@ -172,14 +196,14 @@ xfs_qm_adjust_dqtimers(
/*
* initialize a buffer full of dquots and log the whole thing
*/
-STATIC void
+void
xfs_qm_init_dquot_blk(
struct xfs_trans *tp,
- struct xfs_mount *mp,
xfs_dqid_t id,
xfs_dqtype_t type,
struct xfs_buf *bp)
{
+ struct xfs_mount *mp = tp->t_mountp;
struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_dqblk *d;
xfs_dqid_t curid;
@@ -353,7 +377,7 @@ xfs_dquot_disk_alloc(
* Make a chunk of dquots out of this buffer and log
* the entire thing.
*/
- xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp);
+ xfs_qm_init_dquot_blk(tp, dqp->q_id, qtype, bp);
xfs_buf_set_ref(bp, XFS_DQUOT_REF);
/*
@@ -451,6 +475,8 @@ xfs_dquot_disk_read(
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0, &bp,
&xfs_dquot_buf_ops);
+ if (xfs_metadata_is_sick(error))
+ xfs_dquot_mark_sick(dqp);
if (error) {
ASSERT(bp == NULL);
return error;
@@ -562,7 +588,8 @@ xfs_dquot_from_disk(
struct xfs_dquot *dqp,
struct xfs_buf *bp)
{
- struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset;
+ struct xfs_dqblk *dqb = xfs_buf_offset(bp, dqp->q_bufoffset);
+ struct xfs_disk_dquot *ddqp = &dqb->dd_diskdq;
/*
* Ensure that we got the type and ID we were looking for.
@@ -573,6 +600,7 @@ xfs_dquot_from_disk(
"Metadata corruption detected at %pS, quota %u",
__this_address, dqp->q_id);
xfs_alert(bp->b_mount, "Unmount and run xfs_repair");
+ xfs_dquot_mark_sick(dqp);
return -EFSCORRUPTED;
}
@@ -783,6 +811,12 @@ restart:
* caller should throw away the dquot and start over. Otherwise, the dquot
* is returned locked (and held by the cache) as if there had been a cache
* hit.
+ *
+ * The insert needs to be done under memalloc_nofs context because the radix
+ * tree can do memory allocation during insert. The qi->qi_tree_lock is taken in
+ * memory reclaim when freeing unused dquots, so we cannot have the radix tree
+ * node allocation recursing into filesystem reclaim whilst we hold the
+ * qi_tree_lock.
*/
static int
xfs_qm_dqget_cache_insert(
@@ -792,25 +826,27 @@ xfs_qm_dqget_cache_insert(
xfs_dqid_t id,
struct xfs_dquot *dqp)
{
+ unsigned int nofs_flags;
int error;
+ nofs_flags = memalloc_nofs_save();
mutex_lock(&qi->qi_tree_lock);
error = radix_tree_insert(tree, id, dqp);
if (unlikely(error)) {
/* Duplicate found! Caller must try again. */
- mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_dup(dqp);
- return error;
+ goto out_unlock;
}
/* Return a locked dquot to the caller, with a reference taken. */
xfs_dqlock(dqp);
dqp->q_nrefs = 1;
-
qi->qi_dquots++;
- mutex_unlock(&qi->qi_tree_lock);
- return 0;
+out_unlock:
+ mutex_unlock(&qi->qi_tree_lock);
+ memalloc_nofs_restore(nofs_flags);
+ return error;
}
/* Check our input parameters. */
@@ -949,7 +985,7 @@ xfs_qm_dqget_inode(
if (error)
return error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(xfs_inode_dquot(ip, type) == NULL);
id = xfs_qm_id_for_quotatype(ip, type);
@@ -1006,7 +1042,7 @@ restart:
}
dqret:
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
trace_xfs_dqget_miss(dqp);
*O_dqpp = dqp;
return 0;
@@ -1064,7 +1100,7 @@ xfs_qm_dqput(
struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
trace_xfs_dqput_free(dqp);
- if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
+ if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru))
XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
}
xfs_dqunlock(dqp);
@@ -1237,6 +1273,8 @@ xfs_qm_dqflush(
&bp, &xfs_dquot_buf_ops);
if (error == -EAGAIN)
goto out_unlock;
+ if (xfs_metadata_is_sick(error))
+ xfs_dquot_mark_sick(dqp);
if (error)
goto out_abort;
@@ -1245,12 +1283,13 @@ xfs_qm_dqflush(
xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
dqp->q_id, fa);
xfs_buf_relse(bp);
+ xfs_dquot_mark_sick(dqp);
error = -EFSCORRUPTED;
goto out_abort;
}
/* Flush the incore dquot to the ondisk buffer. */
- dqblk = bp->b_addr + dqp->q_bufoffset;
+ dqblk = xfs_buf_offset(bp, dqp->q_bufoffset);
xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp);
/*
@@ -1361,34 +1400,3 @@ xfs_qm_exit(void)
kmem_cache_destroy(xfs_dqtrx_cache);
kmem_cache_destroy(xfs_dquot_cache);
}
-
-/*
- * Iterate every dquot of a particular type. The caller must ensure that the
- * particular quota type is active. iter_fn can return negative error codes,
- * or -ECANCELED to indicate that it wants to stop iterating.
- */
-int
-xfs_qm_dqiterate(
- struct xfs_mount *mp,
- xfs_dqtype_t type,
- xfs_qm_dqiterate_fn iter_fn,
- void *priv)
-{
- struct xfs_dquot *dq;
- xfs_dqid_t id = 0;
- int error;
-
- do {
- error = xfs_qm_dqget_next(mp, id, type, &dq);
- if (error == -ENOENT)
- return 0;
- if (error)
- return error;
-
- error = iter_fn(dq, type, priv);
- id = dq->q_id + 1;
- xfs_qm_dqput(dq);
- } while (error == 0 && id != 0);
-
- return error;
-}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 80c8f851a2f3..956272d9b302 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -234,12 +234,10 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
return dqp;
}
-typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq,
- xfs_dqtype_t type, void *priv);
-int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type,
- xfs_qm_dqiterate_fn iter_fn, void *priv);
-
time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout);
time64_t xfs_dquot_set_grace_period(time64_t grace);
+void xfs_qm_init_dquot_blk(struct xfs_trans *tp, xfs_dqid_t id, xfs_dqtype_t
+ type, struct xfs_buf *bp);
+
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c
index 8966ba842395..2c2720ce6923 100644
--- a/fs/xfs/xfs_dquot_item_recover.c
+++ b/fs/xfs/xfs_dquot_item_recover.c
@@ -19,6 +19,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
+#include "xfs_error.h"
STATIC void
xlog_recover_dquot_ra_pass2(
@@ -65,6 +66,7 @@ xlog_recover_dquot_commit_pass2(
{
struct xfs_mount *mp = log->l_mp;
struct xfs_buf *bp;
+ struct xfs_dqblk *dqb;
struct xfs_disk_dquot *ddq, *recddq;
struct xfs_dq_logformat *dq_f;
xfs_failaddr_t fa;
@@ -130,14 +132,14 @@ xlog_recover_dquot_commit_pass2(
return error;
ASSERT(bp);
- ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
+ dqb = xfs_buf_offset(bp, dq_f->qlf_boffset);
+ ddq = &dqb->dd_diskdq;
/*
* If the dquot has an LSN in it, recover the dquot only if it's less
* than the lsn of the transaction we are replaying.
*/
if (xfs_has_crc(mp)) {
- struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
@@ -147,10 +149,23 @@ xlog_recover_dquot_commit_pass2(
memcpy(ddq, recddq, item->ri_buf[1].i_len);
if (xfs_has_crc(mp)) {
- xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+ xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
+ /* Validate the recovered dquot. */
+ fa = xfs_dqblk_verify(log->l_mp, dqb, dq_f->qlf_id);
+ if (fa) {
+ XFS_CORRUPTION_ERROR("Bad dquot after recovery",
+ XFS_ERRLEVEL_LOW, mp, dqb,
+ sizeof(struct xfs_dqblk));
+ xfs_alert(mp,
+ "Metadata corruption detected at %pS, dquot 0x%x",
+ fa, dq_f->qlf_id);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+
ASSERT(dq_f->qlf_size == 2);
ASSERT(bp->b_mount == mp);
bp->b_flags |= _XBF_LOGRECOVERY;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index b2cbbba3e15a..7ad0e92c6b5b 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -240,15 +240,15 @@ xfs_errortag_init(
{
int ret;
- mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
- KM_MAYFAIL);
+ mp->m_errortag = kzalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!mp->m_errortag)
return -ENOMEM;
ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
&mp->m_kobj, "errortag");
if (ret)
- kmem_free(mp->m_errortag);
+ kfree(mp->m_errortag);
return ret;
}
@@ -257,7 +257,7 @@ xfs_errortag_del(
struct xfs_mount *mp)
{
xfs_sysfs_del(&mp->m_errortag_kobj);
- kmem_free(mp->m_errortag);
+ kfree(mp->m_errortag);
}
static bool
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 9ecfdcdc752f..56cfa1498571 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -32,7 +32,8 @@ xfs_extent_busy_insert_list(
struct rb_node **rbp;
struct rb_node *parent = NULL;
- new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
+ new = kzalloc(sizeof(struct xfs_extent_busy),
+ GFP_KERNEL | __GFP_NOFAIL);
new->agno = pag->pag_agno;
new->bno = bno;
new->length = len;
@@ -530,7 +531,7 @@ xfs_extent_busy_clear_one(
}
list_del_init(&busyp->list);
- kmem_free(busyp);
+ kfree(busyp);
}
static void
@@ -678,3 +679,16 @@ xfs_extent_busy_ag_cmp(
diff = b1->bno - b2->bno;
return diff;
}
+
+/* Are there any busy extents in this AG? */
+bool
+xfs_extent_busy_list_empty(
+ struct xfs_perag *pag)
+{
+ bool res;
+
+ spin_lock(&pag->pagb_lock);
+ res = RB_EMPTY_ROOT(&pag->pagb_tree);
+ spin_unlock(&pag->pagb_lock);
+ return res;
+}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 0639aab336f3..470032de3139 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -85,4 +85,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
list_sort(NULL, list, xfs_extent_busy_ag_cmp);
}
+bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
+
#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 3fa8789820ad..8c382f092332 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,9 +40,9 @@ STATIC void
xfs_efi_item_free(
struct xfs_efi_log_item *efip)
{
- kmem_free(efip->efi_item.li_lv_shadow);
+ kvfree(efip->efi_item.li_lv_shadow);
if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
- kmem_free(efip);
+ kfree(efip);
else
kmem_cache_free(xfs_efi_cache, efip);
}
@@ -229,9 +229,9 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
STATIC void
xfs_efd_item_free(struct xfs_efd_log_item *efdp)
{
- kmem_free(efdp->efd_item.li_lv_shadow);
+ kvfree(efdp->efd_item.li_lv_shadow);
if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
- kmem_free(efdp);
+ kfree(efdp);
else
kmem_cache_free(xfs_efd_cache, efdp);
}
@@ -304,39 +304,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = {
};
/*
- * Allocate an "extent free done" log item that will hold nextents worth of
- * extents. The caller must use all nextents extents, because we are not
- * flexible about this at all.
- */
-static struct xfs_efd_log_item *
-xfs_trans_get_efd(
- struct xfs_trans *tp,
- struct xfs_efi_log_item *efip,
- unsigned int nextents)
-{
- struct xfs_efd_log_item *efdp;
-
- ASSERT(nextents > 0);
-
- if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
- efdp = kzalloc(xfs_efd_log_item_sizeof(nextents),
- GFP_KERNEL | __GFP_NOFAIL);
- } else {
- efdp = kmem_cache_zalloc(xfs_efd_cache,
- GFP_KERNEL | __GFP_NOFAIL);
- }
-
- xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
- &xfs_efd_item_ops);
- efdp->efd_efip = efip;
- efdp->efd_format.efd_nextents = nextents;
- efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
-
- xfs_trans_add_item(tp, &efdp->efd_item);
- return efdp;
-}
-
-/*
* Fill the EFD with all extents from the EFI when we need to roll the
* transaction and continue with a new EFI.
*
@@ -364,69 +331,6 @@ xfs_efd_from_efi(
efdp->efd_next_extent = efip->efi_format.efi_nextents;
}
-/*
- * Free an extent and log it to the EFD. Note that the transaction is marked
- * dirty regardless of whether the extent free succeeds or fails to support the
- * EFI/EFD lifecycle rules.
- */
-static int
-xfs_trans_free_extent(
- struct xfs_trans *tp,
- struct xfs_efd_log_item *efdp,
- struct xfs_extent_free_item *xefi)
-{
- struct xfs_owner_info oinfo = { };
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_extent *extp;
- uint next_extent;
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
- xefi->xefi_startblock);
- int error;
-
- oinfo.oi_owner = xefi->xefi_owner;
- if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
- oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
- if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
- oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
-
- trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
- agbno, xefi->xefi_blockcount);
-
- error = __xfs_free_extent(tp, xefi->xefi_pag, agbno,
- xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv,
- xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the EFI and frees the EFD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
- /*
- * If we need a new transaction to make progress, the caller will log a
- * new EFI with the current contents. It will also log an EFD to cancel
- * the existing EFI, and so we need to copy all the unprocessed extents
- * in this EFI to the EFD so this works correctly.
- */
- if (error == -EAGAIN) {
- xfs_efd_from_efi(efdp);
- return error;
- }
-
- next_extent = efdp->efd_next_extent;
- ASSERT(next_extent < efdp->efd_format.efd_nextents);
- extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = xefi->xefi_startblock;
- extp->ext_len = xefi->xefi_blockcount;
- efdp->efd_next_extent++;
-
- return error;
-}
-
/* Sort bmap items by AG. */
static int
xfs_extent_free_diff_items(
@@ -453,9 +357,6 @@ xfs_extent_free_log_item(
uint next_extent;
struct xfs_extent *extp;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
-
/*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
@@ -481,7 +382,6 @@ xfs_extent_free_create_intent(
ASSERT(count > 0);
- xfs_trans_add_item(tp, &efip->efi_item);
if (sort)
list_sort(mp, items, xfs_extent_free_diff_items);
list_for_each_entry(xefi, items, xefi_list)
@@ -496,7 +396,26 @@ xfs_extent_free_create_done(
struct xfs_log_item *intent,
unsigned int count)
{
- return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item;
+ struct xfs_efi_log_item *efip = EFI_ITEM(intent);
+ struct xfs_efd_log_item *efdp;
+
+ ASSERT(count > 0);
+
+ if (count > XFS_EFD_MAX_FAST_EXTENTS) {
+ efdp = kzalloc(xfs_efd_log_item_sizeof(count),
+ GFP_KERNEL | __GFP_NOFAIL);
+ } else {
+ efdp = kmem_cache_zalloc(xfs_efd_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ }
+
+ xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
+ &xfs_efd_item_ops);
+ efdp->efd_efip = efip;
+ efdp->efd_format.efd_nextents = count;
+ efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+
+ return &efdp->efd_item;
}
/* Take a passive ref to the AG containing the space we're freeing. */
@@ -527,19 +446,49 @@ xfs_extent_free_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_extent_free_item *xefi;
- int error;
+ struct xfs_efd_log_item *efdp = EFD_ITEM(done);
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_extent *extp;
+ uint next_extent;
+ xfs_agblock_t agbno;
+ int error = 0;
xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+ agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
+
+ oinfo.oi_owner = xefi->xefi_owner;
+ if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
- error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
+ trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
+ agbno, xefi->xefi_blockcount);
/*
- * Don't free the XEFI if we need a new transaction to complete
- * processing of it.
+ * If we need a new transaction to make progress, the caller will log a
+ * new EFI with the current contents. It will also log an EFD to cancel
+ * the existing EFI, and so we need to copy all the unprocessed extents
+ * in this EFI to the EFD so this works correctly.
*/
- if (error == -EAGAIN)
+ if (!(xefi->xefi_flags & XFS_EFI_CANCELLED))
+ error = __xfs_free_extent(tp, xefi->xefi_pag, agbno,
+ xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv,
+ xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
+ if (error == -EAGAIN) {
+ xfs_efd_from_efi(efdp);
return error;
+ }
+
+ /* Add the work we finished to the EFD, even though nobody uses that */
+ next_extent = efdp->efd_next_extent;
+ ASSERT(next_extent < efdp->efd_format.efd_nextents);
+ extp = &(efdp->efd_format.efd_extents[next_extent]);
+ extp->ext_start = xefi->xefi_startblock;
+ extp->ext_len = xefi->xefi_blockcount;
+ efdp->efd_next_extent++;
xfs_extent_free_put_group(xefi);
kmem_cache_free(xfs_extfree_item_cache, xefi);
@@ -567,15 +516,6 @@ xfs_extent_free_cancel_item(
kmem_cache_free(xfs_extfree_item_cache, xefi);
}
-const struct xfs_defer_op_type xfs_extent_free_defer_type = {
- .max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .create_intent = xfs_extent_free_create_intent,
- .abort_intent = xfs_extent_free_abort_intent,
- .create_done = xfs_extent_free_create_done,
- .finish_item = xfs_extent_free_finish_item,
- .cancel_item = xfs_extent_free_cancel_item,
-};
-
/*
* AGFL blocks are accounted differently in the reserve pools and are not
* inserted into the busy extent list.
@@ -610,16 +550,6 @@ xfs_agfl_free_finish_item(
error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno,
agbno, agbp, &oinfo);
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the EFI and frees the EFD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
next_extent = efdp->efd_next_extent;
ASSERT(next_extent < efdp->efd_format.efd_nextents);
extp = &(efdp->efd_format.efd_extents[next_extent]);
@@ -632,16 +562,6 @@ xfs_agfl_free_finish_item(
return error;
}
-/* sub-type with special handling for AGFL deferred frees */
-const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
- .max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .create_intent = xfs_extent_free_create_intent,
- .abort_intent = xfs_extent_free_abort_intent,
- .create_done = xfs_extent_free_create_done,
- .finish_item = xfs_agfl_free_finish_item,
- .cancel_item = xfs_extent_free_cancel_item,
-};
-
/* Is this recovered EFI ok? */
static inline bool
xfs_efi_validate_ext(
@@ -651,23 +571,41 @@ xfs_efi_validate_ext(
return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len);
}
+static inline void
+xfs_efi_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct xfs_extent *extp)
+{
+ struct xfs_extent_free_item *xefi;
+
+ xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ xefi->xefi_startblock = extp->ext_start;
+ xefi->xefi_blockcount = extp->ext_len;
+ xefi->xefi_agresv = XFS_AG_RESV_NONE;
+ xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN;
+ xfs_extent_free_get_group(mp, xefi);
+
+ xfs_defer_add_item(dfp, &xefi->xefi_list);
+}
+
/*
* Process an extent free intent item that was recovered from
* the log. We need to free the extents that it describes.
*/
STATIC int
-xfs_efi_item_recover(
- struct xfs_log_item *lip,
+xfs_extent_free_recover_work(
+ struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
struct xfs_trans_res resv;
+ struct xfs_log_item *lip = dfp->dfp_intent;
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
struct xfs_mount *mp = lip->li_log->l_mp;
- struct xfs_efd_log_item *efdp;
struct xfs_trans *tp;
int i;
int error = 0;
- bool requeue_only = false;
/*
* First check the validity of the extents described by the
@@ -682,55 +620,22 @@ xfs_efi_item_recover(
sizeof(efip->efi_format));
return -EFSCORRUPTED;
}
+
+ xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]);
}
resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp);
if (error)
return error;
- efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
-
- for (i = 0; i < efip->efi_format.efi_nextents; i++) {
- struct xfs_extent_free_item fake = {
- .xefi_owner = XFS_RMAP_OWN_UNKNOWN,
- .xefi_agresv = XFS_AG_RESV_NONE,
- };
- struct xfs_extent *extp;
- extp = &efip->efi_format.efi_extents[i];
-
- fake.xefi_startblock = extp->ext_start;
- fake.xefi_blockcount = extp->ext_len;
-
- if (!requeue_only) {
- xfs_extent_free_get_group(mp, &fake);
- error = xfs_trans_free_extent(tp, efdp, &fake);
- xfs_extent_free_put_group(&fake);
- }
-
- /*
- * If we can't free the extent without potentially deadlocking,
- * requeue the rest of the extents to a new so that they get
- * run again later with a new transaction context.
- */
- if (error == -EAGAIN || requeue_only) {
- error = xfs_free_extent_later(tp, fake.xefi_startblock,
- fake.xefi_blockcount,
- &XFS_RMAP_OINFO_ANY_OWNER,
- fake.xefi_agresv);
- if (!error) {
- requeue_only = true;
- continue;
- }
- }
-
- if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- extp, sizeof(*extp));
- if (error)
- goto abort_error;
-
- }
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &efip->efi_format,
+ sizeof(efip->efi_format));
+ if (error)
+ goto abort_error;
return xfs_defer_ops_capture_and_commit(tp, capture_list);
@@ -739,21 +644,14 @@ abort_error:
return error;
}
-STATIC bool
-xfs_efi_item_match(
- struct xfs_log_item *lip,
- uint64_t intent_id)
-{
- return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
-}
-
/* Relog an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_efi_item_relog(
+xfs_extent_free_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_efd_log_item *efdp;
+ struct xfs_efd_log_item *efdp = EFD_ITEM(done_item);
struct xfs_efi_log_item *efip;
struct xfs_extent *extp;
unsigned int count;
@@ -761,29 +659,56 @@ xfs_efi_item_relog(
count = EFI_ITEM(intent)->efi_format.efi_nextents;
extp = EFI_ITEM(intent)->efi_format.efi_extents;
- tp->t_flags |= XFS_TRANS_DIRTY;
- efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
efdp->efd_next_extent = count;
memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
efip = xfs_efi_init(tp->t_mountp, count);
memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
atomic_set(&efip->efi_next_extent, count);
- xfs_trans_add_item(tp, &efip->efi_item);
- set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+
return &efip->efi_item;
}
+const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+ .name = "extent_free",
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_extent_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+ .recover_work = xfs_extent_free_recover_work,
+ .relog_intent = xfs_extent_free_relog_intent,
+};
+
+/* sub-type with special handling for AGFL deferred frees */
+const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+ .name = "agfl_free",
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_agfl_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+ .recover_work = xfs_extent_free_recover_work,
+ .relog_intent = xfs_extent_free_relog_intent,
+};
+
+STATIC bool
+xfs_efi_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
+}
+
static const struct xfs_item_ops xfs_efi_item_ops = {
.flags = XFS_ITEM_INTENT,
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
.iop_unpin = xfs_efi_item_unpin,
.iop_release = xfs_efi_item_release,
- .iop_recover = xfs_efi_item_recover,
.iop_match = xfs_efi_item_match,
- .iop_relog = xfs_efi_item_relog,
};
/*
@@ -820,12 +745,9 @@ xlog_recover_efi_commit_pass2(
return error;
}
atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
- /*
- * Insert the intent into the AIL directly and drop one reference so
- * that finishing or canceling the work will drop the other.
- */
- xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn);
- xfs_efi_release(efip);
+
+ xlog_recover_intent_item(log, &efip->efi_item, lsn,
+ &xfs_extent_free_defer_type);
return 0;
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 203700278ddb..2ce302b4885f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -214,6 +214,43 @@ xfs_ilock_iocb(
return 0;
}
+static int
+xfs_ilock_iocb_for_write(
+ struct kiocb *iocb,
+ unsigned int *lock_mode)
+{
+ ssize_t ret;
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+
+ ret = xfs_ilock_iocb(iocb, *lock_mode);
+ if (ret)
+ return ret;
+
+ if (*lock_mode == XFS_IOLOCK_EXCL)
+ return 0;
+ if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+ return 0;
+
+ xfs_iunlock(ip, *lock_mode);
+ *lock_mode = XFS_IOLOCK_EXCL;
+ return xfs_ilock_iocb(iocb, *lock_mode);
+}
+
+static unsigned int
+xfs_ilock_for_write_fault(
+ struct xfs_inode *ip)
+{
+ /* get a shared lock if no remapping in progress */
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+ return XFS_MMAPLOCK_SHARED;
+
+ /* wait for remapping to complete */
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ return XFS_MMAPLOCK_EXCL;
+}
+
STATIC ssize_t
xfs_file_dio_read(
struct kiocb *iocb,
@@ -551,7 +588,7 @@ xfs_file_dio_write_aligned(
unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
- ret = xfs_ilock_iocb(iocb, iolock);
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
@@ -618,7 +655,7 @@ retry_exclusive:
flags = IOMAP_DIO_FORCE_WAIT;
}
- ret = xfs_ilock_iocb(iocb, iolock);
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
@@ -842,7 +879,7 @@ xfs_break_dax_layouts(
{
struct page *page;
- ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
page = dax_layout_busy_page(inode->i_mapping);
if (!page)
@@ -863,7 +900,7 @@ xfs_break_layouts(
bool retry;
int error;
- ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
+ xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
do {
retry = false;
@@ -1180,7 +1217,7 @@ xfs_file_remap_range(
if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
xfs_log_force_inode(dest);
out_unlock:
- xfs_iunlock2_io_mmap(src, dest);
+ xfs_iunlock2_remapping(src, dest);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return remapped > 0 ? remapped : ret;
@@ -1193,8 +1230,7 @@ xfs_file_open(
{
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
- FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
+ file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
return generic_file_open(inode, file);
}
@@ -1207,7 +1243,9 @@ xfs_dir_open(
unsigned int mode;
int error;
- error = xfs_file_open(inode, file);
+ if (xfs_is_shutdown(ip->i_mount))
+ return -EIO;
+ error = generic_file_open(inode, file);
if (error)
return error;
@@ -1328,6 +1366,7 @@ __xfs_filemap_fault(
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
vm_fault_t ret;
+ unsigned int lock_mode = 0;
trace_xfs_filemap_fault(ip, order, write_fault);
@@ -1336,25 +1375,24 @@ __xfs_filemap_fault(
file_update_time(vmf->vma->vm_file);
}
+ if (IS_DAX(inode) || write_fault)
+ lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));
+
if (IS_DAX(inode)) {
pfn_t pfn;
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, order, pfn);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ } else if (write_fault) {
+ ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
} else {
- if (write_fault) {
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = iomap_page_mkwrite(vmf,
- &xfs_page_mkwrite_iomap_ops);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- } else {
- ret = filemap_fault(vmf);
- }
+ ret = filemap_fault(vmf);
}
+ if (lock_mode)
+ xfs_iunlock(XFS_I(inode), lock_mode);
+
if (write_fault)
sb_end_pagefault(inode->i_sb);
return ret;
@@ -1453,7 +1491,6 @@ const struct file_operations xfs_file_operations = {
.compat_ioctl = xfs_file_compat_ioctl,
#endif
.mmap = xfs_file_mmap,
- .mmap_supported_flags = MAP_SYNC,
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
@@ -1461,6 +1498,8 @@ const struct file_operations xfs_file_operations = {
.fallocate = xfs_file_fallocate,
.fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
+ .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
+ FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 2fc98d313708..e3aaa0555597 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -44,7 +44,7 @@ xfs_fstrm_free_func(
atomic_dec(&pag->pagf_fstrms);
xfs_perag_rele(pag);
- kmem_free(item);
+ kfree(item);
}
/*
@@ -313,7 +313,7 @@ xfs_filestream_create_association(
* we return a referenced AG, the allocation can still go ahead just
* fine.
*/
- item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
+ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!item)
goto out_put_fstrms;
@@ -326,7 +326,7 @@ xfs_filestream_create_association(
out_free_item:
xfs_perag_rele(item->pag);
- kmem_free(item);
+ kfree(item);
out_put_fstrms:
atomic_dec(&args->pag->pagf_fstrms);
return 0;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 736e5545f584..de59eec74765 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -23,7 +23,7 @@
#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_alloc_btree.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
#include "xfs_ag.h"
/* Convert an xfs_fsmap to an fsmap. */
@@ -483,11 +483,11 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
xfs_rtblock_t rtbno;
xfs_daddr_t rec_daddr, len_daddr;
- rtbno = rec->ar_startext * mp->m_sb.sb_rextsize;
+ rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
rec_daddr = XFS_FSB_TO_BB(mp, rtbno);
irec.rm_startblock = rtbno;
- rtbno = rec->ar_extcount * mp->m_sb.sb_rextsize;
+ rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount);
len_daddr = XFS_FSB_TO_BB(mp, rtbno);
irec.rm_blockcount = rtbno;
@@ -514,7 +514,7 @@ xfs_getfsmap_rtdev_rtbitmap(
uint64_t eofs;
int error;
- eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rextents * mp->m_sb.sb_rextsize);
+ eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents));
if (keys[0].fmr_physical >= eofs)
return 0;
start_rtb = XFS_BB_TO_FSBT(mp,
@@ -539,11 +539,8 @@ xfs_getfsmap_rtdev_rtbitmap(
* Set up query parameters to return free rtextents covering the range
* we want.
*/
- alow.ar_startext = start_rtb;
- ahigh.ar_startext = end_rtb;
- do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
- if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
- ahigh.ar_startext++;
+ alow.ar_startext = xfs_rtb_to_rtx(mp, start_rtb);
+ ahigh.ar_startext = xfs_rtb_to_rtxup(mp, end_rtb);
error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
xfs_getfsmap_rtdev_rtbitmap_helper, info);
if (error)
@@ -766,8 +763,8 @@ xfs_getfsmap_datadev_bnobt_query(
return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info);
/* Allocate cursor for this AG and query_range it. */
- *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
- info->pag, XFS_BTNUM_BNO);
+ *curpp = xfs_bnobt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+ info->pag);
key->ar_startblock = info->low.rm_startblock;
key[1].ar_startblock = info->high.rm_startblock;
return xfs_alloc_query_range(*curpp, key, &key[1],
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 7cb75cb6b8e9..83f708f62ed9 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -134,6 +134,10 @@ xfs_growfs_data_private(
if (delta < 0 && nagcount < 2)
return -EINVAL;
+ /* No work to do */
+ if (delta == 0)
+ return 0;
+
oagcount = mp->m_sb.sb_agcount;
/* allocate the new per-ag structures */
if (nagcount > oagcount) {
@@ -153,7 +157,7 @@ xfs_growfs_data_private(
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0,
0, &tp);
if (error)
- return error;
+ goto out_free_unused_perag;
last_pag = xfs_perag_get(mp, oagcount - 1);
if (delta > 0) {
@@ -227,6 +231,9 @@ xfs_growfs_data_private(
out_trans_cancel:
xfs_trans_cancel(tp);
+out_free_unused_perag:
+ if (nagcount > oagcount)
+ xfs_free_unused_perag_range(mp, oagcount, nagcount);
return error;
}
@@ -344,59 +351,20 @@ xfs_growfs_log(
}
/*
- * exported through ioctl XFS_IOC_FSCOUNTS
- */
-
-void
-xfs_fs_counts(
- xfs_mount_t *mp,
- xfs_fsop_counts_t *cnt)
-{
- cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
- cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
- cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
- xfs_fdblocks_unavailable(mp);
- cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
-}
-
-/*
- * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
- *
- * xfs_reserve_blocks is called to set m_resblks
- * in the in-core mount table. The number of unused reserved blocks
- * is kept in m_resblks_avail.
- *
* Reserve the requested number of blocks if available. Otherwise return
* as many as possible to satisfy the request. The actual number
- * reserved are returned in outval
- *
- * A null inval pointer indicates that only the current reserved blocks
- * available should be returned no settings are changed.
+ * reserved are returned in outval.
*/
-
int
xfs_reserve_blocks(
- xfs_mount_t *mp,
- uint64_t *inval,
- xfs_fsop_resblks_t *outval)
+ struct xfs_mount *mp,
+ uint64_t request)
{
int64_t lcounter, delta;
int64_t fdblks_delta = 0;
- uint64_t request;
int64_t free;
int error = 0;
- /* If inval is null, report current values and return */
- if (inval == (uint64_t *)NULL) {
- if (!outval)
- return -EINVAL;
- outval->resblks = mp->m_resblks;
- outval->resblks_avail = mp->m_resblks_avail;
- return 0;
- }
-
- request = *inval;
-
/*
* With per-cpu counters, this becomes an interesting problem. we need
* to work out if we are freeing or allocation blocks first, then we can
@@ -466,11 +434,6 @@ xfs_reserve_blocks(
spin_lock(&mp->m_sb_lock);
}
out:
- if (outval) {
- outval->resblks = mp->m_resblks;
- outval->resblks_avail = mp->m_resblks_avail;
- }
-
spin_unlock(&mp->m_sb_lock);
return error;
}
@@ -482,9 +445,9 @@ xfs_fs_goingdown(
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
- if (!freeze_bdev(mp->m_super->s_bdev)) {
+ if (!bdev_freeze(mp->m_super->s_bdev)) {
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
- thaw_bdev(mp->m_super->s_bdev);
+ bdev_thaw(mp->m_super->s_bdev);
}
break;
}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 2cffe51a31e8..44457b0a0593 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -6,14 +6,12 @@
#ifndef __XFS_FSOPS_H__
#define __XFS_FSOPS_H__
-extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
-extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
-extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
-extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
- xfs_fsop_resblks_t *outval);
-extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags);
+int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
+int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
+int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
+int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
-extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
-extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
+int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
+int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 9edc1f2bc939..f18fec0adf66 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -44,4 +44,16 @@ struct xfs_globals xfs_globals = {
.pwork_threads = -1, /* automatic thread detection */
.larp = false, /* log attribute replay */
#endif
+
+ /*
+ * Leave this many record slots empty when bulk loading btrees. By
+ * default we load new btree leaf blocks 75% full.
+ */
+ .bload_leaf_slack = -1,
+
+ /*
+ * Leave this many key/ptr slots empty when bulk loading btrees. By
+ * default we load new btree node blocks 75% full.
+ */
+ .bload_node_slack = -1,
};
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 72a075bb2c10..b39f959146bc 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -14,6 +14,10 @@
#include "xfs_trace.h"
#include "xfs_health.h"
#include "xfs_ag.h"
+#include "xfs_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_quota_defs.h"
/*
* Warn about metadata corruption that we detected but haven't fixed, and
@@ -93,11 +97,25 @@ xfs_fs_mark_sick(
struct xfs_mount *mp,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_FS_ALL));
trace_xfs_fs_mark_sick(mp, mask);
spin_lock(&mp->m_sb_lock);
mp->m_fs_sick |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark per-fs metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_fs_mark_corrupt(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_FS_ALL));
+ trace_xfs_fs_mark_corrupt(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_fs_sick |= mask;
mp->m_fs_checked |= mask;
spin_unlock(&mp->m_sb_lock);
}
@@ -108,11 +126,13 @@ xfs_fs_mark_healthy(
struct xfs_mount *mp,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_FS_ALL));
trace_xfs_fs_mark_healthy(mp, mask);
spin_lock(&mp->m_sb_lock);
mp->m_fs_sick &= ~mask;
+ if (!(mp->m_fs_sick & XFS_SICK_FS_PRIMARY))
+ mp->m_fs_sick &= ~XFS_SICK_FS_SECONDARY;
mp->m_fs_checked |= mask;
spin_unlock(&mp->m_sb_lock);
}
@@ -136,11 +156,25 @@ xfs_rt_mark_sick(
struct xfs_mount *mp,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_RT_ALL));
trace_xfs_rt_mark_sick(mp, mask);
spin_lock(&mp->m_sb_lock);
mp->m_rt_sick |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark realtime metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_rt_mark_corrupt(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_RT_ALL));
+ trace_xfs_rt_mark_corrupt(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_rt_sick |= mask;
mp->m_rt_checked |= mask;
spin_unlock(&mp->m_sb_lock);
}
@@ -151,11 +185,13 @@ xfs_rt_mark_healthy(
struct xfs_mount *mp,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_RT_ALL));
trace_xfs_rt_mark_healthy(mp, mask);
spin_lock(&mp->m_sb_lock);
mp->m_rt_sick &= ~mask;
+ if (!(mp->m_rt_sick & XFS_SICK_RT_PRIMARY))
+ mp->m_rt_sick &= ~XFS_SICK_RT_SECONDARY;
mp->m_rt_checked |= mask;
spin_unlock(&mp->m_sb_lock);
}
@@ -173,17 +209,48 @@ xfs_rt_measure_sickness(
spin_unlock(&mp->m_sb_lock);
}
+/* Mark unhealthy per-ag metadata given a raw AG number. */
+void
+xfs_agno_mark_sick(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ unsigned int mask)
+{
+ struct xfs_perag *pag = xfs_perag_get(mp, agno);
+
+ /* per-ag structure not set up yet? */
+ if (!pag)
+ return;
+
+ xfs_ag_mark_sick(pag, mask);
+ xfs_perag_put(pag);
+}
+
/* Mark unhealthy per-ag metadata. */
void
xfs_ag_mark_sick(
struct xfs_perag *pag,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_AG_ALL));
trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask);
spin_lock(&pag->pag_state_lock);
pag->pag_sick |= mask;
+ spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark per-ag metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_ag_mark_corrupt(
+ struct xfs_perag *pag,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_AG_ALL));
+ trace_xfs_ag_mark_corrupt(pag->pag_mount, pag->pag_agno, mask);
+
+ spin_lock(&pag->pag_state_lock);
+ pag->pag_sick |= mask;
pag->pag_checked |= mask;
spin_unlock(&pag->pag_state_lock);
}
@@ -194,11 +261,13 @@ xfs_ag_mark_healthy(
struct xfs_perag *pag,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_AG_ALL));
trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask);
spin_lock(&pag->pag_state_lock);
pag->pag_sick &= ~mask;
+ if (!(pag->pag_sick & XFS_SICK_AG_PRIMARY))
+ pag->pag_sick &= ~XFS_SICK_AG_SECONDARY;
pag->pag_checked |= mask;
spin_unlock(&pag->pag_state_lock);
}
@@ -222,11 +291,34 @@ xfs_inode_mark_sick(
struct xfs_inode *ip,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_INO_ALL));
trace_xfs_inode_mark_sick(ip, mask);
spin_lock(&ip->i_flags_lock);
ip->i_sick |= mask;
+ spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Keep this inode around so we don't lose the sickness report. Scrub
+ * grabs inodes with DONTCACHE assuming that most inode are ok, which
+ * is not the case here.
+ */
+ spin_lock(&VFS_I(ip)->i_lock);
+ VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ spin_unlock(&VFS_I(ip)->i_lock);
+}
+
+/* Mark inode metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_inode_mark_corrupt(
+ struct xfs_inode *ip,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_INO_ALL));
+ trace_xfs_inode_mark_corrupt(ip, mask);
+
+ spin_lock(&ip->i_flags_lock);
+ ip->i_sick |= mask;
ip->i_checked |= mask;
spin_unlock(&ip->i_flags_lock);
@@ -246,11 +338,13 @@ xfs_inode_mark_healthy(
struct xfs_inode *ip,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+ ASSERT(!(mask & ~XFS_SICK_INO_ALL));
trace_xfs_inode_mark_healthy(ip, mask);
spin_lock(&ip->i_flags_lock);
ip->i_sick &= ~mask;
+ if (!(ip->i_sick & XFS_SICK_INO_PRIMARY))
+ ip->i_sick &= ~XFS_SICK_INO_SECONDARY;
ip->i_checked |= mask;
spin_unlock(&ip->i_flags_lock);
}
@@ -280,6 +374,8 @@ static const struct ioctl_sick_map fs_map[] = {
{ XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA },
{ XFS_SICK_FS_GQUOTA, XFS_FSOP_GEOM_SICK_GQUOTA },
{ XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA },
+ { XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK },
+ { XFS_SICK_FS_NLINKS, XFS_FSOP_GEOM_SICK_NLINKS },
{ 0, 0 },
};
@@ -335,6 +431,7 @@ static const struct ioctl_sick_map ag_map[] = {
{ XFS_SICK_AG_FINOBT, XFS_AG_GEOM_SICK_FINOBT },
{ XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT },
{ XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT },
+ { XFS_SICK_AG_INODES, XFS_AG_GEOM_SICK_INODES },
{ 0, 0 },
};
@@ -369,6 +466,10 @@ static const struct ioctl_sick_map ino_map[] = {
{ XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR },
{ XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK },
{ XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT },
+ { XFS_SICK_INO_BMBTD_ZAPPED, XFS_BS_SICK_BMBTD },
+ { XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA },
+ { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR },
+ { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK },
{ 0, 0 },
};
@@ -393,3 +494,92 @@ xfs_bulkstat_health(
bs->bs_sick |= m->ioctl_mask;
}
}
+
+/* Mark a block mapping sick. */
+void
+xfs_bmap_mark_sick(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ unsigned int mask;
+
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ mask = XFS_SICK_INO_BMBTD;
+ break;
+ case XFS_ATTR_FORK:
+ mask = XFS_SICK_INO_BMBTA;
+ break;
+ case XFS_COW_FORK:
+ mask = XFS_SICK_INO_BMBTC;
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ xfs_inode_mark_sick(ip, mask);
+}
+
+/* Record observations of btree corruption with the health tracking system. */
+void
+xfs_btree_mark_sick(
+ struct xfs_btree_cur *cur)
+{
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_MEM:
+ /* no health state tracking for ephemeral btrees */
+ return;
+ case XFS_BTREE_TYPE_AG:
+ ASSERT(cur->bc_ops->sick_mask);
+ xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask);
+ return;
+ case XFS_BTREE_TYPE_INODE:
+ if (xfs_btree_is_bmap(cur->bc_ops)) {
+ xfs_bmap_mark_sick(cur->bc_ino.ip,
+ cur->bc_ino.whichfork);
+ return;
+ }
+ fallthrough;
+ default:
+ ASSERT(0);
+ return;
+ }
+}
+
+/*
+ * Record observations of dir/attr btree corruption with the health tracking
+ * system.
+ */
+void
+xfs_dirattr_mark_sick(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ unsigned int mask;
+
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ mask = XFS_SICK_INO_DIR;
+ break;
+ case XFS_ATTR_FORK:
+ mask = XFS_SICK_INO_XATTR;
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ xfs_inode_mark_sick(ip, mask);
+}
+
+/*
+ * Record observations of dir/attr btree corruption with the health tracking
+ * system.
+ */
+void
+xfs_da_mark_sick(
+ struct xfs_da_args *args)
+{
+ xfs_dirattr_mark_sick(args->dp, args->whichfork);
+}
diff --git a/fs/xfs/xfs_hooks.c b/fs/xfs/xfs_hooks.c
new file mode 100644
index 000000000000..a58d1de2d37d
--- /dev/null
+++ b/fs/xfs/xfs_hooks.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_trace.h"
+
+/* Initialize a notifier chain. */
+void
+xfs_hooks_init(
+ struct xfs_hooks *chain)
+{
+ BLOCKING_INIT_NOTIFIER_HEAD(&chain->head);
+}
+
+/* Make it so a function gets called whenever we hit a certain hook point. */
+int
+xfs_hooks_add(
+ struct xfs_hooks *chain,
+ struct xfs_hook *hook)
+{
+ ASSERT(hook->nb.notifier_call != NULL);
+ BUILD_BUG_ON(offsetof(struct xfs_hook, nb) != 0);
+
+ return blocking_notifier_chain_register(&chain->head, &hook->nb);
+}
+
+/* Remove a previously installed hook. */
+void
+xfs_hooks_del(
+ struct xfs_hooks *chain,
+ struct xfs_hook *hook)
+{
+ blocking_notifier_chain_unregister(&chain->head, &hook->nb);
+}
+
+/* Call a hook. Returns the NOTIFY_* value returned by the last hook. */
+int
+xfs_hooks_call(
+ struct xfs_hooks *chain,
+ unsigned long val,
+ void *priv)
+{
+ return blocking_notifier_call_chain(&chain->head, val, priv);
+}
diff --git a/fs/xfs/xfs_hooks.h b/fs/xfs/xfs_hooks.h
new file mode 100644
index 000000000000..60b8a5831536
--- /dev/null
+++ b/fs/xfs/xfs_hooks.h
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef XFS_HOOKS_H_
+#define XFS_HOOKS_H_
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+struct xfs_hooks {
+ struct blocking_notifier_head head;
+};
+
+/*
+ * If jump labels are enabled in Kconfig, the static key uses nop sleds and
+ * code patching to eliminate the overhead of taking the rwsem in
+ * blocking_notifier_call_chain when there are no hooks configured. If not,
+ * the static key per-call overhead is an atomic read. Most arches that can
+ * handle XFS also support jump labels.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+# define DEFINE_STATIC_XFS_HOOK_SWITCH(name) \
+ static DEFINE_STATIC_KEY_FALSE(name)
+# define xfs_hooks_switch_on(name) static_branch_inc(name)
+# define xfs_hooks_switch_off(name) static_branch_dec(name)
+# define xfs_hooks_switched_on(name) static_branch_unlikely(name)
+
+struct xfs_hook {
+ /* This must come at the start of the structure. */
+ struct notifier_block nb;
+};
+
+typedef int (*xfs_hook_fn_t)(struct xfs_hook *hook, unsigned long action,
+ void *data);
+
+void xfs_hooks_init(struct xfs_hooks *chain);
+int xfs_hooks_add(struct xfs_hooks *chain, struct xfs_hook *hook);
+void xfs_hooks_del(struct xfs_hooks *chain, struct xfs_hook *hook);
+int xfs_hooks_call(struct xfs_hooks *chain, unsigned long action,
+ void *priv);
+
+static inline void xfs_hook_setup(struct xfs_hook *hook, notifier_fn_t fn)
+{
+ hook->nb.notifier_call = fn;
+ hook->nb.priority = 0;
+}
+
+#else
+
+struct xfs_hooks { /* empty */ };
+
+# define DEFINE_STATIC_XFS_HOOK_SWITCH(name)
+# define xfs_hooks_switch_on(name) ((void)0)
+# define xfs_hooks_switch_off(name) ((void)0)
+# define xfs_hooks_switched_on(name) (false)
+
+# define xfs_hooks_init(chain) ((void)0)
+# define xfs_hooks_call(chain, val, priv) (NOTIFY_DONE)
+#endif
+
+#endif /* XFS_HOOKS_H_ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index dba514a2c84d..74f1812b03cb 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -24,6 +24,7 @@
#include "xfs_ialloc.h"
#include "xfs_ag.h"
#include "xfs_log_priv.h"
+#include "xfs_health.h"
#include <linux/iversion.h>
@@ -415,6 +416,9 @@ xfs_iget_check_free_state(
xfs_warn(ip->i_mount,
"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
ip->i_ino, VFS_I(ip)->i_mode);
+ xfs_agno_mark_sick(ip->i_mount,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_SICK_AG_INOBT);
return -EFSCORRUPTED;
}
@@ -422,6 +426,9 @@ xfs_iget_check_free_state(
xfs_warn(ip->i_mount,
"Corruption detected! Free inode 0x%llx has blocks allocated!",
ip->i_ino);
+ xfs_agno_mark_sick(ip->i_mount,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_SICK_AG_INOBT);
return -EFSCORRUPTED;
}
return 0;
@@ -640,6 +647,8 @@ xfs_iget_cache_miss(
xfs_buf_offset(bp, ip->i_imap.im_boffset));
if (!error)
xfs_buf_set_ref(bp, XFS_INO_REF);
+ else
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
xfs_trans_brelse(tp, bp);
if (error)
@@ -659,10 +668,9 @@ xfs_iget_cache_miss(
/*
* Preload the radix tree so we can insert safely under the
* write spinlock. Note that we cannot sleep inside the preload
- * region. Since we can be called from transaction context, don't
- * recurse into the file system.
+ * region.
*/
- if (radix_tree_preload(GFP_NOFS)) {
+ if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) {
error = -EAGAIN;
goto out_destroy;
}
@@ -2031,8 +2039,10 @@ xfs_inodegc_want_queue_work(
* - Memory shrinkers queued the inactivation worker and it hasn't finished.
* - The queue depth exceeds the maximum allowable percpu backlog.
*
- * Note: If the current thread is running a transaction, we don't ever want to
- * wait for other transactions because that could introduce a deadlock.
+ * Note: If we are in a NOFS context here (e.g. current thread is running a
+ * transaction) the we don't want to block here as inodegc progress may require
+ * filesystem resources we hold to make progress and that could result in a
+ * deadlock. Hence we skip out of here if we are in a scoped NOFS context.
*/
static inline bool
xfs_inodegc_want_flush_work(
@@ -2040,7 +2050,7 @@ xfs_inodegc_want_flush_work(
unsigned int items,
unsigned int shrinker_hits)
{
- if (current->journal_info)
+ if (current->flags & PF_MEMALLOC_NOFS)
return false;
if (shrinker_hits > 0)
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index b05314d48176..4345db501714 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,7 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
- kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
+ kvfree(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 36f5cf802c07..d55b42b2480d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -37,15 +37,10 @@
#include "xfs_reflink.h"
#include "xfs_ag.h"
#include "xfs_log_priv.h"
+#include "xfs_health.h"
struct kmem_cache *xfs_inode_cache;
-/*
- * Used in xfs_itruncate_extents(). This is the maximum number of extents
- * freed from a file in a single transaction.
- */
-#define XFS_ITRUNC_MAX_EXTENTS 2
-
STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
struct xfs_inode *);
@@ -208,9 +203,9 @@ xfs_ilock(
}
if (lock_flags & XFS_ILOCK_EXCL)
- mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+ down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
else if (lock_flags & XFS_ILOCK_SHARED)
- mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+ down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
}
/*
@@ -251,10 +246,10 @@ xfs_ilock_nowait(
}
if (lock_flags & XFS_ILOCK_EXCL) {
- if (!mrtryupdate(&ip->i_lock))
+ if (!down_write_trylock(&ip->i_lock))
goto out_undo_mmaplock;
} else if (lock_flags & XFS_ILOCK_SHARED) {
- if (!mrtryaccess(&ip->i_lock))
+ if (!down_read_trylock(&ip->i_lock))
goto out_undo_mmaplock;
}
return 1;
@@ -303,9 +298,9 @@ xfs_iunlock(
up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_ILOCK_EXCL)
- mrunlock_excl(&ip->i_lock);
+ up_write(&ip->i_lock);
else if (lock_flags & XFS_ILOCK_SHARED)
- mrunlock_shared(&ip->i_lock);
+ up_read(&ip->i_lock);
trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
}
@@ -324,7 +319,7 @@ xfs_ilock_demote(
~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
if (lock_flags & XFS_ILOCK_EXCL)
- mrdemote(&ip->i_lock);
+ downgrade_write(&ip->i_lock);
if (lock_flags & XFS_MMAPLOCK_EXCL)
downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_IOLOCK_EXCL)
@@ -333,52 +328,30 @@ xfs_ilock_demote(
trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
}
-#if defined(DEBUG) || defined(XFS_WARN)
-static inline bool
-__xfs_rwsem_islocked(
- struct rw_semaphore *rwsem,
- bool shared)
-{
- if (!debug_locks)
- return rwsem_is_locked(rwsem);
-
- if (!shared)
- return lockdep_is_held_type(rwsem, 0);
-
- /*
- * We are checking that the lock is held at least in shared
- * mode but don't care that it might be held exclusively
- * (i.e. shared | excl). Hence we check if the lock is held
- * in any mode rather than an explicit shared mode.
- */
- return lockdep_is_held_type(rwsem, -1);
-}
-
-bool
-xfs_isilocked(
+void
+xfs_assert_ilocked(
struct xfs_inode *ip,
uint lock_flags)
{
- if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
- if (!(lock_flags & XFS_ILOCK_SHARED))
- return !!ip->i_lock.mr_writer;
- return rwsem_is_locked(&ip->i_lock.mr_lock);
- }
-
- if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
- return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
- (lock_flags & XFS_MMAPLOCK_SHARED));
- }
+ /*
+ * Sometimes we assert the ILOCK is held exclusively, but we're in
+ * a workqueue, so lockdep doesn't know we're the owner.
+ */
+ if (lock_flags & XFS_ILOCK_SHARED)
+ rwsem_assert_held(&ip->i_lock);
+ else if (lock_flags & XFS_ILOCK_EXCL)
+ rwsem_assert_held_write_nolockdep(&ip->i_lock);
- if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
- return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
- (lock_flags & XFS_IOLOCK_SHARED));
- }
+ if (lock_flags & XFS_MMAPLOCK_SHARED)
+ rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock);
+ else if (lock_flags & XFS_MMAPLOCK_EXCL)
+ rwsem_assert_held_write(&VFS_I(ip)->i_mapping->invalidate_lock);
- ASSERT(0);
- return false;
+ if (lock_flags & XFS_IOLOCK_SHARED)
+ rwsem_assert_held(&VFS_I(ip)->i_rwsem);
+ else if (lock_flags & XFS_IOLOCK_EXCL)
+ rwsem_assert_held_write(&VFS_I(ip)->i_rwsem);
}
-#endif
/*
* xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
@@ -661,6 +634,8 @@ xfs_lookup(
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return -EIO;
error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
if (error)
@@ -674,7 +649,7 @@ xfs_lookup(
out_free_name:
if (ci_name)
- kmem_free(ci_name->name);
+ kfree(ci_name->name);
out_unlock:
*ipp = NULL;
return error;
@@ -805,6 +780,8 @@ xfs_init_new_inode(
*/
if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
+ xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
+ XFS_SICK_AG_INOBT);
return -EFSCORRUPTED;
}
@@ -875,7 +852,7 @@ xfs_init_new_inode(
case S_IFLNK:
ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
ip->i_df.if_bytes = 0;
- ip->i_df.if_u1.if_root = NULL;
+ ip->i_df.if_data = NULL;
break;
default:
ASSERT(0);
@@ -918,6 +895,13 @@ xfs_droplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
{
+ if (VFS_I(ip)->i_nlink == 0) {
+ xfs_alert(ip->i_mount,
+ "%s: Attempt to drop inode (%llu) with nlink zero.",
+ __func__, ip->i_ino);
+ return -EFSCORRUPTED;
+ }
+
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
drop_nlink(VFS_I(ip));
@@ -943,6 +927,81 @@ xfs_bumplink(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of directory live update hooks.
+ * If the compiler supports jump labels, the static branch will be replaced by
+ * a nop sled when there are no hook users. Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch);
+
+void
+xfs_dir_hook_disable(void)
+{
+ xfs_hooks_switch_off(&xfs_dir_hooks_switch);
+}
+
+void
+xfs_dir_hook_enable(void)
+{
+ xfs_hooks_switch_on(&xfs_dir_hooks_switch);
+}
+
+/* Call hooks for a directory update relating to a child dirent update. */
+inline void
+xfs_dir_update_hook(
+ struct xfs_inode *dp,
+ struct xfs_inode *ip,
+ int delta,
+ const struct xfs_name *name)
+{
+ if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) {
+ struct xfs_dir_update_params p = {
+ .dp = dp,
+ .ip = ip,
+ .delta = delta,
+ .name = name,
+ };
+ struct xfs_mount *mp = ip->i_mount;
+
+ xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p);
+ }
+}
+
+/* Call the specified function during a directory update. */
+int
+xfs_dir_hook_add(
+ struct xfs_mount *mp,
+ struct xfs_dir_hook *hook)
+{
+ return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Stop calling the specified function during a directory update. */
+void
+xfs_dir_hook_del(
+ struct xfs_mount *mp,
+ struct xfs_dir_hook *hook)
+{
+ xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Configure directory update hook functions. */
+void
+xfs_dir_hook_setup(
+ struct xfs_dir_hook *hook,
+ notifier_fn_t mod_fn)
+{
+ xfs_hook_setup(&hook->dirent_hook, mod_fn);
+}
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
int
xfs_create(
struct mnt_idmap *idmap,
@@ -971,6 +1030,8 @@ xfs_create(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return -EIO;
prid = xfs_get_initial_prid(dp);
@@ -1052,6 +1113,12 @@ xfs_create(
}
/*
+ * Create ip with a reference from dp, and add '.' and '..' references
+ * if it's a directory.
+ */
+ xfs_dir_update_hook(dp, ip, 1, name);
+
+ /*
* If this is a synchronous mount, make sure that the
* create transaction goes to disk before returning to
* the user.
@@ -1210,6 +1277,8 @@ xfs_link(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_ifork_zapped(tdp, XFS_DATA_FORK))
+ return -EIO;
error = xfs_qm_dqattach(sip);
if (error)
@@ -1232,8 +1301,19 @@ xfs_link(
*/
if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
tdp->i_projid != sip->i_projid)) {
- error = -EXDEV;
- goto error_return;
+ /*
+ * Project quota setup skips special files which can
+ * leave inodes in a PROJINHERIT directory without a
+ * project ID set. We need to allow links to be made
+ * to these "project-less" inodes because userspace
+ * expects them to succeed after project ID setup,
+ * but everything else should be rejected.
+ */
+ if (!special_file(VFS_I(sip)->i_mode) ||
+ sip->i_projid != 0) {
+ error = -EXDEV;
+ goto error_return;
+ }
}
if (!resblks) {
@@ -1263,6 +1343,7 @@ xfs_link(
xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
xfs_bumplink(tp, sip);
+ xfs_dir_update_hook(tdp, sip, 1, target_name);
/*
* If this is a synchronous mount, make sure that the
@@ -1332,12 +1413,11 @@ xfs_itruncate_extents_flags(
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp = *tpp;
xfs_fileoff_t first_unmap_block;
- xfs_filblks_t unmap_len;
int error = 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
- xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
+ if (atomic_read(&VFS_I(ip)->i_count))
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
ASSERT(new_size <= XFS_ISIZE(ip));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(ip->i_itemp != NULL);
@@ -1364,19 +1444,10 @@ xfs_itruncate_extents_flags(
return 0;
}
- unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
- while (unmap_len > 0) {
- ASSERT(tp->t_highest_agno == NULLAGNUMBER);
- error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
- flags, XFS_ITRUNC_MAX_EXTENTS);
- if (error)
- goto out;
-
- /* free the just unmapped extents */
- error = xfs_defer_finish(&tp);
- if (error)
- goto out;
- }
+ error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block,
+ XFS_MAX_FILEOFF);
+ if (error)
+ goto out;
if (whichfork == XFS_DATA_FORK) {
/* Remove all pending CoW reservations. */
@@ -1598,7 +1669,7 @@ xfs_inactive_ifree(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
error = xfs_ifree(tp, ip);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (error) {
/*
* If we fail to free the inode, shut down. The cancel
@@ -1679,6 +1750,39 @@ xfs_inode_needs_inactive(
}
/*
+ * Save health status somewhere, if we're dumping an inode with uncorrected
+ * errors and online repair isn't running.
+ */
+static inline void
+xfs_inactive_health(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ unsigned int sick;
+ unsigned int checked;
+
+ xfs_inode_measure_sickness(ip, &sick, &checked);
+ if (!sick)
+ return;
+
+ trace_xfs_inode_unfixed_corruption(ip, sick);
+
+ if (sick & XFS_SICK_INO_FORGET)
+ return;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ if (!pag) {
+ /* There had better still be a perag structure! */
+ ASSERT(0);
+ return;
+ }
+
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_INODES);
+ xfs_perag_put(pag);
+}
+
+/*
* xfs_inactive
*
* This is called when the vnode reference count for the vnode
@@ -1706,6 +1810,8 @@ xfs_inactive(
mp = ip->i_mount;
ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
+ xfs_inactive_health(ip);
+
/*
* If this is a read-only mount, don't do this (would generate I/O)
* unless we're in log recovery and cleaning the iunlinked list.
@@ -1912,6 +2018,7 @@ xfs_iunlink_update_bucket(
*/
if (old_value == new_agino) {
xfs_buf_mark_corrupt(agibp);
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return -EFSCORRUPTED;
}
@@ -1961,11 +2068,14 @@ xfs_iunlink_reload_next(
*/
ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
- if (error)
+ if (error) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return error;
+ }
/* If this is not an unlinked inode, something is very wrong. */
if (VFS_I(next_ip)->i_nlink != 0) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
error = -EFSCORRUPTED;
goto rele;
}
@@ -2003,6 +2113,7 @@ xfs_iunlink_insert_inode(
if (next_agino == agino ||
!xfs_verify_agino_or_null(pag, next_agino)) {
xfs_buf_mark_corrupt(agibp);
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return -EFSCORRUPTED;
}
@@ -2090,6 +2201,7 @@ xfs_iunlink_remove_inode(
if (!xfs_verify_agino(pag, head_agino)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
agi, sizeof(*agi));
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return -EFSCORRUPTED;
}
@@ -2118,8 +2230,10 @@ xfs_iunlink_remove_inode(
struct xfs_inode *prev_ip;
prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
- if (!prev_ip)
+ if (!prev_ip) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return -EFSCORRUPTED;
+ }
error = xfs_iunlink_log_inode(tp, prev_ip, pag,
ip->i_next_unlinked);
@@ -2352,7 +2466,7 @@ xfs_ifree(
struct xfs_inode_log_item *iip = ip->i_itemp;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(ip->i_df.if_nextents == 0);
ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
@@ -2380,8 +2494,8 @@ xfs_ifree(
* already been freed by xfs_attr_inactive.
*/
if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- kmem_free(ip->i_df.if_u1.if_data);
- ip->i_df.if_u1.if_data = NULL;
+ kfree(ip->i_df.if_data);
+ ip->i_df.if_data = NULL;
ip->i_df.if_bytes = 0;
}
@@ -2421,7 +2535,7 @@ static void
xfs_iunpin(
struct xfs_inode *ip)
{
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
@@ -2499,6 +2613,8 @@ xfs_remove(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return -EIO;
error = xfs_qm_dqattach(dp);
if (error)
@@ -2585,6 +2701,12 @@ xfs_remove(
}
/*
+ * Drop the link from dp to ip, and if ip was a directory, remove the
+ * '.' and '..' references since we freed the directory.
+ */
+ xfs_dir_update_hook(dp, ip, -1, name);
+
+ /*
* If this is a synchronous mount, make sure that the
* remove transaction goes to disk before returning to
* the user.
@@ -2774,6 +2896,20 @@ xfs_cross_rename(
}
xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+
+ /*
+ * Inform our hook clients that we've finished an exchange operation as
+ * follows: removed the source and target files from their directories;
+ * added the target to the source directory; and added the source to
+ * the target directory. All inodes are locked, so it's ok to model a
+ * rename this way so long as we say we deleted entries before we add
+ * new ones.
+ */
+ xfs_dir_update_hook(dp1, ip1, -1, name1);
+ xfs_dir_update_hook(dp2, ip2, -1, name2);
+ xfs_dir_update_hook(dp1, ip2, 1, name1);
+ xfs_dir_update_hook(dp2, ip1, 1, name2);
+
return xfs_finish_rename(tp);
out_trans_abort:
@@ -3157,6 +3293,21 @@ retry:
if (new_parent)
xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+ /*
+ * Inform our hook clients that we've finished a rename operation as
+ * follows: removed the source and target files from their directories;
+ * that we've added the source to the target directory; and finally
+ * that we've added the whiteout, if there was one. All inodes are
+ * locked, so it's ok to model a rename this way so long as we say we
+ * deleted entries before we add new ones.
+ */
+ if (target_ip)
+ xfs_dir_update_hook(target_dp, target_ip, -1, target_name);
+ xfs_dir_update_hook(src_dp, src_ip, -1, src_name);
+ xfs_dir_update_hook(target_dp, src_ip, 1, target_name);
+ if (wip)
+ xfs_dir_update_hook(src_dp, wip, 1, src_name);
+
error = xfs_finish_rename(tp);
if (wip)
xfs_irele(wip);
@@ -3182,7 +3333,7 @@ xfs_iflush(
struct xfs_mount *mp = ip->i_mount;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
@@ -3317,6 +3468,8 @@ flush_out:
/* generate the checksum. */
xfs_dinode_calc_crc(mp, dip);
+ if (error)
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
return error;
}
@@ -3621,6 +3774,23 @@ xfs_iunlock2_io_mmap(
inode_unlock(VFS_I(ip1));
}
+/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
+void
+xfs_iunlock2_remapping(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ xfs_iflags_clear(ip1, XFS_IREMAPPING);
+
+ if (ip1 != ip2)
+ xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+
+ if (ip1 != ip2)
+ inode_unlock_shared(VFS_I(ip1));
+ inode_unlock(VFS_I(ip2));
+}
+
/*
* Reload the incore inode list for this inode. Caller should ensure that
* the link count cannot change, either by taking ILOCK_SHARED or otherwise
@@ -3734,3 +3904,45 @@ xfs_inode_reload_unlinked(
return error;
}
+
+/* Has this inode fork been zapped by repair? */
+bool
+xfs_ifork_zapped(
+ const struct xfs_inode *ip,
+ int whichfork)
+{
+ unsigned int datamask = 0;
+
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ switch (ip->i_vnode.i_mode & S_IFMT) {
+ case S_IFDIR:
+ datamask = XFS_SICK_INO_DIR_ZAPPED;
+ break;
+ case S_IFLNK:
+ datamask = XFS_SICK_INO_SYMLINK_ZAPPED;
+ break;
+ }
+ return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask);
+ case XFS_ATTR_FORK:
+ return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED;
+ default:
+ return false;
+ }
+}
+
+/* Compute the number of data and realtime blocks used by a file. */
+void
+xfs_inode_count_blocks(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_filblks_t *dblocks,
+ xfs_filblks_t *rblocks)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+ *rblocks = 0;
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_bmap_count_leaves(ifp, rblocks);
+ *dblocks = ip->i_nblocks - *rblocks;
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0c5bdb91152e..ab46ffb3ac19 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -39,7 +39,7 @@ typedef struct xfs_inode {
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
- mrlock_t i_lock; /* inode lock */
+ struct rw_semaphore i_lock; /* inode lock */
atomic_t i_pincount; /* inode pin count */
struct llist_node i_gclist; /* deferred inactivation list */
@@ -171,6 +171,12 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
return &ip->i_vnode;
}
+/* convert from const xfs inode to const vfs inode */
+static inline const struct inode *VFS_IC(const struct xfs_inode *ip)
+{
+ return &ip->i_vnode;
+}
+
/*
* For regular files we only update the on-disk filesize when actually
* writing data back to disk. Until then only the copy in the VFS inode
@@ -347,6 +353,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
/* Quotacheck is running but inode has not been added to quota counts. */
#define XFS_IQUOTAUNCHECKED (1 << 14)
+/*
+ * Remap in progress. Callers that wish to update file data while
+ * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
+ * the lock in exclusive mode. Relocking the file will block until
+ * IREMAPPING is cleared.
+ */
+#define XFS_IREMAPPING (1U << 15)
+
/* All inode state flags related to inode reclaim. */
#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \
XFS_IRECLAIM | \
@@ -515,7 +529,7 @@ void xfs_ilock(xfs_inode_t *, uint);
int xfs_ilock_nowait(xfs_inode_t *, uint);
void xfs_iunlock(xfs_inode_t *, uint);
void xfs_ilock_demote(xfs_inode_t *, uint);
-bool xfs_isilocked(struct xfs_inode *, uint);
+void xfs_assert_ilocked(struct xfs_inode *, uint);
uint xfs_ilock_data_map_shared(struct xfs_inode *);
uint xfs_ilock_attr_map_shared(struct xfs_inode *);
@@ -561,6 +575,14 @@ extern void xfs_setup_inode(struct xfs_inode *ip);
extern void xfs_setup_iops(struct xfs_inode *ip);
extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
+static inline void xfs_update_stable_writes(struct xfs_inode *ip)
+{
+ if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev))
+ mapping_set_stable_writes(VFS_I(ip)->i_mapping);
+ else
+ mapping_clear_stable_writes(VFS_I(ip)->i_mapping);
+}
+
/*
* When setting up a newly allocated inode, we need to call
* xfs_finish_inode_setup() once the inode is fully instantiated at
@@ -595,6 +617,7 @@ void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
static inline bool
xfs_inode_unlinked_incomplete(
@@ -605,4 +628,33 @@ xfs_inode_unlinked_incomplete(
int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip);
int xfs_inode_reload_unlinked(struct xfs_inode *ip);
+bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork);
+void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_filblks_t *dblocks, xfs_filblks_t *rblocks);
+
+struct xfs_dir_update_params {
+ const struct xfs_inode *dp;
+ const struct xfs_inode *ip;
+ const struct xfs_name *name;
+ int delta;
+};
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip,
+ int delta, const struct xfs_name *name);
+
+struct xfs_dir_hook {
+ struct xfs_hook dirent_hook;
+};
+
+void xfs_dir_hook_disable(void);
+void xfs_dir_hook_enable(void);
+
+int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn);
+#else
+# define xfs_dir_update_hook(dp, ip, delta, name) ((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 17c51804f9c6..f28d653300d1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -19,6 +19,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_error.h"
+#include "xfs_rtbitmap.h"
#include <linux/iversion.h>
@@ -107,7 +108,7 @@ xfs_inode_item_precommit(
*/
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
- (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
+ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
XFS_DIFLAG_EXTSZINHERIT);
ip->i_extsize = 0;
@@ -351,11 +352,10 @@ xfs_inode_item_format_data_fork(
~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- ASSERT(ip->i_df.if_u1.if_data != NULL);
+ ASSERT(ip->i_df.if_data != NULL);
ASSERT(ip->i_disk_size > 0);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
- ip->i_df.if_u1.if_data,
- ip->i_df.if_bytes);
+ ip->i_df.if_data, ip->i_df.if_bytes);
ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
ilf->ilf_size++;
} else {
@@ -430,10 +430,9 @@ xfs_inode_item_format_attr_fork(
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_af.if_bytes > 0) {
- ASSERT(ip->i_af.if_u1.if_data != NULL);
+ ASSERT(ip->i_af.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
- ip->i_af.if_u1.if_data,
- ip->i_af.if_bytes);
+ ip->i_af.if_data, ip->i_af.if_bytes);
ilf->ilf_asize = (unsigned)ip->i_af.if_bytes;
ilf->ilf_size++;
} else {
@@ -556,6 +555,9 @@ xfs_inode_to_log_dinode(
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_v3_pad = 0;
+
+ /* dummy value for initialisation */
+ to->di_crc = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
@@ -648,7 +650,7 @@ xfs_inode_item_pin(
{
struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(lip->li_buf);
trace_xfs_inode_pin(ip, _RET_IP_);
@@ -754,7 +756,7 @@ xfs_inode_item_release(
unsigned short lock_flags;
ASSERT(ip->i_itemp != NULL);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
lock_flags = iip->ili_lock_flags;
iip->ili_lock_flags = 0;
@@ -854,7 +856,7 @@ xfs_inode_item_destroy(
ASSERT(iip->ili_item.li_buf == NULL);
ip->i_itemp = NULL;
- kmem_free(iip->ili_item.li_lv_shadow);
+ kvfree(iip->ili_item.li_lv_shadow);
kmem_cache_free(xfs_ili_cache, iip);
}
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 0e5dba2343ea..dbdab4ce7c44 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -286,11 +286,13 @@ xlog_recover_inode_commit_pass2(
struct xfs_log_dinode *ldip;
uint isize;
int need_free = 0;
+ xfs_failaddr_t fa;
if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
in_f = item->ri_buf[0].i_addr;
} else {
- in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
+ in_f = kmalloc(sizeof(struct xfs_inode_log_format),
+ GFP_KERNEL | __GFP_NOFAIL);
need_free = 1;
error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
if (error)
@@ -369,24 +371,26 @@ xlog_recover_inode_commit_pass2(
* superblock flag to determine whether we need to look at di_flushiter
* to skip replay when the on disk inode is newer than the log one
*/
- if (!xfs_has_v3inodes(mp) &&
- ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
- /*
- * Deal with the wrap case, DI_MAX_FLUSH is less
- * than smaller numbers
- */
- if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
- ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
- /* do nothing */
- } else {
- trace_xfs_log_recover_inode_skip(log, in_f);
- error = 0;
- goto out_release;
+ if (!xfs_has_v3inodes(mp)) {
+ if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ /*
+ * Deal with the wrap case, DI_MAX_FLUSH is less
+ * than smaller numbers
+ */
+ if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+ ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+ /* do nothing */
+ } else {
+ trace_xfs_log_recover_inode_skip(log, in_f);
+ error = 0;
+ goto out_release;
+ }
}
+
+ /* Take the opportunity to reset the flush iteration count */
+ ldip->di_flushiter = 0;
}
- /* Take the opportunity to reset the flush iteration count */
- ldip->di_flushiter = 0;
if (unlikely(S_ISREG(ldip->di_mode))) {
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -528,8 +532,19 @@ out_owner_change:
(dip->di_mode != 0))
error = xfs_recover_inode_owner_change(mp, dip, in_f,
buffer_list);
- /* re-generate the checksum. */
+ /* re-generate the checksum and validate the recovered inode. */
xfs_dinode_calc_crc(log->l_mp, dip);
+ fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip);
+ if (fa) {
+ XFS_CORRUPTION_ERROR(
+ "Bad dinode after recovery",
+ XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip));
+ xfs_alert(mp,
+ "Metadata corruption detected at %pS, inode 0x%llx",
+ fa, in_f->ilf_ino);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
ASSERT(bp->b_mount == mp);
bp->b_flags |= _XBF_LOGRECOVERY;
@@ -539,7 +554,7 @@ out_release:
xfs_buf_relse(bp);
error:
if (need_free)
- kmem_free(in_f);
+ kfree(in_f);
return error;
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 55bb01173cde..d0e2cec6210d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -38,6 +38,7 @@
#include "xfs_reflink.h"
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
+#include "xfs_rtbitmap.h"
#include <linux/mount.h>
#include <linux/namei.h>
@@ -434,7 +435,7 @@ xfs_ioc_attr_list(
copy_to_user(ucursor, &context.cursor, sizeof(context.cursor)))
error = -EFAULT;
out_free:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -492,7 +493,7 @@ xfs_attrmulti_attr_get(
error = -EFAULT;
out_kfree:
- kmem_free(args.value);
+ kvfree(args.value);
return error;
}
@@ -1004,7 +1005,7 @@ xfs_fill_fsxattr(
* later.
*/
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
- ip->i_extsize % mp->m_sb.sb_rextsize > 0) {
+ xfs_extlen_to_rtxmod(mp, ip->i_extsize) > 0) {
fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE |
FS_XFLAG_EXTSZINHERIT);
fa->fsx_extsize = 0;
@@ -1120,23 +1121,25 @@ xfs_ioctl_setattr_xflags(
struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
+ bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
uint64_t i_flags2;
- /* Can't change realtime flag if any extents are allocated. */
- if ((ip->i_df.if_nextents || ip->i_delayed_blks) &&
- XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
- return -EINVAL;
+ if (rtflag != XFS_IS_REALTIME_INODE(ip)) {
+ /* Can't change realtime flag if any extents are allocated. */
+ if (ip->i_df.if_nextents || ip->i_delayed_blks)
+ return -EINVAL;
+ }
- /* If realtime flag is set then must have realtime device */
- if (fa->fsx_xflags & FS_XFLAG_REALTIME) {
+ if (rtflag) {
+ /* If realtime flag is set then must have realtime device */
if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
- (ip->i_extsize % mp->m_sb.sb_rextsize))
+ xfs_extlen_to_rtxmod(mp, ip->i_extsize))
return -EINVAL;
- }
- /* Clear reflink if we are actually able to set the rt flag. */
- if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
- ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+ /* Clear reflink if we are actually able to set the rt flag. */
+ if (xfs_is_reflink_inode(ip))
+ ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+ }
/* diflags2 only valid for v3 inodes. */
i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
@@ -1147,6 +1150,14 @@ xfs_ioctl_setattr_xflags(
ip->i_diflags2 = i_flags2;
xfs_diflags_to_iflags(ip, false);
+
+ /*
+ * Make the stable writes flag match that of the device the inode
+ * resides on when flipping the RT flag.
+ */
+ if (rtflag != XFS_IS_REALTIME_INODE(ip) && S_ISREG(VFS_I(ip)->i_mode))
+ xfs_update_stable_writes(ip);
+
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(mp, xs_ig_attrchg);
@@ -1495,7 +1506,7 @@ xfs_ioc_getbmap(
error = 0;
out_free_buf:
- kmem_free(buf);
+ kvfree(buf);
return error;
}
@@ -1625,7 +1636,7 @@ xfs_ioc_getfsmap(
}
out_free:
- kmem_free(recs);
+ kvfree(recs);
return error;
}
@@ -1861,6 +1872,63 @@ xfs_fs_eofblocks_from_user(
return 0;
}
+static int
+xfs_ioctl_getset_resblocks(
+ struct file *filp,
+ unsigned int cmd,
+ void __user *arg)
+{
+ struct xfs_mount *mp = XFS_I(file_inode(filp))->i_mount;
+ struct xfs_fsop_resblks fsop = { };
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (cmd == XFS_IOC_SET_RESBLKS) {
+ if (xfs_is_readonly(mp))
+ return -EROFS;
+
+ if (copy_from_user(&fsop, arg, sizeof(fsop)))
+ return -EFAULT;
+
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+ error = xfs_reserve_blocks(mp, fsop.resblks);
+ mnt_drop_write_file(filp);
+ if (error)
+ return error;
+ }
+
+ spin_lock(&mp->m_sb_lock);
+ fsop.resblks = mp->m_resblks;
+ fsop.resblks_avail = mp->m_resblks_avail;
+ spin_unlock(&mp->m_sb_lock);
+
+ if (copy_to_user(arg, &fsop, sizeof(fsop)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+xfs_ioctl_fs_counts(
+ struct xfs_mount *mp,
+ struct xfs_fsop_counts __user *uarg)
+{
+ struct xfs_fsop_counts out = {
+ .allocino = percpu_counter_read_positive(&mp->m_icount),
+ .freeino = percpu_counter_read_positive(&mp->m_ifree),
+ .freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
+ xfs_fdblocks_unavailable(mp),
+ .freertx = percpu_counter_read_positive(&mp->m_frextents),
+ };
+
+ if (copy_to_user(uarg, &out, sizeof(out)))
+ return -EFAULT;
+ return 0;
+}
+
/*
* These long-unused ioctls were removed from the official ioctl API in 5.17,
* but retain these definitions so that we can log warnings about them.
@@ -1997,60 +2065,12 @@ xfs_file_ioctl(
return error;
}
- case XFS_IOC_FSCOUNTS: {
- xfs_fsop_counts_t out;
-
- xfs_fs_counts(mp, &out);
-
- if (copy_to_user(arg, &out, sizeof(out)))
- return -EFAULT;
- return 0;
- }
-
- case XFS_IOC_SET_RESBLKS: {
- xfs_fsop_resblks_t inout;
- uint64_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (xfs_is_readonly(mp))
- return -EROFS;
-
- if (copy_from_user(&inout, arg, sizeof(inout)))
- return -EFAULT;
-
- error = mnt_want_write_file(filp);
- if (error)
- return error;
-
- /* input parameter is passed in resblks field of structure */
- in = inout.resblks;
- error = xfs_reserve_blocks(mp, &in, &inout);
- mnt_drop_write_file(filp);
- if (error)
- return error;
-
- if (copy_to_user(arg, &inout, sizeof(inout)))
- return -EFAULT;
- return 0;
- }
-
- case XFS_IOC_GET_RESBLKS: {
- xfs_fsop_resblks_t out;
+ case XFS_IOC_FSCOUNTS:
+ return xfs_ioctl_fs_counts(mp, arg);
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- error = xfs_reserve_blocks(mp, NULL, &out);
- if (error)
- return error;
-
- if (copy_to_user(arg, &out, sizeof(out)))
- return -EFAULT;
-
- return 0;
- }
+ case XFS_IOC_SET_RESBLKS:
+ case XFS_IOC_GET_RESBLKS:
+ return xfs_ioctl_getset_resblocks(filp, cmd, arg);
case XFS_IOC_FSGROWFSDATA: {
struct xfs_growfs_data in;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 18c8f168b153..4087af7f3c9f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -27,6 +27,7 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_reflink.h"
+#include "xfs_health.h"
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -45,6 +46,7 @@ xfs_alert_fsblock_zero(
(unsigned long long)imap->br_startoff,
(unsigned long long)imap->br_blockcount,
imap->br_state);
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
return -EFSCORRUPTED;
}
@@ -99,8 +101,10 @@ xfs_bmbt_to_iomap(
struct xfs_mount *mp = ip->i_mount;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
- if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
+ if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
return xfs_alert_fsblock_zero(ip, imap);
+ }
if (imap->br_startblock == HOLESTARTBLOCK) {
iomap->addr = IOMAP_NULL_ADDR;
@@ -325,8 +329,10 @@ xfs_iomap_write_direct(
goto out_unlock;
}
- if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
+ if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = xfs_alert_fsblock_zero(ip, imap);
+ }
out_unlock:
*seq = xfs_iomap_inode_sequence(ip, 0);
@@ -639,8 +645,10 @@ xfs_iomap_write_unwritten(
if (error)
return error;
- if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
+ if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
return xfs_alert_fsblock_zero(ip, &imap);
+ }
if ((numblks_fsb = imap.br_blockcount) == 0) {
/*
@@ -986,6 +994,7 @@ xfs_buffered_write_iomap_begin(
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = -EFSCORRUPTED;
goto out_unlock;
}
@@ -1323,7 +1332,7 @@ xfs_seek_iomap_begin(
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
- xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+ xfs_trim_extent(&cmap, offset_fsb, end_fsb - offset_fsb);
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
IOMAP_F_SHARED, seq);
@@ -1348,7 +1357,7 @@ xfs_seek_iomap_begin(
imap.br_state = XFS_EXT_NORM;
done:
seq = xfs_iomap_inode_sequence(ip, 0);
- xfs_trim_extent(&imap, offset_fsb, end_fsb);
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
out_unlock:
xfs_iunlock(ip, lockmode);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index fdfda4fba12b..66f8c47642e8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -346,7 +346,7 @@ xfs_vn_ci_lookup(
dname.name = ci_name.name;
dname.len = ci_name.len;
dentry = d_add_ci(dentry, VFS_I(ip), &dname);
- kmem_free(ci_name.name);
+ kfree(ci_name.name);
return dentry;
}
@@ -796,8 +796,7 @@ xfs_setattr_size(
uint lock_flags = 0;
bool did_zeroing = false;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
ASSERT(S_ISREG(inode->i_mode));
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
@@ -1285,9 +1284,9 @@ xfs_setup_inode(
*/
lockdep_set_class(&inode->i_rwsem,
&inode->i_sb->s_type->i_mutex_dir_key);
- lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
+ lockdep_set_class(&ip->i_lock, &xfs_dir_ilock_class);
} else {
- lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
+ lockdep_set_class(&ip->i_lock, &xfs_nondir_ilock_class);
}
/*
@@ -1299,6 +1298,13 @@ xfs_setup_inode(
mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
/*
+ * For real-time inodes update the stable write flags to that of the RT
+ * device instead of the data device.
+ */
+ if (S_ISREG(inode->i_mode) && XFS_IS_REALTIME_INODE(ip))
+ xfs_update_stable_writes(ip);
+
+ /*
* If there is no attribute fork no ACL can exist on this inode,
* and it can't have any file capabilities attached to it either.
*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 14462614fcc8..95fc31b9f87d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -197,8 +197,8 @@ xfs_bulkstat_one(
ASSERT(breq->icount == 1);
- bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
- KM_MAYFAIL);
+ bc.buf = kzalloc(sizeof(struct xfs_bulkstat),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!bc.buf)
return -ENOMEM;
@@ -214,7 +214,7 @@ xfs_bulkstat_one(
breq->startino, &bc);
xfs_trans_cancel(tp);
out:
- kmem_free(bc.buf);
+ kfree(bc.buf);
/*
* If we reported one inode to userspace then we abort because we hit
@@ -289,8 +289,8 @@ xfs_bulkstat(
if (xfs_bulkstat_already_done(breq->mp, breq->startino))
return 0;
- bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
- KM_MAYFAIL);
+ bc.buf = kzalloc(sizeof(struct xfs_bulkstat),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!bc.buf)
return -ENOMEM;
@@ -309,7 +309,7 @@ xfs_bulkstat(
xfs_bulkstat_iwalk, breq->icount, &bc);
xfs_trans_cancel(tp);
out:
- kmem_free(bc.buf);
+ kfree(bc.buf);
/*
* We found some inodes, so clear the error status and return them.
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index b3275e8d47b6..01b55f03a102 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -22,6 +22,7 @@
#include "xfs_trans.h"
#include "xfs_pwork.h"
#include "xfs_ag.h"
+#include "xfs_bit.h"
/*
* Walking Inodes in the Filesystem
@@ -99,6 +100,7 @@ xfs_iwalk_ichunk_ra(
struct xfs_inobt_rec_incore *irec)
{
struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agnumber_t agno = pag->pag_agno;
xfs_agblock_t agbno;
struct blk_plug plug;
int i; /* inode chunk index */
@@ -111,8 +113,9 @@ xfs_iwalk_ichunk_ra(
imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
if (imask & ~irec->ir_free) {
- xfs_btree_reada_bufs(mp, pag->pag_agno, agbno,
- igeo->blocks_per_cluster,
+ xfs_buf_readahead(mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, agbno),
+ igeo->blocks_per_cluster * mp->m_bsize,
&xfs_inode_buf_ops);
}
agbno += igeo->blocks_per_cluster;
@@ -131,21 +134,11 @@ xfs_iwalk_adjust_start(
struct xfs_inobt_rec_incore *irec) /* btree record */
{
int idx; /* index into inode chunk */
- int i;
idx = agino - irec->ir_startino;
- /*
- * We got a right chunk with some left inodes allocated at it. Grab
- * the chunk record. Mark all the uninteresting inodes free because
- * they're before our start point.
- */
- for (i = 0; i < idx; i++) {
- if (XFS_INOBT_MASK(i) & ~irec->ir_free)
- irec->ir_freecount++;
- }
-
irec->ir_free |= xfs_inobt_maskn(0, idx);
+ irec->ir_freecount = hweight64(irec->ir_free);
}
/* Allocate memory for a walk. */
@@ -160,7 +153,7 @@ xfs_iwalk_alloc(
/* Allocate a prefetch buffer for inobt records. */
size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
- iwag->recs = kmem_alloc(size, KM_MAYFAIL);
+ iwag->recs = kmalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (iwag->recs == NULL)
return -ENOMEM;
@@ -172,7 +165,7 @@ STATIC void
xfs_iwalk_free(
struct xfs_iwalk_ag *iwag)
{
- kmem_free(iwag->recs);
+ kfree(iwag->recs);
iwag->recs = NULL;
}
@@ -275,9 +268,10 @@ xfs_iwalk_ag_start(
/* Set up a fresh cursor and empty the inobt cache. */
iwag->nr_recs = 0;
- error = xfs_inobt_cur(pag, tp, XFS_BTNUM_INO, curpp, agi_bpp);
+ error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
if (error)
return error;
+ *curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp);
/* Starting at the beginning of the AG? That's easy! */
if (agino == 0)
@@ -306,8 +300,10 @@ xfs_iwalk_ag_start(
error = xfs_inobt_get_rec(*curpp, irec, has_more);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, *has_more != 1))
+ if (XFS_IS_CORRUPT(mp, *has_more != 1)) {
+ xfs_btree_mark_sick(*curpp);
return -EFSCORRUPTED;
+ }
iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
@@ -390,11 +386,10 @@ xfs_iwalk_run_callbacks(
}
/* ...and recreate the cursor just past where we left off. */
- error = xfs_inobt_cur(iwag->pag, iwag->tp, XFS_BTNUM_INO, curpp,
- agi_bpp);
+ error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, agi_bpp);
if (error)
return error;
-
+ *curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp);
return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
}
@@ -434,6 +429,7 @@ xfs_iwalk_ag(
rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino);
if (iwag->lastino != NULLFSINO &&
XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
+ xfs_btree_mark_sick(cur);
error = -EFSCORRUPTED;
goto out;
}
@@ -627,7 +623,7 @@ xfs_iwalk_ag_work(
xfs_iwalk_free(iwag);
out:
xfs_perag_put(iwag->pag);
- kmem_free(iwag);
+ kfree(iwag);
return error;
}
@@ -663,7 +659,8 @@ xfs_iwalk_threaded(
if (xfs_pwork_ctl_want_abort(&pctl))
break;
- iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
+ iwag = kzalloc(sizeof(struct xfs_iwalk_ag),
+ GFP_KERNEL | __GFP_NOFAIL);
iwag->mp = mp;
/*
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index e9d317a3dafe..8f07c9f6157f 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -21,15 +21,13 @@ typedef __u32 xfs_nlink_t;
#include "xfs_types.h"
-#include "kmem.h"
-#include "mrlock.h"
-
#include <linux/semaphore.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <linux/crc32c.h>
#include <linux/module.h>
#include <linux/mutex.h>
@@ -51,6 +49,7 @@ typedef __u32 xfs_nlink_t;
#include <linux/notifier.h>
#include <linux/delay.h>
#include <linux/log2.h>
+#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/random.h>
#include <linux/ctype.h>
@@ -82,6 +81,7 @@ typedef __u32 xfs_nlink_t;
#include "xfs_buf.h"
#include "xfs_message.h"
#include "xfs_drain.h"
+#include "xfs_hooks.h"
#ifdef __BIG_ENDIAN
#define XFS_NATIVE_HOST 1
@@ -198,6 +198,18 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
return x;
}
+/* If @b is a power of 2, return log2(b). Else return -1. */
+static inline int8_t log2_if_power2(unsigned long b)
+{
+ return is_power_of_2(b) ? ilog2(b) : -1;
+}
+
+/* If @b is a power of 2, return a mask of the lower bits, else return zero. */
+static inline unsigned long long mask64_if_power2(unsigned long b)
+{
+ return is_power_of_2(b) ? b - 1 : 0;
+}
+
int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
char *data, enum req_op op);
@@ -257,4 +269,15 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
# define PTR_FMT "%p"
#endif
+/*
+ * Helper for IO routines to grab backing pages from allocated kernel memory.
+ */
+static inline struct page *
+kmem_to_page(void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ return vmalloc_to_page(addr);
+ return virt_to_page(addr);
+}
+
#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 51c100c86177..5004f23d344e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -633,14 +633,14 @@ xlog_state_release_iclog(
*/
int
xfs_log_mount(
- xfs_mount_t *mp,
- xfs_buftarg_t *log_target,
- xfs_daddr_t blk_offset,
- int num_bblks)
+ xfs_mount_t *mp,
+ struct xfs_buftarg *log_target,
+ xfs_daddr_t blk_offset,
+ int num_bblks)
{
- struct xlog *log;
- int error = 0;
- int min_logfsbs;
+ struct xlog *log;
+ int error = 0;
+ int min_logfsbs;
if (!xfs_has_norecovery(mp)) {
xfs_notice(mp, "Mounting V%d Filesystem %pU",
@@ -1528,7 +1528,7 @@ xlog_alloc_log(
int error = -ENOMEM;
uint log2_size = 0;
- log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
+ log = kzalloc(sizeof(struct xlog), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!log) {
xfs_warn(mp, "Log allocation failed: No memory!");
goto out;
@@ -1542,6 +1542,7 @@ xlog_alloc_log(
log->l_covered_state = XLOG_STATE_COVER_IDLE;
set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
+ INIT_LIST_HEAD(&log->r_dfops);
log->l_prev_block = -1;
/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -1604,7 +1605,8 @@ xlog_alloc_log(
size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
sizeof(struct bio_vec);
- iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
+ iclog = kzalloc(sizeof(*iclog) + bvec_size,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!iclog)
goto out_free_iclog;
@@ -1660,13 +1662,13 @@ out_destroy_workqueue:
out_free_iclog:
for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
prev_iclog = iclog->ic_next;
- kmem_free(iclog->ic_data);
- kmem_free(iclog);
+ kvfree(iclog->ic_data);
+ kfree(iclog);
if (prev_iclog == log->l_iclog)
break;
}
out_free_log:
- kmem_free(log);
+ kfree(log);
out:
return ERR_PTR(error);
} /* xlog_alloc_log */
@@ -1893,9 +1895,7 @@ xlog_write_iclog(
* the buffer manually, the code needs to be kept in sync
* with the I/O completion path.
*/
- xlog_state_done_syncing(iclog);
- up(&iclog->ic_sema);
- return;
+ goto sync;
}
/*
@@ -1925,20 +1925,17 @@ xlog_write_iclog(
* avoid shutdown re-entering this path and erroring out again.
*/
if (log->l_targ != log->l_mp->m_ddev_targp &&
- blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) {
- xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
- return;
- }
+ blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev))
+ goto shutdown;
}
if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
iclog->ic_bio.bi_opf |= REQ_FUA;
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
- if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
- xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
- return;
- }
+ if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count))
+ goto shutdown;
+
if (is_vmalloc_addr(iclog->ic_data))
flush_kernel_vmap_range(iclog->ic_data, count);
@@ -1959,6 +1956,12 @@ xlog_write_iclog(
}
submit_bio(&iclog->ic_bio);
+ return;
+shutdown:
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+sync:
+ xlog_state_done_syncing(iclog);
+ up(&iclog->ic_sema);
}
/*
@@ -2116,14 +2119,14 @@ xlog_dealloc_log(
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
next_iclog = iclog->ic_next;
- kmem_free(iclog->ic_data);
- kmem_free(iclog);
+ kvfree(iclog->ic_data);
+ kfree(iclog);
iclog = next_iclog;
}
log->l_mp->m_log = NULL;
destroy_workqueue(log->l_ioend_workqueue);
- kmem_free(log);
+ kfree(log);
}
/*
@@ -3515,7 +3518,8 @@ xlog_ticket_alloc(
struct xlog_ticket *tic;
int unit_res;
- tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
+ tic = kmem_cache_zalloc(xfs_log_ticket_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 67a99d94701e..73f5b7f628f4 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -100,7 +100,7 @@ xlog_cil_ctx_alloc(void)
{
struct xfs_cil_ctx *ctx;
- ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents.extent_list);
INIT_LIST_HEAD(&ctx->log_items);
@@ -339,7 +339,7 @@ xlog_cil_alloc_shadow_bufs(
* the buffer, only the log vector header and the iovec
* storage.
*/
- kmem_free(lip->li_lv_shadow);
+ kvfree(lip->li_lv_shadow);
lv = xlog_kvmalloc(buf_size);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
@@ -703,7 +703,7 @@ xlog_cil_free_logvec(
while (!list_empty(lv_chain)) {
lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list);
list_del_init(&lv->lv_list);
- kmem_free(lv);
+ kvfree(lv);
}
}
@@ -753,7 +753,7 @@ xlog_cil_committed(
return;
}
- kmem_free(ctx);
+ kfree(ctx);
}
void
@@ -1116,11 +1116,18 @@ xlog_cil_cleanup_whiteouts(
* same sequence twice. If we get a race between multiple pushes for the same
* sequence they will block on the first one and then abort, hence avoiding
* needless pushes.
+ *
+ * This runs from a workqueue so it does not inherent any specific memory
+ * allocation context. However, we do not want to block on memory reclaim
+ * recursing back into the filesystem because this push may have been triggered
+ * by memory reclaim itself. Hence we really need to run under full GFP_NOFS
+ * contraints here.
*/
static void
xlog_cil_push_work(
struct work_struct *work)
{
+ unsigned int nofs_flags = memalloc_nofs_save();
struct xfs_cil_ctx *ctx =
container_of(work, struct xfs_cil_ctx, push_work);
struct xfs_cil *cil = ctx->cil;
@@ -1334,12 +1341,14 @@ xlog_cil_push_work(
spin_unlock(&log->l_icloglock);
xlog_cil_cleanup_whiteouts(&whiteouts);
xfs_log_ticket_ungrant(log, ticket);
+ memalloc_nofs_restore(nofs_flags);
return;
out_skip:
up_write(&cil->xc_ctx_lock);
xfs_log_ticket_put(new_ctx->ticket);
- kmem_free(new_ctx);
+ kfree(new_ctx);
+ memalloc_nofs_restore(nofs_flags);
return;
out_abort_free_ticket:
@@ -1348,6 +1357,7 @@ out_abort_free_ticket:
if (!ctx->commit_iclog) {
xfs_log_ticket_ungrant(log, ctx->ticket);
xlog_cil_committed(ctx);
+ memalloc_nofs_restore(nofs_flags);
return;
}
spin_lock(&log->l_icloglock);
@@ -1356,6 +1366,7 @@ out_abort_free_ticket:
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
xfs_log_ticket_ungrant(log, ticket);
+ memalloc_nofs_restore(nofs_flags);
}
/*
@@ -1533,7 +1544,7 @@ xlog_cil_process_intents(
set_bit(XFS_LI_WHITEOUT, &ilip->li_flags);
trace_xfs_cil_whiteout_mark(ilip);
len += ilip->li_lv->lv_bytes;
- kmem_free(ilip->li_lv);
+ kvfree(ilip->li_lv);
ilip->li_lv = NULL;
xfs_trans_del_item(lip);
@@ -1747,7 +1758,7 @@ xlog_cil_init(
struct xlog_cil_pcp *cilpcp;
int cpu;
- cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
+ cil = kzalloc(sizeof(*cil), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!cil)
return -ENOMEM;
/*
@@ -1786,7 +1797,7 @@ xlog_cil_init(
out_destroy_wq:
destroy_workqueue(cil->xc_push_wq);
out_destroy_cil:
- kmem_free(cil);
+ kfree(cil);
return -ENOMEM;
}
@@ -1799,12 +1810,12 @@ xlog_cil_destroy(
if (cil->xc_ctx) {
if (cil->xc_ctx->ticket)
xfs_log_ticket_put(cil->xc_ctx->ticket);
- kmem_free(cil->xc_ctx);
+ kfree(cil->xc_ctx);
}
ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
free_percpu(cil->xc_pcp);
destroy_workqueue(cil->xc_push_wq);
- kmem_free(cil);
+ kfree(cil);
}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fa3ad1d7b31c..e30c06ec20e3 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -407,6 +407,7 @@ struct xlog {
long l_opstate; /* operational state */
uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
struct list_head *l_buf_cancel_table;
+ struct list_head r_dfops; /* recovered log intent items */
int l_iclog_hsize; /* size of iclog header */
int l_iclog_heads; /* # of iclog header sectors */
uint l_sectBBsize; /* sector size in BBs (2^n) */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13b94d2e605b..13f1d2e91540 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -361,7 +361,7 @@ xlog_find_verify_cycle(
*new_blk = -1;
out:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -477,7 +477,7 @@ xlog_find_verify_log_record(
*last_blk = i;
out:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -731,7 +731,7 @@ validate_head:
goto out_free_buffer;
}
- kmem_free(buffer);
+ kvfree(buffer);
if (head_blk == log_bbnum)
*return_head_blk = 0;
else
@@ -745,7 +745,7 @@ validate_head:
return 0;
out_free_buffer:
- kmem_free(buffer);
+ kvfree(buffer);
if (error)
xfs_warn(log->l_mp, "failed to find log head");
return error;
@@ -999,7 +999,7 @@ xlog_verify_tail(
"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
orig_tail, *tail_blk);
out:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -1046,7 +1046,7 @@ xlog_verify_head(
error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
XLOG_MAX_ICLOGS, tmp_buffer,
&tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
- kmem_free(tmp_buffer);
+ kvfree(tmp_buffer);
if (error < 0)
return error;
@@ -1365,7 +1365,7 @@ xlog_find_tail(
error = xlog_clear_stale_blocks(log, tail_lsn);
done:
- kmem_free(buffer);
+ kvfree(buffer);
if (error)
xfs_warn(log->l_mp, "failed to locate log tail");
@@ -1399,6 +1399,7 @@ xlog_find_zeroed(
xfs_daddr_t new_blk, last_blk, start_blk;
xfs_daddr_t num_scan_bblks;
int error, log_bbnum = log->l_logBBsize;
+ int ret = 1;
*blk_no = 0;
@@ -1413,8 +1414,7 @@ xlog_find_zeroed(
first_cycle = xlog_get_cycle(offset);
if (first_cycle == 0) { /* completely zeroed log */
*blk_no = 0;
- kmem_free(buffer);
- return 1;
+ goto out_free_buffer;
}
/* check partially zeroed log */
@@ -1424,8 +1424,8 @@ xlog_find_zeroed(
last_cycle = xlog_get_cycle(offset);
if (last_cycle != 0) { /* log completely written to */
- kmem_free(buffer);
- return 0;
+ ret = 0;
+ goto out_free_buffer;
}
/* we have a partially zeroed log */
@@ -1471,10 +1471,10 @@ xlog_find_zeroed(
*blk_no = last_blk;
out_free_buffer:
- kmem_free(buffer);
+ kvfree(buffer);
if (error)
return error;
- return 1;
+ return ret;
}
/*
@@ -1583,7 +1583,7 @@ xlog_write_log_records(
}
out_free_buffer:
- kmem_free(buffer);
+ kvfree(buffer);
return error;
}
@@ -1723,30 +1723,24 @@ xlog_clear_stale_blocks(
*/
void
xlog_recover_release_intent(
- struct xlog *log,
- unsigned short intent_type,
- uint64_t intent_id)
+ struct xlog *log,
+ unsigned short intent_type,
+ uint64_t intent_id)
{
- struct xfs_ail_cursor cur;
- struct xfs_log_item *lip;
- struct xfs_ail *ailp = log->l_ailp;
+ struct xfs_defer_pending *dfp, *n;
+
+ list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
+ struct xfs_log_item *lip = dfp->dfp_intent;
- spin_lock(&ailp->ail_lock);
- for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
- lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
if (lip->li_type != intent_type)
continue;
if (!lip->li_ops->iop_match(lip, intent_id))
continue;
- spin_unlock(&ailp->ail_lock);
- lip->li_ops->iop_release(lip);
- spin_lock(&ailp->ail_lock);
- break;
- }
+ ASSERT(xlog_item_is_intent(lip));
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
+ xfs_defer_cancel_recovery(log->l_mp, dfp);
+ }
}
int
@@ -1939,6 +1933,29 @@ xlog_buf_readahead(
xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
}
+/*
+ * Create a deferred work structure for resuming and tracking the progress of a
+ * log intent item that was found during recovery.
+ */
+void
+xlog_recover_intent_item(
+ struct xlog *log,
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn,
+ const struct xfs_defer_op_type *ops)
+{
+ ASSERT(xlog_item_is_intent(lip));
+
+ xfs_defer_start_recovery(lip, &log->r_dfops, ops);
+
+ /*
+ * Insert the intent into the AIL directly and drop one reference so
+ * that finishing or canceling the work will drop the other.
+ */
+ xfs_trans_ail_insert(log->l_ailp, lip, lsn);
+ lip->li_ops->iop_unpin(lip, 0);
+}
+
STATIC int
xlog_recover_items_pass2(
struct xlog *log,
@@ -2040,7 +2057,8 @@ xlog_recover_add_item(
{
struct xlog_recover_item *item;
- item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
+ item = kzalloc(sizeof(struct xlog_recover_item),
+ GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&item->ri_list);
list_add_tail(&item->ri_list, head);
}
@@ -2143,7 +2161,7 @@ xlog_recover_add_to_trans(
return 0;
}
- ptr = kmem_alloc(len, 0);
+ ptr = xlog_kvmalloc(len);
memcpy(ptr, dp, len);
in_f = (struct xfs_inode_log_format *)ptr;
@@ -2165,14 +2183,13 @@ xlog_recover_add_to_trans(
"bad number of regions (%d) in inode log format",
in_f->ilf_size);
ASSERT(0);
- kmem_free(ptr);
+ kvfree(ptr);
return -EFSCORRUPTED;
}
item->ri_total = in_f->ilf_size;
- item->ri_buf =
- kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
- 0);
+ item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+ GFP_KERNEL | __GFP_NOFAIL);
}
if (item->ri_total <= item->ri_cnt) {
@@ -2180,7 +2197,7 @@ xlog_recover_add_to_trans(
"log item region count (%d) overflowed size (%d)",
item->ri_cnt, item->ri_total);
ASSERT(0);
- kmem_free(ptr);
+ kvfree(ptr);
return -EFSCORRUPTED;
}
@@ -2210,13 +2227,13 @@ xlog_recover_free_trans(
/* Free the regions in the item. */
list_del(&item->ri_list);
for (i = 0; i < item->ri_cnt; i++)
- kmem_free(item->ri_buf[i].i_addr);
+ kvfree(item->ri_buf[i].i_addr);
/* Free the item itself */
- kmem_free(item->ri_buf);
- kmem_free(item);
+ kfree(item->ri_buf);
+ kfree(item);
}
/* Free the transaction recover structure */
- kmem_free(trans);
+ kfree(trans);
}
/*
@@ -2315,7 +2332,7 @@ xlog_recover_ophdr_to_trans(
* This is a new transaction so allocate a new recovery container to
* hold the recovery ops that will follow.
*/
- trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
+ trans = kzalloc(sizeof(struct xlog_recover), GFP_KERNEL | __GFP_NOFAIL);
trans->r_log_tid = tid;
trans->r_lsn = be64_to_cpu(rhead->h_lsn);
INIT_LIST_HEAD(&trans->r_itemq);
@@ -2511,7 +2528,7 @@ xlog_abort_defer_ops(
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
list_del_init(&dfc->dfc_list);
- xfs_defer_ops_capture_free(mp, dfc);
+ xfs_defer_ops_capture_abort(mp, dfc);
}
}
@@ -2533,36 +2550,26 @@ xlog_abort_defer_ops(
*/
STATIC int
xlog_recover_process_intents(
- struct xlog *log)
+ struct xlog *log)
{
LIST_HEAD(capture_list);
- struct xfs_ail_cursor cur;
- struct xfs_log_item *lip;
- struct xfs_ail *ailp;
- int error = 0;
+ struct xfs_defer_pending *dfp, *n;
+ int error = 0;
#if defined(DEBUG) || defined(XFS_WARN)
- xfs_lsn_t last_lsn;
-#endif
+ xfs_lsn_t last_lsn;
- ailp = log->l_ailp;
- spin_lock(&ailp->ail_lock);
-#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
#endif
- for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- lip != NULL;
- lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
- const struct xfs_item_ops *ops;
- if (!xlog_item_is_intent(lip))
- break;
+ list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
+ ASSERT(xlog_item_is_intent(dfp->dfp_intent));
/*
* We should never see a redo item with a LSN higher than
* the last transaction we found in the log at the start
* of recovery.
*/
- ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
+ ASSERT(XFS_LSN_CMP(last_lsn, dfp->dfp_intent->li_lsn) >= 0);
/*
* NOTE: If your intent processing routine can create more
@@ -2571,21 +2578,14 @@ xlog_recover_process_intents(
* replayed in the wrong order!
*
* The recovery function can free the log item, so we must not
- * access lip after it returns.
+ * access dfp->dfp_intent after it returns. It must dispose of
+ * @dfp if it returns 0.
*/
- spin_unlock(&ailp->ail_lock);
- ops = lip->li_ops;
- error = ops->iop_recover(lip, &capture_list);
- spin_lock(&ailp->ail_lock);
- if (error) {
- trace_xlog_intent_recovery_failed(log->l_mp, error,
- ops->iop_recover);
+ error = xfs_defer_finish_recovery(log->l_mp, dfp,
+ &capture_list);
+ if (error)
break;
- }
}
-
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
if (error)
goto err;
@@ -2606,27 +2606,34 @@ err:
*/
STATIC void
xlog_recover_cancel_intents(
- struct xlog *log)
+ struct xlog *log)
{
- struct xfs_log_item *lip;
- struct xfs_ail_cursor cur;
- struct xfs_ail *ailp;
-
- ailp = log->l_ailp;
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- while (lip != NULL) {
- if (!xlog_item_is_intent(lip))
- break;
+ struct xfs_defer_pending *dfp, *n;
- spin_unlock(&ailp->ail_lock);
- lip->li_ops->iop_release(lip);
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
+ ASSERT(xlog_item_is_intent(dfp->dfp_intent));
+
+ xfs_defer_cancel_recovery(log->l_mp, dfp);
}
+}
+
+/*
+ * Transfer ownership of the recovered pending work to the recovery transaction
+ * and try to finish the work. If there is more work to be done, the dfp will
+ * remain attached to the transaction. If not, the dfp is freed.
+ */
+int
+xlog_recover_finish_intent(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ int error;
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
+ list_move(&dfp->dfp_list, &tp->t_dfops);
+ error = xfs_defer_finish_one(tp, dfp);
+ if (error == -EAGAIN)
+ return 0;
+ return error;
}
/*
@@ -3017,7 +3024,7 @@ xlog_do_recovery_pass(
hblks = xlog_logrec_hblks(log, rhead);
if (hblks != 1) {
- kmem_free(hbp);
+ kvfree(hbp);
hbp = xlog_alloc_buffer(log, hblks);
}
} else {
@@ -3031,7 +3038,7 @@ xlog_do_recovery_pass(
return -ENOMEM;
dbp = xlog_alloc_buffer(log, BTOBB(h_size));
if (!dbp) {
- kmem_free(hbp);
+ kvfree(hbp);
return -ENOMEM;
}
@@ -3192,16 +3199,33 @@ xlog_do_recovery_pass(
}
bread_err2:
- kmem_free(dbp);
+ kvfree(dbp);
bread_err1:
- kmem_free(hbp);
+ kvfree(hbp);
/*
- * Submit buffers that have been added from the last record processed,
- * regardless of error status.
+ * Submit buffers that have been dirtied by the last record recovered.
*/
- if (!list_empty(&buffer_list))
+ if (!list_empty(&buffer_list)) {
+ if (error) {
+ /*
+ * If there has been an item recovery error then we
+ * cannot allow partial checkpoint writeback to
+ * occur. We might have multiple checkpoints with the
+ * same start LSN in this buffer list, and partial
+ * writeback of a checkpoint in this situation can
+ * prevent future recovery of all the changes in the
+ * checkpoints at this start LSN.
+ *
+ * Note: Shutting down the filesystem will result in the
+ * delwri submission marking all the buffers stale,
+ * completing them and cleaning up _XBF_LOGRECOVERY
+ * state without doing any IO.
+ */
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ }
error2 = xfs_buf_delwri_submit(&buffer_list);
+ }
if (error && first_bad)
*first_bad = rhead_blk;
@@ -3436,12 +3460,19 @@ xlog_recover(
* part of recovery so that the root and real-time bitmap inodes can be read in
* from disk in between the two stages. This is necessary so that we can free
* space in the real-time portion of the file system.
+ *
+ * We run this whole process under GFP_NOFS allocation context. We do a
+ * combination of non-transactional and transactional work, yet we really don't
+ * want to recurse into the filesystem from direct reclaim during any of this
+ * processing. This allows all the recovery code run here not to care about the
+ * memory allocation context it is running in.
*/
int
xlog_recover_finish(
struct xlog *log)
{
- int error;
+ unsigned int nofs_flags = memalloc_nofs_save();
+ int error;
error = xlog_recover_process_intents(log);
if (error) {
@@ -3455,7 +3486,7 @@ xlog_recover_finish(
xlog_recover_cancel_intents(log);
xfs_alert(log->l_mp, "Failed to recover intents");
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
- return error;
+ goto out_error;
}
/*
@@ -3476,7 +3507,7 @@ xlog_recover_finish(
if (error < 0) {
xfs_alert(log->l_mp,
"Failed to clear log incompat features on recovery");
- return error;
+ goto out_error;
}
}
@@ -3501,9 +3532,13 @@ xlog_recover_finish(
* and AIL.
*/
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ error = 0;
+ goto out_error;
}
- return 0;
+out_error:
+ memalloc_nofs_restore(nofs_flags);
+ return error;
}
void
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aed5be5508fe..df370eb5dc15 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,7 @@ xfs_uuid_table_free(void)
{
if (xfs_uuid_table_size == 0)
return;
- kmem_free(xfs_uuid_table);
+ kfree(xfs_uuid_table);
xfs_uuid_table = NULL;
xfs_uuid_table_size = 0;
}
@@ -62,7 +62,7 @@ xfs_uuid_mount(
int hole, i;
/* Publish UUID in struct super_block */
- uuid_copy(&mp->m_super->s_uuid, uuid);
+ super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid));
if (xfs_has_nouuid(mp))
return 0;
@@ -637,7 +637,6 @@ xfs_mountfs(
struct xfs_sb *sbp = &(mp->m_sb);
struct xfs_inode *rip;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
- uint64_t resblks;
uint quotamount = 0;
uint quotaflags = 0;
int error = 0;
@@ -707,6 +706,8 @@ xfs_mountfs(
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
+ super_set_sysfs_name_id(mp->m_super);
+
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
NULL, mp->m_super->s_id);
if (error)
@@ -974,8 +975,7 @@ xfs_mountfs(
* we were already there on the last unmount. Warn if this occurs.
*/
if (!xfs_is_readonly(mp)) {
- resblks = xfs_default_resblks(mp);
- error = xfs_reserve_blocks(mp, &resblks, NULL);
+ error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
if (error)
xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool.");
@@ -1053,7 +1053,6 @@ void
xfs_unmountfs(
struct xfs_mount *mp)
{
- uint64_t resblks;
int error;
/*
@@ -1090,8 +1089,7 @@ xfs_unmountfs(
* we only every apply deltas to the superblock and hence the incore
* value does not matter....
*/
- resblks = 0;
- error = xfs_reserve_blocks(mp, &resblks, NULL);
+ error = xfs_reserve_blocks(mp, 0);
if (error)
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 219681d29fbc..e880aa48de68 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -94,16 +94,16 @@ typedef struct xfs_mount {
struct xfs_inode *m_rsumip; /* pointer to summary inode */
struct xfs_inode *m_rootip; /* pointer to root directory */
struct xfs_quotainfo *m_quotainfo; /* disk quota information */
- xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
- xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
- xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
+ struct xfs_buftarg *m_ddev_targp; /* data device */
+ struct xfs_buftarg *m_logdev_targp;/* log device */
+ struct xfs_buftarg *m_rtdev_targp; /* rt device */
void __percpu *m_inodegc; /* percpu inodegc structures */
/*
* Optional cache of rt summary level per bitmap block with the
- * invariant that m_rsum_cache[bbno] <= the minimum i for which
- * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
- * inode lock.
+ * invariant that m_rsum_cache[bbno] > the maximum i for which
+ * rsum[i][bbno] != 0, or 0 if rsum[i][bbno] == 0 for all i.
+ * Reads and writes are serialized by the rsumip inode lock.
*/
uint8_t *m_rsum_cache;
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
@@ -119,6 +119,7 @@ typedef struct xfs_mount {
uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
uint8_t m_agno_log; /* log #ag's */
uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
+ int8_t m_rtxblklog; /* log2 of rextsize, if possible */
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
uint m_blockwmask; /* blockwsize-1 */
@@ -152,6 +153,7 @@ typedef struct xfs_mount {
uint64_t m_features; /* active filesystem features */
uint64_t m_low_space[XFS_LOWSP_MAX];
uint64_t m_low_rtexts[XFS_LOWSP_MAX];
+ uint64_t m_rtxblkmask; /* rt extent block mask */
struct xfs_ino_geometry m_ino_geo; /* inode geometry */
struct xfs_trans_resv m_resv; /* precomputed res values */
/* low free space thresholds */
@@ -250,6 +252,9 @@ typedef struct xfs_mount {
/* cpus that have inodes queued for inactivation */
struct cpumask m_inodegc_cpumask;
+
+ /* Hook to feed dirent updates to an active online repair. */
+ struct xfs_hooks m_dir_update_hooks;
} xfs_mount_t;
#define M_IGEO(mp) (&(mp)->m_ino_geo)
@@ -500,9 +505,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
-int xfs_buf_hash_init(struct xfs_perag *pag);
-void xfs_buf_hash_destroy(struct xfs_perag *pag);
-
extern void xfs_uuid_table_free(void);
extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index f85e3b07ab44..7443debaffd6 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -333,13 +333,14 @@ xfs_mru_cache_create(
if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
return -EINVAL;
- if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
+ mru = kzalloc(sizeof(*mru), GFP_KERNEL | __GFP_NOFAIL);
+ if (!mru)
return -ENOMEM;
/* An extra list is needed to avoid reaping up to a grp_time early. */
mru->grp_count = grp_count + 1;
- mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
-
+ mru->lists = kzalloc(mru->grp_count * sizeof(*mru->lists),
+ GFP_KERNEL | __GFP_NOFAIL);
if (!mru->lists) {
err = -ENOMEM;
goto exit;
@@ -364,9 +365,9 @@ xfs_mru_cache_create(
exit:
if (err && mru && mru->lists)
- kmem_free(mru->lists);
+ kfree(mru->lists);
if (err && mru)
- kmem_free(mru);
+ kfree(mru);
return err;
}
@@ -406,8 +407,8 @@ xfs_mru_cache_destroy(
xfs_mru_cache_flush(mru);
- kmem_free(mru->lists);
- kmem_free(mru);
+ kfree(mru->lists);
+ kfree(mru);
}
/*
@@ -427,7 +428,7 @@ xfs_mru_cache_insert(
if (!mru || !mru->lists)
return -EINVAL;
- if (radix_tree_preload(GFP_NOFS))
+ if (radix_tree_preload(GFP_KERNEL))
return -ENOMEM;
INIT_LIST_HEAD(&elem->list_node);
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index a7daa522e00f..fa50e5308292 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -22,6 +22,7 @@
#include <linux/mm.h>
#include <linux/dax.h>
+#include <linux/fs.h>
struct xfs_failure_info {
xfs_agblock_t startblock;
@@ -73,10 +74,16 @@ xfs_dax_failure_fn(
struct xfs_mount *mp = cur->bc_mp;
struct xfs_inode *ip;
struct xfs_failure_info *notify = data;
+ struct address_space *mapping;
+ pgoff_t pgoff;
+ unsigned long pgcnt;
int error = 0;
if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+ /* Continue the query because this isn't a failure. */
+ if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+ return 0;
notify->want_shutdown = true;
return 0;
}
@@ -92,15 +99,61 @@ xfs_dax_failure_fn(
return 0;
}
- error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
- xfs_failure_pgoff(mp, rec, notify),
- xfs_failure_pgcnt(mp, rec, notify),
- notify->mf_flags);
+ mapping = VFS_I(ip)->i_mapping;
+ pgoff = xfs_failure_pgoff(mp, rec, notify);
+ pgcnt = xfs_failure_pgcnt(mp, rec, notify);
+
+ /* Continue the rmap query if the inode isn't a dax file. */
+ if (dax_mapping(mapping))
+ error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
+ notify->mf_flags);
+
+ /* Invalidate the cache in dax pages. */
+ if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+ invalidate_inode_pages2_range(mapping, pgoff,
+ pgoff + pgcnt - 1);
+
xfs_irele(ip);
return error;
}
static int
+xfs_dax_notify_failure_freeze(
+ struct xfs_mount *mp)
+{
+ struct super_block *sb = mp->m_super;
+ int error;
+
+ error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
+ if (error)
+ xfs_emerg(mp, "already frozen by kernel, err=%d", error);
+
+ return error;
+}
+
+static void
+xfs_dax_notify_failure_thaw(
+ struct xfs_mount *mp,
+ bool kernel_frozen)
+{
+ struct super_block *sb = mp->m_super;
+ int error;
+
+ if (kernel_frozen) {
+ error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
+ if (error)
+ xfs_emerg(mp, "still frozen after notify failure, err=%d",
+ error);
+ }
+
+ /*
+ * Also thaw userspace call anyway because the device is about to be
+ * removed immediately.
+ */
+ thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+}
+
+static int
xfs_dax_notify_ddev_failure(
struct xfs_mount *mp,
xfs_daddr_t daddr,
@@ -112,15 +165,29 @@ xfs_dax_notify_ddev_failure(
struct xfs_btree_cur *cur = NULL;
struct xfs_buf *agf_bp = NULL;
int error = 0;
+ bool kernel_frozen = false;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr);
xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp,
daddr + bblen - 1);
xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
+ if (mf_flags & MF_MEM_PRE_REMOVE) {
+ xfs_info(mp, "Device is about to be removed!");
+ /*
+ * Freeze fs to prevent new mappings from being created.
+ * - Keep going on if others already hold the kernel forzen.
+ * - Keep going on if other errors too because this device is
+ * starting to fail.
+ * - If kernel frozen state is hold successfully here, thaw it
+ * here as well at the end.
+ */
+ kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
+ }
+
error = xfs_trans_alloc_empty(mp, &tp);
if (error)
- return error;
+ goto out;
for (; agno <= end_agno; agno++) {
struct xfs_rmap_irec ri_low = { };
@@ -165,11 +232,26 @@ xfs_dax_notify_ddev_failure(
}
xfs_trans_cancel(tp);
- if (error || notify.want_shutdown) {
+
+ /*
+ * Shutdown fs from a force umount in pre-remove case which won't fail,
+ * so errors can be ignored. Otherwise, shutdown the filesystem with
+ * CORRUPT flag if error occured or notify.want_shutdown was set during
+ * RMAP querying.
+ */
+ if (mf_flags & MF_MEM_PRE_REMOVE)
+ xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+ else if (error || notify.want_shutdown) {
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
if (!error)
error = -EFSCORRUPTED;
}
+
+out:
+ /* Thaw the fs if it has been frozen before. */
+ if (mf_flags & MF_MEM_PRE_REMOVE)
+ xfs_dax_notify_failure_thaw(mp, kernel_frozen);
+
return error;
}
@@ -197,6 +279,14 @@ xfs_dax_notify_failure(
if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
mp->m_logdev_targp != mp->m_ddev_targp) {
+ /*
+ * In the pre-remove case the failure notification is attempting
+ * to trigger a force unmount. The expectation is that the
+ * device is still present, but its removal is in progress and
+ * can not be cancelled, proceed with accessing the log device.
+ */
+ if (mf_flags & MF_MEM_PRE_REMOVE)
+ return 0;
xfs_err(mp, "ondisk log corrupt, shutting down fs!");
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
return -EFSCORRUPTED;
@@ -210,6 +300,12 @@ xfs_dax_notify_failure(
ddev_start = mp->m_ddev_targp->bt_dax_part_off;
ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
+ /* Notify failure on the whole device. */
+ if (offset == 0 && len == U64_MAX) {
+ offset = ddev_start;
+ len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
+ }
+
/* Ignore the range out of filesystem area */
if (offset + len - 1 < ddev_start)
return -ENXIO;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 94a7932ac570..0f4cf4170c35 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -26,6 +26,7 @@
#include "xfs_ag.h"
#include "xfs_ialloc.h"
#include "xfs_log_priv.h"
+#include "xfs_health.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -171,7 +172,7 @@ xfs_qm_dqpurge(
* hits zero, so it really should be on the freelist here.
*/
ASSERT(!list_empty(&dqp->q_lru));
- list_lru_del(&qi->qi_lru, &dqp->q_lru);
+ list_lru_del_obj(&qi->qi_lru, &dqp->q_lru);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
@@ -254,7 +255,7 @@ xfs_qm_dqattach_one(
struct xfs_dquot *dqp;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
error = 0;
/*
@@ -322,7 +323,7 @@ xfs_qm_dqattach_locked(
if (!xfs_qm_need_dqattach(ip))
return 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER,
@@ -353,7 +354,7 @@ done:
* Don't worry about the dquots that we may have attached before any
* error - they'll get detached later if it has not already been done.
*/
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -628,7 +629,8 @@ xfs_qm_init_quotainfo(
ASSERT(XFS_IS_QUOTA_ON(mp));
- qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0);
+ qinf = mp->m_quotainfo = kzalloc(sizeof(struct xfs_quotainfo),
+ GFP_KERNEL | __GFP_NOFAIL);
error = list_lru_init(&qinf->qi_lru);
if (error)
@@ -642,9 +644,9 @@ xfs_qm_init_quotainfo(
if (error)
goto out_free_lru;
- INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
- INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
- INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
+ INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_KERNEL);
+ INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_KERNEL);
+ INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_KERNEL);
mutex_init(&qinf->qi_tree_lock);
/* mutex used to serialize quotaoffs */
@@ -691,6 +693,9 @@ xfs_qm_init_quotainfo(
shrinker_register(qinf->qi_shrinker);
+ xfs_hooks_init(&qinf->qi_mod_ino_dqtrx_hooks);
+ xfs_hooks_init(&qinf->qi_apply_dqtrx_hooks);
+
return 0;
out_free_inos:
@@ -700,7 +705,7 @@ out_free_inos:
out_free_lru:
list_lru_destroy(&qinf->qi_lru);
out_free_qinf:
- kmem_free(qinf);
+ kfree(qinf);
mp->m_quotainfo = NULL;
return error;
}
@@ -724,7 +729,7 @@ xfs_qm_destroy_quotainfo(
xfs_qm_destroy_quotainos(qi);
mutex_destroy(&qi->qi_tree_lock);
mutex_destroy(&qi->qi_quotaofflock);
- kmem_free(qi);
+ kfree(qi);
mp->m_quotainfo = NULL;
}
@@ -758,14 +763,18 @@ xfs_qm_qino_alloc(
(mp->m_sb.sb_gquotino != NULLFSINO)) {
ino = mp->m_sb.sb_gquotino;
if (XFS_IS_CORRUPT(mp,
- mp->m_sb.sb_pquotino != NULLFSINO))
+ mp->m_sb.sb_pquotino != NULLFSINO)) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA);
return -EFSCORRUPTED;
+ }
} else if ((flags & XFS_QMOPT_GQUOTA) &&
(mp->m_sb.sb_pquotino != NULLFSINO)) {
ino = mp->m_sb.sb_pquotino;
if (XFS_IS_CORRUPT(mp,
- mp->m_sb.sb_gquotino != NULLFSINO))
+ mp->m_sb.sb_gquotino != NULLFSINO)) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA);
return -EFSCORRUPTED;
+ }
}
if (ino != NULLFSINO) {
error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
@@ -996,7 +1005,8 @@ xfs_qm_reset_dqcounts_buf(
if (qip->i_nblocks == 0)
return 0;
- map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
+ map = kmalloc(XFS_DQITER_MAP_SIZE * sizeof(*map),
+ GFP_KERNEL | __GFP_NOFAIL);
lblkno = 0;
maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
@@ -1058,7 +1068,7 @@ xfs_qm_reset_dqcounts_buf(
} while (nmaps > 0);
out:
- kmem_free(map);
+ kfree(map);
return error;
}
@@ -1406,8 +1416,12 @@ error_return:
xfs_warn(mp,
"Quotacheck: Failed to reset quota flags.");
}
- } else
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK);
+ } else {
xfs_notice(mp, "Quotacheck: Done.");
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_QUOTACHECK);
+ }
+
return error;
error_purge:
@@ -1809,7 +1823,7 @@ xfs_qm_vop_chown(
XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(XFS_IS_QUOTA_ON(ip->i_mount));
/* old dquot */
@@ -1817,12 +1831,12 @@ xfs_qm_vop_chown(
ASSERT(prevdq);
ASSERT(prevdq != newdq);
- xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_nblocks));
- xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
+ xfs_trans_mod_ino_dquot(tp, ip, prevdq, bfield, -(ip->i_nblocks));
+ xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
/* the sparkling new dquot */
- xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_nblocks);
- xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
+ xfs_trans_mod_ino_dquot(tp, ip, newdq, bfield, ip->i_nblocks);
+ xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_ICOUNT, 1);
/*
* Back when we made quota reservations for the chown, we reserved the
@@ -1897,29 +1911,28 @@ xfs_qm_vop_create_dqattach(
if (!XFS_IS_QUOTA_ON(mp))
return;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (udqp && XFS_IS_UQUOTA_ON(mp)) {
ASSERT(ip->i_udquot == NULL);
ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id);
ip->i_udquot = xfs_qm_dqhold(udqp);
- xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
ASSERT(ip->i_gdquot == NULL);
ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id);
ip->i_gdquot = xfs_qm_dqhold(gdqp);
- xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
ASSERT(ip->i_pdquot == NULL);
ASSERT(ip->i_projid == pdqp->q_id);
ip->i_pdquot = xfs_qm_dqhold(pdqp);
- xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
+
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, 1);
}
/* Decide if this inode's dquot is near an enforcement boundary. */
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index d5c9fc4ba591..f5993012bf98 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -68,6 +68,10 @@ struct xfs_quotainfo {
/* Minimum and maximum quota expiration timestamp values. */
time64_t qi_expiry_min;
time64_t qi_expiry_max;
+
+ /* Hook to feed quota counter updates to an active online repair. */
+ struct xfs_hooks qi_mod_ino_dqtrx_hooks;
+ struct xfs_hooks qi_apply_dqtrx_hooks;
};
static inline struct radix_tree_root *
@@ -104,6 +108,18 @@ xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type)
return NULL;
}
+/*
+ * Parameters for tracking dqtrx changes on behalf of an inode. The hook
+ * function arg parameter is the field being updated.
+ */
+struct xfs_mod_ino_dqtrx_params {
+ uintptr_t tx_id;
+ xfs_ino_t ino;
+ xfs_dqtype_t q_type;
+ xfs_dqid_t q_id;
+ int64_t delta;
+};
+
extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp,
uint field, int64_t delta);
extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index b77673dd0558..271c1021c733 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -9,6 +9,7 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index dcc785fdd345..85a4ae1a17f6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -74,6 +74,22 @@ struct xfs_dqtrx {
int64_t qt_icount_delta; /* dquot inode count changes */
};
+enum xfs_apply_dqtrx_type {
+ XFS_APPLY_DQTRX_COMMIT = 0,
+ XFS_APPLY_DQTRX_UNRESERVE,
+};
+
+/*
+ * Parameters for applying dqtrx changes to a dquot. The hook function arg
+ * parameter is enum xfs_apply_dqtrx_type.
+ */
+struct xfs_apply_dqtrx_params {
+ uintptr_t tx_id;
+ xfs_ino_t ino;
+ xfs_dqtype_t q_type;
+ xfs_dqid_t q_id;
+};
+
#ifdef CONFIG_XFS_QUOTA
extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
extern void xfs_trans_free_dqinfo(struct xfs_trans *);
@@ -114,6 +130,30 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
}
bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type);
+
+# ifdef CONFIG_XFS_LIVE_HOOKS
+void xfs_trans_mod_ino_dquot(struct xfs_trans *tp, struct xfs_inode *ip,
+ struct xfs_dquot *dqp, unsigned int field, int64_t delta);
+
+struct xfs_quotainfo;
+
+struct xfs_dqtrx_hook {
+ struct xfs_hook mod_hook;
+ struct xfs_hook apply_hook;
+};
+
+void xfs_dqtrx_hook_disable(void);
+void xfs_dqtrx_hook_enable(void);
+
+int xfs_dqtrx_hook_add(struct xfs_quotainfo *qi, struct xfs_dqtrx_hook *hook);
+void xfs_dqtrx_hook_del(struct xfs_quotainfo *qi, struct xfs_dqtrx_hook *hook);
+void xfs_dqtrx_hook_setup(struct xfs_dqtrx_hook *hook, notifier_fn_t mod_fn,
+ notifier_fn_t apply_fn);
+# else
+# define xfs_trans_mod_ino_dquot(tp, ip, dqp, field, delta) \
+ xfs_trans_mod_dquot((tp), (dqp), (field), (delta))
+# endif /* CONFIG_XFS_LIVE_HOOKS */
+
#else
static inline int
xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
@@ -127,7 +167,10 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
}
#define xfs_trans_dup_dqinfo(tp, tp2)
#define xfs_trans_free_dqinfo(tp)
-#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0)
+static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp,
+ struct xfs_inode *ip, uint field, int64_t delta)
+{
+}
#define xfs_trans_apply_dquot_deltas(tp)
#define xfs_trans_unreserve_and_mod_dquots(tp)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
@@ -170,6 +213,12 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
#define xfs_qm_unmount(mp)
#define xfs_qm_unmount_quotas(mp)
#define xfs_inode_near_dquot_enforcement(ip, type) (false)
+
+# ifdef CONFIG_XFS_LIVE_HOOKS
+# define xfs_dqtrx_hook_enable() ((void)0)
+# define xfs_dqtrx_hook_disable() ((void)0)
+# endif /* CONFIG_XFS_LIVE_HOOKS */
+
#endif /* CONFIG_XFS_QUOTA */
static inline int
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 2d4444d61e98..14919b33e4fe 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -36,9 +36,9 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
- kmem_free(cuip->cui_item.li_lv_shadow);
+ kvfree(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
- kmem_free(cuip);
+ kfree(cuip);
else
kmem_cache_free(xfs_cui_cache, cuip);
}
@@ -143,8 +143,8 @@ xfs_cui_init(
ASSERT(nextents > 0);
if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
- cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
- 0);
+ cuip = kzalloc(xfs_cui_log_item_sizeof(nextents),
+ GFP_KERNEL | __GFP_NOFAIL);
else
cuip = kmem_cache_zalloc(xfs_cui_cache,
GFP_KERNEL | __GFP_NOFAIL);
@@ -207,7 +207,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
- kmem_free(cudp->cud_item.li_lv_shadow);
+ kvfree(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_cache, cudp);
}
@@ -227,52 +227,6 @@ static const struct xfs_item_ops xfs_cud_item_ops = {
.iop_intent = xfs_cud_item_intent,
};
-static struct xfs_cud_log_item *
-xfs_trans_get_cud(
- struct xfs_trans *tp,
- struct xfs_cui_log_item *cuip)
-{
- struct xfs_cud_log_item *cudp;
-
- cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
- &xfs_cud_item_ops);
- cudp->cud_cuip = cuip;
- cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
-
- xfs_trans_add_item(tp, &cudp->cud_item);
- return cudp;
-}
-
-/*
- * Finish an refcount update and log it to the CUD. Note that the
- * transaction is marked dirty regardless of whether the refcount
- * update succeeds or fails to support the CUI/CUD lifecycle rules.
- */
-static int
-xfs_trans_log_finish_refcount_update(
- struct xfs_trans *tp,
- struct xfs_cud_log_item *cudp,
- struct xfs_refcount_intent *ri,
- struct xfs_btree_cur **pcur)
-{
- int error;
-
- error = xfs_refcount_finish_one(tp, ri, pcur);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the CUI and frees the CUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
- set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
-
- return error;
-}
-
/* Sort refcount intents by AG. */
static int
xfs_refcount_update_diff_items(
@@ -318,9 +272,6 @@ xfs_refcount_update_log_item(
uint next_extent;
struct xfs_phys_extent *pmap;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
-
/*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
@@ -347,7 +298,6 @@ xfs_refcount_update_create_intent(
ASSERT(count > 0);
- xfs_trans_add_item(tp, &cuip->cui_item);
if (sort)
list_sort(mp, items, xfs_refcount_update_diff_items);
list_for_each_entry(ri, items, ri_list)
@@ -362,7 +312,16 @@ xfs_refcount_update_create_done(
struct xfs_log_item *intent,
unsigned int count)
{
- return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item;
+ struct xfs_cui_log_item *cuip = CUI_ITEM(intent);
+ struct xfs_cud_log_item *cudp;
+
+ cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
+ &xfs_cud_item_ops);
+ cudp->cud_cuip = cuip;
+ cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
+
+ return &cudp->cud_item;
}
/* Take a passive ref to the AG containing the space we're refcounting. */
@@ -397,10 +356,9 @@ xfs_refcount_update_finish_item(
int error;
ri = container_of(item, struct xfs_refcount_intent, ri_list);
- error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri,
- state);
/* Did we run out of reservation? Requeue what we didn't finish. */
+ error = xfs_refcount_finish_one(tp, ri, state);
if (!error && ri->ri_blockcount > 0) {
ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
ri->ri_type == XFS_REFCOUNT_DECREASE);
@@ -433,16 +391,6 @@ xfs_refcount_update_cancel_item(
kmem_cache_free(xfs_refcount_intent_cache, ri);
}
-const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
- .max_items = XFS_CUI_MAX_FAST_EXTENTS,
- .create_intent = xfs_refcount_update_create_intent,
- .abort_intent = xfs_refcount_update_abort_intent,
- .create_done = xfs_refcount_update_create_done,
- .finish_item = xfs_refcount_update_finish_item,
- .finish_cleanup = xfs_refcount_finish_one_cleanup,
- .cancel_item = xfs_refcount_update_cancel_item,
-};
-
/* Is this recovered CUI ok? */
static inline bool
xfs_cui_validate_phys(
@@ -468,23 +416,38 @@ xfs_cui_validate_phys(
return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len);
}
+static inline void
+xfs_cui_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct xfs_phys_extent *pmap)
+{
+ struct xfs_refcount_intent *ri;
+
+ ri = kmem_cache_alloc(xfs_refcount_intent_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
+ ri->ri_startblock = pmap->pe_startblock;
+ ri->ri_blockcount = pmap->pe_len;
+ xfs_refcount_update_get_group(mp, ri);
+
+ xfs_defer_add_item(dfp, &ri->ri_list);
+}
+
/*
* Process a refcount update intent item that was recovered from the log.
* We need to update the refcountbt.
*/
STATIC int
-xfs_cui_item_recover(
- struct xfs_log_item *lip,
+xfs_refcount_recover_work(
+ struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
struct xfs_trans_res resv;
+ struct xfs_log_item *lip = dfp->dfp_intent;
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
- struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
- struct xfs_btree_cur *rcur = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
- unsigned int refc_type;
- bool requeue_only = false;
int i;
int error = 0;
@@ -501,6 +464,8 @@ xfs_cui_item_recover(
sizeof(cuip->cui_format));
return -EFSCORRUPTED;
}
+
+ xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]);
}
/*
@@ -521,100 +486,28 @@ xfs_cui_item_recover(
if (error)
return error;
- cudp = xfs_trans_get_cud(tp, cuip);
-
- for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
- struct xfs_refcount_intent fake = { };
- struct xfs_phys_extent *pmap;
-
- pmap = &cuip->cui_format.cui_extents[i];
- refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
- switch (refc_type) {
- case XFS_REFCOUNT_INCREASE:
- case XFS_REFCOUNT_DECREASE:
- case XFS_REFCOUNT_ALLOC_COW:
- case XFS_REFCOUNT_FREE_COW:
- fake.ri_type = refc_type;
- break;
- default:
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- &cuip->cui_format,
- sizeof(cuip->cui_format));
- error = -EFSCORRUPTED;
- goto abort_error;
- }
-
- fake.ri_startblock = pmap->pe_startblock;
- fake.ri_blockcount = pmap->pe_len;
-
- if (!requeue_only) {
- xfs_refcount_update_get_group(mp, &fake);
- error = xfs_trans_log_finish_refcount_update(tp, cudp,
- &fake, &rcur);
- xfs_refcount_update_put_group(&fake);
- }
- if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- &cuip->cui_format,
- sizeof(cuip->cui_format));
- if (error)
- goto abort_error;
-
- /* Requeue what we didn't finish. */
- if (fake.ri_blockcount > 0) {
- struct xfs_bmbt_irec irec = {
- .br_startblock = fake.ri_startblock,
- .br_blockcount = fake.ri_blockcount,
- };
-
- switch (fake.ri_type) {
- case XFS_REFCOUNT_INCREASE:
- xfs_refcount_increase_extent(tp, &irec);
- break;
- case XFS_REFCOUNT_DECREASE:
- xfs_refcount_decrease_extent(tp, &irec);
- break;
- case XFS_REFCOUNT_ALLOC_COW:
- xfs_refcount_alloc_cow_extent(tp,
- irec.br_startblock,
- irec.br_blockcount);
- break;
- case XFS_REFCOUNT_FREE_COW:
- xfs_refcount_free_cow_extent(tp,
- irec.br_startblock,
- irec.br_blockcount);
- break;
- default:
- ASSERT(0);
- }
- requeue_only = true;
- }
- }
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &cuip->cui_format,
+ sizeof(cuip->cui_format));
+ if (error)
+ goto abort_error;
- xfs_refcount_finish_one_cleanup(tp, rcur, error);
return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
- xfs_refcount_finish_one_cleanup(tp, rcur, error);
xfs_trans_cancel(tp);
return error;
}
-STATIC bool
-xfs_cui_item_match(
- struct xfs_log_item *lip,
- uint64_t intent_id)
-{
- return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
-}
-
/* Relog an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_cui_item_relog(
+xfs_refcount_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_cud_log_item *cudp;
struct xfs_cui_log_item *cuip;
struct xfs_phys_extent *pmap;
unsigned int count;
@@ -622,27 +515,41 @@ xfs_cui_item_relog(
count = CUI_ITEM(intent)->cui_format.cui_nextents;
pmap = CUI_ITEM(intent)->cui_format.cui_extents;
- tp->t_flags |= XFS_TRANS_DIRTY;
- cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
- set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
-
cuip = xfs_cui_init(tp->t_mountp, count);
memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap));
atomic_set(&cuip->cui_next_extent, count);
- xfs_trans_add_item(tp, &cuip->cui_item);
- set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+
return &cuip->cui_item;
}
+const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+ .name = "refcount",
+ .max_items = XFS_CUI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_refcount_update_create_intent,
+ .abort_intent = xfs_refcount_update_abort_intent,
+ .create_done = xfs_refcount_update_create_done,
+ .finish_item = xfs_refcount_update_finish_item,
+ .finish_cleanup = xfs_refcount_finish_one_cleanup,
+ .cancel_item = xfs_refcount_update_cancel_item,
+ .recover_work = xfs_refcount_recover_work,
+ .relog_intent = xfs_refcount_relog_intent,
+};
+
+STATIC bool
+xfs_cui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
+}
+
static const struct xfs_item_ops xfs_cui_item_ops = {
.flags = XFS_ITEM_INTENT,
.iop_size = xfs_cui_item_size,
.iop_format = xfs_cui_item_format,
.iop_unpin = xfs_cui_item_unpin,
.iop_release = xfs_cui_item_release,
- .iop_recover = xfs_cui_item_recover,
.iop_match = xfs_cui_item_match,
- .iop_relog = xfs_cui_item_relog,
};
static inline void
@@ -696,12 +603,9 @@ xlog_recover_cui_commit_pass2(
cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
- /*
- * Insert the intent into the AIL directly and drop one reference so
- * that finishing or canceling the work will drop the other.
- */
- xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn);
- xfs_cui_release(cuip);
+
+ xlog_recover_intent_item(log, &cuip->cui_item, lsn,
+ &xfs_refcount_update_defer_type);
return 0;
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index eb9102453aff..7da0e8f961d3 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -29,6 +29,7 @@
#include "xfs_iomap.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+#include "xfs_health.h"
/*
* Copy on Write of Shared Blocks
@@ -527,7 +528,7 @@ xfs_reflink_allocate_cow(
int error;
bool found;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (!ip->i_cowfp) {
ASSERT(!xfs_is_reflink_inode(ip));
xfs_ifork_init_cow(ip);
@@ -618,7 +619,7 @@ xfs_reflink_cancel_cow_blocks(
error = xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
break;
@@ -784,6 +785,7 @@ xfs_reflink_end_cow_extent(
}
}
del = got;
+ xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb);
/* Grab the corresponding mapping in the data fork. */
nmaps = 1;
@@ -804,7 +806,7 @@ xfs_reflink_end_cow_extent(
* If the extent we're remapping is backed by storage (written
* or not), unmap the extent and drop its refcount.
*/
- xfs_bmap_unmap_extent(tp, ip, &data);
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
xfs_refcount_decrease_extent(tp, &data);
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
-data.br_blockcount);
@@ -828,7 +830,7 @@ xfs_reflink_end_cow_extent(
xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
/* Map the new blocks into the data fork. */
- xfs_bmap_map_extent(tp, ip, &del);
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
/* Charge this new data fork mapping to the on-disk quota. */
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
@@ -1226,8 +1228,10 @@ xfs_reflink_remap_extent(
* extent if they're both holes or both the same physical extent.
*/
if (dmap->br_startblock == smap.br_startblock) {
- if (dmap->br_state != smap.br_state)
+ if (dmap->br_state != smap.br_state) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = -EFSCORRUPTED;
+ }
goto out_cancel;
}
@@ -1290,7 +1294,7 @@ xfs_reflink_remap_extent(
* If the extent we're unmapping is backed by storage (written
* or not), unmap the extent and drop its refcount.
*/
- xfs_bmap_unmap_extent(tp, ip, &smap);
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap);
xfs_refcount_decrease_extent(tp, &smap);
qdelta -= smap.br_blockcount;
} else if (smap.br_startblock == DELAYSTARTBLOCK) {
@@ -1315,7 +1319,7 @@ xfs_reflink_remap_extent(
*/
if (dmap_written) {
xfs_refcount_increase_extent(tp, dmap);
- xfs_bmap_map_extent(tp, ip, dmap);
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap);
qdelta += dmap->br_blockcount;
}
@@ -1390,6 +1394,7 @@ xfs_reflink_remap_blocks(
ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
if (imap.br_startblock == DELAYSTARTBLOCK) {
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ xfs_bmap_mark_sick(src, XFS_DATA_FORK);
error = -EFSCORRUPTED;
break;
}
@@ -1540,6 +1545,10 @@ xfs_reflink_remap_prep(
if (ret)
goto out_unlock;
+ xfs_iflags_set(src, XFS_IREMAPPING);
+ if (inode_in != inode_out)
+ xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
return 0;
out_unlock:
xfs_iunlock2_io_mmap(src, dest);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 0e0e747028da..e473124e29cc 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -36,9 +36,9 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
- kmem_free(ruip->rui_item.li_lv_shadow);
+ kvfree(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
- kmem_free(ruip);
+ kfree(ruip);
else
kmem_cache_free(xfs_rui_cache, ruip);
}
@@ -142,7 +142,8 @@ xfs_rui_init(
ASSERT(nextents > 0);
if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
- ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
+ ruip = kzalloc(xfs_rui_log_item_sizeof(nextents),
+ GFP_KERNEL | __GFP_NOFAIL);
else
ruip = kmem_cache_zalloc(xfs_rui_cache,
GFP_KERNEL | __GFP_NOFAIL);
@@ -205,7 +206,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
- kmem_free(rudp->rud_item.li_lv_shadow);
+ kvfree(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_cache, rudp);
}
@@ -225,23 +226,6 @@ static const struct xfs_item_ops xfs_rud_item_ops = {
.iop_intent = xfs_rud_item_intent,
};
-static struct xfs_rud_log_item *
-xfs_trans_get_rud(
- struct xfs_trans *tp,
- struct xfs_rui_log_item *ruip)
-{
- struct xfs_rud_log_item *rudp;
-
- rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
- &xfs_rud_item_ops);
- rudp->rud_ruip = ruip;
- rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
-
- xfs_trans_add_item(tp, &rudp->rud_item);
- return rudp;
-}
-
/* Set the map extent flags for this reverse mapping. */
static void
xfs_trans_set_rmap_flags(
@@ -285,35 +269,6 @@ xfs_trans_set_rmap_flags(
}
}
-/*
- * Finish an rmap update and log it to the RUD. Note that the transaction is
- * marked dirty regardless of whether the rmap update succeeds or fails to
- * support the RUI/RUD lifecycle rules.
- */
-static int
-xfs_trans_log_finish_rmap_update(
- struct xfs_trans *tp,
- struct xfs_rud_log_item *rudp,
- struct xfs_rmap_intent *ri,
- struct xfs_btree_cur **pcur)
-{
- int error;
-
- error = xfs_rmap_finish_one(tp, ri, pcur);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the RUI and frees the RUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
- set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
-
- return error;
-}
-
/* Sort rmap intents by AG. */
static int
xfs_rmap_update_diff_items(
@@ -340,9 +295,6 @@ xfs_rmap_update_log_item(
uint next_extent;
struct xfs_map_extent *map;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
-
/*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
@@ -372,7 +324,6 @@ xfs_rmap_update_create_intent(
ASSERT(count > 0);
- xfs_trans_add_item(tp, &ruip->rui_item);
if (sort)
list_sort(mp, items, xfs_rmap_update_diff_items);
list_for_each_entry(ri, items, ri_list)
@@ -387,7 +338,16 @@ xfs_rmap_update_create_done(
struct xfs_log_item *intent,
unsigned int count)
{
- return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item;
+ struct xfs_rui_log_item *ruip = RUI_ITEM(intent);
+ struct xfs_rud_log_item *rudp;
+
+ rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
+ &xfs_rud_item_ops);
+ rudp->rud_ruip = ruip;
+ rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
+
+ return &rudp->rud_item;
}
/* Take a passive ref to the AG containing the space we're rmapping. */
@@ -423,8 +383,7 @@ xfs_rmap_update_finish_item(
ri = container_of(item, struct xfs_rmap_intent, ri_list);
- error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri,
- state);
+ error = xfs_rmap_finish_one(tp, ri, state);
xfs_rmap_update_put_group(ri);
kmem_cache_free(xfs_rmap_intent_cache, ri);
@@ -452,16 +411,6 @@ xfs_rmap_update_cancel_item(
kmem_cache_free(xfs_rmap_intent_cache, ri);
}
-const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
- .max_items = XFS_RUI_MAX_FAST_EXTENTS,
- .create_intent = xfs_rmap_update_create_intent,
- .abort_intent = xfs_rmap_update_abort_intent,
- .create_done = xfs_rmap_update_create_done,
- .finish_item = xfs_rmap_update_finish_item,
- .finish_cleanup = xfs_rmap_finish_one_cleanup,
- .cancel_item = xfs_rmap_update_cancel_item,
-};
-
/* Is this recovered RUI ok? */
static inline bool
xfs_rui_validate_map(
@@ -498,20 +447,72 @@ xfs_rui_validate_map(
return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
}
+static inline void
+xfs_rui_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ const struct xfs_map_extent *map)
+{
+ struct xfs_rmap_intent *ri;
+
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
+
+ switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+ case XFS_RMAP_EXTENT_MAP:
+ ri->ri_type = XFS_RMAP_MAP;
+ break;
+ case XFS_RMAP_EXTENT_MAP_SHARED:
+ ri->ri_type = XFS_RMAP_MAP_SHARED;
+ break;
+ case XFS_RMAP_EXTENT_UNMAP:
+ ri->ri_type = XFS_RMAP_UNMAP;
+ break;
+ case XFS_RMAP_EXTENT_UNMAP_SHARED:
+ ri->ri_type = XFS_RMAP_UNMAP_SHARED;
+ break;
+ case XFS_RMAP_EXTENT_CONVERT:
+ ri->ri_type = XFS_RMAP_CONVERT;
+ break;
+ case XFS_RMAP_EXTENT_CONVERT_SHARED:
+ ri->ri_type = XFS_RMAP_CONVERT_SHARED;
+ break;
+ case XFS_RMAP_EXTENT_ALLOC:
+ ri->ri_type = XFS_RMAP_ALLOC;
+ break;
+ case XFS_RMAP_EXTENT_FREE:
+ ri->ri_type = XFS_RMAP_FREE;
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ ri->ri_owner = map->me_owner;
+ ri->ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ ri->ri_bmap.br_startblock = map->me_startblock;
+ ri->ri_bmap.br_startoff = map->me_startoff;
+ ri->ri_bmap.br_blockcount = map->me_len;
+ ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ xfs_rmap_update_get_group(mp, ri);
+
+ xfs_defer_add_item(dfp, &ri->ri_list);
+}
+
/*
* Process an rmap update intent item that was recovered from the log.
* We need to update the rmapbt.
*/
STATIC int
-xfs_rui_item_recover(
- struct xfs_log_item *lip,
+xfs_rmap_recover_work(
+ struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
struct xfs_trans_res resv;
+ struct xfs_log_item *lip = dfp->dfp_intent;
struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
- struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
- struct xfs_btree_cur *rcur = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
int i;
int error = 0;
@@ -529,6 +530,8 @@ xfs_rui_item_recover(
sizeof(ruip->rui_format));
return -EFSCORRUPTED;
}
+
+ xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]);
}
resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
@@ -536,91 +539,29 @@ xfs_rui_item_recover(
XFS_TRANS_RESERVE, &tp);
if (error)
return error;
- rudp = xfs_trans_get_rud(tp, ruip);
- for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
- struct xfs_rmap_intent fake = { };
- struct xfs_map_extent *map;
-
- map = &ruip->rui_format.rui_extents[i];
- switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
- case XFS_RMAP_EXTENT_MAP:
- fake.ri_type = XFS_RMAP_MAP;
- break;
- case XFS_RMAP_EXTENT_MAP_SHARED:
- fake.ri_type = XFS_RMAP_MAP_SHARED;
- break;
- case XFS_RMAP_EXTENT_UNMAP:
- fake.ri_type = XFS_RMAP_UNMAP;
- break;
- case XFS_RMAP_EXTENT_UNMAP_SHARED:
- fake.ri_type = XFS_RMAP_UNMAP_SHARED;
- break;
- case XFS_RMAP_EXTENT_CONVERT:
- fake.ri_type = XFS_RMAP_CONVERT;
- break;
- case XFS_RMAP_EXTENT_CONVERT_SHARED:
- fake.ri_type = XFS_RMAP_CONVERT_SHARED;
- break;
- case XFS_RMAP_EXTENT_ALLOC:
- fake.ri_type = XFS_RMAP_ALLOC;
- break;
- case XFS_RMAP_EXTENT_FREE:
- fake.ri_type = XFS_RMAP_FREE;
- break;
- default:
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- &ruip->rui_format,
- sizeof(ruip->rui_format));
- error = -EFSCORRUPTED;
- goto abort_error;
- }
-
- fake.ri_owner = map->me_owner;
- fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
- fake.ri_bmap.br_startblock = map->me_startblock;
- fake.ri_bmap.br_startoff = map->me_startoff;
- fake.ri_bmap.br_blockcount = map->me_len;
- fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-
- xfs_rmap_update_get_group(mp, &fake);
- error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake,
- &rcur);
- if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- map, sizeof(*map));
- xfs_rmap_update_put_group(&fake);
- if (error)
- goto abort_error;
-
- }
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &ruip->rui_format,
+ sizeof(ruip->rui_format));
+ if (error)
+ goto abort_error;
- xfs_rmap_finish_one_cleanup(tp, rcur, error);
return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
- xfs_rmap_finish_one_cleanup(tp, rcur, error);
xfs_trans_cancel(tp);
return error;
}
-STATIC bool
-xfs_rui_item_match(
- struct xfs_log_item *lip,
- uint64_t intent_id)
-{
- return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
-}
-
/* Relog an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_rui_item_relog(
+xfs_rmap_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_rud_log_item *rudp;
struct xfs_rui_log_item *ruip;
struct xfs_map_extent *map;
unsigned int count;
@@ -628,27 +569,41 @@ xfs_rui_item_relog(
count = RUI_ITEM(intent)->rui_format.rui_nextents;
map = RUI_ITEM(intent)->rui_format.rui_extents;
- tp->t_flags |= XFS_TRANS_DIRTY;
- rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
- set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
-
ruip = xfs_rui_init(tp->t_mountp, count);
memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map));
atomic_set(&ruip->rui_next_extent, count);
- xfs_trans_add_item(tp, &ruip->rui_item);
- set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+
return &ruip->rui_item;
}
+const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+ .name = "rmap",
+ .max_items = XFS_RUI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_rmap_update_create_intent,
+ .abort_intent = xfs_rmap_update_abort_intent,
+ .create_done = xfs_rmap_update_create_done,
+ .finish_item = xfs_rmap_update_finish_item,
+ .finish_cleanup = xfs_rmap_finish_one_cleanup,
+ .cancel_item = xfs_rmap_update_cancel_item,
+ .recover_work = xfs_rmap_recover_work,
+ .relog_intent = xfs_rmap_relog_intent,
+};
+
+STATIC bool
+xfs_rui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
+}
+
static const struct xfs_item_ops xfs_rui_item_ops = {
.flags = XFS_ITEM_INTENT,
.iop_size = xfs_rui_item_size,
.iop_format = xfs_rui_item_format,
.iop_unpin = xfs_rui_item_unpin,
.iop_release = xfs_rui_item_release,
- .iop_recover = xfs_rui_item_recover,
.iop_match = xfs_rui_item_match,
- .iop_relog = xfs_rui_item_relog,
};
static inline void
@@ -702,12 +657,9 @@ xlog_recover_rui_commit_pass2(
ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
- /*
- * Insert the intent into the AIL directly and drop one reference so
- * that finishing or canceling the work will drop the other.
- */
- xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn);
- xfs_rui_release(ruip);
+
+ xlog_recover_intent_item(log, &ruip->rui_item, lsn,
+ &xfs_rmap_update_defer_type);
return 0;
}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 2e1a4e5cd03d..e66f9bd5de5c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -14,62 +14,51 @@
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
+#include "xfs_bmap_util.h"
#include "xfs_trans.h"
#include "xfs_trans_space.h"
#include "xfs_icache.h"
#include "xfs_rtalloc.h"
#include "xfs_sb.h"
-
-/*
- * Read and return the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- */
-static int
-xfs_rtget_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_suminfo_t *sum) /* out: summary info for this block */
-{
- return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum);
-}
+#include "xfs_rtbitmap.h"
+#include "xfs_quota.h"
+#include "xfs_log_priv.h"
+#include "xfs_health.h"
/*
* Return whether there are any free extents in the size range given
* by low and high, for the bitmap block bbno.
*/
-STATIC int /* error */
+STATIC int
xfs_rtany_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int low, /* low log2 extent size */
- int high, /* high log2 extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- int *stat) /* out: any good extents here? */
+ struct xfs_rtalloc_args *args,
+ int low, /* low log2 extent size */
+ int high, /* high log2 extent size */
+ xfs_fileoff_t bbno, /* bitmap block number */
+ int *maxlog) /* out: max log2 extent size free */
{
- int error; /* error value */
- int log; /* loop counter, log2 of ext. size */
- xfs_suminfo_t sum; /* summary data */
-
- /* There are no extents at levels < m_rsum_cache[bbno]. */
- if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno])
- low = mp->m_rsum_cache[bbno];
+ struct xfs_mount *mp = args->mp;
+ int error;
+ int log; /* loop counter, log2 of ext. size */
+ xfs_suminfo_t sum; /* summary data */
+
+ /* There are no extents at levels >= m_rsum_cache[bbno]. */
+ if (mp->m_rsum_cache) {
+ high = min(high, mp->m_rsum_cache[bbno] - 1);
+ if (low > high) {
+ *maxlog = -1;
+ return 0;
+ }
+ }
/*
* Loop over logs of extent sizes.
*/
- for (log = low; log <= high; log++) {
+ for (log = high; log >= low; log--) {
/*
* Get one summary datum.
*/
- error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+ error = xfs_rtget_summary(args, log, bbno, &sum);
if (error) {
return error;
}
@@ -77,18 +66,18 @@ xfs_rtany_summary(
* If there are any, return success.
*/
if (sum) {
- *stat = 1;
+ *maxlog = log;
goto out;
}
}
/*
* Found nothing, return failure.
*/
- *stat = 0;
+ *maxlog = -1;
out:
- /* There were no extents at levels < log. */
- if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log;
+ /* There were no extents at levels > log. */
+ if (mp->m_rsum_cache && log + 1 < mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log + 1;
return 0;
}
@@ -97,60 +86,54 @@ out:
* Copy and transform the summary file, given the old and new
* parameters in the mount structures.
*/
-STATIC int /* error */
+STATIC int
xfs_rtcopy_summary(
- xfs_mount_t *omp, /* old file system mount point */
- xfs_mount_t *nmp, /* new file system mount point */
- xfs_trans_t *tp) /* transaction pointer */
+ struct xfs_rtalloc_args *oargs,
+ struct xfs_rtalloc_args *nargs)
{
- xfs_rtblock_t bbno; /* bitmap block number */
- struct xfs_buf *bp; /* summary buffer */
- int error; /* error return value */
- int log; /* summary level number (log length) */
- xfs_suminfo_t sum; /* summary data */
- xfs_fsblock_t sumbno; /* summary block number */
+ xfs_fileoff_t bbno; /* bitmap block number */
+ int error;
+ int log; /* summary level number (log length) */
+ xfs_suminfo_t sum; /* summary data */
- bp = NULL;
- for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
- for (bbno = omp->m_sb.sb_rbmblocks - 1;
+ for (log = oargs->mp->m_rsumlevels - 1; log >= 0; log--) {
+ for (bbno = oargs->mp->m_sb.sb_rbmblocks - 1;
(xfs_srtblock_t)bbno >= 0;
bbno--) {
- error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
- &sumbno, &sum);
+ error = xfs_rtget_summary(oargs, log, bbno, &sum);
if (error)
- return error;
+ goto out;
if (sum == 0)
continue;
- error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
- &bp, &sumbno);
+ error = xfs_rtmodify_summary(oargs, log, bbno, -sum);
if (error)
- return error;
- error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
- &bp, &sumbno);
+ goto out;
+ error = xfs_rtmodify_summary(nargs, log, bbno, sum);
if (error)
- return error;
+ goto out;
ASSERT(sum > 0);
}
}
+ error = 0;
+out:
+ xfs_rtbuf_cache_relse(oargs);
return 0;
}
/*
* Mark an extent specified by start and len allocated.
* Updates all the summary information as well as the bitmap.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* start block to allocate */
- xfs_extlen_t len, /* length to allocate */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* start rtext to allocate */
+ xfs_rtxlen_t len) /* in/out: summary block number */
{
- xfs_rtblock_t end; /* end of the allocated extent */
- int error; /* error value */
- xfs_rtblock_t postblock = 0; /* first block allocated > end */
- xfs_rtblock_t preblock = 0; /* first block allocated < start */
+ struct xfs_mount *mp = args->mp;
+ xfs_rtxnum_t end; /* end of the allocated rtext */
+ int error;
+ xfs_rtxnum_t postblock = 0; /* first rtext allocated > end */
+ xfs_rtxnum_t preblock = 0; /* first rtext allocated < start */
end = start + len - 1;
/*
@@ -158,119 +141,128 @@ xfs_rtallocate_range(
* We need to find the beginning and end of the extent so we can
* properly update the summary.
*/
- error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
- if (error) {
+ error = xfs_rtfind_back(args, start, 0, &preblock);
+ if (error)
return error;
- }
+
/*
* Find the next allocated block (end of free extent).
*/
- error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
- &postblock);
- if (error) {
+ error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
+ if (error)
return error;
- }
+
/*
* Decrement the summary information corresponding to the entire
* (old) free extent.
*/
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
- XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
- if (error) {
+ error = xfs_rtmodify_summary(args,
+ xfs_highbit64(postblock + 1 - preblock),
+ xfs_rtx_to_rbmblock(mp, preblock), -1);
+ if (error)
return error;
- }
+
/*
* If there are blocks not being allocated at the front of the
* old extent, add summary data for them to be free.
*/
if (preblock < start) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(start - preblock),
- XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
- if (error) {
+ error = xfs_rtmodify_summary(args,
+ xfs_highbit64(start - preblock),
+ xfs_rtx_to_rbmblock(mp, preblock), 1);
+ if (error)
return error;
- }
}
+
/*
* If there are blocks not being allocated at the end of the
* old extent, add summary data for them to be free.
*/
if (postblock > end) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock - end),
- XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
- if (error) {
+ error = xfs_rtmodify_summary(args,
+ xfs_highbit64(postblock - end),
+ xfs_rtx_to_rbmblock(mp, end + 1), 1);
+ if (error)
return error;
- }
}
+
/*
* Modify the bitmap to mark this extent allocated.
*/
- error = xfs_rtmodify_range(mp, tp, start, len, 0);
- return error;
+ return xfs_rtmodify_range(args, start, len, 0);
+}
+
+/*
+ * Make sure we don't run off the end of the rt volume. Be careful that
+ * adjusting maxlen downwards doesn't cause us to fail the alignment checks.
+ */
+static inline xfs_rtxlen_t
+xfs_rtallocate_clamp_len(
+ struct xfs_mount *mp,
+ xfs_rtxnum_t startrtx,
+ xfs_rtxlen_t rtxlen,
+ xfs_rtxlen_t prod)
+{
+ xfs_rtxlen_t ret;
+
+ ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx;
+ return rounddown(ret, prod);
}
/*
* Attempt to allocate an extent minlen<=len<=maxlen starting from
* bitmap block bbno. If we don't get maxlen then use prod to trim
- * the length, if given. Returns error; returns starting block in *rtblock.
+ * the length, if given. Returns error; returns starting block in *rtx.
* The lengths are all in rtextents.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_extent_block(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bbno, /* bitmap block number */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- xfs_rtblock_t *nextp, /* out: next block to try */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t bbno, /* bitmap block number */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxnum_t *nextp, /* out: next rtext to try */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- xfs_rtblock_t besti; /* best rtblock found so far */
- xfs_rtblock_t bestlen; /* best length found so far */
- xfs_rtblock_t end; /* last rtblock in chunk */
- int error; /* error value */
- xfs_rtblock_t i; /* current rtblock trying */
- xfs_rtblock_t next; /* next rtblock to try */
- int stat; /* status from internal calls */
+ struct xfs_mount *mp = args->mp;
+ xfs_rtxnum_t besti; /* best rtext found so far */
+ xfs_rtxnum_t bestlen;/* best length found so far */
+ xfs_rtxnum_t end; /* last rtext in chunk */
+ int error;
+ xfs_rtxnum_t i; /* current rtext trying */
+ xfs_rtxnum_t next; /* next rtext to try */
+ int stat; /* status from internal calls */
/*
* Loop over all the extents starting in this bitmap block,
* looking for one that's long enough.
*/
- for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
- end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
+ for (i = xfs_rbmblock_to_rtx(mp, bbno), besti = -1, bestlen = 0,
+ end = xfs_rbmblock_to_rtx(mp, bbno + 1) - 1;
i <= end;
i++) {
/* Make sure we don't scan off the end of the rt volume. */
- maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i;
+ maxlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod);
/*
* See if there's a free extent of maxlen starting at i.
* If it's not so then next will contain the first non-free.
*/
- error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
- if (error) {
+ error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat);
+ if (error)
return error;
- }
if (stat) {
/*
* i for maxlen is all free, allocate and return that.
*/
- error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
- rsb);
- if (error) {
- return error;
- }
- *len = maxlen;
- *rtblock = i;
- return 0;
+ bestlen = maxlen;
+ besti = i;
+ goto allocate;
}
+
/*
* In the case where we have a variable-sized allocation
* request, figure out how big this free piece is,
@@ -278,7 +270,7 @@ xfs_rtallocate_extent_block(
* so far, remember it.
*/
if (minlen < maxlen) {
- xfs_rtblock_t thislen; /* this extent size */
+ xfs_rtxnum_t thislen; /* this extent size */
thislen = next - i;
if (thislen >= minlen && thislen > bestlen) {
@@ -289,187 +281,157 @@ xfs_rtallocate_extent_block(
/*
* If not done yet, find the start of the next free space.
*/
- if (next < end) {
- error = xfs_rtfind_forw(mp, tp, next, end, &i);
- if (error) {
- return error;
- }
- } else
+ if (next >= end)
break;
+ error = xfs_rtfind_forw(args, next, end, &i);
+ if (error)
+ return error;
}
+
/*
* Searched the whole thing & didn't find a maxlen free extent.
*/
- if (minlen < maxlen && besti != -1) {
- xfs_extlen_t p; /* amount to trim length by */
-
+ if (minlen > maxlen || besti == -1) {
/*
- * If size should be a multiple of prod, make that so.
+ * Allocation failed. Set *nextp to the next block to try.
*/
- if (prod > 1) {
- div_u64_rem(bestlen, prod, &p);
- if (p)
- bestlen -= p;
- }
+ *nextp = next;
+ return -ENOSPC;
+ }
- /*
- * Allocate besti for bestlen & return that.
- */
- error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb);
- if (error) {
- return error;
- }
- *len = bestlen;
- *rtblock = besti;
- return 0;
+ /*
+ * If size should be a multiple of prod, make that so.
+ */
+ if (prod > 1) {
+ xfs_rtxlen_t p; /* amount to trim length by */
+
+ div_u64_rem(bestlen, prod, &p);
+ if (p)
+ bestlen -= p;
}
+
/*
- * Allocation failed. Set *nextp to the next block to try.
+ * Allocate besti for bestlen & return that.
*/
- *nextp = next;
- *rtblock = NULLRTBLOCK;
+allocate:
+ error = xfs_rtallocate_range(args, besti, bestlen);
+ if (error)
+ return error;
+ *len = bestlen;
+ *rtx = besti;
return 0;
}
/*
* Allocate an extent of length minlen<=len<=maxlen, starting at block
* bno. If we don't get maxlen then use prod to trim the length, if given.
- * Returns error; returns starting block in *rtblock.
+ * Returns error; returns starting block in *rtx.
* The lengths are all in rtextents.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_extent_exact(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to allocate */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext number to allocate */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- int error; /* error value */
- xfs_extlen_t i; /* extent length trimmed due to prod */
- int isfree; /* extent is free */
- xfs_rtblock_t next; /* next block to try (dummy) */
+ int error;
+ xfs_rtxlen_t i; /* extent length trimmed due to prod */
+ int isfree; /* extent is free */
+ xfs_rtxnum_t next; /* next rtext to try (dummy) */
- ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ ASSERT(minlen % prod == 0);
+ ASSERT(maxlen % prod == 0);
/*
* Check if the range in question (for maxlen) is free.
*/
- error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree);
- if (error) {
+ error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree);
+ if (error)
return error;
- }
- if (isfree) {
+
+ if (!isfree) {
/*
- * If it is, allocate it and return success.
+ * If not, allocate what there is, if it's at least minlen.
*/
- error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
- if (error) {
- return error;
- }
- *len = maxlen;
- *rtblock = bno;
- return 0;
- }
- /*
- * If not, allocate what there is, if it's at least minlen.
- */
- maxlen = next - bno;
- if (maxlen < minlen) {
+ maxlen = next - start;
+ if (maxlen < minlen)
+ return -ENOSPC;
+
/*
- * Failed, return failure status.
+ * Trim off tail of extent, if prod is specified.
*/
- *rtblock = NULLRTBLOCK;
- return 0;
- }
- /*
- * Trim off tail of extent, if prod is specified.
- */
- if (prod > 1 && (i = maxlen % prod)) {
- maxlen -= i;
- if (maxlen < minlen) {
- /*
- * Now we can't do it, return failure status.
- */
- *rtblock = NULLRTBLOCK;
- return 0;
+ if (prod > 1 && (i = maxlen % prod)) {
+ maxlen -= i;
+ if (maxlen < minlen)
+ return -ENOSPC;
}
}
+
/*
* Allocate what we can and return it.
*/
- error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
- if (error) {
+ error = xfs_rtallocate_range(args, start, maxlen);
+ if (error)
return error;
- }
*len = maxlen;
- *rtblock = bno;
+ *rtx = start;
return 0;
}
/*
* Allocate an extent of length minlen<=len<=maxlen, starting as near
- * to bno as possible. If we don't get maxlen then use prod to trim
+ * to start as possible. If we don't get maxlen then use prod to trim
* the length, if given. The lengths are all in rtextents.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_extent_near(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to allocate */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start, /* starting rtext number to allocate */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- int any; /* any useful extents from summary */
- xfs_rtblock_t bbno; /* bitmap block number */
- int error; /* error value */
- int i; /* bitmap block offset (loop control) */
- int j; /* secondary loop control */
- int log2len; /* log2 of minlen */
- xfs_rtblock_t n; /* next block to try */
- xfs_rtblock_t r; /* result block */
-
- ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ struct xfs_mount *mp = args->mp;
+ int maxlog; /* max useful extent from summary */
+ xfs_fileoff_t bbno; /* bitmap block number */
+ int error;
+ int i; /* bitmap block offset (loop control) */
+ int j; /* secondary loop control */
+ int log2len; /* log2 of minlen */
+ xfs_rtxnum_t n; /* next rtext to try */
+
+ ASSERT(minlen % prod == 0);
+ ASSERT(maxlen % prod == 0);
+
/*
* If the block number given is off the end, silently set it to
* the last block.
*/
- if (bno >= mp->m_sb.sb_rextents)
- bno = mp->m_sb.sb_rextents - 1;
+ if (start >= mp->m_sb.sb_rextents)
+ start = mp->m_sb.sb_rextents - 1;
/* Make sure we don't run off the end of the rt volume. */
- maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno;
- if (maxlen < minlen) {
- *rtblock = NULLRTBLOCK;
- return 0;
- }
+ maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod);
+ if (maxlen < minlen)
+ return -ENOSPC;
/*
* Try the exact allocation first.
*/
- error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len,
- rbpp, rsb, prod, &r);
- if (error) {
+ error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len,
+ prod, rtx);
+ if (error != -ENOSPC)
return error;
- }
- /*
- * If the exact allocation worked, return that.
- */
- if (r != NULLRTBLOCK) {
- *rtblock = r;
- return 0;
- }
- bbno = XFS_BITTOBLOCK(mp, bno);
+
+
+ bbno = xfs_rtx_to_rbmblock(mp, start);
i = 0;
+ j = -1;
ASSERT(minlen != 0);
log2len = xfs_highbit32(minlen);
/*
@@ -480,16 +442,19 @@ xfs_rtallocate_extent_near(
* Get summary information of extents of all useful levels
* starting in this bitmap block.
*/
- error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1,
- bbno + i, rbpp, rsb, &any);
- if (error) {
+ error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1,
+ bbno + i, &maxlog);
+ if (error)
return error;
- }
+
/*
* If there are any useful extents starting here, try
* allocating one.
*/
- if (any) {
+ if (maxlog >= 0) {
+ xfs_extlen_t maxavail =
+ min_t(xfs_rtblock_t, maxlen,
+ (1ULL << (maxlog + 1)) - 1);
/*
* On the positive side of the starting location.
*/
@@ -498,85 +463,47 @@ xfs_rtallocate_extent_near(
* Try to allocate an extent starting in
* this block.
*/
- error = xfs_rtallocate_extent_block(mp, tp,
- bbno + i, minlen, maxlen, len, &n, rbpp,
- rsb, prod, &r);
- if (error) {
+ error = xfs_rtallocate_extent_block(args,
+ bbno + i, minlen, maxavail, len,
+ &n, prod, rtx);
+ if (error != -ENOSPC)
return error;
- }
- /*
- * If it worked, return it.
- */
- if (r != NULLRTBLOCK) {
- *rtblock = r;
- return 0;
- }
}
/*
* On the negative side of the starting location.
*/
else { /* i < 0 */
+ int maxblocks;
+
/*
- * Loop backwards through the bitmap blocks from
- * the starting point-1 up to where we are now.
- * There should be an extent which ends in this
- * bitmap block and is long enough.
- */
- for (j = -1; j > i; j--) {
- /*
- * Grab the summary information for
- * this bitmap block.
- */
- error = xfs_rtany_summary(mp, tp,
- log2len, mp->m_rsumlevels - 1,
- bbno + j, rbpp, rsb, &any);
- if (error) {
- return error;
- }
- /*
- * If there's no extent given in the
- * summary that means the extent we
- * found must carry over from an
- * earlier block. If there is an
- * extent given, we've already tried
- * that allocation, don't do it again.
- */
- if (any)
- continue;
- error = xfs_rtallocate_extent_block(mp,
- tp, bbno + j, minlen, maxlen,
- len, &n, rbpp, rsb, prod, &r);
- if (error) {
- return error;
- }
- /*
- * If it works, return the extent.
- */
- if (r != NULLRTBLOCK) {
- *rtblock = r;
- return 0;
- }
- }
- /*
- * There weren't intervening bitmap blocks
- * with a long enough extent, or the
- * allocation didn't work for some reason
- * (i.e. it's a little * too short).
- * Try to allocate from the summary block
- * that we found.
+ * Loop backwards to find the end of the extent
+ * we found in the realtime summary.
+ *
+ * maxblocks is the maximum possible number of
+ * bitmap blocks from the start of the extent
+ * to the end of the extent.
*/
- error = xfs_rtallocate_extent_block(mp, tp,
- bbno + i, minlen, maxlen, len, &n, rbpp,
- rsb, prod, &r);
- if (error) {
- return error;
- }
+ if (maxlog == 0)
+ maxblocks = 0;
+ else if (maxlog < mp->m_blkbit_log)
+ maxblocks = 1;
+ else
+ maxblocks = 2 << (maxlog - mp->m_blkbit_log);
+
/*
- * If it works, return the extent.
+ * We need to check bbno + i + maxblocks down to
+ * bbno + i. We already checked bbno down to
+ * bbno + j + 1, so we don't need to check those
+ * again.
*/
- if (r != NULLRTBLOCK) {
- *rtblock = r;
- return 0;
+ j = min(i + maxblocks, j);
+ for (; j >= i; j--) {
+ error = xfs_rtallocate_extent_block(args,
+ bbno + j, minlen,
+ maxavail, len, &n, prod,
+ rtx);
+ if (error != -ENOSPC)
+ return error;
}
}
}
@@ -610,8 +537,53 @@ xfs_rtallocate_extent_near(
else
break;
}
- *rtblock = NULLRTBLOCK;
- return 0;
+ return -ENOSPC;
+}
+
+static int
+xfs_rtalloc_sumlevel(
+ struct xfs_rtalloc_args *args,
+ int l, /* level number */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
+{
+ xfs_fileoff_t i; /* bitmap block number */
+
+ for (i = 0; i < args->mp->m_sb.sb_rbmblocks; i++) {
+ xfs_suminfo_t sum; /* summary information for extents */
+ xfs_rtxnum_t n; /* next rtext to be tried */
+ int error;
+
+ error = xfs_rtget_summary(args, l, i, &sum);
+ if (error)
+ return error;
+
+ /*
+ * Nothing there, on to the next block.
+ */
+ if (!sum)
+ continue;
+
+ /*
+ * Try allocating the extent.
+ */
+ error = xfs_rtallocate_extent_block(args, i, minlen, maxlen,
+ len, &n, prod, rtx);
+ if (error != -ENOSPC)
+ return error;
+
+ /*
+ * If the "next block to try" returned from the allocator is
+ * beyond the next bitmap block, skip to that bitmap block.
+ */
+ if (xfs_rtx_to_rbmblock(args->mp, n) > i + 1)
+ i = xfs_rtx_to_rbmblock(args->mp, n) - 1;
+ }
+
+ return -ENOSPC;
}
/*
@@ -619,145 +591,64 @@ xfs_rtallocate_extent_near(
* specified. If we don't get maxlen then use prod to trim
* the length, if given. The lengths are all in rtextents.
*/
-STATIC int /* error */
+STATIC int
xfs_rtallocate_extent_size(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- struct xfs_buf **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
+ struct xfs_rtalloc_args *args,
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- int error; /* error value */
- int i; /* bitmap block number */
- int l; /* level number (loop control) */
- xfs_rtblock_t n; /* next block to be tried */
- xfs_rtblock_t r; /* result block number */
- xfs_suminfo_t sum; /* summary information for extents */
-
- ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ int error;
+ int l; /* level number (loop control) */
+
+ ASSERT(minlen % prod == 0);
+ ASSERT(maxlen % prod == 0);
ASSERT(maxlen != 0);
/*
* Loop over all the levels starting with maxlen.
- * At each level, look at all the bitmap blocks, to see if there
- * are extents starting there that are long enough (>= maxlen).
- * Note, only on the initial level can the allocation fail if
- * the summary says there's an extent.
+ *
+ * At each level, look at all the bitmap blocks, to see if there are
+ * extents starting there that are long enough (>= maxlen).
+ *
+ * Note, only on the initial level can the allocation fail if the
+ * summary says there's an extent.
*/
- for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) {
- /*
- * Loop over all the bitmap blocks.
- */
- for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
- /*
- * Get the summary for this level/block.
- */
- error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
- &sum);
- if (error) {
- return error;
- }
- /*
- * Nothing there, on to the next block.
- */
- if (!sum)
- continue;
- /*
- * Try allocating the extent.
- */
- error = xfs_rtallocate_extent_block(mp, tp, i, maxlen,
- maxlen, len, &n, rbpp, rsb, prod, &r);
- if (error) {
- return error;
- }
- /*
- * If it worked, return that.
- */
- if (r != NULLRTBLOCK) {
- *rtblock = r;
- return 0;
- }
- /*
- * If the "next block to try" returned from the
- * allocator is beyond the next bitmap block,
- * skip to that bitmap block.
- */
- if (XFS_BITTOBLOCK(mp, n) > i + 1)
- i = XFS_BITTOBLOCK(mp, n) - 1;
- }
+ for (l = xfs_highbit32(maxlen); l < args->mp->m_rsumlevels; l++) {
+ error = xfs_rtalloc_sumlevel(args, l, minlen, maxlen, prod, len,
+ rtx);
+ if (error != -ENOSPC)
+ return error;
}
+
/*
- * Didn't find any maxlen blocks. Try smaller ones, unless
- * we're asking for a fixed size extent.
+ * Didn't find any maxlen blocks. Try smaller ones, unless we are
+ * looking for a fixed size extent.
*/
- if (minlen > --maxlen) {
- *rtblock = NULLRTBLOCK;
- return 0;
- }
+ if (minlen > --maxlen)
+ return -ENOSPC;
ASSERT(minlen != 0);
ASSERT(maxlen != 0);
/*
* Loop over sizes, from maxlen down to minlen.
- * This time, when we do the allocations, allow smaller ones
- * to succeed.
+ *
+ * This time, when we do the allocations, allow smaller ones to succeed,
+ * but make sure the specified minlen/maxlen are in the possible range
+ * for this summary level.
*/
for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
- /*
- * Loop over all the bitmap blocks, try an allocation
- * starting in that block.
- */
- for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
- /*
- * Get the summary information for this level/block.
- */
- error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
- &sum);
- if (error) {
- return error;
- }
- /*
- * If nothing there, go on to next.
- */
- if (!sum)
- continue;
- /*
- * Try the allocation. Make sure the specified
- * minlen/maxlen are in the possible range for
- * this summary level.
- */
- error = xfs_rtallocate_extent_block(mp, tp, i,
- XFS_RTMAX(minlen, 1 << l),
- XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
- len, &n, rbpp, rsb, prod, &r);
- if (error) {
- return error;
- }
- /*
- * If it worked, return that extent.
- */
- if (r != NULLRTBLOCK) {
- *rtblock = r;
- return 0;
- }
- /*
- * If the "next block to try" returned from the
- * allocator is beyond the next bitmap block,
- * skip to that bitmap block.
- */
- if (XFS_BITTOBLOCK(mp, n) > i + 1)
- i = XFS_BITTOBLOCK(mp, n) - 1;
- }
+ error = xfs_rtalloc_sumlevel(args, l,
+ max_t(xfs_rtxlen_t, minlen, 1 << l),
+ min_t(xfs_rtxlen_t, maxlen, (1 << (l + 1)) - 1),
+ prod, len, rtx);
+ if (error != -ENOSPC)
+ return error;
}
- /*
- * Got nothing, return failure.
- */
- *rtblock = NULLRTBLOCK;
- return 0;
+
+ return -ENOSPC;
}
/*
@@ -886,12 +777,14 @@ xfs_alloc_rsum_cache(
xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */
{
/*
- * The rsum cache is initialized to all zeroes, which is trivially a
- * lower bound on the minimum level with any free extents. We can
- * continue without the cache if it couldn't be allocated.
+ * The rsum cache is initialized to the maximum value, which is
+ * trivially an upper bound on the maximum level with any free extents.
+ * We can continue without the cache if it couldn't be allocated.
*/
- mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL);
- if (!mp->m_rsum_cache)
+ mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL);
+ if (mp->m_rsum_cache)
+ memset(mp->m_rsum_cache, -1, rbmblocks);
+ else
xfs_warn(mp, "could not allocate realtime summary cache");
}
@@ -907,13 +800,13 @@ xfs_growfs_rt(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_growfs_rt_t *in) /* growfs rt input struct */
{
- xfs_rtblock_t bmbno; /* bitmap block number */
+ xfs_fileoff_t bmbno; /* bitmap block number */
struct xfs_buf *bp; /* temporary buffer */
int error; /* error return value */
xfs_mount_t *nmp; /* new (fake) mount structure */
xfs_rfsblock_t nrblocks; /* new number of realtime blocks */
xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
- xfs_rtblock_t nrextents; /* new number of realtime extents */
+ xfs_rtxnum_t nrextents; /* new number of realtime extents */
uint8_t nrextslog; /* new log2 of sb_rextents */
xfs_extlen_t nrsumblocks; /* new number of summary blocks */
uint nrsumlevels; /* new rt summary levels */
@@ -922,7 +815,6 @@ xfs_growfs_rt(
xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */
xfs_extlen_t rsumblocks; /* current number of rt summary blks */
xfs_sb_t *sbp; /* old superblock */
- xfs_fsblock_t sumbno; /* summary block number */
uint8_t *rsum_cache; /* old summary cache */
sbp = &mp->m_sb;
@@ -954,7 +846,7 @@ xfs_growfs_rt(
return -EINVAL;
/* Unsupported realtime features. */
- if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp))
+ if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp))
return -EOPNOTSUPP;
nrblocks = in->newblocks;
@@ -976,11 +868,12 @@ xfs_growfs_rt(
*/
nrextents = nrblocks;
do_div(nrextents, in->extsize);
- nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize);
- nrextslog = xfs_highbit32(nrextents);
+ if (!xfs_validate_rtextents(nrextents))
+ return -EINVAL;
+ nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents);
+ nrextslog = xfs_compute_rextslog(nrextents);
nrsumlevels = nrextslog + 1;
- nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
- nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+ nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks);
nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
/*
* New summary size can't be more than half the size of
@@ -1012,7 +905,7 @@ xfs_growfs_rt(
/*
* Allocate a new (fake) mount/sb.
*/
- nmp = kmem_alloc(sizeof(*nmp), 0);
+ nmp = kmalloc(sizeof(*nmp), GFP_KERNEL | __GFP_NOFAIL);
/*
* Loop over the bitmap blocks.
* We will do everything one bitmap block at a time.
@@ -1023,6 +916,12 @@ xfs_growfs_rt(
((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
bmbno < nrbmblocks;
bmbno++) {
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ };
+ struct xfs_rtalloc_args nargs = {
+ .mp = nmp,
+ };
struct xfs_trans *tp;
xfs_rfsblock_t nrblocks_step;
@@ -1032,20 +931,21 @@ xfs_growfs_rt(
* Calculate new sb and mount fields for this round.
*/
nsbp->sb_rextsize = in->extsize;
+ nmp->m_rtxblklog = -1; /* don't use shift or masking */
nsbp->sb_rbmblocks = bmbno + 1;
nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize *
nsbp->sb_rextsize;
nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
- nsbp->sb_rextents = nsbp->sb_rblocks;
- do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+ nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks);
ASSERT(nsbp->sb_rextents != 0);
- nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
+ nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents);
nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
- nrsumsize =
- (uint)sizeof(xfs_suminfo_t) * nrsumlevels *
- nsbp->sb_rbmblocks;
- nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+ nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels,
+ nsbp->sb_rbmblocks);
nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+ /* recompute growfsrt reservation from new rsumsize */
+ xfs_trans_resv_calc(nmp, &nmp->m_resv);
+
/*
* Start a transaction, get the log reservation.
*/
@@ -1053,6 +953,9 @@ xfs_growfs_rt(
&tp);
if (error)
break;
+ args.tp = tp;
+ nargs.tp = tp;
+
/*
* Lock out other callers by grabbing the bitmap inode lock.
*/
@@ -1086,7 +989,7 @@ xfs_growfs_rt(
*/
if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
mp->m_rsumlevels != nmp->m_rsumlevels) {
- error = xfs_rtcopy_summary(mp, nmp, tp);
+ error = xfs_rtcopy_summary(&args, &nargs);
if (error)
goto error_cancel;
}
@@ -1111,9 +1014,9 @@ xfs_growfs_rt(
/*
* Free new extent.
*/
- bp = NULL;
- error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
- nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
+ error = xfs_rtfree_range(&nargs, sbp->sb_rextents,
+ nsbp->sb_rextents - sbp->sb_rextents);
+ xfs_rtbuf_cache_relse(&nargs);
if (error) {
error_cancel:
xfs_trans_cancel(tp);
@@ -1129,6 +1032,8 @@ error_cancel:
*/
mp->m_rsumlevels = nrsumlevels;
mp->m_rsumsize = nrsumsize;
+ /* recompute growfsrt reservation from new rsumsize */
+ xfs_trans_resv_calc(mp, &mp->m_resv);
error = xfs_trans_commit(tp);
if (error)
@@ -1147,7 +1052,7 @@ out_free:
/*
* Free the fake mp structure.
*/
- kmem_free(nmp);
+ kfree(nmp);
/*
* If we had to allocate a new rsum_cache, we either need to free the
@@ -1156,10 +1061,10 @@ out_free:
*/
if (rsum_cache != mp->m_rsum_cache) {
if (error) {
- kmem_free(mp->m_rsum_cache);
+ kvfree(mp->m_rsum_cache);
mp->m_rsum_cache = rsum_cache;
} else {
- kmem_free(rsum_cache);
+ kvfree(rsum_cache);
}
}
@@ -1167,80 +1072,6 @@ out_free:
}
/*
- * Allocate an extent in the realtime subvolume, with the usual allocation
- * parameters. The length units are all in realtime extents, as is the
- * result block number.
- */
-int /* error */
-xfs_rtallocate_extent(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to allocate */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- int wasdel, /* was a delayed allocation extent */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock) /* out: start block allocated */
-{
- xfs_mount_t *mp = tp->t_mountp;
- int error; /* error value */
- xfs_rtblock_t r; /* result allocated block */
- xfs_fsblock_t sb; /* summary file block number */
- struct xfs_buf *sumbp; /* summary file block buffer */
-
- ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
- ASSERT(minlen > 0 && minlen <= maxlen);
-
- /*
- * If prod is set then figure out what to do to minlen and maxlen.
- */
- if (prod > 1) {
- xfs_extlen_t i;
-
- if ((i = maxlen % prod))
- maxlen -= i;
- if ((i = minlen % prod))
- minlen += prod - i;
- if (maxlen < minlen) {
- *rtblock = NULLRTBLOCK;
- return 0;
- }
- }
-
-retry:
- sumbp = NULL;
- if (bno == 0) {
- error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
- &sumbp, &sb, prod, &r);
- } else {
- error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
- len, &sumbp, &sb, prod, &r);
- }
-
- if (error)
- return error;
-
- /*
- * If it worked, update the superblock.
- */
- if (r != NULLRTBLOCK) {
- long slen = (long)*len;
-
- ASSERT(*len >= minlen && *len <= maxlen);
- if (wasdel)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
- else
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
- } else if (prod > 1) {
- prod = 1;
- goto retry;
- }
-
- *rtblock = r;
- return 0;
-}
-
-/*
* Initialize realtime fields in the mount structure.
*/
int /* error */
@@ -1250,6 +1081,7 @@ xfs_rtmount_init(
struct xfs_buf *bp; /* buffer for last block of subvolume */
struct xfs_sb *sbp; /* filesystem superblock copy in mount */
xfs_daddr_t d; /* address of last block of subvolume */
+ unsigned int rsumblocks;
int error;
sbp = &mp->m_sb;
@@ -1261,10 +1093,9 @@ xfs_rtmount_init(
return -ENODEV;
}
mp->m_rsumlevels = sbp->sb_rextslog + 1;
- mp->m_rsumsize =
- (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels *
- sbp->sb_rbmblocks;
- mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize);
+ rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels,
+ mp->m_sb.sb_rbmblocks);
+ mp->m_rsumsize = XFS_FSB_TO_B(mp, rsumblocks);
mp->m_rbmip = mp->m_rsumip = NULL;
/*
* Check that the realtime section is an ok size.
@@ -1373,6 +1204,8 @@ xfs_rtmount_inodes(
sbp = &mp->m_sb;
error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
+ if (xfs_metadata_is_sick(error))
+ xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP);
if (error)
return error;
ASSERT(mp->m_rbmip != NULL);
@@ -1382,6 +1215,8 @@ xfs_rtmount_inodes(
goto out_rele_bitmap;
error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
+ if (xfs_metadata_is_sick(error))
+ xfs_rt_mark_sick(mp, XFS_SICK_RT_SUMMARY);
if (error)
goto out_rele_bitmap;
ASSERT(mp->m_rsumip != NULL);
@@ -1404,7 +1239,7 @@ void
xfs_rtunmount_inodes(
struct xfs_mount *mp)
{
- kmem_free(mp->m_rsum_cache);
+ kvfree(mp->m_rsum_cache);
if (mp->m_rbmip)
xfs_irele(mp->m_rbmip);
if (mp->m_rsumip)
@@ -1418,27 +1253,27 @@ xfs_rtunmount_inodes(
* of rtextents and the fraction.
* The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
*/
-int /* error */
+static int
xfs_rtpick_extent(
xfs_mount_t *mp, /* file system mount point */
xfs_trans_t *tp, /* transaction pointer */
- xfs_extlen_t len, /* allocation length (rtextents) */
- xfs_rtblock_t *pick) /* result rt extent */
- {
- xfs_rtblock_t b; /* result block */
+ xfs_rtxlen_t len, /* allocation length (rtextents) */
+ xfs_rtxnum_t *pick) /* result rt extent */
+{
+ xfs_rtxnum_t b; /* result rtext */
int log2; /* log of sequence number */
uint64_t resid; /* residual after log removed */
uint64_t seq; /* sequence number of file creation */
- struct timespec64 ts; /* temporary timespec64 storage */
+ struct timespec64 ts; /* timespec in inode */
- ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL);
+ ts = inode_get_atime(VFS_I(mp->m_rbmip));
if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
seq = 0;
} else {
- ts = inode_get_atime(VFS_I(mp->m_rbmip));
- seq = (uint64_t)ts.tv_sec;
+ seq = ts.tv_sec;
}
if ((log2 = xfs_highbit64(seq)) == -1)
b = 0;
@@ -1451,9 +1286,183 @@ xfs_rtpick_extent(
if (b + len > mp->m_sb.sb_rextents)
b = mp->m_sb.sb_rextents - len;
}
- ts.tv_sec = (time64_t)seq + 1;
+ ts.tv_sec = seq + 1;
inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
*pick = b;
return 0;
}
+
+static void
+xfs_rtalloc_align_minmax(
+ xfs_rtxlen_t *raminlen,
+ xfs_rtxlen_t *ramaxlen,
+ xfs_rtxlen_t *prod)
+{
+ xfs_rtxlen_t newmaxlen = *ramaxlen;
+ xfs_rtxlen_t newminlen = *raminlen;
+ xfs_rtxlen_t slack;
+
+ slack = newmaxlen % *prod;
+ if (slack)
+ newmaxlen -= slack;
+ slack = newminlen % *prod;
+ if (slack)
+ newminlen += *prod - slack;
+
+ /*
+ * If adjusting for extent size hint alignment produces an invalid
+ * min/max len combination, go ahead without it.
+ */
+ if (newmaxlen < newminlen) {
+ *prod = 1;
+ return;
+ }
+ *ramaxlen = newmaxlen;
+ *raminlen = newminlen;
+}
+
+int
+xfs_bmap_rtalloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ xfs_fileoff_t orig_offset = ap->offset;
+ xfs_rtxnum_t start; /* allocation hint rtextent no */
+ xfs_rtxnum_t rtx; /* actually allocated rtextent no */
+ xfs_rtxlen_t prod = 0; /* product factor for allocators */
+ xfs_extlen_t mod = 0; /* product factor for allocators */
+ xfs_rtxlen_t ralen = 0; /* realtime allocation length */
+ xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_extlen_t orig_length = ap->length;
+ xfs_extlen_t minlen = mp->m_sb.sb_rextsize;
+ xfs_rtxlen_t raminlen;
+ bool rtlocked = false;
+ bool ignore_locality = false;
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ .tp = ap->tp,
+ };
+ int error;
+
+ align = xfs_get_extsz_hint(ap->ip);
+retry:
+ error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+ align, 1, ap->eof, 0,
+ ap->conv, &ap->offset, &ap->length);
+ if (error)
+ return error;
+ ASSERT(ap->length);
+ ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0);
+
+ /*
+ * If we shifted the file offset downward to satisfy an extent size
+ * hint, increase minlen by that amount so that the allocator won't
+ * give us an allocation that's too short to cover at least one of the
+ * blocks that the caller asked for.
+ */
+ if (ap->offset != orig_offset)
+ minlen += orig_offset - ap->offset;
+
+ /*
+ * Set ralen to be the actual requested length in rtextents.
+ *
+ * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
+ * we rounded up to it, cut it back so it's valid again.
+ * Note that if it's a really large request (bigger than
+ * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
+ * adjust the starting point to match it.
+ */
+ ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN));
+ raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen));
+ ASSERT(raminlen > 0);
+ ASSERT(raminlen <= ralen);
+
+ /*
+ * Lock out modifications to both the RT bitmap and summary inodes
+ */
+ if (!rtlocked) {
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
+ xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
+ xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ rtlocked = true;
+ }
+
+ if (ignore_locality) {
+ start = 0;
+ } else if (xfs_bmap_adjacent(ap)) {
+ start = xfs_rtb_to_rtx(mp, ap->blkno);
+ } else if (ap->eof && ap->offset == 0) {
+ /*
+ * If it's an allocation to an empty file at offset 0, pick an
+ * extent that will space things out in the rt area.
+ */
+ error = xfs_rtpick_extent(mp, ap->tp, ralen, &start);
+ if (error)
+ return error;
+ } else {
+ start = 0;
+ }
+
+ /*
+ * Only bother calculating a real prod factor if offset & length are
+ * perfectly aligned, otherwise it will just get us in trouble.
+ */
+ div_u64_rem(ap->offset, align, &mod);
+ if (mod || ap->length % align) {
+ prod = 1;
+ } else {
+ prod = xfs_extlen_to_rtxlen(mp, align);
+ if (prod > 1)
+ xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod);
+ }
+
+ if (start) {
+ error = xfs_rtallocate_extent_near(&args, start, raminlen,
+ ralen, &ralen, prod, &rtx);
+ } else {
+ error = xfs_rtallocate_extent_size(&args, raminlen,
+ ralen, &ralen, prod, &rtx);
+ }
+ xfs_rtbuf_cache_relse(&args);
+
+ if (error == -ENOSPC) {
+ if (align > mp->m_sb.sb_rextsize) {
+ /*
+ * We previously enlarged the request length to try to
+ * satisfy an extent size hint. The allocator didn't
+ * return anything, so reset the parameters to the
+ * original values and try again without alignment
+ * criteria.
+ */
+ ap->offset = orig_offset;
+ ap->length = orig_length;
+ minlen = align = mp->m_sb.sb_rextsize;
+ goto retry;
+ }
+
+ if (!ignore_locality && start != 0) {
+ /*
+ * If we can't allocate near a specific rt extent, try
+ * again without locality criteria.
+ */
+ ignore_locality = true;
+ goto retry;
+ }
+
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
+ return 0;
+ }
+ if (error)
+ return error;
+
+ xfs_trans_mod_sb(ap->tp, ap->wasdel ?
+ XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS,
+ -(long)ralen);
+ ap->blkno = xfs_rtx_to_rtb(mp, rtx);
+ ap->length = xfs_rtxlen_to_extlen(mp, ralen);
+ xfs_bmap_alloc_account(ap);
+ return 0;
+}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 62c7ad79cbb6..a6836da9bebe 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -11,54 +11,8 @@
struct xfs_mount;
struct xfs_trans;
-/*
- * XXX: Most of the realtime allocation functions deal in units of realtime
- * extents, not realtime blocks. This looks funny when paired with the type
- * name and screams for a larger cleanup.
- */
-struct xfs_rtalloc_rec {
- xfs_rtblock_t ar_startext;
- xfs_rtblock_t ar_extcount;
-};
-
-typedef int (*xfs_rtalloc_query_range_fn)(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- const struct xfs_rtalloc_rec *rec,
- void *priv);
-
#ifdef CONFIG_XFS_RT
/*
- * Function prototypes for exported functions.
- */
-
-/*
- * Allocate an extent in the realtime subvolume, with the usual allocation
- * parameters. The length units are all in realtime extents, as is the
- * result block number.
- */
-int /* error */
-xfs_rtallocate_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to allocate */
- xfs_extlen_t minlen, /* minimum length to allocate */
- xfs_extlen_t maxlen, /* maximum length to allocate */
- xfs_extlen_t *len, /* out: actual length allocated */
- int wasdel, /* was a delayed allocation extent */
- xfs_extlen_t prod, /* extent product factor */
- xfs_rtblock_t *rtblock); /* out: start block allocated */
-
-/*
- * Free an extent in the realtime subvolume. Length is expressed in
- * realtime extents, as is the block number.
- */
-int /* error */
-xfs_rtfree_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to free */
- xfs_extlen_t len); /* length of extent freed */
-
-/*
* Initialize realtime fields in the mount structure.
*/
int /* error */
@@ -77,20 +31,6 @@ xfs_rtmount_inodes(
struct xfs_mount *mp); /* file system mount structure */
/*
- * Pick an extent for allocation at the start of a new realtime file.
- * Use the sequence number stored in the atime field of the bitmap inode.
- * Translate this to a fraction of the rtextents, and return the product
- * of rtextents and the fraction.
- * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
- */
-int /* error */
-xfs_rtpick_extent(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_extlen_t len, /* allocation length (rtextents) */
- xfs_rtblock_t *pick); /* result rt extent */
-
-/*
* Grow the realtime area of the filesystem.
*/
int
@@ -98,55 +38,10 @@ xfs_growfs_rt(
struct xfs_mount *mp, /* file system mount structure */
xfs_growfs_rt_t *in); /* user supplied growfs struct */
-/*
- * From xfs_rtbitmap.c
- */
-int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
-int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len, int val,
- xfs_rtblock_t *new, int *stat);
-int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_rtblock_t limit,
- xfs_rtblock_t *rtblock);
-int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_rtblock_t limit,
- xfs_rtblock_t *rtblock);
-int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len, int val);
-int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
- int log, xfs_rtblock_t bbno, int delta,
- struct xfs_buf **rbpp, xfs_fsblock_t *rsb,
- xfs_suminfo_t *sum);
-int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
- xfs_rtblock_t bbno, int delta, struct xfs_buf **rbpp,
- xfs_fsblock_t *rsb);
-int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len,
- struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
- const struct xfs_rtalloc_rec *low_rec,
- const struct xfs_rtalloc_rec *high_rec,
- xfs_rtalloc_query_range_fn fn, void *priv);
-int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtalloc_query_range_fn fn,
- void *priv);
-bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
-int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtblock_t start, xfs_extlen_t len,
- bool *is_free);
int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
#else
-# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
-# define xfs_rtfree_extent(t,b,l) (ENOSYS)
-# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
-# define xfs_growfs_rt(mp,in) (ENOSYS)
-# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
-# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS)
-# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS)
-# define xfs_verify_rtbno(m, r) (false)
-# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS)
-# define xfs_rtalloc_reinit_frextents(m) (0)
+# define xfs_growfs_rt(mp,in) (-ENOSYS)
+# define xfs_rtalloc_reinit_frextents(m) (0)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
@@ -157,7 +52,7 @@ xfs_rtmount_init(
xfs_warn(mp, "Not built with CONFIG_XFS_RT");
return -ENOSYS;
}
-# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS))
# define xfs_rtunmount_inodes(m)
#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 90a77cd3ebad..ed97d72caa66 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -50,7 +50,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{ "ibt2", xfsstats_offset(xs_fibt_2) },
{ "fibt2", xfsstats_offset(xs_rmap_2) },
{ "rmapbt", xfsstats_offset(xs_refcbt_2) },
- { "refcntbt", xfsstats_offset(xs_qm_dqreclaims)},
+ { "refcntbt", xfsstats_offset(xs_rmap_mem_2) },
+ { "rmapbt_mem", xfsstats_offset(xs_rcbag_2) },
+ { "rcbagbt", xfsstats_offset(xs_qm_dqreclaims)},
/* we print both series of quota information together */
{ "qm", xfsstats_offset(xs_xstrat_bytes)},
};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 43ffba74f045..a61fb56ed2e6 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -125,6 +125,8 @@ struct __xfsstats {
uint32_t xs_fibt_2[__XBTS_MAX];
uint32_t xs_rmap_2[__XBTS_MAX];
uint32_t xs_refcbt_2[__XBTS_MAX];
+ uint32_t xs_rmap_mem_2[__XBTS_MAX];
+ uint32_t xs_rcbag_2[__XBTS_MAX];
uint32_t xs_qm_dqreclaims;
uint32_t xs_qm_dqreclaim_misses;
uint32_t xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f0ae07828153..bce020374c5e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -42,7 +42,9 @@
#include "xfs_xattr.h"
#include "xfs_iunlink_item.h"
#include "xfs_dahash_test.h"
+#include "xfs_rtbitmap.h"
#include "scrub/stats.h"
+#include "scrub/rcbag_btree.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -349,7 +351,6 @@ xfs_setup_dax_always(
return -EINVAL;
}
- xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
return 0;
disable_dax:
@@ -361,15 +362,16 @@ STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
const char *name,
- struct bdev_handle **handlep)
+ struct file **bdev_filep)
{
int error = 0;
- *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
- mp->m_super, &fs_holder_ops);
- if (IS_ERR(*handlep)) {
- error = PTR_ERR(*handlep);
- *handlep = NULL;
+ *bdev_filep = bdev_file_open_by_path(name,
+ BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
+ mp->m_super, &fs_holder_ops);
+ if (IS_ERR(*bdev_filep)) {
+ error = PTR_ERR(*bdev_filep);
+ *bdev_filep = NULL;
xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
}
@@ -434,32 +436,26 @@ xfs_open_devices(
{
struct super_block *sb = mp->m_super;
struct block_device *ddev = sb->s_bdev;
- struct bdev_handle *logdev_handle = NULL, *rtdev_handle = NULL;
+ struct file *logdev_file = NULL, *rtdev_file = NULL;
int error;
/*
- * blkdev_put() can't be called under s_umount, see the comment
- * in get_tree_bdev() for more details
- */
- up_write(&sb->s_umount);
-
- /*
* Open real time and log devices - order is important.
*/
if (mp->m_logname) {
- error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle);
+ error = xfs_blkdev_get(mp, mp->m_logname, &logdev_file);
if (error)
- goto out_relock;
+ return error;
}
if (mp->m_rtname) {
- error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle);
+ error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_file);
if (error)
goto out_close_logdev;
- if (rtdev_handle->bdev == ddev ||
- (logdev_handle &&
- rtdev_handle->bdev == logdev_handle->bdev)) {
+ if (file_bdev(rtdev_file) == ddev ||
+ (logdev_file &&
+ file_bdev(rtdev_file) == file_bdev(logdev_file))) {
xfs_warn(mp,
"Cannot mount filesystem with identical rtdev and ddev/logdev.");
error = -EINVAL;
@@ -471,31 +467,28 @@ xfs_open_devices(
* Setup xfs_mount buffer target pointers
*/
error = -ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
- if (rtdev_handle) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle);
+ if (rtdev_file) {
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
- if (logdev_handle && logdev_handle->bdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle);
+ if (logdev_file && file_bdev(logdev_file) != ddev) {
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
mp->m_logdev_targp = mp->m_ddev_targp;
/* Handle won't be used, drop it */
- if (logdev_handle)
- bdev_release(logdev_handle);
+ if (logdev_file)
+ bdev_fput(logdev_file);
}
- error = 0;
-out_relock:
- down_write(&sb->s_umount);
- return error;
+ return 0;
out_free_rtdev_targ:
if (mp->m_rtdev_targp)
@@ -503,12 +496,12 @@ out_relock:
out_free_ddev_targ:
xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
- if (rtdev_handle)
- bdev_release(rtdev_handle);
+ if (rtdev_file)
+ bdev_fput(rtdev_file);
out_close_logdev:
- if (logdev_handle)
- bdev_release(logdev_handle);
- goto out_relock;
+ if (logdev_file)
+ bdev_fput(logdev_file);
+ return error;
}
/*
@@ -723,9 +716,7 @@ xfs_fs_inode_init_once(
/* xfs inode */
atomic_set(&ip->i_pincount, 0);
spin_lock_init(&ip->i_flags_lock);
-
- mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
- "xfsino", ip->i_ino);
+ init_rwsem(&ip->i_lock);
}
/*
@@ -758,10 +749,6 @@ static void
xfs_mount_free(
struct xfs_mount *mp)
{
- /*
- * Free the buftargs here because blkdev_put needs to be called outside
- * of sb->s_umount, which is held around the call to ->put_super.
- */
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_logdev_targp);
if (mp->m_rtdev_targp)
@@ -772,7 +759,7 @@ xfs_mount_free(
debugfs_remove(mp->m_debugfs);
kfree(mp->m_rtname);
kfree(mp->m_logname);
- kmem_free(mp);
+ kfree(mp);
}
STATIC int
@@ -896,7 +883,7 @@ xfs_fs_statfs(
statp->f_blocks = sbp->sb_rblocks;
freertx = percpu_counter_sum_positive(&mp->m_frextents);
- statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
+ statp->f_bavail = statp->f_bfree = xfs_rtx_to_rtb(mp, freertx);
}
return 0;
@@ -905,10 +892,8 @@ xfs_fs_statfs(
STATIC void
xfs_save_resvblks(struct xfs_mount *mp)
{
- uint64_t resblks = 0;
-
mp->m_resblks_save = mp->m_resblks;
- xfs_reserve_blocks(mp, &resblks, NULL);
+ xfs_reserve_blocks(mp, 0);
}
STATIC void
@@ -922,7 +907,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
} else
resblks = xfs_default_resblks(mp);
- xfs_reserve_blocks(mp, &resblks, NULL);
+ xfs_reserve_blocks(mp, resblks);
}
/*
@@ -1509,6 +1494,18 @@ xfs_fs_fill_super(
mp->m_super = sb;
+ /*
+ * Copy VFS mount flags from the context now that all parameter parsing
+ * is guaranteed to have been completed by either the old mount API or
+ * the newer fsopen/fsconfig API.
+ */
+ if (fc->sb_flags & SB_RDONLY)
+ set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+ if (fc->sb_flags & SB_DIRSYNC)
+ mp->m_features |= XFS_FEAT_DIRSYNC;
+ if (fc->sb_flags & SB_SYNCHRONOUS)
+ mp->m_features |= XFS_FEAT_WSYNC;
+
error = xfs_fs_validate_params(mp);
if (error)
return error;
@@ -1978,12 +1975,17 @@ static const struct fs_context_operations xfs_context_ops = {
.free = xfs_fs_free,
};
+/*
+ * WARNING: do not initialise any parameters in this function that depend on
+ * mount option parsing having already been performed as this can be called from
+ * fsopen() before any parameters have been set.
+ */
static int xfs_init_fs_context(
struct fs_context *fc)
{
struct xfs_mount *mp;
- mp = kmem_alloc(sizeof(struct xfs_mount), KM_ZERO);
+ mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL);
if (!mp)
return -ENOMEM;
@@ -2009,15 +2011,7 @@ static int xfs_init_fs_context(
mp->m_logbsize = -1;
mp->m_allocsize_log = 16; /* 64k */
- /*
- * Copy binary VFS mount flags we are interested in.
- */
- if (fc->sb_flags & SB_RDONLY)
- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
- if (fc->sb_flags & SB_DIRSYNC)
- mp->m_features |= XFS_FEAT_DIRSYNC;
- if (fc->sb_flags & SB_SYNCHRONOUS)
- mp->m_features |= XFS_FEAT_WSYNC;
+ xfs_hooks_init(&mp->m_dir_update_hooks);
fc->s_fs_info = mp;
fc->ops = &xfs_context_ops;
@@ -2050,8 +2044,7 @@ xfs_init_caches(void)
xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
SLAB_HWCACHE_ALIGN |
- SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT,
NULL);
if (!xfs_buf_cache)
goto out;
@@ -2066,10 +2059,14 @@ xfs_init_caches(void)
if (error)
goto out_destroy_log_ticket_cache;
- error = xfs_defer_init_item_caches();
+ error = rcbagbt_init_cur_cache();
if (error)
goto out_destroy_btree_cur_cache;
+ error = xfs_defer_init_item_caches();
+ if (error)
+ goto out_destroy_rcbagbt_cur_cache;
+
xfs_da_state_cache = kmem_cache_create("xfs_da_state",
sizeof(struct xfs_da_state),
0, 0, NULL);
@@ -2116,14 +2113,14 @@ xfs_init_caches(void)
sizeof(struct xfs_inode), 0,
(SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+ SLAB_ACCOUNT),
xfs_fs_inode_init_once);
if (!xfs_inode_cache)
goto out_destroy_efi_cache;
xfs_ili_cache = kmem_cache_create("xfs_ili",
sizeof(struct xfs_inode_log_item), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT,
NULL);
if (!xfs_ili_cache)
goto out_destroy_inode_cache;
@@ -2226,6 +2223,8 @@ xfs_init_caches(void)
kmem_cache_destroy(xfs_da_state_cache);
out_destroy_defer_item_cache:
xfs_defer_destroy_item_caches();
+ out_destroy_rcbagbt_cur_cache:
+ rcbagbt_destroy_cur_cache();
out_destroy_btree_cur_cache:
xfs_btree_destroy_cur_caches();
out_destroy_log_ticket_cache:
@@ -2263,6 +2262,7 @@ xfs_destroy_caches(void)
kmem_cache_destroy(xfs_ifork_cache);
kmem_cache_destroy(xfs_da_state_cache);
xfs_defer_destroy_item_caches();
+ rcbagbt_destroy_cur_cache();
xfs_btree_destroy_cur_caches();
kmem_cache_destroy(xfs_log_ticket_cache);
kmem_cache_destroy(xfs_buf_cache);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 85e433df6a3f..3e376d24c7c1 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -23,77 +23,8 @@
#include "xfs_trans.h"
#include "xfs_ialloc.h"
#include "xfs_error.h"
-
-/* ----- Kernel only functions below ----- */
-int
-xfs_readlink_bmap_ilocked(
- struct xfs_inode *ip,
- char *link)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
- struct xfs_buf *bp;
- xfs_daddr_t d;
- char *cur_chunk;
- int pathlen = ip->i_disk_size;
- int nmaps = XFS_SYMLINK_MAPS;
- int byte_cnt;
- int n;
- int error = 0;
- int fsblocks = 0;
- int offset;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-
- fsblocks = xfs_symlink_blocks(mp, pathlen);
- error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
- if (error)
- goto out;
-
- offset = 0;
- for (n = 0; n < nmaps; n++) {
- d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
- byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-
- error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
- &bp, &xfs_symlink_buf_ops);
- if (error)
- return error;
- byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
- if (pathlen < byte_cnt)
- byte_cnt = pathlen;
-
- cur_chunk = bp->b_addr;
- if (xfs_has_crc(mp)) {
- if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
- byte_cnt, bp)) {
- error = -EFSCORRUPTED;
- xfs_alert(mp,
-"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
- offset, byte_cnt, ip->i_ino);
- xfs_buf_relse(bp);
- goto out;
-
- }
-
- cur_chunk += sizeof(struct xfs_dsymlink_hdr);
- }
-
- memcpy(link + offset, cur_chunk, byte_cnt);
-
- pathlen -= byte_cnt;
- offset += byte_cnt;
-
- xfs_buf_relse(bp);
- }
- ASSERT(pathlen == 0);
-
- link[ip->i_disk_size] = '\0';
- error = 0;
-
- out:
- return error;
-}
+#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
int
xfs_readlink(
@@ -102,25 +33,27 @@ xfs_readlink(
{
struct xfs_mount *mp = ip->i_mount;
xfs_fsize_t pathlen;
- int error = -EFSCORRUPTED;
+ int error;
trace_xfs_readlink(ip);
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_ifork_zapped(ip, XFS_DATA_FORK))
+ return -EIO;
xfs_ilock(ip, XFS_ILOCK_SHARED);
pathlen = ip->i_disk_size;
if (!pathlen)
- goto out;
+ goto out_corrupt;
if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) {
xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
__func__, (unsigned long long) ip->i_ino,
(long long) pathlen);
ASSERT(0);
- goto out;
+ goto out_corrupt;
}
if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
@@ -128,18 +61,21 @@ xfs_readlink(
* The VFS crashes on a NULL pointer, so return -EFSCORRUPTED
* if if_data is junk.
*/
- if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data))
- goto out;
+ if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data))
+ goto out_corrupt;
- memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1);
+ memcpy(link, ip->i_df.if_data, pathlen + 1);
error = 0;
} else {
- error = xfs_readlink_bmap_ilocked(ip, link);
+ error = xfs_symlink_remote_read(ip, link);
}
- out:
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return error;
+ out_corrupt:
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+ return -EFSCORRUPTED;
}
int
@@ -157,15 +93,7 @@ xfs_symlink(
int error = 0;
int pathlen;
bool unlock_dp_on_error = false;
- xfs_fileoff_t first_fsb;
xfs_filblks_t fs_blocks;
- int nmaps;
- struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
- xfs_daddr_t d;
- const char *cur_chunk;
- int byte_cnt;
- int n;
- struct xfs_buf *bp;
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
@@ -253,62 +181,11 @@ xfs_symlink(
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
resblks -= XFS_IALLOC_SPACE_RES(mp);
- /*
- * If the symlink will fit into the inode, write it inline.
- */
- if (pathlen <= xfs_inode_data_fork_size(ip)) {
- xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
-
- ip->i_disk_size = pathlen;
- ip->i_df.if_format = XFS_DINODE_FMT_LOCAL;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
- } else {
- int offset;
-
- first_fsb = 0;
- nmaps = XFS_SYMLINK_MAPS;
-
- error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
- XFS_BMAPI_METADATA, resblks, mval, &nmaps);
- if (error)
- goto out_trans_cancel;
-
- resblks -= fs_blocks;
- ip->i_disk_size = pathlen;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- cur_chunk = target_path;
- offset = 0;
- for (n = 0; n < nmaps; n++) {
- char *buf;
-
- d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
- byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
- error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- BTOBB(byte_cnt), 0, &bp);
- if (error)
- goto out_trans_cancel;
- bp->b_ops = &xfs_symlink_buf_ops;
-
- byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
- byte_cnt = min(byte_cnt, pathlen);
-
- buf = bp->b_addr;
- buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
- byte_cnt, bp);
-
- memcpy(buf, cur_chunk, byte_cnt);
-
- cur_chunk += byte_cnt;
- pathlen -= byte_cnt;
- offset += byte_cnt;
-
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
- xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
- (char *)bp->b_addr);
- }
- ASSERT(pathlen == 0);
- }
+ error = xfs_symlink_write_target(tp, ip, target_path, pathlen,
+ fs_blocks, resblks);
+ if (error)
+ goto out_trans_cancel;
+ resblks -= fs_blocks;
i_size_write(VFS_I(ip), ip->i_disk_size);
/*
@@ -319,6 +196,7 @@ xfs_symlink(
goto out_trans_cancel;
xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+ xfs_dir_update_hook(dp, ip, 1, link_name);
/*
* If this is a synchronous mount, make sure that the
@@ -493,6 +371,7 @@ xfs_inactive_symlink(
__func__, (unsigned long long)ip->i_ino, pathlen);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
ASSERT(0);
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index d1ca1ce62a93..0d29a50e66fd 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -10,7 +10,6 @@
int xfs_symlink(struct mnt_idmap *idmap, struct xfs_inode *dp,
struct xfs_name *link_name, const char *target_path,
umode_t mode, struct xfs_inode **ipp);
-int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link);
int xfs_readlink(struct xfs_inode *ip, char *link);
int xfs_inactive_symlink(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index fade33735393..a191f6560f98 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -206,8 +206,6 @@ static struct ctl_table xfs_table[] = {
.extra2 = &xfs_params.stats_clear.max
},
#endif /* CONFIG_PROC_FS */
-
- {}
};
int
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index f78ad6b10ea5..276696a07040 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -85,6 +85,8 @@ struct xfs_globals {
int pwork_threads; /* parallel workqueue threads */
bool larp; /* log attribute replay */
#endif
+ int bload_leaf_slack; /* btree bulk load leaf slack */
+ int bload_node_slack; /* btree bulk load node slack */
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
bool bug_on_assert; /* BUG() the kernel on assert failure */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index a3c6b1548723..d2391eec37fe 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -193,7 +193,6 @@ always_cow_show(
}
XFS_SYSFS_ATTR_RW(always_cow);
-#ifdef DEBUG
/*
* Override how many threads the parallel work queue is allowed to create.
* This has to be a debug-only global (instead of an errortag) because one of
@@ -229,6 +228,15 @@ pwork_threads_show(
}
XFS_SYSFS_ATTR_RW(pwork_threads);
+/*
+ * The "LARP" (Logged extended Attribute Recovery Persistence) debugging knob
+ * sets the XFS_DA_OP_LOGGED flag on all xfs_attr_set operations performed on
+ * V5 filesystems. As a result, the intermediate progress of all setxattr and
+ * removexattr operations are tracked via the log and can be restarted during
+ * recovery. This is useful for testing xattr recovery prior to merging of the
+ * parent pointer feature which requires it to maintain consistency, and may be
+ * enabled for userspace xattrs in the future.
+ */
static ssize_t
larp_store(
struct kobject *kobject,
@@ -251,17 +259,68 @@ larp_show(
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp);
}
XFS_SYSFS_ATTR_RW(larp);
-#endif /* DEBUG */
+
+STATIC ssize_t
+bload_leaf_slack_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ xfs_globals.bload_leaf_slack = val;
+ return count;
+}
+
+STATIC ssize_t
+bload_leaf_slack_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack);
+}
+XFS_SYSFS_ATTR_RW(bload_leaf_slack);
+
+STATIC ssize_t
+bload_node_slack_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ xfs_globals.bload_node_slack = val;
+ return count;
+}
+
+STATIC ssize_t
+bload_node_slack_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack);
+}
+XFS_SYSFS_ATTR_RW(bload_node_slack);
static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(bug_on_assert),
ATTR_LIST(log_recovery_delay),
ATTR_LIST(mount_delay),
ATTR_LIST(always_cow),
-#ifdef DEBUG
ATTR_LIST(pwork_threads),
ATTR_LIST(larp),
-#endif
+ ATTR_LIST(bload_leaf_slack),
+ ATTR_LIST(bload_node_slack),
NULL,
};
ATTRIBUTE_GROUPS(xfs_dbg);
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 8a5dc1538aa8..1a963382e5e9 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -36,6 +36,9 @@
#include "xfs_error.h"
#include <linux/iomap.h>
#include "xfs_iomap.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_bmap.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 3926cf7f2a6e..aea97fc074f8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -67,6 +67,7 @@ struct xfs_buf_log_format;
struct xfs_inode_log_format;
struct xfs_bmbt_irec;
struct xfs_btree_cur;
+struct xfs_defer_op_type;
struct xfs_refcount_irec;
struct xfs_fsmap;
struct xfs_rmap_irec;
@@ -78,6 +79,9 @@ union xfs_btree_ptr;
struct xfs_dqtrx;
struct xfs_icwalk;
struct xfs_perag;
+struct xfbtree;
+struct xfs_btree_ops;
+struct xfs_bmap_intent;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -145,21 +149,23 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
TRACE_EVENT(xlog_intent_recovery_failed,
- TP_PROTO(struct xfs_mount *mp, int error, void *function),
- TP_ARGS(mp, error, function),
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
+ int error),
+ TP_ARGS(mp, ops, error),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __string(name, ops->name)
__field(int, error)
- __field(void *, function)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
+ __assign_str(name, ops->name);
__entry->error = error;
- __entry->function = function;
),
- TP_printk("dev %d:%d error %d function %pS",
+ TP_printk("dev %d:%d optype %s error %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->error, __entry->function)
+ __get_str(name),
+ __entry->error)
);
DECLARE_EVENT_CLASS(xfs_perag_class,
@@ -637,6 +643,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bdetach);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
@@ -1707,12 +1714,10 @@ DECLARE_EVENT_CLASS(xfs_agf_class,
__entry->agno = be32_to_cpu(agf->agf_seqno),
__entry->flags = flags;
__entry->length = be32_to_cpu(agf->agf_length),
- __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
- __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
- __entry->bno_level =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
- __entry->cnt_level =
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
+ __entry->bno_root = be32_to_cpu(agf->agf_bno_root),
+ __entry->cnt_root = be32_to_cpu(agf->agf_cnt_root),
+ __entry->bno_level = be32_to_cpu(agf->agf_bno_level),
+ __entry->cnt_level = be32_to_cpu(agf->agf_cnt_level),
__entry->flfirst = be32_to_cpu(agf->agf_flfirst),
__entry->fllast = be32_to_cpu(agf->agf_fllast),
__entry->flcount = be32_to_cpu(agf->agf_flcount),
@@ -1887,28 +1892,28 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_near_bno);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_finish);
TRACE_EVENT(xfs_alloc_cur_check,
- TP_PROTO(struct xfs_mount *mp, xfs_btnum_t btnum, xfs_agblock_t bno,
+ TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, xfs_extlen_t diff, bool new),
- TP_ARGS(mp, btnum, bno, len, diff, new),
+ TP_ARGS(cur, bno, len, diff, new),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agblock_t, bno)
__field(xfs_extlen_t, len)
__field(xfs_extlen_t, diff)
__field(bool, new)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->btnum = btnum;
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __assign_str(name, cur->bc_ops->name);
__entry->bno = bno;
__entry->len = len;
__entry->diff = diff;
__entry->new = new;
),
- TP_printk("dev %d:%d btree %s agbno 0x%x fsbcount 0x%x diff 0x%x new %d",
+ TP_printk("dev %d:%d %sbt agbno 0x%x fsbcount 0x%x diff 0x%x new %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->bno, __entry->len, __entry->diff, __entry->new)
)
@@ -2449,21 +2454,12 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
DEFINE_DISCARD_EVENT(xfs_discard_exclude);
DEFINE_DISCARD_EVENT(xfs_discard_busy);
-/* btree cursor events */
-TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
-
DECLARE_EVENT_CLASS(xfs_btree_cur_class,
TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
TP_ARGS(cur, level, bp),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(int, level)
__field(int, nlevels)
__field(int, ptr)
@@ -2471,15 +2467,15 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_levels[level].ptr;
__entry->daddr = bp ? xfs_buf_daddr(bp) : -1;
),
- TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
+ TP_printk("dev %d:%d %sbt level %d/%d ptr %d daddr 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->nlevels,
__entry->ptr,
@@ -2493,6 +2489,90 @@ DEFINE_EVENT(xfs_btree_cur_class, name, \
DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys);
DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range);
+TRACE_EVENT(xfs_btree_alloc_block,
+ TP_PROTO(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, int stat,
+ int error),
+ TP_ARGS(cur, ptr, stat, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __string(name, cur->bc_ops->name)
+ __field(int, error)
+ __field(xfs_agblock_t, agbno)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_INODE:
+ __entry->agno = 0;
+ __entry->ino = cur->bc_ino.ip->i_ino;
+ break;
+ case XFS_BTREE_TYPE_AG:
+ __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->ino = 0;
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ __entry->agno = 0;
+ __entry->ino = 0;
+ break;
+ }
+ __assign_str(name, cur->bc_ops->name);
+ __entry->error = error;
+ if (!error && stat) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
+ xfs_fsblock_t fsb = be64_to_cpu(ptr->l);
+
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp,
+ fsb);
+ __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp,
+ fsb);
+ } else {
+ __entry->agbno = be32_to_cpu(ptr->s);
+ }
+ } else {
+ __entry->agbno = NULLAGBLOCK;
+ }
+ ),
+ TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __get_str(name),
+ __entry->agno,
+ __entry->ino,
+ __entry->agbno,
+ __entry->error)
+);
+
+TRACE_EVENT(xfs_btree_free_block,
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_buf *bp),
+ TP_ARGS(cur, bp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __string(name, cur->bc_ops->name)
+ __field(xfs_agblock_t, agbno)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->agno = xfs_daddr_to_agno(cur->bc_mp,
+ xfs_buf_daddr(bp));
+ if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
+ __entry->ino = cur->bc_ino.ip->i_ino;
+ else
+ __entry->ino = 0;
+ __assign_str(name, cur->bc_ops->name);
+ __entry->agbno = xfs_daddr_to_agbno(cur->bc_mp,
+ xfs_buf_daddr(bp));
+ ),
+ TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __get_str(name),
+ __entry->agno,
+ __entry->ino,
+ __entry->agbno)
+);
+
/* deferred ops */
struct xfs_defer_pending;
@@ -2549,22 +2629,25 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
TP_ARGS(mp, dfp),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(int, type)
+ __string(name, dfp->dfp_ops->name)
__field(void *, intent)
+ __field(unsigned int, flags)
__field(char, committed)
__field(int, nr)
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
- __entry->type = dfp->dfp_type;
+ __assign_str(name, dfp->dfp_ops->name);
__entry->intent = dfp->dfp_intent;
+ __entry->flags = dfp->dfp_flags;
__entry->committed = dfp->dfp_done != NULL;
__entry->nr = dfp->dfp_count;
),
- TP_printk("dev %d:%d optype %d intent %p committed %d nr %d",
+ TP_printk("dev %d:%d optype %s intent %p flags %s committed %d nr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __get_str(name),
__entry->intent,
+ __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS),
__entry->committed,
__entry->nr)
)
@@ -2573,7 +2656,25 @@ DEFINE_EVENT(xfs_defer_pending_class, name, \
TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), \
TP_ARGS(mp, dfp))
-DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
+DEFINE_DEFER_EVENT(xfs_defer_cancel);
+DEFINE_DEFER_EVENT(xfs_defer_trans_roll);
+DEFINE_DEFER_EVENT(xfs_defer_trans_abort);
+DEFINE_DEFER_EVENT(xfs_defer_finish);
+DEFINE_DEFER_EVENT(xfs_defer_finish_done);
+
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
+
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);
+
+DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
int type, xfs_agblock_t agbno, xfs_extlen_t len),
TP_ARGS(mp, agno, type, agbno, len),
@@ -2598,89 +2699,17 @@ DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
__entry->agbno,
__entry->len)
);
-#define DEFINE_PHYS_EXTENT_DEFERRED_EVENT(name) \
-DEFINE_EVENT(xfs_phys_extent_deferred_class, name, \
+#define DEFINE_FREE_EXTENT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_free_extent_deferred_class, name, \
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
int type, \
xfs_agblock_t bno, \
xfs_extlen_t len), \
TP_ARGS(mp, agno, type, bno, len))
-
-DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- int op,
- xfs_agblock_t agbno,
- xfs_ino_t ino,
- int whichfork,
- xfs_fileoff_t offset,
- xfs_filblks_t len,
- xfs_exntst_t state),
- TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_agnumber_t, agno)
- __field(xfs_ino_t, ino)
- __field(xfs_agblock_t, agbno)
- __field(int, whichfork)
- __field(xfs_fileoff_t, l_loff)
- __field(xfs_filblks_t, l_len)
- __field(xfs_exntst_t, l_state)
- __field(int, op)
- ),
- TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->ino = ino;
- __entry->agbno = agbno;
- __entry->whichfork = whichfork;
- __entry->l_loff = offset;
- __entry->l_len = len;
- __entry->l_state = state;
- __entry->op = op;
- ),
- TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->op,
- __entry->agno,
- __entry->agbno,
- __entry->ino,
- __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
- __entry->l_loff,
- __entry->l_len,
- __entry->l_state)
-);
-#define DEFINE_MAP_EXTENT_DEFERRED_EVENT(name) \
-DEFINE_EVENT(xfs_map_extent_deferred_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- int op, \
- xfs_agblock_t agbno, \
- xfs_ino_t ino, \
- int whichfork, \
- xfs_fileoff_t offset, \
- xfs_filblks_t len, \
- xfs_exntst_t state), \
- TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
-
-DEFINE_DEFER_EVENT(xfs_defer_cancel);
-DEFINE_DEFER_EVENT(xfs_defer_trans_roll);
-DEFINE_DEFER_EVENT(xfs_defer_trans_abort);
-DEFINE_DEFER_EVENT(xfs_defer_finish);
-DEFINE_DEFER_EVENT(xfs_defer_finish_done);
-
-DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
-DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
-
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
-
-#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer);
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_defer);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_deferred);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_defer);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred);
DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp,
@@ -2688,25 +2717,28 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
TP_ARGS(mp, dfp, item),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(int, type)
+ __string(name, dfp->dfp_ops->name)
__field(void *, intent)
__field(void *, item)
__field(char, committed)
+ __field(unsigned int, flags)
__field(int, nr)
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
- __entry->type = dfp->dfp_type;
+ __assign_str(name, dfp->dfp_ops->name);
__entry->intent = dfp->dfp_intent;
__entry->item = item;
__entry->committed = dfp->dfp_done != NULL;
+ __entry->flags = dfp->dfp_flags;
__entry->nr = dfp->dfp_count;
),
- TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d",
+ TP_printk("dev %d:%d optype %s intent %p item %p flags %s committed %d nr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __get_str(name),
__entry->intent,
__entry->item,
+ __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS),
__entry->committed,
__entry->nr)
)
@@ -2842,12 +2874,63 @@ DEFINE_EVENT(xfs_rmapbt_class, name, \
uint64_t owner, uint64_t offset, unsigned int flags), \
TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
-#define DEFINE_RMAP_DEFERRED_EVENT DEFINE_MAP_EXTENT_DEFERRED_EVENT
+DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int op,
+ xfs_agblock_t agbno,
+ xfs_ino_t ino,
+ int whichfork,
+ xfs_fileoff_t offset,
+ xfs_filblks_t len,
+ xfs_exntst_t state),
+ TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agblock_t, agbno)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, l_loff)
+ __field(xfs_filblks_t, l_len)
+ __field(xfs_exntst_t, l_state)
+ __field(int, op)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->ino = ino;
+ __entry->agbno = agbno;
+ __entry->whichfork = whichfork;
+ __entry->l_loff = offset;
+ __entry->l_len = len;
+ __entry->l_state = state;
+ __entry->op = op;
+ ),
+ TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->op,
+ __entry->agno,
+ __entry->agbno,
+ __entry->ino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->l_loff,
+ __entry->l_len,
+ __entry->l_state)
+);
+#define DEFINE_RMAP_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_rmap_deferred_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ int op, \
+ xfs_agblock_t agbno, \
+ xfs_ino_t ino, \
+ int whichfork, \
+ xfs_fileoff_t offset, \
+ xfs_filblks_t len, \
+ xfs_exntst_t state), \
+ TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer);
DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred);
-DEFINE_BUSY_EVENT(xfs_rmapbt_alloc_block);
-DEFINE_BUSY_EVENT(xfs_rmapbt_free_block);
DEFINE_RMAPBT_EVENT(xfs_rmap_update);
DEFINE_RMAPBT_EVENT(xfs_rmap_insert);
DEFINE_RMAPBT_EVENT(xfs_rmap_delete);
@@ -2864,7 +2947,66 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
/* deferred bmbt updates */
-#define DEFINE_BMAP_DEFERRED_EVENT DEFINE_RMAP_DEFERRED_EVENT
+TRACE_DEFINE_ENUM(XFS_BMAP_MAP);
+TRACE_DEFINE_ENUM(XFS_BMAP_UNMAP);
+
+DECLARE_EVENT_CLASS(xfs_bmap_deferred_class,
+ TP_PROTO(struct xfs_bmap_intent *bi),
+ TP_ARGS(bi),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, opdev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_fsblock_t, rtbno)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, l_loff)
+ __field(xfs_filblks_t, l_len)
+ __field(xfs_exntst_t, l_state)
+ __field(int, op)
+ ),
+ TP_fast_assign(
+ struct xfs_inode *ip = bi->bi_owner;
+
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ if (xfs_ifork_is_realtime(ip, bi->bi_whichfork)) {
+ __entry->agno = 0;
+ __entry->agbno = 0;
+ __entry->rtbno = bi->bi_bmap.br_startblock;
+ __entry->opdev = ip->i_mount->m_rtdev_targp->bt_dev;
+ } else {
+ __entry->agno = XFS_FSB_TO_AGNO(ip->i_mount,
+ bi->bi_bmap.br_startblock);
+ __entry->agbno = XFS_FSB_TO_AGBNO(ip->i_mount,
+ bi->bi_bmap.br_startblock);
+ __entry->rtbno = 0;
+ __entry->opdev = __entry->dev;
+ }
+ __entry->ino = ip->i_ino;
+ __entry->whichfork = bi->bi_whichfork;
+ __entry->l_loff = bi->bi_bmap.br_startoff;
+ __entry->l_len = bi->bi_bmap.br_blockcount;
+ __entry->l_state = bi->bi_bmap.br_state;
+ __entry->op = bi->bi_type;
+ ),
+ TP_printk("dev %d:%d op %s opdev %d:%d ino 0x%llx agno 0x%x agbno 0x%x rtbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->op, XFS_BMAP_INTENT_STRINGS),
+ MAJOR(__entry->opdev), MINOR(__entry->opdev),
+ __entry->ino,
+ __entry->agno,
+ __entry->agbno,
+ __entry->rtbno,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->l_loff,
+ __entry->l_len,
+ __entry->l_state)
+);
+#define DEFINE_BMAP_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_bmap_deferred_class, name, \
+ TP_PROTO(struct xfs_bmap_intent *bi), \
+ TP_ARGS(bi))
DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer);
DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred);
@@ -3205,8 +3347,6 @@ DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \
TP_ARGS(mp, agno, i1, i2, i3))
/* refcount btree tracepoints */
-DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block);
-DEFINE_BUSY_EVENT(xfs_refcountbt_free_block);
DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update);
@@ -3243,7 +3383,39 @@ DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error);
DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared);
DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result);
DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error);
-#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+
+DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int type, xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, type, agbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, type)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->type = type;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->agno,
+ __entry->agbno,
+ __entry->len)
+);
+#define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_deferred_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ int type, \
+ xfs_agblock_t bno, \
+ xfs_extlen_t len), \
+ TP_ARGS(mp, agno, type, bno, len))
DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer);
DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
@@ -3914,9 +4086,11 @@ DEFINE_EVENT(xfs_fs_corrupt_class, name, \
TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
TP_ARGS(mp, flags))
DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_corrupt);
DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption);
DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_corrupt);
DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption);
@@ -3943,6 +4117,7 @@ DEFINE_EVENT(xfs_ag_corrupt_class, name, \
unsigned int flags), \
TP_ARGS(mp, agno, flags))
DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_corrupt);
DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption);
@@ -3968,7 +4143,9 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name, \
TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
TP_ARGS(ip, flags))
DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_corrupt);
DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_unfixed_corruption);
TRACE_EVENT(xfs_iwalk_ag,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -4028,31 +4205,6 @@ TRACE_EVENT(xfs_pwork_init,
__entry->nr_threads, __entry->pid)
)
-DECLARE_EVENT_CLASS(xfs_kmem_class,
- TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
- TP_ARGS(size, flags, caller_ip),
- TP_STRUCT__entry(
- __field(ssize_t, size)
- __field(int, flags)
- __field(unsigned long, caller_ip)
- ),
- TP_fast_assign(
- __entry->size = size;
- __entry->flags = flags;
- __entry->caller_ip = caller_ip;
- ),
- TP_printk("size %zd flags 0x%x caller %pS",
- __entry->size,
- __entry->flags,
- (char *)__entry->caller_ip)
-)
-
-#define DEFINE_KMEM_EVENT(name) \
-DEFINE_EVENT(xfs_kmem_class, name, \
- TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
- TP_ARGS(size, flags, caller_ip))
-DEFINE_KMEM_EVENT(kmem_alloc);
-
TRACE_EVENT(xfs_check_new_dalign,
TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
TP_ARGS(mp, new_dalign, calc_rootino),
@@ -4079,7 +4231,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
TP_ARGS(cur),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(unsigned int, levels)
@@ -4087,15 +4239,15 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->agno = cur->bc_ag.pag->pag_agno;
__entry->agbno = cur->bc_ag.afake->af_root;
__entry->levels = cur->bc_ag.afake->af_levels;
__entry->blocks = cur->bc_ag.afake->af_blocks;
),
- TP_printk("dev %d:%d btree %s agno 0x%x levels %u blocks %u root %u",
+ TP_printk("dev %d:%d %sbt agno 0x%x levels %u blocks %u root %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->agno,
__entry->levels,
__entry->blocks,
@@ -4107,7 +4259,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
TP_ARGS(cur),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(xfs_agnumber_t, agno)
__field(xfs_agino_t, agino)
__field(unsigned int, levels)
@@ -4116,7 +4268,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
cur->bc_ino.ip->i_ino);
__entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
@@ -4125,9 +4277,9 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
__entry->blocks = cur->bc_ino.ifake->if_blocks;
__entry->whichfork = cur->bc_ino.whichfork;
),
- TP_printk("dev %d:%d btree %s agno 0x%x agino 0x%x whichfork %s levels %u blocks %u",
+ TP_printk("dev %d:%d %sbt agno 0x%x agino 0x%x whichfork %s levels %u blocks %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->agno,
__entry->agino,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
@@ -4144,7 +4296,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
blocks_with_extra),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(unsigned int, level)
__field(unsigned int, nlevels)
__field(uint64_t, nr_this_level)
@@ -4155,7 +4307,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->nr_this_level = nr_this_level;
@@ -4164,9 +4316,9 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
__entry->blocks = blocks;
__entry->blocks_with_extra = blocks_with_extra;
),
- TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
+ TP_printk("dev %d:%d %sbt level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->nlevels,
__entry->nr_this_level,
@@ -4183,7 +4335,7 @@ TRACE_EVENT(xfs_btree_bload_block,
TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_btnum_t, btnum)
+ __string(name, cur->bc_ops->name)
__field(unsigned int, level)
__field(unsigned long long, block_idx)
__field(unsigned long long, nr_blocks)
@@ -4193,11 +4345,11 @@ TRACE_EVENT(xfs_btree_bload_block,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->btnum = cur->bc_btnum;
+ __assign_str(name, cur->bc_ops->name);
__entry->level = level;
__entry->block_idx = block_idx;
__entry->nr_blocks = nr_blocks;
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
xfs_fsblock_t fsb = be64_to_cpu(ptr->l);
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb);
@@ -4208,9 +4360,9 @@ TRACE_EVENT(xfs_btree_bload_block,
}
__entry->nr_records = nr_records;
),
- TP_printk("dev %d:%d btree %s level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u",
+ TP_printk("dev %d:%d %sbt level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __get_str(name),
__entry->level,
__entry->block_idx,
__entry->nr_blocks,
@@ -4399,8 +4551,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
-DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace);
-DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove);
TRACE_EVENT(xfs_force_shutdown,
@@ -4462,6 +4612,164 @@ DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
#endif /* CONFIG_XFS_DRAIN_INTENTS */
+#ifdef CONFIG_XFS_MEMORY_BUFS
+TRACE_EVENT(xmbuf_create,
+ TP_PROTO(struct xfs_buftarg *btp),
+ TP_ARGS(btp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, ino)
+ __array(char, pathname, 256)
+ ),
+ TP_fast_assign(
+ char pathname[257];
+ char *path;
+ struct file *file = btp->bt_file;
+
+ __entry->dev = btp->bt_mount->m_super->s_dev;
+ __entry->ino = file_inode(file)->i_ino;
+ memset(pathname, 0, sizeof(pathname));
+ path = file_path(file, pathname, sizeof(pathname) - 1);
+ if (IS_ERR(path))
+ path = "(unknown)";
+ strncpy(__entry->pathname, path, sizeof(__entry->pathname));
+ ),
+ TP_printk("dev %d:%d xmino 0x%lx path '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pathname)
+);
+
+TRACE_EVENT(xmbuf_free,
+ TP_PROTO(struct xfs_buftarg *btp),
+ TP_ARGS(btp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, ino)
+ __field(unsigned long long, bytes)
+ __field(loff_t, size)
+ ),
+ TP_fast_assign(
+ struct file *file = btp->bt_file;
+ struct inode *inode = file_inode(file);
+
+ __entry->dev = btp->bt_mount->m_super->s_dev;
+ __entry->size = i_size_read(inode);
+ __entry->bytes = (inode->i_blocks << SECTOR_SHIFT) + inode->i_bytes;
+ __entry->ino = inode->i_ino;
+ ),
+ TP_printk("dev %d:%d xmino 0x%lx mem_bytes 0x%llx isize 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->bytes,
+ __entry->size)
+);
+#endif /* CONFIG_XFS_MEMORY_BUFS */
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+TRACE_EVENT(xfbtree_init,
+ TP_PROTO(struct xfs_mount *mp, struct xfbtree *xfbt,
+ const struct xfs_btree_ops *ops),
+ TP_ARGS(mp, xfbt, ops),
+ TP_STRUCT__entry(
+ __field(const void *, btree_ops)
+ __field(unsigned long, xfino)
+ __field(unsigned int, leaf_mxr)
+ __field(unsigned int, leaf_mnr)
+ __field(unsigned int, node_mxr)
+ __field(unsigned int, node_mnr)
+ __field(unsigned long long, owner)
+ ),
+ TP_fast_assign(
+ __entry->btree_ops = ops;
+ __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+ __entry->leaf_mxr = xfbt->maxrecs[0];
+ __entry->node_mxr = xfbt->maxrecs[1];
+ __entry->leaf_mnr = xfbt->minrecs[0];
+ __entry->node_mnr = xfbt->minrecs[1];
+ __entry->owner = xfbt->owner;
+ ),
+ TP_printk("xfino 0x%lx btree_ops %pS owner 0x%llx leaf_mxr %u leaf_mnr %u node_mxr %u node_mnr %u",
+ __entry->xfino,
+ __entry->btree_ops,
+ __entry->owner,
+ __entry->leaf_mxr,
+ __entry->leaf_mnr,
+ __entry->node_mxr,
+ __entry->node_mnr)
+);
+
+DECLARE_EVENT_CLASS(xfbtree_buf_class,
+ TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp),
+ TP_ARGS(xfbt, bp),
+ TP_STRUCT__entry(
+ __field(unsigned long, xfino)
+ __field(xfs_daddr_t, bno)
+ __field(int, nblks)
+ __field(int, hold)
+ __field(int, pincount)
+ __field(unsigned int, lockval)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+ __entry->bno = xfs_buf_daddr(bp);
+ __entry->nblks = bp->b_length;
+ __entry->hold = atomic_read(&bp->b_hold);
+ __entry->pincount = atomic_read(&bp->b_pin_count);
+ __entry->lockval = bp->b_sema.count;
+ __entry->flags = bp->b_flags;
+ ),
+ TP_printk("xfino 0x%lx daddr 0x%llx bbcount 0x%x hold %d pincount %d lock %d flags %s",
+ __entry->xfino,
+ (unsigned long long)__entry->bno,
+ __entry->nblks,
+ __entry->hold,
+ __entry->pincount,
+ __entry->lockval,
+ __print_flags(__entry->flags, "|", XFS_BUF_FLAGS))
+)
+
+#define DEFINE_XFBTREE_BUF_EVENT(name) \
+DEFINE_EVENT(xfbtree_buf_class, name, \
+ TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), \
+ TP_ARGS(xfbt, bp))
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_create_root_buf);
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_commit_buf);
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_cancel_buf);
+
+DECLARE_EVENT_CLASS(xfbtree_freesp_class,
+ TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur,
+ xfs_fileoff_t fileoff),
+ TP_ARGS(xfbt, cur, fileoff),
+ TP_STRUCT__entry(
+ __field(unsigned long, xfino)
+ __string(btname, cur->bc_ops->name)
+ __field(int, nlevels)
+ __field(xfs_fileoff_t, fileoff)
+ ),
+ TP_fast_assign(
+ __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+ __assign_str(btname, cur->bc_ops->name);
+ __entry->nlevels = cur->bc_nlevels;
+ __entry->fileoff = fileoff;
+ ),
+ TP_printk("xfino 0x%lx %sbt nlevels %d fileoff 0x%llx",
+ __entry->xfino,
+ __get_str(btname),
+ __entry->nlevels,
+ (unsigned long long)__entry->fileoff)
+)
+
+#define DEFINE_XFBTREE_FREESP_EVENT(name) \
+DEFINE_EVENT(xfbtree_freesp_class, name, \
+ TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, \
+ xfs_fileoff_t fileoff), \
+ TP_ARGS(xfbt, cur, fileoff))
+DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block);
+DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block);
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8c0bfc9a33b1..7350640059cc 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -24,6 +24,7 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_icache.h"
+#include "xfs_rtbitmap.h"
struct kmem_cache *xfs_trans_cache;
@@ -655,6 +656,10 @@ xfs_trans_unreserve_and_mod_sb(
mp->m_sb.sb_agcount += tp->t_agcount_delta;
mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
mp->m_sb.sb_rextsize += tp->t_rextsize_delta;
+ if (tp->t_rextsize_delta) {
+ mp->m_rtxblklog = log2_if_power2(mp->m_sb.sb_rextsize);
+ mp->m_rtxblkmask = mask64_if_power2(mp->m_sb.sb_rextsize);
+ }
mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta;
mp->m_sb.sb_rblocks += tp->t_rblocks_delta;
mp->m_sb.sb_rextents += tp->t_rextents_delta;
@@ -1196,7 +1201,7 @@ xfs_trans_alloc_inode(
retry:
error = xfs_trans_alloc(mp, resv, dblocks,
- rblocks / mp->m_sb.sb_rextsize,
+ xfs_extlen_to_rtxlen(mp, rblocks),
force ? XFS_TRANS_RESERVE : 0, &tp);
if (error)
return error;
@@ -1232,6 +1237,68 @@ out_cancel:
}
/*
+ * Try to reserve more blocks for a transaction.
+ *
+ * This is for callers that need to attach resources to a transaction, scan
+ * those resources to determine the space reservation requirements, and then
+ * modify the attached resources. In other words, online repair. This can
+ * fail due to ENOSPC, so the caller must be able to cancel the transaction
+ * without shutting down the fs.
+ */
+int
+xfs_trans_reserve_more(
+ struct xfs_trans *tp,
+ unsigned int blocks,
+ unsigned int rtextents)
+{
+ struct xfs_trans_res resv = { };
+
+ return xfs_trans_reserve(tp, &resv, blocks, rtextents);
+}
+
+/*
+ * Try to reserve more blocks and file quota for a transaction. Same
+ * conditions of usage as xfs_trans_reserve_more.
+ */
+int
+xfs_trans_reserve_more_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ unsigned int dblocks,
+ unsigned int rblocks,
+ bool force_quota)
+{
+ struct xfs_trans_res resv = { };
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks);
+ int error;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
+
+ error = xfs_trans_reserve(tp, &resv, dblocks, rtx);
+ if (error)
+ return error;
+
+ if (!XFS_IS_QUOTA_ON(mp) || xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
+ return 0;
+
+ if (tp->t_flags & XFS_TRANS_RESERVE)
+ force_quota = true;
+
+ error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks,
+ force_quota);
+ if (!error)
+ return 0;
+
+ /* Quota failed, give back the new reservation. */
+ xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE);
+ tp->t_blk_res -= dblocks;
+ xfs_mod_frextents(mp, rtx);
+ tp->t_rtx_res -= rtx;
+ return error;
+}
+
+/*
* Allocate an transaction in preparation for inode creation by reserving quota
* against the given dquots. Callers are not required to hold any inode locks.
*/
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 6e3646d524ce..1636663707dc 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -78,11 +78,7 @@ struct xfs_item_ops {
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
uint (*iop_push)(struct xfs_log_item *, struct list_head *);
void (*iop_release)(struct xfs_log_item *);
- int (*iop_recover)(struct xfs_log_item *lip,
- struct list_head *capture_list);
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
- struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
- struct xfs_trans *tp);
struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done);
};
@@ -168,6 +164,8 @@ typedef struct xfs_trans {
int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
uint blocks, uint rtextents, uint flags,
struct xfs_trans **tpp);
+int xfs_trans_reserve_more(struct xfs_trans *tp,
+ unsigned int blocks, unsigned int rtextents);
int xfs_trans_alloc_empty(struct xfs_mount *mp,
struct xfs_trans **tpp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
@@ -217,6 +215,7 @@ struct xfs_buf *xfs_trans_getsb(struct xfs_trans *);
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
@@ -247,19 +246,13 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
extern struct kmem_cache *xfs_trans_cache;
-static inline struct xfs_log_item *
-xfs_trans_item_relog(
- struct xfs_log_item *lip,
- struct xfs_trans *tp)
-{
- return lip->li_ops->iop_relog(lip, tp);
-}
-
struct xfs_dquot;
int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv,
unsigned int dblocks, unsigned int rblocks, bool force,
struct xfs_trans **tpp);
+int xfs_trans_reserve_more_inode(struct xfs_trans *tp, struct xfs_inode *ip,
+ unsigned int dblocks, unsigned int rblocks, bool force_quota);
int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
struct xfs_dquot *pdqp, unsigned int dblocks,
@@ -275,19 +268,14 @@ static inline void
xfs_trans_set_context(
struct xfs_trans *tp)
{
- ASSERT(current->journal_info == NULL);
tp->t_pflags = memalloc_nofs_save();
- current->journal_info = tp;
}
static inline void
xfs_trans_clear_context(
struct xfs_trans *tp)
{
- if (current->journal_info == tp) {
- memalloc_nofs_restore(tp->t_pflags);
- current->journal_info = NULL;
- }
+ memalloc_nofs_restore(tp->t_pflags);
}
static inline void
@@ -295,10 +283,8 @@ xfs_trans_switch_context(
struct xfs_trans *old_tp,
struct xfs_trans *new_tp)
{
- ASSERT(current->journal_info == old_tp);
new_tp->t_pflags = old_tp->t_pflags;
old_tp->t_pflags = 0;
- current->journal_info = new_tp;
}
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1098452e7f95..e4c343096f95 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -901,7 +901,8 @@ xfs_trans_ail_init(
{
struct xfs_ail *ailp;
- ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+ ailp = kzalloc(sizeof(struct xfs_ail),
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!ailp)
return -ENOMEM;
@@ -921,7 +922,7 @@ xfs_trans_ail_init(
return 0;
out_free_ailp:
- kmem_free(ailp);
+ kfree(ailp);
return -ENOMEM;
}
@@ -932,5 +933,5 @@ xfs_trans_ail_destroy(
struct xfs_ail *ailp = mp->m_ail;
kthread_stop(ailp->ail_task);
- kmem_free(ailp);
+ kfree(ailp);
}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6549e50d852c..e28ab74af4f0 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -393,6 +393,48 @@ xfs_trans_brelse(
}
/*
+ * Forcibly detach a buffer previously joined to the transaction. The caller
+ * will retain its locked reference to the buffer after this function returns.
+ * The buffer must be completely clean and must not be held to the transaction.
+ */
+void
+xfs_trans_bdetach(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+
+ ASSERT(tp != NULL);
+ ASSERT(bp->b_transp == tp);
+ ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ trace_xfs_trans_bdetach(bip);
+
+ /*
+ * Erase all recursion count, since we're removing this buffer from the
+ * transaction.
+ */
+ bip->bli_recur = 0;
+
+ /*
+ * The buffer must be completely clean. Specifically, it had better
+ * not be dirty, stale, logged, ordered, or held to the transaction.
+ */
+ ASSERT(!test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags));
+ ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY));
+ ASSERT(!(bip->bli_flags & XFS_BLI_HOLD));
+ ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+ ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+
+ /* Unlink the log item from the transaction and drop the log item. */
+ xfs_trans_del_item(&bip->bli_item);
+ xfs_buf_item_put(bip);
+ bp->b_transp = NULL;
+}
+
+/*
* Mark the buffer as not needing to be unlocked when the buf item's
* iop_committing() routine is called. The buffer must already be locked
* and associated with the given transaction.
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index aa00cf67ad72..577b535a595c 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -17,6 +17,7 @@
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_error.h"
+#include "xfs_health.h"
STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
@@ -120,6 +121,116 @@ xfs_trans_dup_dqinfo(
}
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of quota live updates. If the
+ * compiler supports jump labels, the static branch will be replaced by a nop
+ * sled when there are no hook users. Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dqtrx_hooks_switch);
+
+void
+xfs_dqtrx_hook_disable(void)
+{
+ xfs_hooks_switch_off(&xfs_dqtrx_hooks_switch);
+}
+
+void
+xfs_dqtrx_hook_enable(void)
+{
+ xfs_hooks_switch_on(&xfs_dqtrx_hooks_switch);
+}
+
+/* Schedule a transactional dquot update on behalf of an inode. */
+void
+xfs_trans_mod_ino_dquot(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_dquot *dqp,
+ unsigned int field,
+ int64_t delta)
+{
+ xfs_trans_mod_dquot(tp, dqp, field, delta);
+
+ if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
+ struct xfs_mod_ino_dqtrx_params p = {
+ .tx_id = (uintptr_t)tp,
+ .ino = ip->i_ino,
+ .q_type = xfs_dquot_type(dqp),
+ .q_id = dqp->q_id,
+ .delta = delta
+ };
+ struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo;
+
+ xfs_hooks_call(&qi->qi_mod_ino_dqtrx_hooks, field, &p);
+ }
+}
+
+/* Call the specified functions during a dquot counter update. */
+int
+xfs_dqtrx_hook_add(
+ struct xfs_quotainfo *qi,
+ struct xfs_dqtrx_hook *hook)
+{
+ int error;
+
+ /*
+ * Transactional dquot updates first call the mod hook when changes
+ * are attached to the transaction and then call the apply hook when
+ * those changes are committed (or canceled).
+ *
+ * The apply hook must be installed before the mod hook so that we
+ * never fail to catch the end of a quota update sequence.
+ */
+ error = xfs_hooks_add(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook);
+ if (error)
+ goto out;
+
+ error = xfs_hooks_add(&qi->qi_mod_ino_dqtrx_hooks, &hook->mod_hook);
+ if (error)
+ goto out_apply;
+
+ return 0;
+
+out_apply:
+ xfs_hooks_del(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook);
+out:
+ return error;
+}
+
+/* Stop calling the specified function during a dquot counter update. */
+void
+xfs_dqtrx_hook_del(
+ struct xfs_quotainfo *qi,
+ struct xfs_dqtrx_hook *hook)
+{
+ /*
+ * The mod hook must be removed before apply hook to avoid giving the
+ * hook consumer with an incomplete update. No hooks should be running
+ * after these functions return.
+ */
+ xfs_hooks_del(&qi->qi_mod_ino_dqtrx_hooks, &hook->mod_hook);
+ xfs_hooks_del(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook);
+}
+
+/* Configure dquot update hook functions. */
+void
+xfs_dqtrx_hook_setup(
+ struct xfs_dqtrx_hook *hook,
+ notifier_fn_t mod_fn,
+ notifier_fn_t apply_fn)
+{
+ xfs_hook_setup(&hook->mod_hook, mod_fn);
+ xfs_hook_setup(&hook->apply_hook, apply_fn);
+}
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
/*
* Wrap around mod_dquot to account for both user and group quotas.
*/
@@ -137,11 +248,11 @@ xfs_trans_mod_dquot_byino(
return;
if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
- (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
+ xfs_trans_mod_ino_dquot(tp, ip, ip->i_udquot, field, delta);
if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
- (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+ xfs_trans_mod_ino_dquot(tp, ip, ip->i_gdquot, field, delta);
if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
- (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
+ xfs_trans_mod_ino_dquot(tp, ip, ip->i_pdquot, field, delta);
}
STATIC struct xfs_dqtrx *
@@ -321,6 +432,29 @@ xfs_apply_quota_reservation_deltas(
}
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/* Call downstream hooks now that it's time to apply dquot deltas. */
+static inline void
+xfs_trans_apply_dquot_deltas_hook(
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
+{
+ if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
+ struct xfs_apply_dqtrx_params p = {
+ .tx_id = (uintptr_t)tp,
+ .q_type = xfs_dquot_type(dqp),
+ .q_id = dqp->q_id,
+ };
+ struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo;
+
+ xfs_hooks_call(&qi->qi_apply_dqtrx_hooks,
+ XFS_APPLY_DQTRX_COMMIT, &p);
+ }
+}
+#else
+# define xfs_trans_apply_dquot_deltas_hook(tp, dqp) ((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
/*
* Called by xfs_trans_commit() and similar in spirit to
* xfs_trans_apply_sb_deltas().
@@ -366,6 +500,8 @@ xfs_trans_apply_dquot_deltas(
ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ xfs_trans_apply_dquot_deltas_hook(tp, dqp);
+
/*
* adjust the actual number of blocks used
*/
@@ -465,6 +601,29 @@ xfs_trans_apply_dquot_deltas(
}
}
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/* Call downstream hooks now that it's time to cancel dquot deltas. */
+static inline void
+xfs_trans_unreserve_and_mod_dquots_hook(
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
+{
+ if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
+ struct xfs_apply_dqtrx_params p = {
+ .tx_id = (uintptr_t)tp,
+ .q_type = xfs_dquot_type(dqp),
+ .q_id = dqp->q_id,
+ };
+ struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo;
+
+ xfs_hooks_call(&qi->qi_apply_dqtrx_hooks,
+ XFS_APPLY_DQTRX_UNRESERVE, &p);
+ }
+}
+#else
+# define xfs_trans_unreserve_and_mod_dquots_hook(tp, dqp) ((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
/*
* Release the reservations, and adjust the dquots accordingly.
* This is called only when the transaction is being aborted. If by
@@ -495,6 +654,9 @@ xfs_trans_unreserve_and_mod_dquots(
*/
if ((dqp = qtrx->qt_dquot) == NULL)
break;
+
+ xfs_trans_unreserve_and_mod_dquots_hook(tp, dqp);
+
/*
* Unreserve the original reservation. We don't care
* about the number of blocks used field, or deltas.
@@ -706,6 +868,7 @@ error_return:
error_corrupt:
xfs_dqunlock(dqp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK);
return -EFSCORRUPTED;
}
@@ -796,7 +959,7 @@ xfs_trans_reserve_quota_nblks(
return 0;
ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (force)
qflags |= XFS_QMOPT_FORCE_RES;
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 987843f84d03..364104e1b38a 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -136,6 +136,9 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
};
int error;
+ if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK))
+ return -EIO;
+
error = xfs_attr_get(&args);
if (error)
return error;
@@ -294,6 +297,9 @@ xfs_vn_listxattr(
struct inode *inode = d_inode(dentry);
int error;
+ if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK))
+ return -EIO;
+
/*
* First read the regular on-disk attributes.
*/