aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/xfs/Kconfig5
-rw-r--r--fs/xfs/Makefile2
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h11
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c6
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h13
-rw-r--r--fs/xfs/scrub/dir.c4
-rw-r--r--fs/xfs/scrub/inode_repair.c236
-rw-r--r--fs/xfs/scrub/iscan.c767
-rw-r--r--fs/xfs/scrub/iscan.h84
-rw-r--r--fs/xfs/scrub/trace.c2
-rw-r--r--fs/xfs/scrub/trace.h194
-rw-r--r--fs/xfs/xfs_hooks.c52
-rw-r--r--fs/xfs/xfs_hooks.h65
-rw-r--r--fs/xfs/xfs_iwalk.c13
-rw-r--r--fs/xfs/xfs_linux.h1
15 files changed, 1436 insertions, 19 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 567fb37274d3..fa7eb3e2a248 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -124,11 +124,16 @@ config XFS_DRAIN_INTENTS
bool
select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+config XFS_LIVE_HOOKS
+ bool
+ select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+
config XFS_ONLINE_SCRUB
bool "XFS online metadata check support"
default n
depends on XFS_FS
depends on TMPFS && SHMEM
+ select XFS_LIVE_HOOKS
select XFS_DRAIN_INTENTS
help
If you say Y here you will be able to check metadata on a
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 35a23427055b..4597c0f19e8e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -136,6 +136,7 @@ xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o
endif
xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o
+xfs-$(CONFIG_XFS_LIVE_HOOKS) += xfs_hooks.o
# online scrub/repair
ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
@@ -158,6 +159,7 @@ xfs-y += $(addprefix scrub/, \
health.o \
ialloc.o \
inode.o \
+ iscan.o \
parent.o \
readdir.o \
refcount.o \
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 24f9d1461f9a..060e5c96b70f 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -159,6 +159,17 @@ struct xfs_da3_intnode {
#define XFS_DIR3_FT_MAX 9
+#define XFS_DIR3_FTYPE_STR \
+ { XFS_DIR3_FT_UNKNOWN, "unknown" }, \
+ { XFS_DIR3_FT_REG_FILE, "file" }, \
+ { XFS_DIR3_FT_DIR, "directory" }, \
+ { XFS_DIR3_FT_CHRDEV, "char" }, \
+ { XFS_DIR3_FT_BLKDEV, "block" }, \
+ { XFS_DIR3_FT_FIFO, "fifo" }, \
+ { XFS_DIR3_FT_SOCK, "sock" }, \
+ { XFS_DIR3_FT_SYMLINK, "symlink" }, \
+ { XFS_DIR3_FT_WHT, "whiteout" }
+
/*
* Byte offset in data block and shortform entry.
*/
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 8c9403b33191..9018abb38d7f 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -25,6 +25,12 @@ const struct xfs_name xfs_name_dotdot = {
.type = XFS_DIR3_FT_DIR,
};
+const struct xfs_name xfs_name_dot = {
+ .name = (const unsigned char *)".",
+ .len = 1,
+ .type = XFS_DIR3_FT_DIR,
+};
+
/*
* Convert inode mode to directory entry filetype
*/
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 19af22a16c41..8497d041f316 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -22,6 +22,19 @@ struct xfs_dir3_icfree_hdr;
struct xfs_dir3_icleaf_hdr;
extern const struct xfs_name xfs_name_dotdot;
+extern const struct xfs_name xfs_name_dot;
+
+static inline bool
+xfs_dir2_samename(
+ const struct xfs_name *n1,
+ const struct xfs_name *n2)
+{
+ if (n1 == n2)
+ return true;
+ if (n1->len != n2->len)
+ return false;
+ return !memcmp(n1->name, n2->name, n1->len);
+}
/*
* Convert inode mode to directory entry filetype
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index d86ab51af928..076a310b8eb0 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -93,11 +93,11 @@ xchk_dir_actor(
return -ECANCELED;
}
- if (!strncmp(".", name->name, name->len)) {
+ if (xfs_dir2_samename(name, &xfs_name_dot)) {
/* If this is "." then check that the inum matches the dir. */
if (ino != dp->i_ino)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- } else if (!strncmp("..", name->name, name->len)) {
+ } else if (xfs_dir2_samename(name, &xfs_name_dotdot)) {
/*
* If this is ".." in the root inode, check that the inum
* matches this dir.
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 0ca62d59f84a..7e859c412a5b 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -43,6 +43,8 @@
#include "scrub/btree.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/readdir.h"
/*
* Inode Record Repair
@@ -126,6 +128,10 @@ struct xrep_inode {
/* Must we remove all access from this file? */
bool zap_acls;
+
+ /* Inode scanner to see if we can find the ftype from dirents */
+ struct xchk_iscan ftype_iscan;
+ uint8_t alleged_ftype;
};
/*
@@ -227,26 +233,233 @@ xrep_dinode_header(
dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
}
-/* Turn di_mode into /something/ recognizable. */
-STATIC void
+/*
+ * If this directory entry points to the scrub target inode, then the directory
+ * we're scanning is the parent of the scrub target inode.
+ */
+STATIC int
+xrep_dinode_findmode_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_inode *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ if (ino != sc->sm->sm_ino)
+ return 0;
+
+ /* Ignore garbage directory entry names. */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+ return -EFSCORRUPTED;
+
+ /* Don't pick up dot or dotdot entries; we only want child dirents. */
+ if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
+ xfs_dir2_samename(name, &xfs_name_dot))
+ return 0;
+
+ /*
+ * Uhoh, more than one parent for this inode and they don't agree on
+ * the file type?
+ */
+ if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN &&
+ ri->alleged_ftype != name->type) {
+ trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type,
+ ri->alleged_ftype);
+ return -EFSCORRUPTED;
+ }
+
+ /* We found a potential parent; remember the ftype. */
+ trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type);
+ ri->alleged_ftype = name->type;
+ return 0;
+}
+
+/*
+ * If this is a directory, walk the dirents looking for any that point to the
+ * scrub target inode.
+ */
+STATIC int
+xrep_dinode_findmode_walk_directory(
+ struct xrep_inode *ri,
+ struct xfs_inode *dp)
+{
+ struct xfs_scrub *sc = ri->sc;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /*
+ * Scan the directory to see if there it contains an entry pointing to
+ * the directory that we are repairing.
+ */
+ lock_mode = xfs_ilock_data_map_shared(dp);
+
+ /*
+ * If this directory is known to be sick, we cannot scan it reliably
+ * and must abort.
+ */
+ if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
+ XFS_SICK_INO_BMBTD |
+ XFS_SICK_INO_DIR)) {
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ /*
+ * We cannot complete our parent pointer scan if a directory looks as
+ * though it has been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ return error;
+}
+
+/*
+ * Try to find the mode of the inode being repaired by looking for directories
+ * that point down to this file.
+ */
+STATIC int
+xrep_dinode_find_mode(
+ struct xrep_inode *ri,
+ uint16_t *mode)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_inode *dp;
+ int error;
+
+ /* No ftype means we have no other metadata to consult. */
+ if (!xfs_has_ftype(sc->mp)) {
+ *mode = S_IFREG;
+ return 0;
+ }
+
+ /*
+ * Scan all directories for parents that might point down to this
+ * inode. Skip the inode being repaired during the scan since it
+ * cannot be its own parent. Note that we still hold the AGI locked
+ * so there's a real possibility that _iscan_iter can return EBUSY.
+ */
+ xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
+ ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
+ ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
+ while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
+ if (S_ISDIR(VFS_I(dp)->i_mode))
+ error = xrep_dinode_findmode_walk_directory(ri, dp);
+ xchk_iscan_mark_visited(&ri->ftype_iscan, dp);
+ xchk_irele(sc, dp);
+ if (error < 0)
+ break;
+ if (xchk_should_terminate(sc, &error))
+ break;
+ }
+ xchk_iscan_iter_finish(&ri->ftype_iscan);
+ xchk_iscan_teardown(&ri->ftype_iscan);
+
+ if (error == -EBUSY) {
+ if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) {
+ /*
+ * If we got an EBUSY after finding at least one
+ * dirent, that means the scan found an inode on the
+ * inactivation list and could not open it. Accept the
+ * alleged ftype and install a new mode below.
+ */
+ error = 0;
+ } else if (!(sc->flags & XCHK_TRY_HARDER)) {
+ /*
+ * Otherwise, retry the operation one time to see if
+ * the reason for the delay is an inode from the same
+ * cluster buffer waiting on the inactivation list.
+ */
+ error = -EDEADLOCK;
+ }
+ }
+ if (error)
+ return error;
+
+ /*
+ * Convert the discovered ftype into the file mode. If all else fails,
+ * return S_IFREG.
+ */
+ switch (ri->alleged_ftype) {
+ case XFS_DIR3_FT_DIR:
+ *mode = S_IFDIR;
+ break;
+ case XFS_DIR3_FT_WHT:
+ case XFS_DIR3_FT_CHRDEV:
+ *mode = S_IFCHR;
+ break;
+ case XFS_DIR3_FT_BLKDEV:
+ *mode = S_IFBLK;
+ break;
+ case XFS_DIR3_FT_FIFO:
+ *mode = S_IFIFO;
+ break;
+ case XFS_DIR3_FT_SOCK:
+ *mode = S_IFSOCK;
+ break;
+ case XFS_DIR3_FT_SYMLINK:
+ *mode = S_IFLNK;
+ break;
+ default:
+ *mode = S_IFREG;
+ break;
+ }
+ return 0;
+}
+
+/* Turn di_mode into /something/ recognizable. Returns true if we succeed. */
+STATIC int
xrep_dinode_mode(
struct xrep_inode *ri,
struct xfs_dinode *dip)
{
struct xfs_scrub *sc = ri->sc;
uint16_t mode = be16_to_cpu(dip->di_mode);
+ int error;
trace_xrep_dinode_mode(sc, dip);
if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
- return;
+ return 0;
+
+ /* Try to fix the mode. If we cannot, then leave everything alone. */
+ error = xrep_dinode_find_mode(ri, &mode);
+ switch (error) {
+ case -EINTR:
+ case -EBUSY:
+ case -EDEADLOCK:
+ /* temporary failure or fatal signal */
+ return error;
+ case 0:
+ /* found mode */
+ break;
+ default:
+ /* some other error, assume S_IFREG */
+ mode = S_IFREG;
+ break;
+ }
/* bad mode, so we set it to a file that only root can read */
- mode = S_IFREG;
dip->di_mode = cpu_to_be16(mode);
dip->di_uid = 0;
dip->di_gid = 0;
ri->zap_acls = true;
+ return 0;
}
/* Fix any conflicting flags that the verifiers complain about. */
@@ -1107,12 +1320,15 @@ xrep_dinode_core(
/* Fix everything the verifier will complain about. */
dip = xfs_buf_offset(bp, ri->imap.im_boffset);
xrep_dinode_header(sc, dip);
- xrep_dinode_mode(ri, dip);
+ iget_error = xrep_dinode_mode(ri, dip);
+ if (iget_error)
+ goto write;
xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
xrep_dinode_size(ri, dip);
xrep_dinode_extsize_hints(sc, dip);
xrep_dinode_zap_forks(ri, dip);
+write:
/* Write out the inode. */
trace_xrep_dinode_fixed(sc, dip);
xfs_dinode_calc_crc(sc->mp, dip);
@@ -1128,7 +1344,8 @@ xrep_dinode_core(
* accessing the inode. If iget fails, we still need to commit the
* changes.
*/
- iget_error = xchk_iget(sc, ino, &sc->ip);
+ if (!iget_error)
+ iget_error = xchk_iget(sc, ino, &sc->ip);
if (!iget_error)
xchk_ilock(sc, XFS_IOLOCK_EXCL);
@@ -1496,6 +1713,13 @@ xrep_inode(
ASSERT(ri != NULL);
error = xrep_dinode_problems(ri);
+ if (error == -EBUSY) {
+ /*
+ * Directory scan to recover inode mode encountered a
+ * busy inode, so we did not continue repairing things.
+ */
+ return 0;
+ }
if (error)
return error;
diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c
new file mode 100644
index 000000000000..17af89b519b3
--- /dev/null
+++ b/fs/xfs/scrub/iscan.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <[email protected]>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+#include "xfs_icache.h"
+#include "scrub/scrub.h"
+#include "scrub/iscan.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * Live File Scan
+ * ==============
+ *
+ * Live file scans walk every inode in a live filesystem. This is more or
+ * less like a regular iwalk, except that when we're advancing the scan cursor,
+ * we must ensure that inodes cannot be added or deleted anywhere between the
+ * old cursor value and the new cursor value. If we're advancing the cursor
+ * by one inode, the caller must hold that inode; if we're finding the next
+ * inode to scan, we must grab the AGI and hold it until we've updated the
+ * scan cursor.
+ *
+ * Callers are expected to use this code to scan all files in the filesystem to
+ * construct a new metadata index of some kind. The scan races against other
+ * live updates, which means there must be a provision to update the new index
+ * when updates are made to inodes that already been scanned. The iscan lock
+ * can be used in live update hook code to stop the scan and protect this data
+ * structure.
+ *
+ * To keep the new index up to date with other metadata updates being made to
+ * the live filesystem, it is assumed that the caller will add hooks as needed
+ * to be notified when a metadata update occurs. The inode scanner must tell
+ * the hook code when an inode has been visited with xchk_iscan_mark_visit.
+ * Hook functions can use xchk_iscan_want_live_update to decide if the
+ * scanner's observations must be updated.
+ */
+
+/*
+ * If the inobt record @rec covers @iscan->skip_ino, mark the inode free so
+ * that the scan ignores that inode.
+ */
+STATIC void
+xchk_iscan_mask_skipino(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_inobt_rec_incore *rec,
+ xfs_agino_t lastrecino)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino);
+ xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino);
+
+ if (pag->pag_agno != skip_agno)
+ return;
+ if (skip_agino < rec->ir_startino)
+ return;
+ if (skip_agino > lastrecino)
+ return;
+
+ rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1);
+}
+
+/*
+ * Set *cursor to the next allocated inode after whatever it's set to now.
+ * If there are no more inodes in this AG, cursor is set to NULLAGINO.
+ */
+STATIC int
+xchk_iscan_find_next(
+ struct xchk_iscan *iscan,
+ struct xfs_buf *agi_bp,
+ struct xfs_perag *pag,
+ xfs_inofree_t *allocmaskp,
+ xfs_agino_t *cursor,
+ uint8_t *nr_inodesp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+ xfs_agnumber_t agno = pag->pag_agno;
+ xfs_agino_t lastino = NULLAGINO;
+ xfs_agino_t first, last;
+ xfs_agino_t agino = *cursor;
+ int has_rec;
+ int error;
+
+ /* If the cursor is beyond the end of this AG, move to the next one. */
+ xfs_agino_range(mp, agno, &first, &last);
+ if (agino > last) {
+ *cursor = NULLAGINO;
+ return 0;
+ }
+
+ /*
+ * Look up the inode chunk for the current cursor position. If there
+ * is no chunk here, we want the next one.
+ */
+ cur = xfs_inobt_init_cursor(pag, tp, agi_bp, XFS_BTNUM_INO);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec);
+ if (!error && !has_rec)
+ error = xfs_btree_increment(cur, 0, &has_rec);
+ for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) {
+ xfs_inofree_t allocmask;
+
+ /*
+ * If we've run out of inobt records in this AG, move the
+ * cursor on to the next AG and exit. The caller can try
+ * again with the next AG.
+ */
+ if (!has_rec) {
+ *cursor = NULLAGINO;
+ break;
+ }
+
+ error = xfs_inobt_get_rec(cur, &rec, &has_rec);
+ if (error)
+ break;
+ if (!has_rec) {
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ /* Make sure that we always move forward. */
+ if (lastino != NULLAGINO &&
+ XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) {
+ error = -EFSCORRUPTED;
+ break;
+ }
+ lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1;
+
+ /*
+ * If this record only covers inodes that come before the
+ * cursor, advance to the next record.
+ */
+ if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
+ continue;
+
+ if (iscan->skip_ino)
+ xchk_iscan_mask_skipino(iscan, pag, &rec, lastino);
+
+ /*
+ * If the incoming lookup put us in the middle of an inobt
+ * record, mark it and the previous inodes "free" so that the
+ * search for allocated inodes will start at the cursor.
+ * We don't care about ir_freecount here.
+ */
+ if (agino >= rec.ir_startino)
+ rec.ir_free |= xfs_inobt_maskn(0,
+ agino + 1 - rec.ir_startino);
+
+ /*
+ * If there are allocated inodes in this chunk, find them
+ * and update the scan cursor.
+ */
+ allocmask = ~rec.ir_free;
+ if (hweight64(allocmask) > 0) {
+ int next = xfs_lowbit64(allocmask);
+
+ ASSERT(next >= 0);
+ *cursor = rec.ir_startino + next;
+ *allocmaskp = allocmask >> next;
+ *nr_inodesp = XFS_INODES_PER_CHUNK - next;
+ break;
+ }
+ }
+
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Advance both the scan and the visited cursors.
+ *
+ * The inumber address space for a given filesystem is sparse, which means that
+ * the scan cursor can jump a long ways in a single iter() call. There are no
+ * inodes in these sparse areas, so we must move the visited cursor forward at
+ * the same time so that the scan user can receive live updates for inodes that
+ * may get created once we release the AGI buffer.
+ */
+static inline void
+xchk_iscan_move_cursor(
+ struct xchk_iscan *iscan,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t cursor, visited;
+
+ BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO);
+
+ /*
+ * Special-case ino == 0 here so that we never set visited_ino to
+ * NULLFSINO when wrapping around EOFS, for that will let through all
+ * live updates.
+ */
+ cursor = XFS_AGINO_TO_INO(mp, agno, agino);
+ if (cursor == 0)
+ visited = XFS_MAXINUMBER;
+ else
+ visited = cursor - 1;
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = cursor;
+ iscan->__visited_ino = visited;
+ trace_xchk_iscan_move_cursor(iscan);
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Prepare to return agno/agino to the iscan caller by moving the lastino
+ * cursor to the previous inode. Do this while we still hold the AGI so that
+ * no other threads can create or delete inodes in this AG.
+ */
+static inline void
+xchk_iscan_finish(
+ struct xchk_iscan *iscan)
+{
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = NULLFSINO;
+
+ /* All live updates will be applied from now on */
+ iscan->__visited_ino = NULLFSINO;
+
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Advance ino to the next inode that the inobt thinks is allocated, being
+ * careful to jump to the next AG if we've reached the right end of this AG's
+ * inode btree. Advancing ino effectively means that we've pushed the inode
+ * scan forward, so set the iscan cursor to (ino - 1) so that our live update
+ * predicates will track inode allocations in that part of the inode number
+ * key space once we release the AGI buffer.
+ *
+ * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
+ * -ECANCELED if the live scan aborted, or the usual negative errno.
+ */
+STATIC int
+xchk_iscan_advance(
+ struct xchk_iscan *iscan,
+ struct xfs_perag **pagp,
+ struct xfs_buf **agi_bpp,
+ xfs_inofree_t *allocmaskp,
+ uint8_t *nr_inodesp)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agi_bp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ xfs_agino_t agino;
+ int ret;
+
+ ASSERT(iscan->cursor_ino >= iscan->__visited_ino);
+
+ do {
+ if (xchk_iscan_aborted(iscan))
+ return -ECANCELED;
+
+ agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino);
+ pag = xfs_perag_get(mp, agno);
+ if (!pag)
+ return -ECANCELED;
+
+ ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
+ if (ret)
+ goto out_pag;
+
+ agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino);
+ ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp,
+ &agino, nr_inodesp);
+ if (ret)
+ goto out_buf;
+
+ if (agino != NULLAGINO) {
+ /*
+ * Found the next inode in this AG, so return it along
+ * with the AGI buffer and the perag structure to
+ * ensure it cannot go away.
+ */
+ xchk_iscan_move_cursor(iscan, agno, agino);
+ *agi_bpp = agi_bp;
+ *pagp = pag;
+ return 1;
+ }
+
+ /*
+ * Did not find any more inodes in this AG, move on to the next
+ * AG.
+ */
+ agno = (agno + 1) % mp->m_sb.sb_agcount;
+ xchk_iscan_move_cursor(iscan, agno, 0);
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ trace_xchk_iscan_advance_ag(iscan);
+ } while (iscan->cursor_ino != iscan->scan_start_ino);
+
+ xchk_iscan_finish(iscan);
+ return 0;
+
+out_buf:
+ xfs_trans_brelse(sc->tp, agi_bp);
+out_pag:
+ xfs_perag_put(pag);
+ return ret;
+}
+
+/*
+ * Grabbing the inode failed, so we need to back up the scan and ask the caller
+ * to try to _advance the scan again. Returns -EBUSY if we've run out of retry
+ * opportunities, -ECANCELED if the process has a fatal signal pending, or
+ * -EAGAIN if we should try again.
+ */
+STATIC int
+xchk_iscan_iget_retry(
+ struct xchk_iscan *iscan,
+ bool wait)
+{
+ ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1);
+
+ if (!iscan->iget_timeout ||
+ time_is_before_jiffies(iscan->__iget_deadline))
+ return -EBUSY;
+
+ if (wait) {
+ unsigned long relax;
+
+ /*
+ * Sleep for a period of time to let the rest of the system
+ * catch up. If we return early, someone sent a kill signal to
+ * the calling process.
+ */
+ relax = msecs_to_jiffies(iscan->iget_retry_delay);
+ trace_xchk_iscan_iget_retry_wait(iscan);
+
+ if (schedule_timeout_killable(relax) ||
+ xchk_iscan_aborted(iscan))
+ return -ECANCELED;
+ }
+
+ iscan->cursor_ino--;
+ return -EAGAIN;
+}
+
+/*
+ * Grab an inode as part of an inode scan. While scanning this inode, the
+ * caller must ensure that no other threads can modify the inode until a call
+ * to xchk_iscan_visit succeeds.
+ *
+ * Returns the number of incore inodes grabbed; -EAGAIN if the caller should
+ * call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode;
+ * -ECANCELED if there's a fatal signal pending; or some other negative errno.
+ */
+STATIC int
+xchk_iscan_iget(
+ struct xchk_iscan *iscan,
+ struct xfs_perag *pag,
+ struct xfs_buf *agi_bp,
+ xfs_inofree_t allocmask,
+ uint8_t nr_inodes)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t ino = iscan->cursor_ino;
+ unsigned int idx = 0;
+ unsigned int i;
+ int error;
+
+ ASSERT(iscan->__inodes[0] == NULL);
+
+ /* Fill the first slot in the inode array. */
+ error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+ &iscan->__inodes[idx]);
+
+ trace_xchk_iscan_iget(iscan, error);
+
+ if (error == -ENOENT || error == -EAGAIN) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ /*
+ * It's possible that this inode has lost all of its links but
+ * hasn't yet been inactivated. If we don't have a transaction
+ * or it's not writable, flush the inodegc workers and wait.
+ */
+ xfs_inodegc_flush(mp);
+ return xchk_iscan_iget_retry(iscan, true);
+ }
+
+ if (error == -EINVAL) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+
+ /*
+ * We thought the inode was allocated, but the inode btree
+ * lookup failed, which means that it was freed since the last
+ * time we advanced the cursor. Back up and try again. This
+ * should never happen since still hold the AGI buffer from the
+ * inobt check, but we need to be careful about infinite loops.
+ */
+ return xchk_iscan_iget_retry(iscan, false);
+ }
+
+ if (error) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ return error;
+ }
+ idx++;
+ ino++;
+ allocmask >>= 1;
+
+ /*
+ * Now that we've filled the first slot in __inodes, try to fill the
+ * rest of the batch with consecutively ordered inodes. to reduce the
+ * number of _iter calls. Make a bitmap of unallocated inodes from the
+ * zeroes in the inuse bitmap; these inodes will not be scanned, but
+ * the _want_live_update predicate will pass through all live updates.
+ *
+ * If we can't iget an allocated inode, stop and return what we have.
+ */
+ mutex_lock(&iscan->lock);
+ iscan->__batch_ino = ino - 1;
+ iscan->__skipped_inomask = 0;
+ mutex_unlock(&iscan->lock);
+
+ for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) {
+ if (!(allocmask & 1)) {
+ ASSERT(!(iscan->__skipped_inomask & (1ULL << i)));
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = ino;
+ iscan->__skipped_inomask |= (1ULL << i);
+ mutex_unlock(&iscan->lock);
+ continue;
+ }
+
+ ASSERT(iscan->__inodes[idx] == NULL);
+
+ error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+ &iscan->__inodes[idx]);
+ if (error)
+ break;
+
+ mutex_lock(&iscan->lock);
+ iscan->cursor_ino = ino;
+ mutex_unlock(&iscan->lock);
+ idx++;
+ }
+
+ trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx);
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ return idx;
+}
+
+/*
+ * Advance the visit cursor to reflect skipped inodes beyond whatever we
+ * scanned.
+ */
+STATIC void
+xchk_iscan_finish_batch(
+ struct xchk_iscan *iscan)
+{
+ xfs_ino_t highest_skipped;
+
+ mutex_lock(&iscan->lock);
+
+ if (iscan->__batch_ino != NULLFSINO) {
+ highest_skipped = iscan->__batch_ino +
+ xfs_highbit64(iscan->__skipped_inomask);
+ iscan->__visited_ino = max(iscan->__visited_ino,
+ highest_skipped);
+
+ trace_xchk_iscan_skip(iscan);
+ }
+
+ iscan->__batch_ino = NULLFSINO;
+ iscan->__skipped_inomask = 0;
+
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Advance the inode scan cursor to the next allocated inode and return up to
+ * 64 consecutive allocated inodes starting with the cursor position.
+ */
+STATIC int
+xchk_iscan_iter_batch(
+ struct xchk_iscan *iscan)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ int ret;
+
+ xchk_iscan_finish_batch(iscan);
+
+ if (iscan->iget_timeout)
+ iscan->__iget_deadline = jiffies +
+ msecs_to_jiffies(iscan->iget_timeout);
+
+ do {
+ struct xfs_buf *agi_bp = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_inofree_t allocmask = 0;
+ uint8_t nr_inodes = 0;
+
+ ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask,
+ &nr_inodes);
+ if (ret != 1)
+ return ret;
+
+ if (xchk_iscan_aborted(iscan)) {
+ xfs_trans_brelse(sc->tp, agi_bp);
+ xfs_perag_put(pag);
+ ret = -ECANCELED;
+ break;
+ }
+
+ ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+/*
+ * Advance the inode scan cursor to the next allocated inode and return the
+ * incore inode structure associated with it.
+ *
+ * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
+ * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be
+ * grabbed, or the usual negative errno.
+ *
+ * If the function returns -EBUSY and the caller can handle skipping an inode,
+ * it may call this function again to continue the scan with the next allocated
+ * inode.
+ */
+int
+xchk_iscan_iter(
+ struct xchk_iscan *iscan,
+ struct xfs_inode **ipp)
+{
+ unsigned int i;
+ int error;
+
+ /* Find a cached inode, or go get another batch. */
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+ if (iscan->__inodes[i])
+ goto foundit;
+ }
+
+ error = xchk_iscan_iter_batch(iscan);
+ if (error <= 0)
+ return error;
+
+ ASSERT(iscan->__inodes[0] != NULL);
+ i = 0;
+
+foundit:
+ /* Give the caller our reference. */
+ *ipp = iscan->__inodes[i];
+ iscan->__inodes[i] = NULL;
+ return 1;
+}
+
+/* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */
+void
+xchk_iscan_iter_finish(
+ struct xchk_iscan *iscan)
+{
+ struct xfs_scrub *sc = iscan->sc;
+ unsigned int i;
+
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+ if (iscan->__inodes[i]) {
+ xchk_irele(sc, iscan->__inodes[i]);
+ iscan->__inodes[i] = NULL;
+ }
+ }
+}
+
+/* Mark this inode scan finished and release resources. */
+void
+xchk_iscan_teardown(
+ struct xchk_iscan *iscan)
+{
+ xchk_iscan_iter_finish(iscan);
+ xchk_iscan_finish(iscan);
+ mutex_destroy(&iscan->lock);
+}
+
+/* Pick an AG from which to start a scan. */
+static inline xfs_ino_t
+xchk_iscan_rotor(
+ struct xfs_mount *mp)
+{
+ static atomic_t agi_rotor;
+ unsigned int r = atomic_inc_return(&agi_rotor) - 1;
+
+ /*
+ * Rotoring *backwards* through the AGs, so we add one here before
+ * subtracting from the agcount to arrive at an AG number.
+ */
+ r = (r % mp->m_sb.sb_agcount) + 1;
+
+ return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0);
+}
+
+/*
+ * Set ourselves up to start an inode scan. If the @iget_timeout and
+ * @iget_retry_delay parameters are set, the scan will try to iget each inode
+ * for @iget_timeout milliseconds. If an iget call indicates that the inode is
+ * waiting to be inactivated, the CPU will relax for @iget_retry_delay
+ * milliseconds after pushing the inactivation workers.
+ */
+void
+xchk_iscan_start(
+ struct xfs_scrub *sc,
+ unsigned int iget_timeout,
+ unsigned int iget_retry_delay,
+ struct xchk_iscan *iscan)
+{
+ xfs_ino_t start_ino;
+
+ start_ino = xchk_iscan_rotor(sc->mp);
+
+ iscan->__batch_ino = NULLFSINO;
+ iscan->__skipped_inomask = 0;
+
+ iscan->sc = sc;
+ clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+ iscan->iget_timeout = iget_timeout;
+ iscan->iget_retry_delay = iget_retry_delay;
+ iscan->__visited_ino = start_ino;
+ iscan->cursor_ino = start_ino;
+ iscan->scan_start_ino = start_ino;
+ mutex_init(&iscan->lock);
+ memset(iscan->__inodes, 0, sizeof(iscan->__inodes));
+
+ trace_xchk_iscan_start(iscan, start_ino);
+}
+
+/*
+ * Mark this inode as having been visited. Callers must hold a sufficiently
+ * exclusive lock on the inode to prevent concurrent modifications.
+ */
+void
+xchk_iscan_mark_visited(
+ struct xchk_iscan *iscan,
+ struct xfs_inode *ip)
+{
+ mutex_lock(&iscan->lock);
+ iscan->__visited_ino = ip->i_ino;
+ trace_xchk_iscan_visit(iscan);
+ mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Did we skip this inode because it wasn't allocated when we loaded the batch?
+ * If so, it is newly allocated and will not be scanned. All live updates to
+ * this inode must be passed to the caller to maintain scan correctness.
+ */
+static inline bool
+xchk_iscan_skipped(
+ const struct xchk_iscan *iscan,
+ xfs_ino_t ino)
+{
+ if (iscan->__batch_ino == NULLFSINO)
+ return false;
+ if (ino < iscan->__batch_ino)
+ return false;
+ if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK)
+ return false;
+
+ return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino));
+}
+
+/*
+ * Do we need a live update for this inode? This is true if the scanner thread
+ * has visited this inode and the scan hasn't been aborted due to errors.
+ * Callers must hold a sufficiently exclusive lock on the inode to prevent
+ * scanners from reading any inode metadata.
+ */
+bool
+xchk_iscan_want_live_update(
+ struct xchk_iscan *iscan,
+ xfs_ino_t ino)
+{
+ bool ret = false;
+
+ if (xchk_iscan_aborted(iscan))
+ return false;
+
+ mutex_lock(&iscan->lock);
+
+ trace_xchk_iscan_want_live_update(iscan, ino);
+
+ /* Scan is finished, caller should receive all updates. */
+ if (iscan->__visited_ino == NULLFSINO) {
+ ret = true;
+ goto unlock;
+ }
+
+ /*
+ * No inodes have been visited yet, so the visited cursor points at the
+ * start of the scan range. The caller should not receive any updates.
+ */
+ if (iscan->scan_start_ino == iscan->__visited_ino) {
+ ret = false;
+ goto unlock;
+ }
+
+ /*
+ * This inode was not allocated at the time of the iscan batch.
+ * The caller should receive all updates.
+ */
+ if (xchk_iscan_skipped(iscan, ino)) {
+ ret = true;
+ goto unlock;
+ }
+
+ /*
+ * The visited cursor hasn't yet wrapped around the end of the FS. If
+ * @ino is inside the starred range, the caller should receive updates:
+ *
+ * 0 ------------ S ************ V ------------ EOFS
+ */
+ if (iscan->scan_start_ino <= iscan->__visited_ino) {
+ if (ino >= iscan->scan_start_ino &&
+ ino <= iscan->__visited_ino)
+ ret = true;
+
+ goto unlock;
+ }
+
+ /*
+ * The visited cursor wrapped around the end of the FS. If @ino is
+ * inside the starred range, the caller should receive updates:
+ *
+ * 0 ************ V ------------ S ************ EOFS
+ */
+ if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino)
+ ret = true;
+
+unlock:
+ mutex_unlock(&iscan->lock);
+ return ret;
+}
diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h
new file mode 100644
index 000000000000..71f657552dfa
--- /dev/null
+++ b/fs/xfs/scrub/iscan.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <[email protected]>
+ */
+#ifndef __XFS_SCRUB_ISCAN_H__
+#define __XFS_SCRUB_ISCAN_H__
+
+struct xchk_iscan {
+ struct xfs_scrub *sc;
+
+ /* Lock to protect the scan cursor. */
+ struct mutex lock;
+
+ /*
+ * This is the first inode in the inumber address space that we
+ * examined. When the scan wraps around back to here, the scan is
+ * finished.
+ */
+ xfs_ino_t scan_start_ino;
+
+ /* This is the inode that will be examined next. */
+ xfs_ino_t cursor_ino;
+
+ /* If nonzero and non-NULL, skip this inode when scanning. */
+ xfs_ino_t skip_ino;
+
+ /*
+ * This is the last inode that we've successfully scanned, either
+ * because the caller scanned it, or we moved the cursor past an empty
+ * part of the inode address space. Scan callers should only use the
+ * xchk_iscan_visit function to modify this.
+ */
+ xfs_ino_t __visited_ino;
+
+ /* Operational state of the livescan. */
+ unsigned long __opstate;
+
+ /* Give up on iterating @cursor_ino if we can't iget it by this time. */
+ unsigned long __iget_deadline;
+
+ /* Amount of time (in ms) that we will try to iget an inode. */
+ unsigned int iget_timeout;
+
+ /* Wait this many ms to retry an iget. */
+ unsigned int iget_retry_delay;
+
+ /*
+ * The scan grabs batches of inodes and stashes them here before
+ * handing them out with _iter. Unallocated inodes are set in the
+ * mask so that all updates to that inode are selected for live
+ * update propagation.
+ */
+ xfs_ino_t __batch_ino;
+ xfs_inofree_t __skipped_inomask;
+ struct xfs_inode *__inodes[XFS_INODES_PER_CHUNK];
+};
+
+/* Set if the scan has been aborted due to some event in the fs. */
+#define XCHK_ISCAN_OPSTATE_ABORTED (1)
+
+static inline bool
+xchk_iscan_aborted(const struct xchk_iscan *iscan)
+{
+ return test_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+}
+
+static inline void
+xchk_iscan_abort(struct xchk_iscan *iscan)
+{
+ set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+}
+
+void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout,
+ unsigned int iget_retry_delay, struct xchk_iscan *iscan);
+void xchk_iscan_teardown(struct xchk_iscan *iscan);
+
+int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp);
+void xchk_iscan_iter_finish(struct xchk_iscan *iscan);
+
+void xchk_iscan_mark_visited(struct xchk_iscan *iscan, struct xfs_inode *ip);
+bool xchk_iscan_want_live_update(struct xchk_iscan *iscan, xfs_ino_t ino);
+
+#endif /* __XFS_SCRUB_ISCAN_H__ */
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index d0e24ffaf754..5ed75cc33b92 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -16,10 +16,12 @@
#include "xfs_rtbitmap.h"
#include "xfs_quota.h"
#include "xfs_quota_defs.h"
+#include "xfs_da_format.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/quota.h"
+#include "scrub/iscan.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index ae6b2385a8cb..9aba60c61880 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -16,10 +16,12 @@
#include <linux/tracepoint.h>
#include "xfs_bit.h"
+struct xfs_scrub;
struct xfile;
struct xfarray;
struct xfarray_sortinfo;
struct xchk_dqiter;
+struct xchk_iscan;
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -1146,6 +1148,149 @@ TRACE_EVENT(xchk_rtsum_record_free,
);
#endif /* CONFIG_XFS_RT */
+DECLARE_EVENT_CLASS(xchk_iscan_class,
+ TP_PROTO(struct xchk_iscan *iscan),
+ TP_ARGS(iscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited)
+)
+#define DEFINE_ISCAN_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan), \
+ TP_ARGS(iscan))
+DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor);
+DEFINE_ISCAN_EVENT(xchk_iscan_visit);
+DEFINE_ISCAN_EVENT(xchk_iscan_skip);
+DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag);
+
+DECLARE_EVENT_CLASS(xchk_iscan_ino_class,
+ TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino),
+ TP_ARGS(iscan, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, startino)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->startino = iscan->scan_start_ino;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d iscan start 0x%llx cursor 0x%llx visited 0x%llx ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->startino,
+ __entry->cursor,
+ __entry->visited,
+ __entry->ino)
+)
+#define DEFINE_ISCAN_INO_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_ino_class, name, \
+ TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), \
+ TP_ARGS(iscan, ino))
+DEFINE_ISCAN_INO_EVENT(xchk_iscan_want_live_update);
+DEFINE_ISCAN_INO_EVENT(xchk_iscan_start);
+
+TRACE_EVENT(xchk_iscan_iget,
+ TP_PROTO(struct xchk_iscan *iscan, int error),
+ TP_ARGS(iscan, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->error = error;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->error)
+);
+
+TRACE_EVENT(xchk_iscan_iget_batch,
+ TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan,
+ unsigned int nr, unsigned int avail),
+ TP_ARGS(mp, iscan, nr, avail),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(unsigned int, nr)
+ __field(unsigned int, avail)
+ __field(unsigned int, unavail)
+ __field(xfs_ino_t, batch_ino)
+ __field(unsigned long long, skipmask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->nr = nr;
+ __entry->avail = avail;
+ __entry->unavail = hweight64(iscan->__skipped_inomask);
+ __entry->batch_ino = iscan->__batch_ino;
+ __entry->skipmask = iscan->__skipped_inomask;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx batchino 0x%llx skipmask 0x%llx nr %u avail %u unavail %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->batch_ino,
+ __entry->skipmask,
+ __entry->nr,
+ __entry->avail,
+ __entry->unavail)
+);
+
+TRACE_EVENT(xchk_iscan_iget_retry_wait,
+ TP_PROTO(struct xchk_iscan *iscan),
+ TP_ARGS(iscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, cursor)
+ __field(xfs_ino_t, visited)
+ __field(unsigned int, retry_delay)
+ __field(unsigned long, remaining)
+ __field(unsigned int, iget_timeout)
+ ),
+ TP_fast_assign(
+ __entry->dev = iscan->sc->mp->m_super->s_dev;
+ __entry->cursor = iscan->cursor_ino;
+ __entry->visited = iscan->__visited_ino;
+ __entry->retry_delay = iscan->iget_retry_delay;
+ __entry->remaining = jiffies_to_msecs(iscan->__iget_deadline - jiffies);
+ __entry->iget_timeout = iscan->iget_timeout;
+ ),
+ TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx remaining %lu timeout %u delay %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->cursor,
+ __entry->visited,
+ __entry->remaining,
+ __entry->iget_timeout,
+ __entry->retry_delay)
+);
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
@@ -1672,6 +1817,55 @@ TRACE_EVENT(xrep_dinode_count_rmaps,
__entry->attr_extents)
);
+TRACE_EVENT(xrep_dinode_findmode_dirent,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp,
+ unsigned int ftype),
+ TP_ARGS(sc, dp, ftype),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->ftype = ftype;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR))
+);
+
+TRACE_EVENT(xrep_dinode_findmode_dirent_inval,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp,
+ unsigned int ftype, unsigned int found_ftype),
+ TP_ARGS(sc, dp, ftype, found_ftype),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(unsigned int, ftype)
+ __field(unsigned int, found_ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->parent_ino = dp->i_ino;
+ __entry->ftype = ftype;
+ __entry->found_ftype = found_ftype;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s' found_ftype '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->parent_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __print_symbolic(__entry->found_ftype, XFS_DIR3_FTYPE_STR))
+);
+
TRACE_EVENT(xrep_cow_mark_file_range,
TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock,
xfs_fileoff_t startoff, xfs_filblks_t blockcount),
diff --git a/fs/xfs/xfs_hooks.c b/fs/xfs/xfs_hooks.c
new file mode 100644
index 000000000000..a58d1de2d37d
--- /dev/null
+++ b/fs/xfs/xfs_hooks.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <[email protected]>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_trace.h"
+
+/* Initialize a notifier chain. */
+void
+xfs_hooks_init(
+ struct xfs_hooks *chain)
+{
+ BLOCKING_INIT_NOTIFIER_HEAD(&chain->head);
+}
+
+/* Make it so a function gets called whenever we hit a certain hook point. */
+int
+xfs_hooks_add(
+ struct xfs_hooks *chain,
+ struct xfs_hook *hook)
+{
+ ASSERT(hook->nb.notifier_call != NULL);
+ BUILD_BUG_ON(offsetof(struct xfs_hook, nb) != 0);
+
+ return blocking_notifier_chain_register(&chain->head, &hook->nb);
+}
+
+/* Remove a previously installed hook. */
+void
+xfs_hooks_del(
+ struct xfs_hooks *chain,
+ struct xfs_hook *hook)
+{
+ blocking_notifier_chain_unregister(&chain->head, &hook->nb);
+}
+
+/* Call a hook. Returns the NOTIFY_* value returned by the last hook. */
+int
+xfs_hooks_call(
+ struct xfs_hooks *chain,
+ unsigned long val,
+ void *priv)
+{
+ return blocking_notifier_call_chain(&chain->head, val, priv);
+}
diff --git a/fs/xfs/xfs_hooks.h b/fs/xfs/xfs_hooks.h
new file mode 100644
index 000000000000..60b8a5831536
--- /dev/null
+++ b/fs/xfs/xfs_hooks.h
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <[email protected]>
+ */
+#ifndef XFS_HOOKS_H_
+#define XFS_HOOKS_H_
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+struct xfs_hooks {
+ struct blocking_notifier_head head;
+};
+
+/*
+ * If jump labels are enabled in Kconfig, the static key uses nop sleds and
+ * code patching to eliminate the overhead of taking the rwsem in
+ * blocking_notifier_call_chain when there are no hooks configured. If not,
+ * the static key per-call overhead is an atomic read. Most arches that can
+ * handle XFS also support jump labels.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+# define DEFINE_STATIC_XFS_HOOK_SWITCH(name) \
+ static DEFINE_STATIC_KEY_FALSE(name)
+# define xfs_hooks_switch_on(name) static_branch_inc(name)
+# define xfs_hooks_switch_off(name) static_branch_dec(name)
+# define xfs_hooks_switched_on(name) static_branch_unlikely(name)
+
+struct xfs_hook {
+ /* This must come at the start of the structure. */
+ struct notifier_block nb;
+};
+
+typedef int (*xfs_hook_fn_t)(struct xfs_hook *hook, unsigned long action,
+ void *data);
+
+void xfs_hooks_init(struct xfs_hooks *chain);
+int xfs_hooks_add(struct xfs_hooks *chain, struct xfs_hook *hook);
+void xfs_hooks_del(struct xfs_hooks *chain, struct xfs_hook *hook);
+int xfs_hooks_call(struct xfs_hooks *chain, unsigned long action,
+ void *priv);
+
+static inline void xfs_hook_setup(struct xfs_hook *hook, notifier_fn_t fn)
+{
+ hook->nb.notifier_call = fn;
+ hook->nb.priority = 0;
+}
+
+#else
+
+struct xfs_hooks { /* empty */ };
+
+# define DEFINE_STATIC_XFS_HOOK_SWITCH(name)
+# define xfs_hooks_switch_on(name) ((void)0)
+# define xfs_hooks_switch_off(name) ((void)0)
+# define xfs_hooks_switched_on(name) (false)
+
+# define xfs_hooks_init(chain) ((void)0)
+# define xfs_hooks_call(chain, val, priv) (NOTIFY_DONE)
+#endif
+
+#endif /* XFS_HOOKS_H_ */
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 6d2eb6364867..c1e9c7bcb6a9 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -22,6 +22,7 @@
#include "xfs_trans.h"
#include "xfs_pwork.h"
#include "xfs_ag.h"
+#include "xfs_bit.h"
/*
* Walking Inodes in the Filesystem
@@ -131,21 +132,11 @@ xfs_iwalk_adjust_start(
struct xfs_inobt_rec_incore *irec) /* btree record */
{
int idx; /* index into inode chunk */
- int i;
idx = agino - irec->ir_startino;
- /*
- * We got a right chunk with some left inodes allocated at it. Grab
- * the chunk record. Mark all the uninteresting inodes free because
- * they're before our start point.
- */
- for (i = 0; i < idx; i++) {
- if (XFS_INOBT_MASK(i) & ~irec->ir_free)
- irec->ir_freecount++;
- }
-
irec->ir_free |= xfs_inobt_maskn(0, idx);
+ irec->ir_freecount = hweight64(irec->ir_free);
}
/* Allocate memory for a walk. */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 75245932e0ad..8f07c9f6157f 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -81,6 +81,7 @@ typedef __u32 xfs_nlink_t;
#include "xfs_buf.h"
#include "xfs_message.h"
#include "xfs_drain.h"
+#include "xfs_hooks.h"
#ifdef __BIG_ENDIAN
#define XFS_NATIVE_HOST 1