aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/xfs/Makefile3
-rw-r--r--fs/xfs/scrub/dir.c9
-rw-r--r--fs/xfs/scrub/dir_repair.c1402
-rw-r--r--fs/xfs/scrub/findparent.c448
-rw-r--r--fs/xfs/scrub/findparent.h50
-rw-r--r--fs/xfs/scrub/inode_repair.c5
-rw-r--r--fs/xfs/scrub/iscan.c18
-rw-r--r--fs/xfs/scrub/iscan.h1
-rw-r--r--fs/xfs/scrub/nlinks.c23
-rw-r--r--fs/xfs/scrub/nlinks_repair.c9
-rw-r--r--fs/xfs/scrub/parent.c14
-rw-r--r--fs/xfs/scrub/parent_repair.c234
-rw-r--r--fs/xfs/scrub/readdir.c7
-rw-r--r--fs/xfs/scrub/repair.c1
-rw-r--r--fs/xfs/scrub/repair.h8
-rw-r--r--fs/xfs/scrub/scrub.c4
-rw-r--r--fs/xfs/scrub/tempfile.c13
-rw-r--r--fs/xfs/scrub/tempfile.h2
-rw-r--r--fs/xfs/scrub/trace.h115
-rw-r--r--fs/xfs/scrub/xfblob.h24
-rw-r--r--fs/xfs/xfs_inode.c51
21 files changed, 2437 insertions, 4 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7dbe6b3befb3..d48646f86563 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -198,11 +198,14 @@ xfs-y += $(addprefix scrub/, \
attr_repair.o \
bmap_repair.o \
cow_repair.o \
+ dir_repair.o \
+ findparent.o \
fscounters_repair.o \
ialloc_repair.o \
inode_repair.o \
newbt.o \
nlinks_repair.o \
+ parent_repair.o \
rcbag_btree.o \
rcbag.o \
reap.o \
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 7bac74621af7..3fe6ffcf9c06 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -21,12 +21,21 @@
#include "scrub/dabtree.h"
#include "scrub/readdir.h"
#include "scrub/health.h"
+#include "scrub/repair.h"
/* Set us up to scrub directories. */
int
xchk_setup_directory(
struct xfs_scrub *sc)
{
+ int error;
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_directory(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_inode_contents(sc, 0);
}
diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c
new file mode 100644
index 000000000000..34fe720fde0e
--- /dev/null
+++ b/fs/xfs/scrub/dir_repair.c
@@ -0,0 +1,1402 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <[email protected]>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_bmap_util.h"
+#include "xfs_exchmaps.h"
+#include "xfs_exchrange.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/tempfile.h"
+#include "scrub/tempexch.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/xfblob.h"
+#include "scrub/iscan.h"
+#include "scrub/readdir.h"
+#include "scrub/reap.h"
+#include "scrub/findparent.h"
+
+/*
+ * Directory Repair
+ * ================
+ *
+ * We repair directories by reading the directory data blocks looking for
+ * directory entries that look salvageable (name passes verifiers, entry points
+ * to a valid allocated inode, etc). Each entry worth salvaging is stashed in
+ * memory, and the stashed entries are periodically replayed into a temporary
+ * directory to constrain memory use. Batching the construction of the
+ * temporary directory in this fashion reduces lock cycling of the directory
+ * being repaired and the temporary directory, and will later become important
+ * for parent pointer scanning.
+ *
+ * Directory entries added to the temporary directory do not elevate the link
+ * counts of the inodes found. When salvaging completes, the remaining stashed
+ * entries are replayed to the temporary directory. An atomic mapping exchange
+ * is used to commit the new directory blocks to the directory being repaired.
+ * This will disrupt readdir cursors.
+ *
+ * Locking Issues
+ * --------------
+ *
+ * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on
+ * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects
+ * b's dotdot update. This is in contrast to every other dotdot update (link,
+ * remove, mkdir). If the repair code drops the ILOCK, it must either
+ * revalidate the dotdot entry or use dirent hooks to capture updates from
+ * other threads.
+ */
+
+/* Directory entry to be restored in the new directory. */
+struct xrep_dirent {
+ /* Cookie for retrieval of the dirent name. */
+ xfblob_cookie name_cookie;
+
+ /* Target inode number. */
+ xfs_ino_t ino;
+
+ /* Length of the dirent name. */
+ uint8_t namelen;
+
+ /* File type of the dirent. */
+ uint8_t ftype;
+};
+
+/*
+ * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names
+ * before we write them to the temp dir.
+ */
+#define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8)
+
+struct xrep_dir {
+ struct xfs_scrub *sc;
+
+ /* Fixed-size array of xrep_dirent structures. */
+ struct xfarray *dir_entries;
+
+ /* Blobs containing directory entry names. */
+ struct xfblob *dir_names;
+
+ /* Information for exchanging data forks at the end. */
+ struct xrep_tempexch tx;
+
+ /* Preallocated args struct for performing dir operations */
+ struct xfs_da_args args;
+
+ /*
+ * Information used to scan the filesystem to find the inumber of the
+ * dotdot entry for this directory.
+ */
+ struct xrep_parent_scan_info pscan;
+
+ /* How many subdirectories did we find? */
+ uint64_t subdirs;
+
+ /* How many dirents did we find? */
+ unsigned int dirents;
+
+ /* Directory entry name, plus the trailing null. */
+ struct xfs_name xname;
+ unsigned char namebuf[MAXNAMELEN];
+};
+
+/* Tear down all the incore stuff we created. */
+static void
+xrep_dir_teardown(
+ struct xfs_scrub *sc)
+{
+ struct xrep_dir *rd = sc->buf;
+
+ xrep_findparent_scan_teardown(&rd->pscan);
+ xfblob_destroy(rd->dir_names);
+ xfarray_destroy(rd->dir_entries);
+}
+
+/* Set up for a directory repair. */
+int
+xrep_setup_directory(
+ struct xfs_scrub *sc)
+{
+ struct xrep_dir *rd;
+ int error;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ error = xrep_tempfile_create(sc, S_IFDIR);
+ if (error)
+ return error;
+
+ rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS);
+ if (!rd)
+ return -ENOMEM;
+ rd->sc = sc;
+ rd->xname.name = rd->namebuf;
+ sc->buf = rd;
+
+ return 0;
+}
+
+/*
+ * If we're the root of a directory tree, we are our own parent. If we're an
+ * unlinked directory, the parent /won't/ have a link to us. Set the parent
+ * directory to the root for both cases. Returns NULLFSINO if we don't know
+ * what to do.
+ */
+static inline xfs_ino_t
+xrep_dir_self_parent(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+
+ if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino)
+ return sc->mp->m_sb.sb_rootino;
+
+ if (VFS_I(sc->ip)->i_nlink == 0)
+ return sc->mp->m_sb.sb_rootino;
+
+ return NULLFSINO;
+}
+
+/*
+ * Look up the dotdot entry and confirm that it's really the parent.
+ * Returns NULLFSINO if we don't know what to do.
+ */
+static inline xfs_ino_t
+xrep_dir_lookup_parent(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t ino;
+ int error;
+
+ error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL);
+ if (error)
+ return NULLFSINO;
+ if (!xfs_verify_dir_ino(sc->mp, ino))
+ return NULLFSINO;
+
+ error = xrep_findparent_confirm(sc, &ino);
+ if (error)
+ return NULLFSINO;
+
+ return ino;
+}
+
+/*
+ * Look up '..' in the dentry cache and confirm that it's really the parent.
+ * Returns NULLFSINO if the dcache misses or if the hit is implausible.
+ */
+static inline xfs_ino_t
+xrep_dir_dcache_parent(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t parent_ino;
+ int error;
+
+ parent_ino = xrep_findparent_from_dcache(sc);
+ if (parent_ino == NULLFSINO)
+ return parent_ino;
+
+ error = xrep_findparent_confirm(sc, &parent_ino);
+ if (error)
+ return NULLFSINO;
+
+ return parent_ino;
+}
+
+/* Try to find the parent of the directory being repaired. */
+STATIC int
+xrep_dir_find_parent(
+ struct xrep_dir *rd)
+{
+ xfs_ino_t ino;
+
+ ino = xrep_findparent_self_reference(rd->sc);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rd->pscan, ino);
+ return 0;
+ }
+
+ ino = xrep_dir_dcache_parent(rd);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rd->pscan, ino);
+ return 0;
+ }
+
+ ino = xrep_dir_lookup_parent(rd);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rd->pscan, ino);
+ return 0;
+ }
+
+ /*
+ * A full filesystem scan is the last resort. On a busy filesystem,
+ * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means
+ * that we don't know what who the parent is, so we should return to
+ * userspace.
+ */
+ return xrep_findparent_scan(&rd->pscan);
+}
+
+/*
+ * Decide if we want to salvage this entry. We don't bother with oversized
+ * names or the dot entry.
+ */
+STATIC int
+xrep_dir_want_salvage(
+ struct xrep_dir *rd,
+ const char *name,
+ int namelen,
+ xfs_ino_t ino)
+{
+ struct xfs_mount *mp = rd->sc->mp;
+
+ /* No pointers to ourselves or to garbage. */
+ if (ino == rd->sc->ip->i_ino)
+ return false;
+ if (!xfs_verify_dir_ino(mp, ino))
+ return false;
+
+ /* No weird looking names or dot entries. */
+ if (namelen >= MAXNAMELEN || namelen <= 0)
+ return false;
+ if (namelen == 1 && name[0] == '.')
+ return false;
+ if (!xfs_dir2_namecheck(name, namelen))
+ return false;
+
+ return true;
+}
+
+/*
+ * Remember that we want to create a dirent in the tempdir. These stashed
+ * actions will be replayed later.
+ */
+STATIC int
+xrep_dir_stash_createname(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_ino_t ino)
+{
+ struct xrep_dirent dirent = {
+ .ino = ino,
+ .namelen = name->len,
+ .ftype = name->type,
+ };
+ int error;
+
+ trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino);
+
+ error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
+ if (error)
+ return error;
+
+ return xfarray_append(rd->dir_entries, &dirent);
+}
+
+/* Allocate an in-core record to hold entries while we rebuild the dir data. */
+STATIC int
+xrep_dir_salvage_entry(
+ struct xrep_dir *rd,
+ unsigned char *name,
+ unsigned int namelen,
+ xfs_ino_t ino)
+{
+ struct xfs_name xname = {
+ .name = name,
+ };
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *ip;
+ unsigned int i = 0;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * Truncate the name to the first character that would trip namecheck.
+ * If we no longer have a name after that, ignore this entry.
+ */
+ while (i < namelen && name[i] != 0 && name[i] != '/')
+ i++;
+ if (i == 0)
+ return 0;
+ xname.len = i;
+
+ /* Ignore '..' entries; we already picked the new parent. */
+ if (xname.len == 2 && name[0] == '.' && name[1] == '.') {
+ trace_xrep_dir_salvaged_parent(sc->ip, ino);
+ return 0;
+ }
+
+ trace_xrep_dir_salvage_entry(sc->ip, &xname, ino);
+
+ /*
+ * Compute the ftype or dump the entry if we can't. We don't lock the
+ * inode because inodes can't change type while we have a reference.
+ */
+ error = xchk_iget(sc, ino, &ip);
+ if (error)
+ return 0;
+
+ xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
+ xchk_irele(sc, ip);
+
+ return xrep_dir_stash_createname(rd, &xname, ino);
+}
+
+/* Record a shortform directory entry for later reinsertion. */
+STATIC int
+xrep_dir_salvage_sf_entry(
+ struct xrep_dir *rd,
+ struct xfs_dir2_sf_hdr *sfp,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ xfs_ino_t ino;
+
+ ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
+ if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
+ return 0;
+
+ return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
+}
+
+/* Record a regular directory entry for later reinsertion. */
+STATIC int
+xrep_dir_salvage_data_entry(
+ struct xrep_dir *rd,
+ struct xfs_dir2_data_entry *dep)
+{
+ xfs_ino_t ino;
+
+ ino = be64_to_cpu(dep->inumber);
+ if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
+ return 0;
+
+ return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
+}
+
+/* Try to recover block/data format directory entries. */
+STATIC int
+xrep_dir_recover_data(
+ struct xrep_dir *rd,
+ struct xfs_buf *bp)
+{
+ struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo;
+ unsigned int offset;
+ unsigned int end;
+ int error = 0;
+
+ /*
+ * Loop over the data portion of the block.
+ * Each object is a real entry (dep) or an unused one (dup).
+ */
+ offset = geo->data_entry_offset;
+ end = min_t(unsigned int, BBTOB(bp->b_length),
+ xfs_dir3_data_end_offset(geo, bp->b_addr));
+
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+
+ if (xchk_should_terminate(rd->sc, &error))
+ return error;
+
+ /* Skip unused entries. */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ offset += be16_to_cpu(dup->length);
+ continue;
+ }
+
+ /* Don't walk off the end of the block. */
+ offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
+ if (offset > end)
+ break;
+
+ /* Ok, let's save this entry. */
+ error = xrep_dir_salvage_data_entry(rd, dep);
+ if (error)
+ return error;
+
+ }
+
+ return 0;
+}
+
+/* Try to recover shortform directory entries. */
+STATIC int
+xrep_dir_recover_sf(
+ struct xrep_dir *rd)
+{
+ struct xfs_dir2_sf_hdr *hdr;
+ struct xfs_dir2_sf_entry *sfep;
+ struct xfs_dir2_sf_entry *next;
+ struct xfs_ifork *ifp;
+ xfs_ino_t ino;
+ unsigned char *end;
+ int error = 0;
+
+ ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
+ hdr = ifp->if_data;
+ end = (unsigned char *)ifp->if_data + ifp->if_bytes;
+
+ ino = xfs_dir2_sf_get_parent_ino(hdr);
+ trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);
+
+ sfep = xfs_dir2_sf_firstentry(hdr);
+ while ((unsigned char *)sfep < end) {
+ if (xchk_should_terminate(rd->sc, &error))
+ return error;
+
+ next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep);
+ if ((unsigned char *)next > end)
+ break;
+
+ /* Ok, let's save this entry. */
+ error = xrep_dir_salvage_sf_entry(rd, hdr, sfep);
+ if (error)
+ return error;
+
+ sfep = next;
+ }
+
+ return 0;
+}
+
+/*
+ * Try to figure out the format of this directory from the data fork mappings
+ * and the directory size. If we can be reasonably sure of format, we can be
+ * more aggressive in salvaging directory entries. On return, @magic_guess
+ * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
+ * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
+ * and 0 if we can't tell.
+ */
+STATIC void
+xrep_dir_guess_format(
+ struct xrep_dir *rd,
+ __be32 *magic_guess)
+{
+ struct xfs_inode *dp = rd->sc->ip;
+ struct xfs_mount *mp = rd->sc->mp;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ xfs_fileoff_t last;
+ int error;
+
+ ASSERT(xfs_has_crc(mp));
+
+ *magic_guess = 0;
+
+ /*
+ * If there's a single directory block and the directory size is
+ * exactly one block, this has to be a single block format directory.
+ */
+ error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK);
+ if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize &&
+ dp->i_disk_size == geo->blksize) {
+ *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+ return;
+ }
+
+ /*
+ * If the last extent before the leaf offset matches the directory
+ * size and the directory size is larger than 1 block, this is a
+ * data format directory.
+ */
+ last = geo->leafblk;
+ error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK);
+ if (!error &&
+ XFS_FSB_TO_B(mp, last) > geo->blksize &&
+ XFS_FSB_TO_B(mp, last) == dp->i_disk_size) {
+ *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+ return;
+ }
+}
+
+/* Recover directory entries from a specific directory block. */
+STATIC int
+xrep_dir_recover_dirblock(
+ struct xrep_dir *rd,
+ __be32 magic_guess,
+ xfs_dablk_t dabno)
+{
+ struct xfs_dir2_data_hdr *hdr;
+ struct xfs_buf *bp;
+ __be32 oldmagic;
+ int error;
+
+ /*
+ * Try to read buffer. We invalidate them in the next step so we don't
+ * bother to set a buffer type or ops.
+ */
+ error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
+ XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
+ if (error || !bp)
+ return error;
+
+ hdr = bp->b_addr;
+ oldmagic = hdr->magic;
+
+ trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
+ be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));
+
+ /*
+ * If we're sure of the block's format, proceed with the salvage
+ * operation using the specified magic number.
+ */
+ if (magic_guess) {
+ hdr->magic = magic_guess;
+ goto recover;
+ }
+
+ /*
+ * If we couldn't guess what type of directory this is, then we will
+ * only salvage entries from directory blocks that match the magic
+ * number and pass verifiers.
+ */
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+ if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
+ goto out;
+ if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
+ goto out;
+ break;
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+ if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
+ goto out;
+ if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
+ goto out;
+ break;
+ default:
+ goto out;
+ }
+
+recover:
+ error = xrep_dir_recover_data(rd, bp);
+
+out:
+ hdr->magic = oldmagic;
+ xfs_trans_brelse(rd->sc->tp, bp);
+ return error;
+}
+
+static inline void
+xrep_dir_init_args(
+ struct xrep_dir *rd,
+ struct xfs_inode *dp,
+ const struct xfs_name *name)
+{
+ memset(&rd->args, 0, sizeof(struct xfs_da_args));
+ rd->args.geo = rd->sc->mp->m_dir_geo;
+ rd->args.whichfork = XFS_DATA_FORK;
+ rd->args.owner = rd->sc->ip->i_ino;
+ rd->args.trans = rd->sc->tp;
+ rd->args.dp = dp;
+ if (!name)
+ return;
+ rd->args.name = name->name;
+ rd->args.namelen = name->len;
+ rd->args.filetype = name->type;
+ rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name);
+}
+
+/* Replay a stashed createname into the temporary directory. */
+STATIC int
+xrep_dir_replay_createname(
+ struct xrep_dir *rd,
+ const struct xfs_name *name,
+ xfs_ino_t inum,
+ xfs_extlen_t total)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *dp = rd->sc->tempip;
+ bool is_block, is_leaf;
+ int error;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ error = xfs_dir_ino_validate(sc->mp, inum);
+ if (error)
+ return error;
+
+ trace_xrep_dir_replay_createname(dp, name, inum);
+
+ xrep_dir_init_args(rd, dp, name);
+ rd->args.inumber = inum;
+ rd->args.total = total;
+ rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+ return xfs_dir2_sf_addname(&rd->args);
+
+ error = xfs_dir2_isblock(&rd->args, &is_block);
+ if (error)
+ return error;
+ if (is_block)
+ return xfs_dir2_block_addname(&rd->args);
+
+ error = xfs_dir2_isleaf(&rd->args, &is_leaf);
+ if (error)
+ return error;
+ if (is_leaf)
+ return xfs_dir2_leaf_addname(&rd->args);
+
+ return xfs_dir2_node_addname(&rd->args);
+}
+
+/*
+ * Add this stashed incore directory entry to the temporary directory.
+ * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
+ * must not be in transaction context.
+ */
+STATIC int
+xrep_dir_replay_update(
+ struct xrep_dir *rd,
+ const struct xfs_name *xname,
+ const struct xrep_dirent *dirent)
+{
+ struct xfs_mount *mp = rd->sc->mp;
+#ifdef DEBUG
+ xfs_ino_t ino;
+#endif
+ uint resblks;
+ int error;
+
+ resblks = XFS_LINK_SPACE_RES(mp, xname->len);
+ error = xchk_trans_alloc(rd->sc, resblks);
+ if (error)
+ return error;
+
+ /* Lock the temporary directory and join it to the transaction */
+ xrep_tempfile_ilock(rd->sc);
+ xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);
+
+ /*
+ * Create a replacement dirent in the temporary directory. Note that
+ * _createname doesn't check for existing entries. There shouldn't be
+ * any in the temporary dir, but we'll verify this in debug mode.
+ */
+#ifdef DEBUG
+ error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
+ if (error != -ENOENT) {
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+#endif
+
+ error = xrep_dir_replay_createname(rd, xname, dirent->ino, resblks);
+ if (error)
+ goto out_cancel;
+
+ if (xname->type == XFS_DIR3_FT_DIR)
+ rd->subdirs++;
+ rd->dirents++;
+
+ /* Commit and unlock. */
+ error = xrep_trans_commit(rd->sc);
+ if (error)
+ return error;
+
+ xrep_tempfile_iunlock(rd->sc);
+ return 0;
+out_cancel:
+ xchk_trans_cancel(rd->sc);
+ xrep_tempfile_iunlock(rd->sc);
+ return error;
+}
+
+/*
+ * Flush stashed incore dirent updates that have been recorded by the scanner.
+ * This is done to reduce the memory requirements of the directory rebuild,
+ * since directories can contain up to 32GB of directory data.
+ *
+ * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir
+ * IOLOCK.
+ */
+STATIC int
+xrep_dir_replay_updates(
+ struct xrep_dir *rd)
+{
+ xfarray_idx_t array_cur;
+ int error;
+
+ /* Add all the salvaged dirents to the temporary directory. */
+ foreach_xfarray_idx(rd->dir_entries, array_cur) {
+ struct xrep_dirent dirent;
+
+ error = xfarray_load(rd->dir_entries, array_cur, &dirent);
+ if (error)
+ return error;
+
+ error = xfblob_loadname(rd->dir_names, dirent.name_cookie,
+ &rd->xname, dirent.namelen);
+ if (error)
+ return error;
+ rd->xname.type = dirent.ftype;
+
+ error = xrep_dir_replay_update(rd, &rd->xname, &dirent);
+ if (error)
+ return error;
+ }
+
+ /* Empty out both arrays now that we've added the entries. */
+ xfarray_truncate(rd->dir_entries);
+ xfblob_truncate(rd->dir_names);
+ return 0;
+}
+
+/*
+ * Periodically flush stashed directory entries to the temporary dir. This
+ * is done to reduce the memory requirements of the directory rebuild, since
+ * directories can contain up to 32GB of directory data.
+ */
+STATIC int
+xrep_dir_flush_stashed(
+ struct xrep_dir *rd)
+{
+ int error;
+
+ /*
+ * Entering this function, the scrub context has a reference to the
+ * inode being repaired, the temporary file, and a scrub transaction
+ * that we use during dirent salvaging to avoid livelocking if there
+ * are cycles in the directory structures. We hold ILOCK_EXCL on both
+ * the inode being repaired and the temporary file, though they are
+ * not ijoined to the scrub transaction.
+ *
+ * To constrain kernel memory use, we occasionally write salvaged
+ * dirents from the xfarray and xfblob structures into the temporary
+ * directory in preparation for exchanging the directory structures at
+ * the end. Updating the temporary file requires a transaction, so we
+ * commit the scrub transaction and drop the two ILOCKs so that
+ * we can allocate whatever transaction we want.
+ *
+ * We still hold IOLOCK_EXCL on the inode being repaired, which
+ * prevents anyone from accessing the damaged directory data while we
+ * repair it.
+ */
+ error = xrep_trans_commit(rd->sc);
+ if (error)
+ return error;
+ xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);
+
+ /*
+ * Take the IOLOCK of the temporary file while we modify dirents. This
+ * isn't strictly required because the temporary file is never revealed
+ * to userspace, but we follow the same locking rules. We still hold
+ * sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rd->sc);
+ if (error)
+ return error;
+
+ /* Write to the tempdir all the updates that we've stashed. */
+ error = xrep_dir_replay_updates(rd);
+ xrep_tempfile_iounlock(rd->sc);
+ if (error)
+ return error;
+
+ /*
+ * Recreate the salvage transaction and relock the dir we're salvaging.
+ */
+ error = xchk_trans_alloc(rd->sc, 0);
+ if (error)
+ return error;
+ xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/* Decide if we've stashed too much dirent data in memory. */
+static inline bool
+xrep_dir_want_flush_stashed(
+ struct xrep_dir *rd)
+{
+ unsigned long long bytes;
+
+ bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names);
+ return bytes > XREP_DIR_MAX_STASH_BYTES;
+}
+
+/* Extract as many directory entries as we can. */
+STATIC int
+xrep_dir_recover(
+ struct xrep_dir *rd)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_da_geometry *geo = sc->mp->m_dir_geo;
+ xfs_fileoff_t offset;
+ xfs_dablk_t dabno;
+ __be32 magic_guess;
+ int nmap;
+ int error;
+
+ xrep_dir_guess_format(rd, &magic_guess);
+
+ /* Iterate each directory data block in the data fork. */
+ for (offset = 0;
+ offset < geo->leafblk;
+ offset = got.br_startoff + got.br_blockcount) {
+ nmap = 1;
+ error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
+ &got, &nmap, 0);
+ if (error)
+ return error;
+ if (nmap != 1)
+ return -EFSCORRUPTED;
+ if (!xfs_bmap_is_written_extent(&got))
+ continue;
+
+ for (dabno = round_up(got.br_startoff, geo->fsbcount);
+ dabno < got.br_startoff + got.br_blockcount;
+ dabno += geo->fsbcount) {
+ if (xchk_should_terminate(rd->sc, &error))
+ return error;
+
+ error = xrep_dir_recover_dirblock(rd,
+ magic_guess, dabno);
+ if (error)
+ return error;
+
+ /* Flush dirents to constrain memory usage. */
+ if (xrep_dir_want_flush_stashed(rd)) {
+ error = xrep_dir_flush_stashed(rd);
+ if (error)
+ return error;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Find all the directory entries for this inode by scraping them out of the
+ * directory leaf blocks by hand, and flushing them into the temp dir.
+ */
+STATIC int
+xrep_dir_find_entries(
+ struct xrep_dir *rd)
+{
+ struct xfs_inode *dp = rd->sc->ip;
+ int error;
+
+ /*
+ * Salvage directory entries from the old directory, and write them to
+ * the temporary directory.
+ */
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ error = xrep_dir_recover_sf(rd);
+ } else {
+ error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ error = xrep_dir_recover(rd);
+ }
+ if (error)
+ return error;
+
+ return xrep_dir_flush_stashed(rd);
+}
+
+/* Scan all files in the filesystem for dirents. */
+STATIC int
+xrep_dir_salvage_entries(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ /*
+ * Drop the ILOCK on this directory so that we can scan for this
+ * directory's parent. Figure out who is going to be the parent of
+ * this directory, then retake the ILOCK so that we can salvage
+ * directory entries.
+ */
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ error = xrep_dir_find_parent(rd);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ /*
+ * Collect directory entries by parsing raw leaf blocks to salvage
+ * whatever we can. When we're done, free the staging memory before
+ * exchanging the directories to reduce memory usage.
+ */
+ error = xrep_dir_find_entries(rd);
+ if (error)
+ return error;
+
+ /*
+ * Cancel the repair transaction and drop the ILOCK so that we can
+ * (later) use the atomic mapping exchange functions to compute the
+ * correct block reservations and re-lock the inodes.
+ *
+ * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
+ * modifications, but there's nothing to prevent userspace from reading
+ * the directory until we're ready for the exchange operation. Reads
+ * will return -EIO without shutting down the fs, so we're ok with
+ * that.
+ *
+ * The VFS can change dotdot on us, but the findparent scan will keep
+ * our incore parent inode up to date. See the note on locking issues
+ * for more details.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+
+/*
+ * Free all the directory blocks and reset the data fork. The caller must
+ * join the inode to the transaction. This function returns with the inode
+ * joined to a clean scrub transaction.
+ */
+STATIC int
+xrep_dir_reset_fork(
+ struct xrep_dir *rd,
+ xfs_ino_t parent_ino)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
+ int error;
+
+ /* Unmap all the directory buffers. */
+ if (xfs_ifork_has_extents(ifp)) {
+ error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ trace_xrep_dir_reset_fork(sc->tempip, parent_ino);
+
+ /* Reset the data fork to an empty data fork. */
+ xfs_idestroy_fork(ifp);
+ ifp->if_bytes = 0;
+ sc->tempip->i_disk_size = 0;
+
+ /* Reinitialize the short form directory. */
+ xrep_dir_init_args(rd, sc->tempip, NULL);
+ return xfs_dir2_sf_create(&rd->args, parent_ino);
+}
+
+/*
+ * Prepare both inodes' directory forks for exchanging mappings. Promote the
+ * tempfile from short format to leaf format, and if the file being repaired
+ * has a short format data fork, turn it into an empty extent list.
+ */
+STATIC int
+xrep_dir_swap_prep(
+ struct xfs_scrub *sc,
+ bool temp_local,
+ bool ip_local)
+{
+ int error;
+
+ /*
+ * If the tempfile's directory is in shortform format, convert that to
+ * a single leaf extent so that we can use the atomic mapping exchange.
+ */
+ if (temp_local) {
+ struct xfs_da_args args = {
+ .dp = sc->tempip,
+ .geo = sc->mp->m_dir_geo,
+ .whichfork = XFS_DATA_FORK,
+ .trans = sc->tp,
+ .total = 1,
+ .owner = sc->ip->i_ino,
+ };
+
+ error = xfs_dir2_sf_to_block(&args);
+ if (error)
+ return error;
+
+ /*
+ * Roll the deferred log items to get us back to a clean
+ * transaction.
+ */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If the file being repaired had a shortform data fork, convert that
+ * to an empty extent list in preparation for the atomic mapping
+ * exchange.
+ */
+ if (ip_local) {
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(ifp);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ ifp->if_bytes = 0;
+ ifp->if_data = NULL;
+ ifp->if_height = 0;
+
+ xfs_trans_log_inode(sc->tp, sc->ip,
+ XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ }
+
+ return 0;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+static int
+xrep_dir_replace(
+ struct xrep_dir *rd,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ xfs_ino_t inum,
+ xfs_extlen_t total)
+{
+ struct xfs_scrub *sc = rd->sc;
+ bool is_block, is_leaf;
+ int error;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+
+ error = xfs_dir_ino_validate(sc->mp, inum);
+ if (error)
+ return error;
+
+ xrep_dir_init_args(rd, dp, name);
+ rd->args.inumber = inum;
+ rd->args.total = total;
+
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+ return xfs_dir2_sf_replace(&rd->args);
+
+ error = xfs_dir2_isblock(&rd->args, &is_block);
+ if (error)
+ return error;
+ if (is_block)
+ return xfs_dir2_block_replace(&rd->args);
+
+ error = xfs_dir2_isleaf(&rd->args, &is_leaf);
+ if (error)
+ return error;
+ if (is_leaf)
+ return xfs_dir2_leaf_replace(&rd->args);
+
+ return xfs_dir2_node_replace(&rd->args);
+}
+
+/*
+ * Reset the link count of this directory and adjust the unlinked list pointers
+ * as needed.
+ */
+STATIC int
+xrep_dir_set_nlink(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ struct xfs_inode *dp = sc->ip;
+ struct xfs_perag *pag;
+ unsigned int new_nlink = rd->subdirs + 2;
+ int error;
+
+ /*
+ * The directory is not on the incore unlinked list, which means that
+ * it needs to be reachable via the directory tree. Update the nlink
+ * with our observed link count.
+ *
+ * XXX: A subsequent patch will handle parentless directories by moving
+ * them to the lost and found instead of aborting the repair.
+ */
+ if (!xfs_inode_on_unlinked_list(dp))
+ goto reset_nlink;
+
+ /*
+ * The directory is on the unlinked list and we did not find any
+ * dirents. Set the link count to zero and let the directory
+ * inactivate when the last reference drops.
+ */
+ if (rd->dirents == 0) {
+ new_nlink = 0;
+ goto reset_nlink;
+ }
+
+ /*
+ * The directory is on the unlinked list and we found dirents. This
+ * directory needs to be reachable via the directory tree. Remove the
+ * dir from the unlinked list and update nlink with the observed link
+ * count.
+ */
+ pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino));
+ if (!pag) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_iunlink_remove(sc->tp, pag, dp);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+reset_nlink:
+ if (VFS_I(dp)->i_nlink != new_nlink)
+ set_nlink(VFS_I(dp), new_nlink);
+ return 0;
+}
+
+/* Exchange the temporary directory's data fork with the one being repaired. */
+STATIC int
+xrep_dir_swap(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ bool ip_local, temp_local;
+ int error = 0;
+
+ /*
+ * If we found enough subdirs to overflow this directory's link count,
+ * bail out to userspace before we modify anything.
+ */
+ if (rd->subdirs + 2 > XFS_MAXLINK)
+ return -EFSCORRUPTED;
+
+ /*
+ * If we never found the parent for this directory, we can't fix this
+ * directory.
+ */
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+ if (rd->pscan.parent_ino == NULLFSINO)
+ return -EFSCORRUPTED;
+
+ /*
+ * Reset the temporary directory's '..' entry to point to the parent
+ * that we found. The temporary directory was created with the root
+ * directory as the parent, so we can skip this if repairing a
+ * subdirectory of the root.
+ *
+ * It's also possible that this replacement could also expand a sf
+ * tempdir into block format.
+ */
+ if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) {
+ error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
+ rd->pscan.parent_ino, rd->tx.req.resblks);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Changing the dot and dotdot entries could have changed the shape of
+ * the directory, so we recompute these.
+ */
+ ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+ temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
+
+ /*
+ * If the both files have a local format data fork and the rebuilt
+ * directory data would fit in the repaired file's data fork, copy
+ * the contents from the tempfile and update the directory link count.
+ * We're done now.
+ */
+ if (ip_local && temp_local &&
+ sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
+ xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
+ return xrep_dir_set_nlink(rd);
+ }
+
+ /*
+ * Clean the transaction before we start working on exchanging
+ * directory contents.
+ */
+ error = xrep_tempfile_roll_trans(rd->sc);
+ if (error)
+ return error;
+
+ /* Otherwise, make sure both data forks are in block-mapping mode. */
+ error = xrep_dir_swap_prep(sc, temp_local, ip_local);
+ if (error)
+ return error;
+
+ /*
+ * Set nlink of the directory in the same transaction sequence that
+ * (atomically) commits the new directory data.
+ */
+ error = xrep_dir_set_nlink(rd);
+ if (error)
+ return error;
+
+ return xrep_tempexch_contents(sc, &rd->tx);
+}
+
+/*
+ * Exchange the new directory contents (which we created in the tempfile) with
+ * the directory being repaired.
+ */
+STATIC int
+xrep_dir_rebuild_tree(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ int error;
+
+ trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino);
+
+ /*
+ * Take the IOLOCK on the temporary file so that we can run dir
+ * operations with the same locks held as we would for a normal file.
+ * We still hold sc->ip's IOLOCK.
+ */
+ error = xrep_tempfile_iolock_polled(rd->sc);
+ if (error)
+ return error;
+
+ /* Allocate transaction and ILOCK the scrub file and the temp file. */
+ error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
+ if (error)
+ return error;
+
+ /*
+ * Exchange the tempdir's data fork with the file being repaired. This
+ * recreates the transaction and re-takes the ILOCK in the scrub
+ * context.
+ */
+ error = xrep_dir_swap(rd);
+ if (error)
+ return error;
+
+ /*
+ * Release the old directory blocks and reset the data fork of the temp
+ * directory to an empty shortform directory because inactivation does
+ * nothing for directories.
+ */
+ error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
+ if (error)
+ return error;
+
+ /*
+ * Roll to get a transaction without any inodes joined to it. Then we
+ * can drop the tempfile's ILOCK and IOLOCK before doing more work on
+ * the scrub target directory.
+ */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+
+ xrep_tempfile_iunlock(sc);
+ xrep_tempfile_iounlock(sc);
+ return 0;
+}
+
+/* Set up the filesystem scan so we can regenerate directory entries. */
+STATIC int
+xrep_dir_setup_scan(
+ struct xrep_dir *rd)
+{
+ struct xfs_scrub *sc = rd->sc;
+ char *descr;
+ int error;
+
+ /* Set up some staging memory for salvaging dirents. */
+ descr = xchk_xfile_ino_descr(sc, "directory entries");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_dirent),
+ &rd->dir_entries);
+ kfree(descr);
+ if (error)
+ return error;
+
+ descr = xchk_xfile_ino_descr(sc, "directory entry names");
+ error = xfblob_create(descr, &rd->dir_names);
+ kfree(descr);
+ if (error)
+ goto out_xfarray;
+
+ error = xrep_findparent_scan_start(sc, &rd->pscan);
+ if (error)
+ goto out_xfblob;
+
+ return 0;
+
+out_xfblob:
+ xfblob_destroy(rd->dir_names);
+ rd->dir_names = NULL;
+out_xfarray:
+ xfarray_destroy(rd->dir_entries);
+ rd->dir_entries = NULL;
+ return error;
+}
+
+/*
+ * Repair the directory metadata.
+ *
+ * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer
+ * cache in XFS can't handle aliased multiblock buffers, so this might
+ * misbehave if the directory blocks are crosslinked with other filesystem
+ * metadata.
+ *
+ * XXX: Is it necessary to check the dcache for this directory to make sure
+ * that we always recreate every cached entry?
+ */
+int
+xrep_directory(
+ struct xfs_scrub *sc)
+{
+ struct xrep_dir *rd = sc->buf;
+ int error;
+
+ /* The rmapbt is required to reap the old data fork. */
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+
+ error = xrep_dir_setup_scan(rd);
+ if (error)
+ return error;
+
+ error = xrep_dir_salvage_entries(rd);
+ if (error)
+ goto out_teardown;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto out_teardown;
+
+ error = xrep_dir_rebuild_tree(rd);
+ if (error)
+ goto out_teardown;
+
+out_teardown:
+ xrep_dir_teardown(sc);
+ return error;
+}
diff --git a/fs/xfs/scrub/findparent.c b/fs/xfs/scrub/findparent.c
new file mode 100644
index 000000000000..712dd73e4789
--- /dev/null
+++ b/fs/xfs/scrub/findparent.c
@@ -0,0 +1,448 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <[email protected]>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trans_space.h"
+#include "xfs_health.h"
+#include "xfs_exchmaps.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/findparent.h"
+#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+
+/*
+ * Finding the Parent of a Directory
+ * =================================
+ *
+ * Directories have parent pointers, in the sense that each directory contains
+ * a dotdot entry that points to the single allowed parent. The brute force
+ * way to find the parent of a given directory is to scan every directory in
+ * the filesystem looking for a child dirent that references this directory.
+ *
+ * This module wraps the process of scanning the directory tree. It requires
+ * that @sc->ip is the directory whose parent we want to find, and that the
+ * caller hold only the IOLOCK on that directory. The scan itself needs to
+ * take the ILOCK of each directory visited.
+ *
+ * Because we cannot hold @sc->ip's ILOCK during a scan of the whole fs, it is
+ * necessary to use dirent hook to update the parent scan results. Callers
+ * must not read the scan results without re-taking @sc->ip's ILOCK.
+ *
+ * There are a few shortcuts that we can take to avoid scanning the entire
+ * filesystem, such as noticing directory tree roots and querying the dentry
+ * cache for parent information.
+ */
+
+struct xrep_findparent_info {
+ /* The directory currently being scanned. */
+ struct xfs_inode *dp;
+
+ /*
+ * Scrub context. We're looking for a @dp containing a directory
+ * entry pointing to sc->ip->i_ino.
+ */
+ struct xfs_scrub *sc;
+
+ /* Optional scan information for a xrep_findparent_scan call. */
+ struct xrep_parent_scan_info *parent_scan;
+
+ /*
+ * Parent that we've found for sc->ip. If we're scanning the entire
+ * directory tree, we need this to ensure that we only find /one/
+ * parent directory.
+ */
+ xfs_ino_t found_parent;
+
+ /*
+ * This is set to true if @found_parent was not observed directly from
+ * the directory scan but by noticing a change in dotdot entries after
+ * cycling the sc->ip IOLOCK.
+ */
+ bool parent_tentative;
+};
+
+/*
+ * If this directory entry points to the scrub target inode, then the directory
+ * we're scanning is the parent of the scrub target inode.
+ */
+STATIC int
+xrep_findparent_dirent(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
+{
+ struct xrep_findparent_info *fpi = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(fpi->sc, &error))
+ return error;
+
+ if (ino != fpi->sc->ip->i_ino)
+ return 0;
+
+ /* Ignore garbage directory entry names. */
+ if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+ return -EFSCORRUPTED;
+
+ /*
+ * Ignore dotdot and dot entries -- we're looking for parent -> child
+ * links only.
+ */
+ if (name->name[0] == '.' && (name->len == 1 ||
+ (name->len == 2 && name->name[1] == '.')))
+ return 0;
+
+ /* Uhoh, more than one parent for a dir? */
+ if (fpi->found_parent != NULLFSINO &&
+ !(fpi->parent_tentative && fpi->found_parent == fpi->dp->i_ino)) {
+ trace_xrep_findparent_dirent(fpi->sc->ip, 0);
+ return -EFSCORRUPTED;
+ }
+
+ /* We found a potential parent; remember this. */
+ trace_xrep_findparent_dirent(fpi->sc->ip, fpi->dp->i_ino);
+ fpi->found_parent = fpi->dp->i_ino;
+ fpi->parent_tentative = false;
+
+ if (fpi->parent_scan)
+ xrep_findparent_scan_found(fpi->parent_scan, fpi->dp->i_ino);
+
+ return 0;
+}
+
+/*
+ * If this is a directory, walk the dirents looking for any that point to the
+ * scrub target inode.
+ */
+STATIC int
+xrep_findparent_walk_directory(
+ struct xrep_findparent_info *fpi)
+{
+ struct xfs_scrub *sc = fpi->sc;
+ struct xfs_inode *dp = fpi->dp;
+ unsigned int lock_mode;
+ int error = 0;
+
+ /*
+ * The inode being scanned cannot be its own parent, nor can any
+ * temporary directory we created to stage this repair.
+ */
+ if (dp == sc->ip || dp == sc->tempip)
+ return 0;
+
+ /*
+ * Similarly, temporary files created to stage a repair cannot be the
+ * parent of this inode.
+ */
+ if (xrep_is_tempfile(dp))
+ return 0;
+
+ /*
+ * Scan the directory to see if there it contains an entry pointing to
+ * the directory that we are repairing.
+ */
+ lock_mode = xfs_ilock_data_map_shared(dp);
+
+ /*
+ * If this directory is known to be sick, we cannot scan it reliably
+ * and must abort.
+ */
+ if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
+ XFS_SICK_INO_BMBTD |
+ XFS_SICK_INO_DIR)) {
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ /*
+ * We cannot complete our parent pointer scan if a directory looks as
+ * though it has been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ error = xchk_dir_walk(sc, dp, xrep_findparent_dirent, fpi);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock(dp, lock_mode);
+ return error;
+}
+
+/*
+ * Update this directory's dotdot pointer based on ongoing dirent updates.
+ */
+STATIC int
+xrep_findparent_live_update(
+ struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct xfs_dir_update_params *p = data;
+ struct xrep_parent_scan_info *pscan;
+ struct xfs_scrub *sc;
+
+ pscan = container_of(nb, struct xrep_parent_scan_info,
+ dhook.dirent_hook.nb);
+ sc = pscan->sc;
+
+ /*
+ * If @p->ip is the subdirectory that we're interested in and we've
+ * already scanned @p->dp, update the dotdot target inumber to the
+ * parent inode.
+ */
+ if (p->ip->i_ino == sc->ip->i_ino &&
+ xchk_iscan_want_live_update(&pscan->iscan, p->dp->i_ino)) {
+ if (p->delta > 0) {
+ xrep_findparent_scan_found(pscan, p->dp->i_ino);
+ } else {
+ xrep_findparent_scan_found(pscan, NULLFSINO);
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * Set up a scan to find the parent of a directory. The provided dirent hook
+ * will be called when there is a dotdot update for the inode being repaired.
+ */
+int
+xrep_findparent_scan_start(
+ struct xfs_scrub *sc,
+ struct xrep_parent_scan_info *pscan)
+{
+ int error;
+
+ if (!(sc->flags & XCHK_FSGATES_DIRENTS)) {
+ ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+ return -EINVAL;
+ }
+
+ pscan->sc = sc;
+ pscan->parent_ino = NULLFSINO;
+
+ mutex_init(&pscan->lock);
+
+ xchk_iscan_start(sc, 30000, 100, &pscan->iscan);
+
+ /*
+ * Hook into the dirent update code. The hook only operates on inodes
+ * that were already scanned, and the scanner thread takes each inode's
+ * ILOCK, which means that any in-progress inode updates will finish
+ * before we can scan the inode.
+ */
+ xfs_dir_hook_setup(&pscan->dhook, xrep_findparent_live_update);
+ error = xfs_dir_hook_add(sc->mp, &pscan->dhook);
+ if (error)
+ goto out_iscan;
+
+ return 0;
+out_iscan:
+ xchk_iscan_teardown(&pscan->iscan);
+ mutex_destroy(&pscan->lock);
+ return error;
+}
+
+/*
+ * Scan the entire filesystem looking for a parent inode for the inode being
+ * scrubbed. @sc->ip must not be the root of a directory tree. Callers must
+ * not hold a dirty transaction or any lock that would interfere with taking
+ * an ILOCK.
+ *
+ * Returns 0 with @pscan->parent_ino set to the parent that we found.
+ * Returns 0 with @pscan->parent_ino set to NULLFSINO if we found no parents.
+ * Returns the usual negative errno if something else happened.
+ */
+int
+xrep_findparent_scan(
+ struct xrep_parent_scan_info *pscan)
+{
+ struct xrep_findparent_info fpi = {
+ .sc = pscan->sc,
+ .found_parent = NULLFSINO,
+ .parent_scan = pscan,
+ };
+ struct xfs_scrub *sc = pscan->sc;
+ int ret;
+
+ ASSERT(S_ISDIR(VFS_IC(sc->ip)->i_mode));
+
+ while ((ret = xchk_iscan_iter(&pscan->iscan, &fpi.dp)) == 1) {
+ if (S_ISDIR(VFS_I(fpi.dp)->i_mode))
+ ret = xrep_findparent_walk_directory(&fpi);
+ else
+ ret = 0;
+ xchk_iscan_mark_visited(&pscan->iscan, fpi.dp);
+ xchk_irele(sc, fpi.dp);
+ if (ret)
+ break;
+
+ if (xchk_should_terminate(sc, &ret))
+ break;
+ }
+ xchk_iscan_iter_finish(&pscan->iscan);
+
+ return ret;
+}
+
+/* Tear down a parent scan. */
+void
+xrep_findparent_scan_teardown(
+ struct xrep_parent_scan_info *pscan)
+{
+ xfs_dir_hook_del(pscan->sc->mp, &pscan->dhook);
+ xchk_iscan_teardown(&pscan->iscan);
+ mutex_destroy(&pscan->lock);
+}
+
+/* Finish a parent scan early. */
+void
+xrep_findparent_scan_finish_early(
+ struct xrep_parent_scan_info *pscan,
+ xfs_ino_t ino)
+{
+ xrep_findparent_scan_found(pscan, ino);
+ xchk_iscan_finish_early(&pscan->iscan);
+}
+
+/*
+ * Confirm that the directory @parent_ino actually contains a directory entry
+ * pointing to the child @sc->ip->ino. This function returns one of several
+ * ways:
+ *
+ * Returns 0 with @parent_ino unchanged if the parent was confirmed.
+ * Returns 0 with @parent_ino set to NULLFSINO if the parent was not valid.
+ * Returns the usual negative errno if something else happened.
+ */
+int
+xrep_findparent_confirm(
+ struct xfs_scrub *sc,
+ xfs_ino_t *parent_ino)
+{
+ struct xrep_findparent_info fpi = {
+ .sc = sc,
+ .found_parent = NULLFSINO,
+ };
+ int error;
+
+ /*
+ * The root directory always points to itself. Unlinked dirs can point
+ * anywhere, so we point them at the root dir too.
+ */
+ if (sc->ip == sc->mp->m_rootip || VFS_I(sc->ip)->i_nlink == 0) {
+ *parent_ino = sc->mp->m_sb.sb_rootino;
+ return 0;
+ }
+
+ /* Reject garbage parent inode numbers and self-referential parents. */
+ if (*parent_ino == NULLFSINO)
+ return 0;
+ if (!xfs_verify_dir_ino(sc->mp, *parent_ino) ||
+ *parent_ino == sc->ip->i_ino) {
+ *parent_ino = NULLFSINO;
+ return 0;
+ }
+
+ error = xchk_iget(sc, *parent_ino, &fpi.dp);
+ if (error)
+ return error;
+
+ if (!S_ISDIR(VFS_I(fpi.dp)->i_mode)) {
+ *parent_ino = NULLFSINO;
+ goto out_rele;
+ }
+
+ error = xrep_findparent_walk_directory(&fpi);
+ if (error)
+ goto out_rele;
+
+ *parent_ino = fpi.found_parent;
+out_rele:
+ xchk_irele(sc, fpi.dp);
+ return error;
+}
+
+/*
+ * If we're the root of a directory tree, we are our own parent. If we're an
+ * unlinked directory, the parent /won't/ have a link to us. Set the parent
+ * directory to the root for both cases. Returns NULLFSINO if we don't know
+ * what to do.
+ */
+xfs_ino_t
+xrep_findparent_self_reference(
+ struct xfs_scrub *sc)
+{
+ if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino)
+ return sc->mp->m_sb.sb_rootino;
+
+ if (VFS_I(sc->ip)->i_nlink == 0)
+ return sc->mp->m_sb.sb_rootino;
+
+ return NULLFSINO;
+}
+
+/* Check the dentry cache to see if knows of a parent for the scrub target. */
+xfs_ino_t
+xrep_findparent_from_dcache(
+ struct xfs_scrub *sc)
+{
+ struct inode *pip = NULL;
+ struct dentry *dentry, *parent;
+ xfs_ino_t ret = NULLFSINO;
+
+ dentry = d_find_alias(VFS_I(sc->ip));
+ if (!dentry)
+ goto out;
+
+ parent = dget_parent(dentry);
+ if (!parent)
+ goto out_dput;
+
+ ASSERT(parent->d_sb == sc->ip->i_mount->m_super);
+
+ pip = igrab(d_inode(parent));
+ dput(parent);
+
+ if (S_ISDIR(pip->i_mode)) {
+ trace_xrep_findparent_from_dcache(sc->ip, XFS_I(pip)->i_ino);
+ ret = XFS_I(pip)->i_ino;
+ }
+
+ xchk_irele(sc, XFS_I(pip));
+
+out_dput:
+ dput(dentry);
+out:
+ return ret;
+}
diff --git a/fs/xfs/scrub/findparent.h b/fs/xfs/scrub/findparent.h
new file mode 100644
index 000000000000..501f99d3164e
--- /dev/null
+++ b/fs/xfs/scrub/findparent.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <[email protected]>
+ */
+#ifndef __XFS_SCRUB_FINDPARENT_H__
+#define __XFS_SCRUB_FINDPARENT_H__
+
+struct xrep_parent_scan_info {
+ struct xfs_scrub *sc;
+
+ /* Inode scan cursor. */
+ struct xchk_iscan iscan;
+
+ /* Hook to capture directory entry updates. */
+ struct xfs_dir_hook dhook;
+
+ /* Lock protecting parent_ino. */
+ struct mutex lock;
+
+ /* Parent inode that we've found. */
+ xfs_ino_t parent_ino;
+
+ bool lookup_parent;
+};
+
+int xrep_findparent_scan_start(struct xfs_scrub *sc,
+ struct xrep_parent_scan_info *pscan);
+int xrep_findparent_scan(struct xrep_parent_scan_info *pscan);
+void xrep_findparent_scan_teardown(struct xrep_parent_scan_info *pscan);
+
+static inline void
+xrep_findparent_scan_found(
+ struct xrep_parent_scan_info *pscan,
+ xfs_ino_t ino)
+{
+ mutex_lock(&pscan->lock);
+ pscan->parent_ino = ino;
+ mutex_unlock(&pscan->lock);
+}
+
+void xrep_findparent_scan_finish_early(struct xrep_parent_scan_info *pscan,
+ xfs_ino_t ino);
+
+int xrep_findparent_confirm(struct xfs_scrub *sc, xfs_ino_t *parent_ino);
+
+xfs_ino_t xrep_findparent_self_reference(struct xfs_scrub *sc);
+xfs_ino_t xrep_findparent_from_dcache(struct xfs_scrub *sc);
+
+#endif /* __XFS_SCRUB_FINDPARENT_H__ */
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index c743772a523e..0dde5df2f8d3 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -46,6 +46,7 @@
#include "scrub/repair.h"
#include "scrub/iscan.h"
#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
/*
* Inode Record Repair
@@ -340,6 +341,10 @@ xrep_dinode_findmode_walk_directory(
unsigned int lock_mode;
int error = 0;
+ /* Ignore temporary repair directories. */
+ if (xrep_is_tempfile(dp))
+ return 0;
+
/*
* Scan the directory to see if there it contains an entry pointing to
* the directory that we are repairing.
diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c
index c643b7d79b60..c380207702e2 100644
--- a/fs/xfs/scrub/iscan.c
+++ b/fs/xfs/scrub/iscan.c
@@ -243,6 +243,17 @@ xchk_iscan_finish(
mutex_unlock(&iscan->lock);
}
+/* Mark an inode scan finished before we actually scan anything. */
+void
+xchk_iscan_finish_early(
+ struct xchk_iscan *iscan)
+{
+ ASSERT(iscan->cursor_ino == iscan->scan_start_ino);
+ ASSERT(iscan->__visited_ino == iscan->scan_start_ino);
+
+ xchk_iscan_finish(iscan);
+}
+
/*
* Grab the AGI to advance the inode scan. Returns 0 if *agi_bpp is now set,
* -ECANCELED if the live scan aborted, -EBUSY if the AGI could not be grabbed,
@@ -436,8 +447,13 @@ xchk_iscan_iget(
* It's possible that this inode has lost all of its links but
* hasn't yet been inactivated. If we don't have a transaction
* or it's not writable, flush the inodegc workers and wait.
+ * If we have a non-empty transaction, we must not block on
+ * inodegc, which allocates its own transactions.
*/
- xfs_inodegc_flush(mp);
+ if (sc->tp && !(sc->tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
+ xfs_inodegc_push(mp);
+ else
+ xfs_inodegc_flush(mp);
return xchk_iscan_iget_retry(iscan, true);
}
diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h
index 5e0e4ed9dea6..f9f47fa01a9e 100644
--- a/fs/xfs/scrub/iscan.h
+++ b/fs/xfs/scrub/iscan.h
@@ -88,6 +88,7 @@ xchk_iscan_set_agi_trylock(struct xchk_iscan *iscan)
void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout,
unsigned int iget_retry_delay, struct xchk_iscan *iscan);
+void xchk_iscan_finish_early(struct xchk_iscan *iscan);
void xchk_iscan_teardown(struct xchk_iscan *iscan);
int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp);
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
index 8a7d9557897c..8b9aa73093d6 100644
--- a/fs/xfs/scrub/nlinks.c
+++ b/fs/xfs/scrub/nlinks.c
@@ -27,6 +27,7 @@
#include "scrub/nlinks.h"
#include "scrub/trace.h"
#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
/*
* Live Inode Link Count Checking
@@ -152,6 +153,13 @@ xchk_nlinks_live_update(
xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
+ /*
+ * Ignore temporary directories being used to stage dir repairs, since
+ * we don't bump the link counts of the children.
+ */
+ if (xrep_is_tempfile(p->dp))
+ return NOTIFY_DONE;
+
trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
p->delta, p->name->name, p->name->len);
@@ -303,6 +311,13 @@ xchk_nlinks_collect_dir(
unsigned int lock_mode;
int error = 0;
+ /*
+ * Ignore temporary directories being used to stage dir repairs, since
+ * we don't bump the link counts of the children.
+ */
+ if (xrep_is_tempfile(dp))
+ return 0;
+
/* Prevent anyone from changing this directory while we walk it. */
xfs_ilock(dp, XFS_IOLOCK_SHARED);
lock_mode = xfs_ilock_data_map_shared(dp);
@@ -537,6 +552,14 @@ xchk_nlinks_compare_inode(
unsigned int actual_nlink;
int error;
+ /*
+ * Ignore temporary files being used to stage repairs, since we assume
+ * they're correct for non-directories, and the directory repair code
+ * doesn't bump the link counts for the children.
+ */
+ if (xrep_is_tempfile(ip))
+ return 0;
+
xfs_ilock(ip, XFS_ILOCK_SHARED);
mutex_lock(&xnc->lock);
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
index 58cacb8e94c1..23eb08c4b5ad 100644
--- a/fs/xfs/scrub/nlinks_repair.c
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -26,6 +26,7 @@
#include "scrub/iscan.h"
#include "scrub/nlinks.h"
#include "scrub/trace.h"
+#include "scrub/tempfile.h"
/*
* Live Inode Link Count Repair
@@ -68,6 +69,14 @@ xrep_nlinks_repair_inode(
bool dirty = false;
int error;
+ /*
+ * Ignore temporary files being used to stage repairs, since we assume
+ * they're correct for non-directories, and the directory repair code
+ * doesn't bump the link counts for the children.
+ */
+ if (xrep_is_tempfile(ip))
+ return 0;
+
xchk_ilock(sc, XFS_IOLOCK_EXCL);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp);
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 5da10ed1fe8c..acb6282c3d14 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -10,6 +10,7 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_dir2.h"
@@ -17,12 +18,22 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/readdir.h"
+#include "scrub/tempfile.h"
+#include "scrub/repair.h"
/* Set us up to scrub parents. */
int
xchk_setup_parent(
struct xfs_scrub *sc)
{
+ int error;
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_parent(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_inode_contents(sc, 0);
}
@@ -143,7 +154,8 @@ xchk_parent_validate(
}
if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
return error;
- if (dp == sc->ip || dp == sc->tempip || !S_ISDIR(VFS_I(dp)->i_mode)) {
+ if (dp == sc->ip || xrep_is_tempfile(dp) ||
+ !S_ISDIR(VFS_I(dp)->i_mode)) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
goto out_rele;
}
diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c
new file mode 100644
index 000000000000..826926c2bb0d
--- /dev/null
+++ b/fs/xfs/scrub/parent_repair.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <[email protected]>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trans_space.h"
+#include "xfs_health.h"
+#include "xfs_exchmaps.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/findparent.h"
+#include "scrub/readdir.h"
+
+/*
+ * Repairing The Directory Parent Pointer
+ * ======================================
+ *
+ * Currently, only directories support parent pointers (in the form of '..'
+ * entries), so we simply scan the filesystem and update the '..' entry.
+ *
+ * Note that because the only parent pointer is the dotdot entry, we won't
+ * touch an unhealthy directory, since the directory repair code is perfectly
+ * capable of rebuilding a directory with the proper parent inode.
+ *
+ * See the section on locking issues in dir_repair.c for more information about
+ * conflicts with the VFS. The findparent code wll keep our incore parent
+ * inode up to date.
+ */
+
+struct xrep_parent {
+ struct xfs_scrub *sc;
+
+ /*
+ * Information used to scan the filesystem to find the inumber of the
+ * dotdot entry for this directory.
+ */
+ struct xrep_parent_scan_info pscan;
+};
+
+/* Tear down all the incore stuff we created. */
+static void
+xrep_parent_teardown(
+ struct xrep_parent *rp)
+{
+ xrep_findparent_scan_teardown(&rp->pscan);
+}
+
+/* Set up for a parent repair. */
+int
+xrep_setup_parent(
+ struct xfs_scrub *sc)
+{
+ struct xrep_parent *rp;
+
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+ rp = kvzalloc(sizeof(struct xrep_parent), XCHK_GFP_FLAGS);
+ if (!rp)
+ return -ENOMEM;
+ rp->sc = sc;
+ sc->buf = rp;
+
+ return 0;
+}
+
+/*
+ * Scan all files in the filesystem for a child dirent that we can turn into
+ * the dotdot entry for this directory.
+ */
+STATIC int
+xrep_parent_find_dotdot(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t ino;
+ unsigned int sick, checked;
+ int error;
+
+ /*
+ * Avoid sick directories. There shouldn't be anyone else clearing the
+ * directory's sick status.
+ */
+ xfs_inode_measure_sickness(sc->ip, &sick, &checked);
+ if (sick & XFS_SICK_INO_DIR)
+ return -EFSCORRUPTED;
+
+ ino = xrep_findparent_self_reference(sc);
+ if (ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rp->pscan, ino);
+ return 0;
+ }
+
+ /*
+ * Drop the ILOCK on this directory so that we can scan for the dotdot
+ * entry. Figure out who is going to be the parent of this directory,
+ * then retake the ILOCK so that we can salvage directory entries.
+ */
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ /* Does the VFS dcache have an answer for us? */
+ ino = xrep_findparent_from_dcache(sc);
+ if (ino != NULLFSINO) {
+ error = xrep_findparent_confirm(sc, &ino);
+ if (!error && ino != NULLFSINO) {
+ xrep_findparent_scan_finish_early(&rp->pscan, ino);
+ goto out_relock;
+ }
+ }
+
+ /* Scan the entire filesystem for a parent. */
+ error = xrep_findparent_scan(&rp->pscan);
+out_relock:
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+
+ return error;
+}
+
+/* Reset a directory's dotdot entry, if needed. */
+STATIC int
+xrep_parent_reset_dotdot(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+ xfs_ino_t ino;
+ unsigned int spaceres;
+ int error = 0;
+
+ ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
+
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &ino);
+ if (error || ino == rp->pscan.parent_ino)
+ return error;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ trace_xrep_parent_reset_dotdot(sc->ip, rp->pscan.parent_ino);
+
+ /*
+ * Reserve more space just in case we have to expand the dir. We're
+ * allowed to exceed quota to repair inconsistent metadata.
+ */
+ spaceres = XFS_RENAME_SPACE_RES(sc->mp, xfs_name_dotdot.len);
+ error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, spaceres, 0,
+ true);
+ if (error)
+ return error;
+
+ error = xfs_dir_replace(sc->tp, sc->ip, &xfs_name_dotdot,
+ rp->pscan.parent_ino, spaceres);
+ if (error)
+ return error;
+
+ /*
+ * Roll transaction to detach the inode from the transaction but retain
+ * ILOCK_EXCL.
+ */
+ return xfs_trans_roll(&sc->tp);
+}
+
+/*
+ * Commit the new parent pointer structure (currently only the dotdot entry) to
+ * the file that we're repairing.
+ */
+STATIC int
+xrep_parent_rebuild_tree(
+ struct xrep_parent *rp)
+{
+ if (rp->pscan.parent_ino == NULLFSINO) {
+ /* Cannot fix orphaned directories yet. */
+ return -EFSCORRUPTED;
+ }
+
+ return xrep_parent_reset_dotdot(rp);
+}
+
+/* Set up the filesystem scan so we can look for parents. */
+STATIC int
+xrep_parent_setup_scan(
+ struct xrep_parent *rp)
+{
+ struct xfs_scrub *sc = rp->sc;
+
+ return xrep_findparent_scan_start(sc, &rp->pscan);
+}
+
+int
+xrep_parent(
+ struct xfs_scrub *sc)
+{
+ struct xrep_parent *rp = sc->buf;
+ int error;
+
+ error = xrep_parent_setup_scan(rp);
+ if (error)
+ return error;
+
+ error = xrep_parent_find_dotdot(rp);
+ if (error)
+ goto out_teardown;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto out_teardown;
+
+ error = xrep_parent_rebuild_tree(rp);
+ if (error)
+ goto out_teardown;
+
+out_teardown:
+ xrep_parent_teardown(rp);
+ return error;
+}
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
index e94080469315..028690761c62 100644
--- a/fs/xfs/scrub/readdir.c
+++ b/fs/xfs/scrub/readdir.c
@@ -333,6 +333,13 @@ xchk_dir_lookup(
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
+ /*
+ * A temporary directory's block headers are written with the owner
+ * set to sc->ip, so we must switch the owner here for the lookup.
+ */
+ if (dp == sc->tempip)
+ args.owner = sc->ip->i_ino;
+
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 04aec0e9e4c3..369f0430e4ba 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -35,6 +35,7 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr.h"
+#include "xfs_dir2.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 9cbfd8da5620..e53374fa5430 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -91,6 +91,8 @@ int xrep_metadata_inode_forks(struct xfs_scrub *sc);
int xrep_setup_ag_rmapbt(struct xfs_scrub *sc);
int xrep_setup_ag_refcountbt(struct xfs_scrub *sc);
int xrep_setup_xattr(struct xfs_scrub *sc);
+int xrep_setup_directory(struct xfs_scrub *sc);
+int xrep_setup_parent(struct xfs_scrub *sc);
/* Repair setup functions */
int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -125,6 +127,8 @@ int xrep_bmap_cow(struct xfs_scrub *sc);
int xrep_nlinks(struct xfs_scrub *sc);
int xrep_fscounters(struct xfs_scrub *sc);
int xrep_xattr(struct xfs_scrub *sc);
+int xrep_directory(struct xfs_scrub *sc);
+int xrep_parent(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xrep_rtbitmap(struct xfs_scrub *sc);
@@ -195,6 +199,8 @@ xrep_setup_nothing(
#define xrep_setup_ag_rmapbt xrep_setup_nothing
#define xrep_setup_ag_refcountbt xrep_setup_nothing
#define xrep_setup_xattr xrep_setup_nothing
+#define xrep_setup_directory xrep_setup_nothing
+#define xrep_setup_parent xrep_setup_nothing
#define xrep_setup_inode(sc, imap) ((void)0)
@@ -221,6 +227,8 @@ xrep_setup_nothing(
#define xrep_fscounters xrep_notsupported
#define xrep_rtsummary xrep_notsupported
#define xrep_xattr xrep_notsupported
+#define xrep_directory xrep_notsupported
+#define xrep_parent xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 547189a14b6b..520d83db193c 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -325,7 +325,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_INODE,
.setup = xchk_setup_directory,
.scrub = xchk_directory,
- .repair = xrep_notsupported,
+ .repair = xrep_directory,
},
[XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */
.type = ST_INODE,
@@ -343,7 +343,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_INODE,
.setup = xchk_setup_parent,
.scrub = xchk_parent,
- .repair = xrep_notsupported,
+ .repair = xrep_parent,
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_FS,
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index 0b3060be938f..4ca86a6a5be1 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -841,3 +841,16 @@ xrep_tempfile_copyout_local(
ilog_flags |= xfs_ilog_fdata(whichfork);
xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags);
}
+
+/* Decide if a given XFS inode is a temporary file for a repair. */
+bool
+xrep_is_tempfile(
+ const struct xfs_inode *ip)
+{
+ const struct inode *inode = &ip->i_vnode;
+
+ if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR))
+ return true;
+
+ return false;
+}
diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h
index d57e4f145a7c..e51399f595fe 100644
--- a/fs/xfs/scrub/tempfile.h
+++ b/fs/xfs/scrub/tempfile.h
@@ -35,11 +35,13 @@ int xrep_tempfile_set_isize(struct xfs_scrub *sc, unsigned long long isize);
int xrep_tempfile_roll_trans(struct xfs_scrub *sc);
void xrep_tempfile_copyout_local(struct xfs_scrub *sc, int whichfork);
+bool xrep_is_tempfile(const struct xfs_inode *ip);
#else
static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc)
{
xchk_ilock(sc, XFS_IOLOCK_EXCL);
}
+# define xrep_is_tempfile(ip) (false)
# define xrep_tempfile_rele(sc)
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index ffaff7722bf2..d68ec8e2781e 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -2500,6 +2500,121 @@ DEFINE_EVENT(xrep_xattr_class, name, \
DEFINE_XREP_XATTR_EVENT(xrep_xattr_rebuild_tree);
DEFINE_XREP_XATTR_EVENT(xrep_xattr_reset_fork);
+TRACE_EVENT(xrep_dir_recover_dirblock,
+ TP_PROTO(struct xfs_inode *dp, xfs_dablk_t dabno, uint32_t magic,
+ uint32_t magic_guess),
+ TP_ARGS(dp, dabno, magic, magic_guess),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_dablk_t, dabno)
+ __field(uint32_t, magic)
+ __field(uint32_t, magic_guess)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->dabno = dabno;
+ __entry->magic = magic;
+ __entry->magic_guess = magic_guess;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx dablk 0x%x magic 0x%x magic_guess 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->dabno,
+ __entry->magic,
+ __entry->magic_guess)
+);
+
+DECLARE_EVENT_CLASS(xrep_dir_class,
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino),
+ TP_ARGS(dp, parent_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_ino_t, parent_ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->parent_ino = parent_ino;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx parent 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->parent_ino)
+)
+#define DEFINE_XREP_DIR_EVENT(name) \
+DEFINE_EVENT(xrep_dir_class, name, \
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t parent_ino), \
+ TP_ARGS(dp, parent_ino))
+DEFINE_XREP_DIR_EVENT(xrep_dir_rebuild_tree);
+DEFINE_XREP_DIR_EVENT(xrep_dir_reset_fork);
+DEFINE_XREP_DIR_EVENT(xrep_parent_reset_dotdot);
+
+DECLARE_EVENT_CLASS(xrep_dirent_class,
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name,
+ xfs_ino_t ino),
+ TP_ARGS(dp, name, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(unsigned int, namelen)
+ __dynamic_array(char, name, name->len)
+ __field(xfs_ino_t, ino)
+ __field(uint8_t, ftype)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ __entry->ino = ino;
+ __entry->ftype = name->type;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx ftype %s name '%.*s' ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __entry->namelen,
+ __get_str(name),
+ __entry->ino)
+)
+#define DEFINE_XREP_DIRENT_EVENT(name) \
+DEFINE_EVENT(xrep_dirent_class, name, \
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name, \
+ xfs_ino_t ino), \
+ TP_ARGS(dp, name, ino))
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_salvage_entry);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_stash_createname);
+DEFINE_XREP_DIRENT_EVENT(xrep_dir_replay_createname);
+
+DECLARE_EVENT_CLASS(xrep_parent_salvage_class,
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino),
+ TP_ARGS(dp, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dir_ino)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = dp->i_mount->m_super->s_dev;
+ __entry->dir_ino = dp->i_ino;
+ __entry->ino = ino;
+ ),
+ TP_printk("dev %d:%d dir 0x%llx parent 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dir_ino,
+ __entry->ino)
+)
+#define DEFINE_XREP_PARENT_SALVAGE_EVENT(name) \
+DEFINE_EVENT(xrep_parent_salvage_class, name, \
+ TP_PROTO(struct xfs_inode *dp, xfs_ino_t ino), \
+ TP_ARGS(dp, ino))
+DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_dir_salvaged_parent);
+DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_dirent);
+DEFINE_XREP_PARENT_SALVAGE_EVENT(xrep_findparent_from_dcache);
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/scrub/xfblob.h b/fs/xfs/scrub/xfblob.h
index 78a67a06408f..ae78322613ca 100644
--- a/fs/xfs/scrub/xfblob.h
+++ b/fs/xfs/scrub/xfblob.h
@@ -23,4 +23,28 @@ int xfblob_free(struct xfblob *blob, xfblob_cookie cookie);
unsigned long long xfblob_bytes(struct xfblob *blob);
void xfblob_truncate(struct xfblob *blob);
+static inline int
+xfblob_storename(
+ struct xfblob *blob,
+ xfblob_cookie *cookie,
+ const struct xfs_name *xname)
+{
+ return xfblob_store(blob, cookie, xname->name, xname->len);
+}
+
+static inline int
+xfblob_loadname(
+ struct xfblob *blob,
+ xfblob_cookie cookie,
+ struct xfs_name *xname,
+ uint32_t size)
+{
+ int ret = xfblob_load(blob, cookie, (void *)xname->name, size);
+ if (ret)
+ return ret;
+
+ xname->len = size;
+ return 0;
+}
+
#endif /* __XFS_SCRUB_XFBLOB_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b24c0e23d37d..09d643a9e997 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -16,6 +16,7 @@
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_attr.h"
+#include "xfs_bit.h"
#include "xfs_trans_space.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
@@ -1552,6 +1553,51 @@ out_unlock:
}
/*
+ * Mark all the buffers attached to this directory stale. In theory we should
+ * never be freeing a directory with any blocks at all, but this covers the
+ * case where we've recovered a directory swap with a "temporary" directory
+ * created by online repair and now need to dump it.
+ */
+STATIC void
+xfs_inactive_dir(
+ struct xfs_inode *dp)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
+ xfs_fileoff_t off;
+
+ /*
+ * Invalidate each directory block. All directory blocks are of
+ * fsbcount length and alignment, so we only need to walk those same
+ * offsets. We hold the only reference to this inode, so we must wait
+ * for the buffer locks.
+ */
+ for_each_xfs_iext(ifp, &icur, &got) {
+ for (off = round_up(got.br_startoff, geo->fsbcount);
+ off < got.br_startoff + got.br_blockcount;
+ off += geo->fsbcount) {
+ struct xfs_buf *bp = NULL;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ fsbno = (off - got.br_startoff) + got.br_startblock;
+ error = xfs_buf_incore(mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, fsbno),
+ XFS_FSB_TO_BB(mp, geo->fsbcount),
+ XBF_LIVESCAN, &bp);
+ if (error)
+ continue;
+
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ }
+ }
+}
+
+/*
* xfs_inactive_truncate
*
* Called to perform a truncate when an inode becomes unlinked.
@@ -1861,6 +1907,11 @@ xfs_inactive(
goto out;
}
+ if (S_ISDIR(VFS_I(ip)->i_mode) && ip->i_df.if_nextents > 0) {
+ xfs_inactive_dir(ip);
+ truncate = 1;
+ }
+
if (S_ISLNK(VFS_I(ip)->i_mode))
error = xfs_inactive_symlink(ip);
else if (truncate)