diff options
-rw-r--r-- | fs/xfs/Kconfig | 5 | ||||
-rw-r--r-- | fs/xfs/Makefile | 2 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_da_format.h | 11 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_dir2.c | 6 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_dir2.h | 13 | ||||
-rw-r--r-- | fs/xfs/scrub/dir.c | 4 | ||||
-rw-r--r-- | fs/xfs/scrub/inode_repair.c | 236 | ||||
-rw-r--r-- | fs/xfs/scrub/iscan.c | 767 | ||||
-rw-r--r-- | fs/xfs/scrub/iscan.h | 84 | ||||
-rw-r--r-- | fs/xfs/scrub/trace.c | 2 | ||||
-rw-r--r-- | fs/xfs/scrub/trace.h | 194 | ||||
-rw-r--r-- | fs/xfs/xfs_hooks.c | 52 | ||||
-rw-r--r-- | fs/xfs/xfs_hooks.h | 65 | ||||
-rw-r--r-- | fs/xfs/xfs_iwalk.c | 13 | ||||
-rw-r--r-- | fs/xfs/xfs_linux.h | 1 |
15 files changed, 1436 insertions, 19 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 567fb37274d3..fa7eb3e2a248 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -124,11 +124,16 @@ config XFS_DRAIN_INTENTS bool select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL +config XFS_LIVE_HOOKS + bool + select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL + config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n depends on XFS_FS depends on TMPFS && SHMEM + select XFS_LIVE_HOOKS select XFS_DRAIN_INTENTS help If you say Y here you will be able to check metadata on a diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 35a23427055b..4597c0f19e8e 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -136,6 +136,7 @@ xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o endif xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o +xfs-$(CONFIG_XFS_LIVE_HOOKS) += xfs_hooks.o # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) @@ -158,6 +159,7 @@ xfs-y += $(addprefix scrub/, \ health.o \ ialloc.o \ inode.o \ + iscan.o \ parent.o \ readdir.o \ refcount.o \ diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 24f9d1461f9a..060e5c96b70f 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -159,6 +159,17 @@ struct xfs_da3_intnode { #define XFS_DIR3_FT_MAX 9 +#define XFS_DIR3_FTYPE_STR \ + { XFS_DIR3_FT_UNKNOWN, "unknown" }, \ + { XFS_DIR3_FT_REG_FILE, "file" }, \ + { XFS_DIR3_FT_DIR, "directory" }, \ + { XFS_DIR3_FT_CHRDEV, "char" }, \ + { XFS_DIR3_FT_BLKDEV, "block" }, \ + { XFS_DIR3_FT_FIFO, "fifo" }, \ + { XFS_DIR3_FT_SOCK, "sock" }, \ + { XFS_DIR3_FT_SYMLINK, "symlink" }, \ + { XFS_DIR3_FT_WHT, "whiteout" } + /* * Byte offset in data block and shortform entry. */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 8c9403b33191..9018abb38d7f 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -25,6 +25,12 @@ const struct xfs_name xfs_name_dotdot = { .type = XFS_DIR3_FT_DIR, }; +const struct xfs_name xfs_name_dot = { + .name = (const unsigned char *)".", + .len = 1, + .type = XFS_DIR3_FT_DIR, +}; + /* * Convert inode mode to directory entry filetype */ diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 19af22a16c41..8497d041f316 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -22,6 +22,19 @@ struct xfs_dir3_icfree_hdr; struct xfs_dir3_icleaf_hdr; extern const struct xfs_name xfs_name_dotdot; +extern const struct xfs_name xfs_name_dot; + +static inline bool +xfs_dir2_samename( + const struct xfs_name *n1, + const struct xfs_name *n2) +{ + if (n1 == n2) + return true; + if (n1->len != n2->len) + return false; + return !memcmp(n1->name, n2->name, n1->len); +} /* * Convert inode mode to directory entry filetype diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index d86ab51af928..076a310b8eb0 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -93,11 +93,11 @@ xchk_dir_actor( return -ECANCELED; } - if (!strncmp(".", name->name, name->len)) { + if (xfs_dir2_samename(name, &xfs_name_dot)) { /* If this is "." then check that the inum matches the dir. */ if (ino != dp->i_ino) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - } else if (!strncmp("..", name->name, name->len)) { + } else if (xfs_dir2_samename(name, &xfs_name_dotdot)) { /* * If this is ".." in the root inode, check that the inum * matches this dir. diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 0ca62d59f84a..7e859c412a5b 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -43,6 +43,8 @@ #include "scrub/btree.h" #include "scrub/trace.h" #include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/readdir.h" /* * Inode Record Repair @@ -126,6 +128,10 @@ struct xrep_inode { /* Must we remove all access from this file? */ bool zap_acls; + + /* Inode scanner to see if we can find the ftype from dirents */ + struct xchk_iscan ftype_iscan; + uint8_t alleged_ftype; }; /* @@ -227,26 +233,233 @@ xrep_dinode_header( dip->di_gen = cpu_to_be32(sc->sm->sm_gen); } -/* Turn di_mode into /something/ recognizable. */ -STATIC void +/* + * If this directory entry points to the scrub target inode, then the directory + * we're scanning is the parent of the scrub target inode. + */ +STATIC int +xrep_dinode_findmode_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + if (ino != sc->sm->sm_ino) + return 0; + + /* Ignore garbage directory entry names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + /* + * Uhoh, more than one parent for this inode and they don't agree on + * the file type? + */ + if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN && + ri->alleged_ftype != name->type) { + trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type, + ri->alleged_ftype); + return -EFSCORRUPTED; + } + + /* We found a potential parent; remember the ftype. */ + trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type); + ri->alleged_ftype = name->type; + return 0; +} + +/* + * If this is a directory, walk the dirents looking for any that point to the + * scrub target inode. + */ +STATIC int +xrep_dinode_findmode_walk_directory( + struct xrep_inode *ri, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = ri->sc; + unsigned int lock_mode; + int error = 0; + + /* + * Scan the directory to see if there it contains an entry pointing to + * the directory that we are repairing. + */ + lock_mode = xfs_ilock_data_map_shared(dp); + + /* + * If this directory is known to be sick, we cannot scan it reliably + * and must abort. + */ + if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | + XFS_SICK_INO_BMBTD | + XFS_SICK_INO_DIR)) { + error = -EFSCORRUPTED; + goto out_unlock; + } + + /* + * We cannot complete our parent pointer scan if a directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lock_mode); + return error; +} + +/* + * Try to find the mode of the inode being repaired by looking for directories + * that point down to this file. + */ +STATIC int +xrep_dinode_find_mode( + struct xrep_inode *ri, + uint16_t *mode) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_inode *dp; + int error; + + /* No ftype means we have no other metadata to consult. */ + if (!xfs_has_ftype(sc->mp)) { + *mode = S_IFREG; + return 0; + } + + /* + * Scan all directories for parents that might point down to this + * inode. Skip the inode being repaired during the scan since it + * cannot be its own parent. Note that we still hold the AGI locked + * so there's a real possibility that _iscan_iter can return EBUSY. + */ + xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); + ri->ftype_iscan.skip_ino = sc->sm->sm_ino; + ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; + while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { + if (S_ISDIR(VFS_I(dp)->i_mode)) + error = xrep_dinode_findmode_walk_directory(ri, dp); + xchk_iscan_mark_visited(&ri->ftype_iscan, dp); + xchk_irele(sc, dp); + if (error < 0) + break; + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&ri->ftype_iscan); + xchk_iscan_teardown(&ri->ftype_iscan); + + if (error == -EBUSY) { + if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) { + /* + * If we got an EBUSY after finding at least one + * dirent, that means the scan found an inode on the + * inactivation list and could not open it. Accept the + * alleged ftype and install a new mode below. + */ + error = 0; + } else if (!(sc->flags & XCHK_TRY_HARDER)) { + /* + * Otherwise, retry the operation one time to see if + * the reason for the delay is an inode from the same + * cluster buffer waiting on the inactivation list. + */ + error = -EDEADLOCK; + } + } + if (error) + return error; + + /* + * Convert the discovered ftype into the file mode. If all else fails, + * return S_IFREG. + */ + switch (ri->alleged_ftype) { + case XFS_DIR3_FT_DIR: + *mode = S_IFDIR; + break; + case XFS_DIR3_FT_WHT: + case XFS_DIR3_FT_CHRDEV: + *mode = S_IFCHR; + break; + case XFS_DIR3_FT_BLKDEV: + *mode = S_IFBLK; + break; + case XFS_DIR3_FT_FIFO: + *mode = S_IFIFO; + break; + case XFS_DIR3_FT_SOCK: + *mode = S_IFSOCK; + break; + case XFS_DIR3_FT_SYMLINK: + *mode = S_IFLNK; + break; + default: + *mode = S_IFREG; + break; + } + return 0; +} + +/* Turn di_mode into /something/ recognizable. Returns true if we succeed. */ +STATIC int xrep_dinode_mode( struct xrep_inode *ri, struct xfs_dinode *dip) { struct xfs_scrub *sc = ri->sc; uint16_t mode = be16_to_cpu(dip->di_mode); + int error; trace_xrep_dinode_mode(sc, dip); if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) - return; + return 0; + + /* Try to fix the mode. If we cannot, then leave everything alone. */ + error = xrep_dinode_find_mode(ri, &mode); + switch (error) { + case -EINTR: + case -EBUSY: + case -EDEADLOCK: + /* temporary failure or fatal signal */ + return error; + case 0: + /* found mode */ + break; + default: + /* some other error, assume S_IFREG */ + mode = S_IFREG; + break; + } /* bad mode, so we set it to a file that only root can read */ - mode = S_IFREG; dip->di_mode = cpu_to_be16(mode); dip->di_uid = 0; dip->di_gid = 0; ri->zap_acls = true; + return 0; } /* Fix any conflicting flags that the verifiers complain about. */ @@ -1107,12 +1320,15 @@ xrep_dinode_core( /* Fix everything the verifier will complain about. */ dip = xfs_buf_offset(bp, ri->imap.im_boffset); xrep_dinode_header(sc, dip); - xrep_dinode_mode(ri, dip); + iget_error = xrep_dinode_mode(ri, dip); + if (iget_error) + goto write; xrep_dinode_flags(sc, dip, ri->rt_extents > 0); xrep_dinode_size(ri, dip); xrep_dinode_extsize_hints(sc, dip); xrep_dinode_zap_forks(ri, dip); +write: /* Write out the inode. */ trace_xrep_dinode_fixed(sc, dip); xfs_dinode_calc_crc(sc->mp, dip); @@ -1128,7 +1344,8 @@ xrep_dinode_core( * accessing the inode. If iget fails, we still need to commit the * changes. */ - iget_error = xchk_iget(sc, ino, &sc->ip); + if (!iget_error) + iget_error = xchk_iget(sc, ino, &sc->ip); if (!iget_error) xchk_ilock(sc, XFS_IOLOCK_EXCL); @@ -1496,6 +1713,13 @@ xrep_inode( ASSERT(ri != NULL); error = xrep_dinode_problems(ri); + if (error == -EBUSY) { + /* + * Directory scan to recover inode mode encountered a + * busy inode, so we did not continue repairing things. + */ + return 0; + } if (error) return error; diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c new file mode 100644 index 000000000000..17af89b519b3 --- /dev/null +++ b/fs/xfs/scrub/iscan.c @@ -0,0 +1,767 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <[email protected]> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_bit.h" +#include "xfs_icache.h" +#include "scrub/scrub.h" +#include "scrub/iscan.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * Live File Scan + * ============== + * + * Live file scans walk every inode in a live filesystem. This is more or + * less like a regular iwalk, except that when we're advancing the scan cursor, + * we must ensure that inodes cannot be added or deleted anywhere between the + * old cursor value and the new cursor value. If we're advancing the cursor + * by one inode, the caller must hold that inode; if we're finding the next + * inode to scan, we must grab the AGI and hold it until we've updated the + * scan cursor. + * + * Callers are expected to use this code to scan all files in the filesystem to + * construct a new metadata index of some kind. The scan races against other + * live updates, which means there must be a provision to update the new index + * when updates are made to inodes that already been scanned. The iscan lock + * can be used in live update hook code to stop the scan and protect this data + * structure. + * + * To keep the new index up to date with other metadata updates being made to + * the live filesystem, it is assumed that the caller will add hooks as needed + * to be notified when a metadata update occurs. The inode scanner must tell + * the hook code when an inode has been visited with xchk_iscan_mark_visit. + * Hook functions can use xchk_iscan_want_live_update to decide if the + * scanner's observations must be updated. + */ + +/* + * If the inobt record @rec covers @iscan->skip_ino, mark the inode free so + * that the scan ignores that inode. + */ +STATIC void +xchk_iscan_mask_skipino( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_inobt_rec_incore *rec, + xfs_agino_t lastrecino) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino); + xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino); + + if (pag->pag_agno != skip_agno) + return; + if (skip_agino < rec->ir_startino) + return; + if (skip_agino > lastrecino) + return; + + rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1); +} + +/* + * Set *cursor to the next allocated inode after whatever it's set to now. + * If there are no more inodes in this AG, cursor is set to NULLAGINO. + */ +STATIC int +xchk_iscan_find_next( + struct xchk_iscan *iscan, + struct xfs_buf *agi_bp, + struct xfs_perag *pag, + xfs_inofree_t *allocmaskp, + xfs_agino_t *cursor, + uint8_t *nr_inodesp) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + xfs_agnumber_t agno = pag->pag_agno; + xfs_agino_t lastino = NULLAGINO; + xfs_agino_t first, last; + xfs_agino_t agino = *cursor; + int has_rec; + int error; + + /* If the cursor is beyond the end of this AG, move to the next one. */ + xfs_agino_range(mp, agno, &first, &last); + if (agino > last) { + *cursor = NULLAGINO; + return 0; + } + + /* + * Look up the inode chunk for the current cursor position. If there + * is no chunk here, we want the next one. + */ + cur = xfs_inobt_init_cursor(pag, tp, agi_bp, XFS_BTNUM_INO); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec); + if (!error && !has_rec) + error = xfs_btree_increment(cur, 0, &has_rec); + for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) { + xfs_inofree_t allocmask; + + /* + * If we've run out of inobt records in this AG, move the + * cursor on to the next AG and exit. The caller can try + * again with the next AG. + */ + if (!has_rec) { + *cursor = NULLAGINO; + break; + } + + error = xfs_inobt_get_rec(cur, &rec, &has_rec); + if (error) + break; + if (!has_rec) { + error = -EFSCORRUPTED; + break; + } + + /* Make sure that we always move forward. */ + if (lastino != NULLAGINO && + XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) { + error = -EFSCORRUPTED; + break; + } + lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1; + + /* + * If this record only covers inodes that come before the + * cursor, advance to the next record. + */ + if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + continue; + + if (iscan->skip_ino) + xchk_iscan_mask_skipino(iscan, pag, &rec, lastino); + + /* + * If the incoming lookup put us in the middle of an inobt + * record, mark it and the previous inodes "free" so that the + * search for allocated inodes will start at the cursor. + * We don't care about ir_freecount here. + */ + if (agino >= rec.ir_startino) + rec.ir_free |= xfs_inobt_maskn(0, + agino + 1 - rec.ir_startino); + + /* + * If there are allocated inodes in this chunk, find them + * and update the scan cursor. + */ + allocmask = ~rec.ir_free; + if (hweight64(allocmask) > 0) { + int next = xfs_lowbit64(allocmask); + + ASSERT(next >= 0); + *cursor = rec.ir_startino + next; + *allocmaskp = allocmask >> next; + *nr_inodesp = XFS_INODES_PER_CHUNK - next; + break; + } + } + + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Advance both the scan and the visited cursors. + * + * The inumber address space for a given filesystem is sparse, which means that + * the scan cursor can jump a long ways in a single iter() call. There are no + * inodes in these sparse areas, so we must move the visited cursor forward at + * the same time so that the scan user can receive live updates for inodes that + * may get created once we release the AGI buffer. + */ +static inline void +xchk_iscan_move_cursor( + struct xchk_iscan *iscan, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t cursor, visited; + + BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO); + + /* + * Special-case ino == 0 here so that we never set visited_ino to + * NULLFSINO when wrapping around EOFS, for that will let through all + * live updates. + */ + cursor = XFS_AGINO_TO_INO(mp, agno, agino); + if (cursor == 0) + visited = XFS_MAXINUMBER; + else + visited = cursor - 1; + + mutex_lock(&iscan->lock); + iscan->cursor_ino = cursor; + iscan->__visited_ino = visited; + trace_xchk_iscan_move_cursor(iscan); + mutex_unlock(&iscan->lock); +} + +/* + * Prepare to return agno/agino to the iscan caller by moving the lastino + * cursor to the previous inode. Do this while we still hold the AGI so that + * no other threads can create or delete inodes in this AG. + */ +static inline void +xchk_iscan_finish( + struct xchk_iscan *iscan) +{ + mutex_lock(&iscan->lock); + iscan->cursor_ino = NULLFSINO; + + /* All live updates will be applied from now on */ + iscan->__visited_ino = NULLFSINO; + + mutex_unlock(&iscan->lock); +} + +/* + * Advance ino to the next inode that the inobt thinks is allocated, being + * careful to jump to the next AG if we've reached the right end of this AG's + * inode btree. Advancing ino effectively means that we've pushed the inode + * scan forward, so set the iscan cursor to (ino - 1) so that our live update + * predicates will track inode allocations in that part of the inode number + * key space once we release the AGI buffer. + * + * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, + * -ECANCELED if the live scan aborted, or the usual negative errno. + */ +STATIC int +xchk_iscan_advance( + struct xchk_iscan *iscan, + struct xfs_perag **pagp, + struct xfs_buf **agi_bpp, + xfs_inofree_t *allocmaskp, + uint8_t *nr_inodesp) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agi_bp; + struct xfs_perag *pag; + xfs_agnumber_t agno; + xfs_agino_t agino; + int ret; + + ASSERT(iscan->cursor_ino >= iscan->__visited_ino); + + do { + if (xchk_iscan_aborted(iscan)) + return -ECANCELED; + + agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino); + pag = xfs_perag_get(mp, agno); + if (!pag) + return -ECANCELED; + + ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); + if (ret) + goto out_pag; + + agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino); + ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp, + &agino, nr_inodesp); + if (ret) + goto out_buf; + + if (agino != NULLAGINO) { + /* + * Found the next inode in this AG, so return it along + * with the AGI buffer and the perag structure to + * ensure it cannot go away. + */ + xchk_iscan_move_cursor(iscan, agno, agino); + *agi_bpp = agi_bp; + *pagp = pag; + return 1; + } + + /* + * Did not find any more inodes in this AG, move on to the next + * AG. + */ + agno = (agno + 1) % mp->m_sb.sb_agcount; + xchk_iscan_move_cursor(iscan, agno, 0); + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + trace_xchk_iscan_advance_ag(iscan); + } while (iscan->cursor_ino != iscan->scan_start_ino); + + xchk_iscan_finish(iscan); + return 0; + +out_buf: + xfs_trans_brelse(sc->tp, agi_bp); +out_pag: + xfs_perag_put(pag); + return ret; +} + +/* + * Grabbing the inode failed, so we need to back up the scan and ask the caller + * to try to _advance the scan again. Returns -EBUSY if we've run out of retry + * opportunities, -ECANCELED if the process has a fatal signal pending, or + * -EAGAIN if we should try again. + */ +STATIC int +xchk_iscan_iget_retry( + struct xchk_iscan *iscan, + bool wait) +{ + ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1); + + if (!iscan->iget_timeout || + time_is_before_jiffies(iscan->__iget_deadline)) + return -EBUSY; + + if (wait) { + unsigned long relax; + + /* + * Sleep for a period of time to let the rest of the system + * catch up. If we return early, someone sent a kill signal to + * the calling process. + */ + relax = msecs_to_jiffies(iscan->iget_retry_delay); + trace_xchk_iscan_iget_retry_wait(iscan); + + if (schedule_timeout_killable(relax) || + xchk_iscan_aborted(iscan)) + return -ECANCELED; + } + + iscan->cursor_ino--; + return -EAGAIN; +} + +/* + * Grab an inode as part of an inode scan. While scanning this inode, the + * caller must ensure that no other threads can modify the inode until a call + * to xchk_iscan_visit succeeds. + * + * Returns the number of incore inodes grabbed; -EAGAIN if the caller should + * call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode; + * -ECANCELED if there's a fatal signal pending; or some other negative errno. + */ +STATIC int +xchk_iscan_iget( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_buf *agi_bp, + xfs_inofree_t allocmask, + uint8_t nr_inodes) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t ino = iscan->cursor_ino; + unsigned int idx = 0; + unsigned int i; + int error; + + ASSERT(iscan->__inodes[0] == NULL); + + /* Fill the first slot in the inode array. */ + error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + &iscan->__inodes[idx]); + + trace_xchk_iscan_iget(iscan, error); + + if (error == -ENOENT || error == -EAGAIN) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + /* + * It's possible that this inode has lost all of its links but + * hasn't yet been inactivated. If we don't have a transaction + * or it's not writable, flush the inodegc workers and wait. + */ + xfs_inodegc_flush(mp); + return xchk_iscan_iget_retry(iscan, true); + } + + if (error == -EINVAL) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + /* + * We thought the inode was allocated, but the inode btree + * lookup failed, which means that it was freed since the last + * time we advanced the cursor. Back up and try again. This + * should never happen since still hold the AGI buffer from the + * inobt check, but we need to be careful about infinite loops. + */ + return xchk_iscan_iget_retry(iscan, false); + } + + if (error) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + return error; + } + idx++; + ino++; + allocmask >>= 1; + + /* + * Now that we've filled the first slot in __inodes, try to fill the + * rest of the batch with consecutively ordered inodes. to reduce the + * number of _iter calls. Make a bitmap of unallocated inodes from the + * zeroes in the inuse bitmap; these inodes will not be scanned, but + * the _want_live_update predicate will pass through all live updates. + * + * If we can't iget an allocated inode, stop and return what we have. + */ + mutex_lock(&iscan->lock); + iscan->__batch_ino = ino - 1; + iscan->__skipped_inomask = 0; + mutex_unlock(&iscan->lock); + + for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) { + if (!(allocmask & 1)) { + ASSERT(!(iscan->__skipped_inomask & (1ULL << i))); + + mutex_lock(&iscan->lock); + iscan->cursor_ino = ino; + iscan->__skipped_inomask |= (1ULL << i); + mutex_unlock(&iscan->lock); + continue; + } + + ASSERT(iscan->__inodes[idx] == NULL); + + error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + &iscan->__inodes[idx]); + if (error) + break; + + mutex_lock(&iscan->lock); + iscan->cursor_ino = ino; + mutex_unlock(&iscan->lock); + idx++; + } + + trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx); + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + return idx; +} + +/* + * Advance the visit cursor to reflect skipped inodes beyond whatever we + * scanned. + */ +STATIC void +xchk_iscan_finish_batch( + struct xchk_iscan *iscan) +{ + xfs_ino_t highest_skipped; + + mutex_lock(&iscan->lock); + + if (iscan->__batch_ino != NULLFSINO) { + highest_skipped = iscan->__batch_ino + + xfs_highbit64(iscan->__skipped_inomask); + iscan->__visited_ino = max(iscan->__visited_ino, + highest_skipped); + + trace_xchk_iscan_skip(iscan); + } + + iscan->__batch_ino = NULLFSINO; + iscan->__skipped_inomask = 0; + + mutex_unlock(&iscan->lock); +} + +/* + * Advance the inode scan cursor to the next allocated inode and return up to + * 64 consecutive allocated inodes starting with the cursor position. + */ +STATIC int +xchk_iscan_iter_batch( + struct xchk_iscan *iscan) +{ + struct xfs_scrub *sc = iscan->sc; + int ret; + + xchk_iscan_finish_batch(iscan); + + if (iscan->iget_timeout) + iscan->__iget_deadline = jiffies + + msecs_to_jiffies(iscan->iget_timeout); + + do { + struct xfs_buf *agi_bp = NULL; + struct xfs_perag *pag = NULL; + xfs_inofree_t allocmask = 0; + uint8_t nr_inodes = 0; + + ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask, + &nr_inodes); + if (ret != 1) + return ret; + + if (xchk_iscan_aborted(iscan)) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + ret = -ECANCELED; + break; + } + + ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes); + } while (ret == -EAGAIN); + + return ret; +} + +/* + * Advance the inode scan cursor to the next allocated inode and return the + * incore inode structure associated with it. + * + * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, + * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be + * grabbed, or the usual negative errno. + * + * If the function returns -EBUSY and the caller can handle skipping an inode, + * it may call this function again to continue the scan with the next allocated + * inode. + */ +int +xchk_iscan_iter( + struct xchk_iscan *iscan, + struct xfs_inode **ipp) +{ + unsigned int i; + int error; + + /* Find a cached inode, or go get another batch. */ + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (iscan->__inodes[i]) + goto foundit; + } + + error = xchk_iscan_iter_batch(iscan); + if (error <= 0) + return error; + + ASSERT(iscan->__inodes[0] != NULL); + i = 0; + +foundit: + /* Give the caller our reference. */ + *ipp = iscan->__inodes[i]; + iscan->__inodes[i] = NULL; + return 1; +} + +/* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */ +void +xchk_iscan_iter_finish( + struct xchk_iscan *iscan) +{ + struct xfs_scrub *sc = iscan->sc; + unsigned int i; + + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (iscan->__inodes[i]) { + xchk_irele(sc, iscan->__inodes[i]); + iscan->__inodes[i] = NULL; + } + } +} + +/* Mark this inode scan finished and release resources. */ +void +xchk_iscan_teardown( + struct xchk_iscan *iscan) +{ + xchk_iscan_iter_finish(iscan); + xchk_iscan_finish(iscan); + mutex_destroy(&iscan->lock); +} + +/* Pick an AG from which to start a scan. */ +static inline xfs_ino_t +xchk_iscan_rotor( + struct xfs_mount *mp) +{ + static atomic_t agi_rotor; + unsigned int r = atomic_inc_return(&agi_rotor) - 1; + + /* + * Rotoring *backwards* through the AGs, so we add one here before + * subtracting from the agcount to arrive at an AG number. + */ + r = (r % mp->m_sb.sb_agcount) + 1; + + return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0); +} + +/* + * Set ourselves up to start an inode scan. If the @iget_timeout and + * @iget_retry_delay parameters are set, the scan will try to iget each inode + * for @iget_timeout milliseconds. If an iget call indicates that the inode is + * waiting to be inactivated, the CPU will relax for @iget_retry_delay + * milliseconds after pushing the inactivation workers. + */ +void +xchk_iscan_start( + struct xfs_scrub *sc, + unsigned int iget_timeout, + unsigned int iget_retry_delay, + struct xchk_iscan *iscan) +{ + xfs_ino_t start_ino; + + start_ino = xchk_iscan_rotor(sc->mp); + + iscan->__batch_ino = NULLFSINO; + iscan->__skipped_inomask = 0; + + iscan->sc = sc; + clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); + iscan->iget_timeout = iget_timeout; + iscan->iget_retry_delay = iget_retry_delay; + iscan->__visited_ino = start_ino; + iscan->cursor_ino = start_ino; + iscan->scan_start_ino = start_ino; + mutex_init(&iscan->lock); + memset(iscan->__inodes, 0, sizeof(iscan->__inodes)); + + trace_xchk_iscan_start(iscan, start_ino); +} + +/* + * Mark this inode as having been visited. Callers must hold a sufficiently + * exclusive lock on the inode to prevent concurrent modifications. + */ +void +xchk_iscan_mark_visited( + struct xchk_iscan *iscan, + struct xfs_inode *ip) +{ + mutex_lock(&iscan->lock); + iscan->__visited_ino = ip->i_ino; + trace_xchk_iscan_visit(iscan); + mutex_unlock(&iscan->lock); +} + +/* + * Did we skip this inode because it wasn't allocated when we loaded the batch? + * If so, it is newly allocated and will not be scanned. All live updates to + * this inode must be passed to the caller to maintain scan correctness. + */ +static inline bool +xchk_iscan_skipped( + const struct xchk_iscan *iscan, + xfs_ino_t ino) +{ + if (iscan->__batch_ino == NULLFSINO) + return false; + if (ino < iscan->__batch_ino) + return false; + if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK) + return false; + + return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino)); +} + +/* + * Do we need a live update for this inode? This is true if the scanner thread + * has visited this inode and the scan hasn't been aborted due to errors. + * Callers must hold a sufficiently exclusive lock on the inode to prevent + * scanners from reading any inode metadata. + */ +bool +xchk_iscan_want_live_update( + struct xchk_iscan *iscan, + xfs_ino_t ino) +{ + bool ret = false; + + if (xchk_iscan_aborted(iscan)) + return false; + + mutex_lock(&iscan->lock); + + trace_xchk_iscan_want_live_update(iscan, ino); + + /* Scan is finished, caller should receive all updates. */ + if (iscan->__visited_ino == NULLFSINO) { + ret = true; + goto unlock; + } + + /* + * No inodes have been visited yet, so the visited cursor points at the + * start of the scan range. The caller should not receive any updates. + */ + if (iscan->scan_start_ino == iscan->__visited_ino) { + ret = false; + goto unlock; + } + + /* + * This inode was not allocated at the time of the iscan batch. + * The caller should receive all updates. + */ + if (xchk_iscan_skipped(iscan, ino)) { + ret = true; + goto unlock; + } + + /* + * The visited cursor hasn't yet wrapped around the end of the FS. If + * @ino is inside the starred range, the caller should receive updates: + * + * 0 ------------ S ************ V ------------ EOFS + */ + if (iscan->scan_start_ino <= iscan->__visited_ino) { + if (ino >= iscan->scan_start_ino && + ino <= iscan->__visited_ino) + ret = true; + + goto unlock; + } + + /* + * The visited cursor wrapped around the end of the FS. If @ino is + * inside the starred range, the caller should receive updates: + * + * 0 ************ V ------------ S ************ EOFS + */ + if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino) + ret = true; + +unlock: + mutex_unlock(&iscan->lock); + return ret; +} diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h new file mode 100644 index 000000000000..71f657552dfa --- /dev/null +++ b/fs/xfs/scrub/iscan.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <[email protected]> + */ +#ifndef __XFS_SCRUB_ISCAN_H__ +#define __XFS_SCRUB_ISCAN_H__ + +struct xchk_iscan { + struct xfs_scrub *sc; + + /* Lock to protect the scan cursor. */ + struct mutex lock; + + /* + * This is the first inode in the inumber address space that we + * examined. When the scan wraps around back to here, the scan is + * finished. + */ + xfs_ino_t scan_start_ino; + + /* This is the inode that will be examined next. */ + xfs_ino_t cursor_ino; + + /* If nonzero and non-NULL, skip this inode when scanning. */ + xfs_ino_t skip_ino; + + /* + * This is the last inode that we've successfully scanned, either + * because the caller scanned it, or we moved the cursor past an empty + * part of the inode address space. Scan callers should only use the + * xchk_iscan_visit function to modify this. + */ + xfs_ino_t __visited_ino; + + /* Operational state of the livescan. */ + unsigned long __opstate; + + /* Give up on iterating @cursor_ino if we can't iget it by this time. */ + unsigned long __iget_deadline; + + /* Amount of time (in ms) that we will try to iget an inode. */ + unsigned int iget_timeout; + + /* Wait this many ms to retry an iget. */ + unsigned int iget_retry_delay; + + /* + * The scan grabs batches of inodes and stashes them here before + * handing them out with _iter. Unallocated inodes are set in the + * mask so that all updates to that inode are selected for live + * update propagation. + */ + xfs_ino_t __batch_ino; + xfs_inofree_t __skipped_inomask; + struct xfs_inode *__inodes[XFS_INODES_PER_CHUNK]; +}; + +/* Set if the scan has been aborted due to some event in the fs. */ +#define XCHK_ISCAN_OPSTATE_ABORTED (1) + +static inline bool +xchk_iscan_aborted(const struct xchk_iscan *iscan) +{ + return test_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); +} + +static inline void +xchk_iscan_abort(struct xchk_iscan *iscan) +{ + set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); +} + +void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout, + unsigned int iget_retry_delay, struct xchk_iscan *iscan); +void xchk_iscan_teardown(struct xchk_iscan *iscan); + +int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp); +void xchk_iscan_iter_finish(struct xchk_iscan *iscan); + +void xchk_iscan_mark_visited(struct xchk_iscan *iscan, struct xfs_inode *ip); +bool xchk_iscan_want_live_update(struct xchk_iscan *iscan, xfs_ino_t ino); + +#endif /* __XFS_SCRUB_ISCAN_H__ */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index d0e24ffaf754..5ed75cc33b92 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -16,10 +16,12 @@ #include "xfs_rtbitmap.h" #include "xfs_quota.h" #include "xfs_quota_defs.h" +#include "xfs_da_format.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/quota.h" +#include "scrub/iscan.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index ae6b2385a8cb..9aba60c61880 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -16,10 +16,12 @@ #include <linux/tracepoint.h> #include "xfs_bit.h" +struct xfs_scrub; struct xfile; struct xfarray; struct xfarray_sortinfo; struct xchk_dqiter; +struct xchk_iscan; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -1146,6 +1148,149 @@ TRACE_EVENT(xchk_rtsum_record_free, ); #endif /* CONFIG_XFS_RT */ +DECLARE_EVENT_CLASS(xchk_iscan_class, + TP_PROTO(struct xchk_iscan *iscan), + TP_ARGS(iscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited) +) +#define DEFINE_ISCAN_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_class, name, \ + TP_PROTO(struct xchk_iscan *iscan), \ + TP_ARGS(iscan)) +DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor); +DEFINE_ISCAN_EVENT(xchk_iscan_visit); +DEFINE_ISCAN_EVENT(xchk_iscan_skip); +DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag); + +DECLARE_EVENT_CLASS(xchk_iscan_ino_class, + TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), + TP_ARGS(iscan, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, startino) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->startino = iscan->scan_start_ino; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->ino = ino; + ), + TP_printk("dev %d:%d iscan start 0x%llx cursor 0x%llx visited 0x%llx ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->startino, + __entry->cursor, + __entry->visited, + __entry->ino) +) +#define DEFINE_ISCAN_INO_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_ino_class, name, \ + TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), \ + TP_ARGS(iscan, ino)) +DEFINE_ISCAN_INO_EVENT(xchk_iscan_want_live_update); +DEFINE_ISCAN_INO_EVENT(xchk_iscan_start); + +TRACE_EVENT(xchk_iscan_iget, + TP_PROTO(struct xchk_iscan *iscan, int error), + TP_ARGS(iscan, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->error = error; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->error) +); + +TRACE_EVENT(xchk_iscan_iget_batch, + TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan, + unsigned int nr, unsigned int avail), + TP_ARGS(mp, iscan, nr, avail), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(unsigned int, nr) + __field(unsigned int, avail) + __field(unsigned int, unavail) + __field(xfs_ino_t, batch_ino) + __field(unsigned long long, skipmask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->nr = nr; + __entry->avail = avail; + __entry->unavail = hweight64(iscan->__skipped_inomask); + __entry->batch_ino = iscan->__batch_ino; + __entry->skipmask = iscan->__skipped_inomask; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx batchino 0x%llx skipmask 0x%llx nr %u avail %u unavail %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->batch_ino, + __entry->skipmask, + __entry->nr, + __entry->avail, + __entry->unavail) +); + +TRACE_EVENT(xchk_iscan_iget_retry_wait, + TP_PROTO(struct xchk_iscan *iscan), + TP_ARGS(iscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(unsigned int, retry_delay) + __field(unsigned long, remaining) + __field(unsigned int, iget_timeout) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->retry_delay = iscan->iget_retry_delay; + __entry->remaining = jiffies_to_msecs(iscan->__iget_deadline - jiffies); + __entry->iget_timeout = iscan->iget_timeout; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx remaining %lu timeout %u delay %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->remaining, + __entry->iget_timeout, + __entry->retry_delay) +); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) @@ -1672,6 +1817,55 @@ TRACE_EVENT(xrep_dinode_count_rmaps, __entry->attr_extents) ); +TRACE_EVENT(xrep_dinode_findmode_dirent, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp, + unsigned int ftype), + TP_ARGS(sc, dp, ftype), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, ftype) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->parent_ino = dp->i_ino; + __entry->ftype = ftype; + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR)) +); + +TRACE_EVENT(xrep_dinode_findmode_dirent_inval, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp, + unsigned int ftype, unsigned int found_ftype), + TP_ARGS(sc, dp, ftype, found_ftype), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, ftype) + __field(unsigned int, found_ftype) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->parent_ino = dp->i_ino; + __entry->ftype = ftype; + __entry->found_ftype = found_ftype; + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s' found_ftype '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __print_symbolic(__entry->found_ftype, XFS_DIR3_FTYPE_STR)) +); + TRACE_EVENT(xrep_cow_mark_file_range, TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock, xfs_fileoff_t startoff, xfs_filblks_t blockcount), diff --git a/fs/xfs/xfs_hooks.c b/fs/xfs/xfs_hooks.c new file mode 100644 index 000000000000..a58d1de2d37d --- /dev/null +++ b/fs/xfs/xfs_hooks.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <[email protected]> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_ag.h" +#include "xfs_trace.h" + +/* Initialize a notifier chain. */ +void +xfs_hooks_init( + struct xfs_hooks *chain) +{ + BLOCKING_INIT_NOTIFIER_HEAD(&chain->head); +} + +/* Make it so a function gets called whenever we hit a certain hook point. */ +int +xfs_hooks_add( + struct xfs_hooks *chain, + struct xfs_hook *hook) +{ + ASSERT(hook->nb.notifier_call != NULL); + BUILD_BUG_ON(offsetof(struct xfs_hook, nb) != 0); + + return blocking_notifier_chain_register(&chain->head, &hook->nb); +} + +/* Remove a previously installed hook. */ +void +xfs_hooks_del( + struct xfs_hooks *chain, + struct xfs_hook *hook) +{ + blocking_notifier_chain_unregister(&chain->head, &hook->nb); +} + +/* Call a hook. Returns the NOTIFY_* value returned by the last hook. */ +int +xfs_hooks_call( + struct xfs_hooks *chain, + unsigned long val, + void *priv) +{ + return blocking_notifier_call_chain(&chain->head, val, priv); +} diff --git a/fs/xfs/xfs_hooks.h b/fs/xfs/xfs_hooks.h new file mode 100644 index 000000000000..60b8a5831536 --- /dev/null +++ b/fs/xfs/xfs_hooks.h @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <[email protected]> + */ +#ifndef XFS_HOOKS_H_ +#define XFS_HOOKS_H_ + +#ifdef CONFIG_XFS_LIVE_HOOKS +struct xfs_hooks { + struct blocking_notifier_head head; +}; + +/* + * If jump labels are enabled in Kconfig, the static key uses nop sleds and + * code patching to eliminate the overhead of taking the rwsem in + * blocking_notifier_call_chain when there are no hooks configured. If not, + * the static key per-call overhead is an atomic read. Most arches that can + * handle XFS also support jump labels. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +# define DEFINE_STATIC_XFS_HOOK_SWITCH(name) \ + static DEFINE_STATIC_KEY_FALSE(name) +# define xfs_hooks_switch_on(name) static_branch_inc(name) +# define xfs_hooks_switch_off(name) static_branch_dec(name) +# define xfs_hooks_switched_on(name) static_branch_unlikely(name) + +struct xfs_hook { + /* This must come at the start of the structure. */ + struct notifier_block nb; +}; + +typedef int (*xfs_hook_fn_t)(struct xfs_hook *hook, unsigned long action, + void *data); + +void xfs_hooks_init(struct xfs_hooks *chain); +int xfs_hooks_add(struct xfs_hooks *chain, struct xfs_hook *hook); +void xfs_hooks_del(struct xfs_hooks *chain, struct xfs_hook *hook); +int xfs_hooks_call(struct xfs_hooks *chain, unsigned long action, + void *priv); + +static inline void xfs_hook_setup(struct xfs_hook *hook, notifier_fn_t fn) +{ + hook->nb.notifier_call = fn; + hook->nb.priority = 0; +} + +#else + +struct xfs_hooks { /* empty */ }; + +# define DEFINE_STATIC_XFS_HOOK_SWITCH(name) +# define xfs_hooks_switch_on(name) ((void)0) +# define xfs_hooks_switch_off(name) ((void)0) +# define xfs_hooks_switched_on(name) (false) + +# define xfs_hooks_init(chain) ((void)0) +# define xfs_hooks_call(chain, val, priv) (NOTIFY_DONE) +#endif + +#endif /* XFS_HOOKS_H_ */ diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 6d2eb6364867..c1e9c7bcb6a9 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -22,6 +22,7 @@ #include "xfs_trans.h" #include "xfs_pwork.h" #include "xfs_ag.h" +#include "xfs_bit.h" /* * Walking Inodes in the Filesystem @@ -131,21 +132,11 @@ xfs_iwalk_adjust_start( struct xfs_inobt_rec_incore *irec) /* btree record */ { int idx; /* index into inode chunk */ - int i; idx = agino - irec->ir_startino; - /* - * We got a right chunk with some left inodes allocated at it. Grab - * the chunk record. Mark all the uninteresting inodes free because - * they're before our start point. - */ - for (i = 0; i < idx; i++) { - if (XFS_INOBT_MASK(i) & ~irec->ir_free) - irec->ir_freecount++; - } - irec->ir_free |= xfs_inobt_maskn(0, idx); + irec->ir_freecount = hweight64(irec->ir_free); } /* Allocate memory for a walk. */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 75245932e0ad..8f07c9f6157f 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -81,6 +81,7 @@ typedef __u32 xfs_nlink_t; #include "xfs_buf.h" #include "xfs_message.h" #include "xfs_drain.h" +#include "xfs_hooks.h" #ifdef __BIG_ENDIAN #define XFS_NATIVE_HOST 1 |