diff options
Diffstat (limited to 'fs/xfs')
-rw-r--r-- | fs/xfs/libxfs/xfs_da_format.h | 75 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_fs.h | 4 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_inode_buf.c | 5 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_trans_inode.c | 6 | ||||
-rw-r--r-- | fs/xfs/scrub/fscounters.c | 188 | ||||
-rw-r--r-- | fs/xfs/scrub/scrub.c | 6 | ||||
-rw-r--r-- | fs/xfs/scrub/scrub.h | 1 | ||||
-rw-r--r-- | fs/xfs/scrub/trace.h | 26 | ||||
-rw-r--r-- | fs/xfs/xfs_acl.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_aops.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_bmap_util.c | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.c | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_inode_item.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_iops.c | 25 | ||||
-rw-r--r-- | fs/xfs/xfs_itable.c | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_ondisk.h | 5 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c | 138 |
18 files changed, 371 insertions, 134 deletions
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 25e2841084e1..f9015f88eca7 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -591,7 +591,7 @@ struct xfs_attr_shortform { uint8_t valuelen; /* actual length of value (no NULL) */ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ uint8_t nameval[]; /* name & value bytes concatenated */ - } list[1]; /* variable sized array */ + } list[]; /* variable sized array */ }; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ @@ -620,19 +620,29 @@ typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ typedef struct xfs_attr_leaf_name_local { __be16 valuelen; /* number of bytes in value */ __u8 namelen; /* length of name bytes */ - __u8 nameval[1]; /* name/value bytes */ + /* + * In Linux 6.5 this flex array was converted from nameval[1] to + * nameval[]. Be very careful here about extra padding at the end; + * see xfs_attr_leaf_entsize_local() for details. + */ + __u8 nameval[]; /* name/value bytes */ } xfs_attr_leaf_name_local_t; typedef struct xfs_attr_leaf_name_remote { __be32 valueblk; /* block number of value bytes */ __be32 valuelen; /* number of bytes in value */ __u8 namelen; /* length of name bytes */ - __u8 name[1]; /* name bytes */ + /* + * In Linux 6.5 this flex array was converted from name[1] to name[]. + * Be very careful here about extra padding at the end; see + * xfs_attr_leaf_entsize_remote() for details. + */ + __u8 name[]; /* name bytes */ } xfs_attr_leaf_name_remote_t; typedef struct xfs_attr_leafblock { xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ - xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ + xfs_attr_leaf_entry_t entries[]; /* sorted on key, not name */ /* * The rest of the block contains the following structures after the * leaf entries, growing from the bottom up. The variables are never @@ -664,7 +674,7 @@ struct xfs_attr3_leaf_hdr { struct xfs_attr3_leafblock { struct xfs_attr3_leaf_hdr hdr; - struct xfs_attr_leaf_entry entries[1]; + struct xfs_attr_leaf_entry entries[]; /* * The rest of the block contains the following structures after the @@ -747,14 +757,61 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) */ static inline int xfs_attr_leaf_entsize_remote(int nlen) { - return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 + - nlen, XFS_ATTR_LEAF_NAME_ALIGN); + /* + * Prior to Linux 6.5, struct xfs_attr_leaf_name_remote ended with + * name[1], which was used as a flexarray. The layout of this struct + * is 9 bytes of fixed-length fields followed by a __u8 flex array at + * offset 9. + * + * On most architectures, struct xfs_attr_leaf_name_remote had two + * bytes of implicit padding at the end of the struct to make the + * struct length 12. After converting name[1] to name[], there are + * three implicit padding bytes and the struct size remains 12. + * However, there are compiler configurations that do not add implicit + * padding at all (m68k) and have been broken for years. + * + * This entsize computation historically added (the xattr name length) + * to (the padded struct length - 1) and rounded that sum up to the + * nearest multiple of 4 (NAME_ALIGN). IOWs, round_up(11 + nlen, 4). + * This is encoded in the ondisk format, so we cannot change this. + * + * Compute the entsize from offsetof of the flexarray and manually + * adding bytes for the implicit padding. + */ + const size_t remotesize = + offsetof(struct xfs_attr_leaf_name_remote, name) + 2; + + return round_up(remotesize + nlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) { - return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 + - nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN); + /* + * Prior to Linux 6.5, struct xfs_attr_leaf_name_local ended with + * nameval[1], which was used as a flexarray. The layout of this + * struct is 3 bytes of fixed-length fields followed by a __u8 flex + * array at offset 3. + * + * struct xfs_attr_leaf_name_local had zero bytes of implicit padding + * at the end of the struct to make the struct length 4. On most + * architectures, after converting nameval[1] to nameval[], there is + * one implicit padding byte and the struct size remains 4. However, + * there are compiler configurations that do not add implicit padding + * at all (m68k) and would break. + * + * This entsize computation historically added (the xattr name and + * value length) to (the padded struct length - 1) and rounded that sum + * up to the nearest multiple of 4 (NAME_ALIGN). IOWs, the formula is + * round_up(3 + nlen + vlen, 4). This is encoded in the ondisk format, + * so we cannot change this. + * + * Compute the entsize from offsetof of the flexarray and manually + * adding bytes for the implicit padding. + */ + const size_t localsize = + offsetof(struct xfs_attr_leaf_name_local, nameval); + + return round_up(localsize + nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local_max(int bsize) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 9c60ebb328b4..2cbf9ea39b8c 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -592,12 +592,12 @@ typedef struct xfs_attrlist_cursor { struct xfs_attrlist { __s32 al_count; /* number of entries in attrlist */ __s32 al_more; /* T/F: more attrs (do call again) */ - __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ + __s32 al_offset[]; /* byte offsets of attrs [var-sized] */ }; struct xfs_attrlist_ent { /* data from attr_list() */ __u32 a_valuelen; /* number bytes in value of attr */ - char a_name[1]; /* attr name (NULL terminated) */ + char a_name[]; /* attr name (NULL terminated) */ }; typedef struct xfs_fsop_attrlist_handlereq { diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 758aacd8166b..a35781577cad 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -222,7 +222,8 @@ xfs_inode_from_disk( */ inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime); inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime); - inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime); + inode_set_ctime_to_ts(inode, + xfs_inode_from_disk_ts(from, from->di_ctime)); ip->i_disk_size = be64_to_cpu(from->di_size); ip->i_nblocks = be64_to_cpu(from->di_nblocks); @@ -316,7 +317,7 @@ xfs_inode_to_disk( to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); - to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); + to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode)); to->di_nlink = cpu_to_be32(inode->i_nlink); to->di_gen = cpu_to_be32(inode->i_generation); to->di_mode = cpu_to_be16(inode->i_mode); diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index cb4796b6e693..ad22656376d3 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -62,12 +62,12 @@ xfs_trans_ichgtime( ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - tv = current_time(inode); + /* If the mtime changes, then ctime must also change */ + ASSERT(flags & XFS_ICHGTIME_CHG); + tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) inode->i_mtime = tv; - if (flags & XFS_ICHGTIME_CHG) - inode->i_ctime = tv; if (flags & XFS_ICHGTIME_CREATE) ip->i_crtime = tv; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index e382a35e98d8..05be757668bb 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2019-2023 Oracle. All Rights Reserved. * Author: Darrick J. Wong <djwong@kernel.org> @@ -8,6 +8,8 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_mount.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" @@ -16,6 +18,7 @@ #include "xfs_ag.h" #include "xfs_rtalloc.h" #include "xfs_inode.h" +#include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -53,6 +56,7 @@ struct xchk_fscounters { uint64_t frextents; unsigned long long icount_min; unsigned long long icount_max; + bool frozen; }; /* @@ -123,6 +127,82 @@ xchk_fscount_warmup( return error; } +static inline int +xchk_fsfreeze( + struct xfs_scrub *sc) +{ + int error; + + error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + trace_xchk_fsfreeze(sc, error); + return error; +} + +static inline int +xchk_fsthaw( + struct xfs_scrub *sc) +{ + int error; + + /* This should always succeed, we have a kernel freeze */ + error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + trace_xchk_fsthaw(sc, error); + return error; +} + +/* + * We couldn't stabilize the filesystem long enough to sample all the variables + * that comprise the summary counters and compare them to the percpu counters. + * We need to disable all writer threads, which means taking the first two + * freeze levels to put userspace to sleep, and the third freeze level to + * prevent background threads from starting new transactions. Take one level + * more to prevent other callers from unfreezing the filesystem while we run. + */ +STATIC int +xchk_fscounters_freeze( + struct xfs_scrub *sc) +{ + struct xchk_fscounters *fsc = sc->buf; + int error = 0; + + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; + mnt_drop_write_file(sc->file); + } + + /* Try to grab a kernel freeze. */ + while ((error = xchk_fsfreeze(sc)) == -EBUSY) { + if (xchk_should_terminate(sc, &error)) + return error; + + delay(HZ / 10); + } + if (error) + return error; + + fsc->frozen = true; + return 0; +} + +/* Thaw the filesystem after checking or repairing fscounters. */ +STATIC void +xchk_fscounters_cleanup( + void *buf) +{ + struct xchk_fscounters *fsc = buf; + struct xfs_scrub *sc = fsc->sc; + int error; + + if (!fsc->frozen) + return; + + error = xchk_fsthaw(sc); + if (error) + xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error); + else + fsc->frozen = false; +} + int xchk_setup_fscounters( struct xfs_scrub *sc) @@ -140,6 +220,7 @@ xchk_setup_fscounters( sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; + sc->buf_cleanup = xchk_fscounters_cleanup; fsc = sc->buf; fsc->sc = sc; @@ -150,7 +231,18 @@ xchk_setup_fscounters( if (error) return error; - return xchk_trans_alloc(sc, 0); + /* + * Pause all writer activity in the filesystem while we're scrubbing to + * reduce the likelihood of background perturbations to the counters + * throwing off our calculations. + */ + if (sc->flags & XCHK_TRY_HARDER) { + error = xchk_fscounters_freeze(sc); + if (error) + return error; + } + + return xfs_trans_alloc_empty(sc->mp, &sc->tp); } /* @@ -290,8 +382,7 @@ retry: if (fsc->ifree > fsc->icount) { if (tries--) goto retry; - xchk_set_incomplete(sc); - return 0; + return -EDEADLOCK; } return 0; @@ -367,6 +458,8 @@ xchk_fscount_count_frextents( * Otherwise, we /might/ have a problem. If the change in the summations is * more than we want to tolerate, the filesystem is probably busy and we should * just send back INCOMPLETE and see if userspace will try again. + * + * If we're repairing then we require an exact match. */ static inline bool xchk_fscount_within_range( @@ -396,21 +489,7 @@ xchk_fscount_within_range( if (expected >= min_value && expected <= max_value) return true; - /* - * If the difference between the two summations is too large, the fs - * might just be busy and so we'll mark the scrub incomplete. Return - * true here so that we don't mark the counter corrupt. - * - * XXX: In the future when userspace can grant scrub permission to - * quiesce the filesystem to solve the outsized variance problem, this - * check should be moved up and the return code changed to signal to - * userspace that we need quiesce permission. - */ - if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) { - xchk_set_incomplete(sc); - return true; - } - + /* Everything else is bad. */ return false; } @@ -422,6 +501,7 @@ xchk_fscounters( struct xfs_mount *mp = sc->mp; struct xchk_fscounters *fsc = sc->buf; int64_t icount, ifree, fdblocks, frextents; + bool try_again = false; int error; /* Snapshot the percpu counters. */ @@ -431,9 +511,26 @@ xchk_fscounters( frextents = percpu_counter_sum(&mp->m_frextents); /* No negative values, please! */ - if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0) + if (icount < 0 || ifree < 0) xchk_set_corrupt(sc); + /* + * If the filesystem is not frozen, the counter summation calls above + * can race with xfs_mod_freecounter, which subtracts a requested space + * reservation from the counter and undoes the subtraction if that made + * the counter go negative. Therefore, it's possible to see negative + * values here, and we should only flag that as a corruption if we + * froze the fs. This is much more likely to happen with frextents + * since there are no reserved pools. + */ + if (fdblocks < 0 || frextents < 0) { + if (!fsc->frozen) + return -EDEADLOCK; + + xchk_set_corrupt(sc); + return 0; + } + /* See if icount is obviously wrong. */ if (icount < fsc->icount_min || icount > fsc->icount_max) xchk_set_corrupt(sc); @@ -447,12 +544,6 @@ xchk_fscounters( xchk_set_corrupt(sc); /* - * XXX: We can't quiesce percpu counter updates, so exit early. - * This can be re-enabled when we gain exclusive freeze functionality. - */ - return 0; - - /* * If ifree exceeds icount by more than the minimum variance then * something's probably wrong with the counters. */ @@ -463,8 +554,6 @@ xchk_fscounters( error = xchk_fscount_aggregate_agcounts(sc, fsc); if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) - return 0; /* Count the free extents counter for rt volumes. */ error = xchk_fscount_count_frextents(sc, fsc); @@ -473,20 +562,45 @@ xchk_fscounters( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) return 0; - /* Compare the in-core counters with whatever we counted. */ - if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) - xchk_set_corrupt(sc); + /* + * Compare the in-core counters with whatever we counted. If the fs is + * frozen, we treat the discrepancy as a corruption because the freeze + * should have stabilized the counter values. Otherwise, we need + * userspace to call us back having granted us freeze permission. + */ + if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, + fsc->icount)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } - if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) - xchk_set_corrupt(sc); + if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, - fsc->fdblocks)) - xchk_set_corrupt(sc); + fsc->fdblocks)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, - fsc->frextents)) - xchk_set_corrupt(sc); + fsc->frextents)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } + + if (try_again) + return -EDEADLOCK; return 0; } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 3d98f604765e..a0fffbcd022b 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -184,8 +184,10 @@ xchk_teardown( xchk_irele(sc, sc->ip); sc->ip = NULL; } - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; mnt_drop_write_file(sc->file); + } if (sc->buf) { if (sc->buf_cleanup) sc->buf_cleanup(sc->buf); @@ -505,6 +507,8 @@ retry_op: error = mnt_want_write_file(sc->file); if (error) goto out_sc; + + sc->flags |= XCHK_HAVE_FREEZE_PROT; } /* Set up for the operation. */ diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index e113f2f5c254..f8ba00e51ca9 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -106,6 +106,7 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */ +#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index b3894daeb86a..0b54f1a1cf0c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -98,6 +98,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); #define XFS_SCRUB_STATE_STRINGS \ { XCHK_TRY_HARDER, "try_harder" }, \ + { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ { XREP_ALREADY_FIXED, "already_fixed" } @@ -693,6 +694,31 @@ TRACE_EVENT(xchk_fscounters_within_range, __entry->old_value) ) +DECLARE_EVENT_CLASS(xchk_fsfreeze_class, + TP_PROTO(struct xfs_scrub *sc, int error), + TP_ARGS(sc, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->error = error; + ), + TP_printk("dev %d:%d type %s error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->error) +); +#define DEFINE_XCHK_FSFREEZE_EVENT(name) \ +DEFINE_EVENT(xchk_fsfreeze_class, name, \ + TP_PROTO(struct xfs_scrub *sc, int error), \ + TP_ARGS(sc, error)) +DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze); +DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw); + TRACE_EVENT(xchk_refcount_incorrect, TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, xfs_nlink_t seen), diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 791db7d9c849..6b840301817a 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -233,7 +233,7 @@ xfs_acl_set_mode( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (xfs_has_wsync(mp)) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 451942fb38ec..2fca4b4e7fd8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = { .read_folio = xfs_vm_read_folio, .readahead = xfs_vm_readahead, .writepages = xfs_vm_writepages, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .bmap = xfs_vm_bmap, diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fbb675563208..fcefab687285 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1644,6 +1644,7 @@ xfs_swap_extents( uint64_t f; int resblks = 0; unsigned int flags = 0; + struct timespec64 ctime; /* * Lock the inodes against other IO, page faults and truncate to @@ -1756,8 +1757,9 @@ xfs_swap_extents( * process that the file was not changed out from * under it. */ - if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || - (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + ctime = inode_get_ctime(VFS_I(ip)); + if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) || (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { error = -EBUSY; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 15d1e5a7c2d3..3b903f6bce98 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1938,14 +1938,17 @@ void xfs_free_buftarg( struct xfs_buftarg *btp) { + struct block_device *bdev = btp->bt_bdev; + unregister_shrinker(&btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); - blkdev_issue_flush(btp->bt_bdev); - invalidate_bdev(btp->bt_bdev); fs_put_dax(btp->bt_daxdev, btp->bt_mount); + /* the main block device is closed by kill_block_super */ + if (bdev != btp->bt_mount->m_super->s_bdev) + blkdev_put(bdev, btp->bt_mount->m_super); kmem_free(btp); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9e62cc500140..360fe83a334f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -843,10 +843,9 @@ xfs_init_new_inode( ip->i_df.if_nextents = 0; ASSERT(ip->i_nblocks == 0); - tv = current_time(inode); + tv = inode_set_ctime_current(inode); inode->i_mtime = tv; inode->i_atime = tv; - inode->i_ctime = tv; ip->i_extsize = 0; ip->i_diflags = 0; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 91c847a84e10..127b2410eb20 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -528,7 +528,7 @@ xfs_inode_to_log_dinode( memset(to->di_pad3, 0, sizeof(to->di_pad3)); to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); - to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime); + to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode)); to->di_nlink = inode->i_nlink; to->di_gen = inode->i_generation; to->di_mode = inode->i_mode; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 24718adb3c16..2ededd3f6b8c 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -573,10 +573,10 @@ xfs_vn_getattr( stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode->i_atime; - stat->mtime = inode->i_mtime; - stat->ctime = inode->i_ctime; stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); + fill_mg_cmtime(stat, request_mask, inode); + if (xfs_has_v3inodes(mp)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; @@ -917,7 +917,7 @@ xfs_setattr_size( if (newsize != oldsize && !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = - current_time(inode); + current_mgtime(inode); iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } @@ -1029,7 +1029,6 @@ xfs_vn_setattr( STATIC int xfs_vn_update_time( struct inode *inode, - struct timespec64 *now, int flags) { struct xfs_inode *ip = XFS_I(inode); @@ -1037,13 +1036,16 @@ xfs_vn_update_time( int log_flags = XFS_ILOG_TIMESTAMP; struct xfs_trans *tp; int error; + struct timespec64 now; trace_xfs_update_time(ip); if (inode->i_sb->s_flags & SB_LAZYTIME) { if (!((flags & S_VERSION) && - inode_maybe_inc_iversion(inode, false))) - return generic_update_time(inode, now, flags); + inode_maybe_inc_iversion(inode, false))) { + generic_update_time(inode, flags); + return 0; + } /* Capture the iversion update that just occurred */ log_flags |= XFS_ILOG_CORE; @@ -1054,12 +1056,15 @@ xfs_vn_update_time( return error; xfs_ilock(ip, XFS_ILOCK_EXCL); - if (flags & S_CTIME) - inode->i_ctime = *now; + if (flags & (S_CTIME|S_MTIME)) + now = inode_set_ctime_current(inode); + else + now = current_time(inode); + if (flags & S_MTIME) - inode->i_mtime = *now; + inode->i_mtime = now; if (flags & S_ATIME) - inode->i_atime = *now; + inode->i_atime = now; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, log_flags); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f225413a993c..c2093cb56092 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -100,8 +100,8 @@ xfs_bulkstat_one_int( buf->bs_atime_nsec = inode->i_atime.tv_nsec; buf->bs_mtime = inode->i_mtime.tv_sec; buf->bs_mtime_nsec = inode->i_mtime.tv_nsec; - buf->bs_ctime = inode->i_ctime.tv_sec; - buf->bs_ctime_nsec = inode->i_ctime.tv_nsec; + buf->bs_ctime = inode_get_ctime(inode).tv_sec; + buf->bs_ctime_nsec = inode_get_ctime(inode).tv_nsec; buf->bs_gen = inode->i_generation; buf->bs_mode = inode->i_mode; diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 9737b5a9f405..c4cc99b70dd3 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -56,7 +56,7 @@ xfs_check_ondisk_structs(void) /* dir/attr trees */ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80); - XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 88); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 80); XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode, 64); @@ -88,7 +88,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); + XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4); XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 818510243130..c79eac048456 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -377,17 +377,6 @@ disable_dax: return 0; } -static void -xfs_bdev_mark_dead( - struct block_device *bdev) -{ - xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED); -} - -static const struct blk_holder_ops xfs_holder_ops = { - .mark_dead = xfs_bdev_mark_dead, -}; - STATIC int xfs_blkdev_get( xfs_mount_t *mp, @@ -396,8 +385,8 @@ xfs_blkdev_get( { int error = 0; - *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp, - &xfs_holder_ops); + *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, + mp->m_super, &fs_holder_ops); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); xfs_warn(mp, "Invalid device [%s], error=%d", name, error); @@ -407,31 +396,45 @@ xfs_blkdev_get( } STATIC void -xfs_blkdev_put( - struct xfs_mount *mp, - struct block_device *bdev) -{ - if (bdev) - blkdev_put(bdev, mp); -} - -STATIC void -xfs_close_devices( +xfs_shutdown_devices( struct xfs_mount *mp) { + /* + * Udev is triggered whenever anyone closes a block device or unmounts + * a file systemm on a block device. + * The default udev rules invoke blkid to read the fs super and create + * symlinks to the bdev under /dev/disk. For this, it uses buffered + * reads through the page cache. + * + * xfs_db also uses buffered reads to examine metadata. There is no + * coordination between xfs_db and udev, which means that they can run + * concurrently. Note there is no coordination between the kernel and + * blkid either. + * + * On a system with 64k pages, the page cache can cache the superblock + * and the root inode (and hence the root directory) with the same 64k + * page. If udev spawns blkid after the mkfs and the system is busy + * enough that it is still running when xfs_db starts up, they'll both + * read from the same page in the pagecache. + * + * The unmount writes updated inode metadata to disk directly. The XFS + * buffer cache does not use the bdev pagecache, so it needs to + * invalidate that pagecache on unmount. If the above scenario occurs, + * the pagecache no longer reflects what's on disk, xfs_db reads the + * stale metadata, and fails to find /a. Most of the time this succeeds + * because closing a bdev invalidates the page cache, but when processes + * race, everyone loses. + */ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { - struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - - xfs_free_buftarg(mp->m_logdev_targp); - xfs_blkdev_put(mp, logdev); + blkdev_issue_flush(mp->m_logdev_targp->bt_bdev); + invalidate_bdev(mp->m_logdev_targp->bt_bdev); } if (mp->m_rtdev_targp) { - struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - - xfs_free_buftarg(mp->m_rtdev_targp); - xfs_blkdev_put(mp, rtdev); + blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); + invalidate_bdev(mp->m_rtdev_targp->bt_bdev); } - xfs_free_buftarg(mp->m_ddev_targp); + blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); + invalidate_bdev(mp->m_ddev_targp->bt_bdev); } /* @@ -448,17 +451,24 @@ STATIC int xfs_open_devices( struct xfs_mount *mp) { - struct block_device *ddev = mp->m_super->s_bdev; + struct super_block *sb = mp->m_super; + struct block_device *ddev = sb->s_bdev; struct block_device *logdev = NULL, *rtdev = NULL; int error; /* + * blkdev_put() can't be called under s_umount, see the comment + * in get_tree_bdev() for more details + */ + up_write(&sb->s_umount); + + /* * Open real time and log devices - order is important. */ if (mp->m_logname) { error = xfs_blkdev_get(mp, mp->m_logname, &logdev); if (error) - return error; + goto out_relock; } if (mp->m_rtname) { @@ -496,7 +506,10 @@ xfs_open_devices( mp->m_logdev_targp = mp->m_ddev_targp; } - return 0; + error = 0; +out_relock: + down_write(&sb->s_umount); + return error; out_free_rtdev_targ: if (mp->m_rtdev_targp) @@ -504,11 +517,12 @@ xfs_open_devices( out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - xfs_blkdev_put(mp, rtdev); + if (rtdev) + blkdev_put(rtdev, sb); out_close_logdev: if (logdev && logdev != ddev) - xfs_blkdev_put(mp, logdev); - return error; + blkdev_put(logdev, sb); + goto out_relock; } /* @@ -758,6 +772,17 @@ static void xfs_mount_free( struct xfs_mount *mp) { + /* + * Free the buftargs here because blkdev_put needs to be called outside + * of sb->s_umount, which is held around the call to ->put_super. + */ + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_free_buftarg(mp->m_logdev_targp); + if (mp->m_rtdev_targp) + xfs_free_buftarg(mp->m_rtdev_targp); + if (mp->m_ddev_targp) + xfs_free_buftarg(mp->m_ddev_targp); + kfree(mp->m_rtname); kfree(mp->m_logname); kmem_free(mp); @@ -1133,10 +1158,6 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - /* if ->fill_super failed, we have no mount to tear down */ - if (!sb->s_fs_info) - return; - xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid); xfs_filestream_unmount(mp); xfs_unmountfs(mp); @@ -1147,10 +1168,7 @@ xfs_fs_put_super( xfs_inodegc_free_percpu(mp); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); - xfs_close_devices(mp); - - sb->s_fs_info = NULL; - xfs_mount_free(mp); + xfs_shutdown_devices(mp); } static long @@ -1492,7 +1510,7 @@ xfs_fs_fill_super( error = xfs_fs_validate_params(mp); if (error) - goto out_free_names; + return error; sb_min_blocksize(sb, BBSIZE); sb->s_xattr = xfs_xattr_handlers; @@ -1519,11 +1537,11 @@ xfs_fs_fill_super( error = xfs_open_devices(mp); if (error) - goto out_free_names; + return error; error = xfs_init_mount_workqueues(mp); if (error) - goto out_close_devices; + goto out_shutdown_devices; error = xfs_init_percpu_counters(mp); if (error) @@ -1737,11 +1755,8 @@ xfs_fs_fill_super( xfs_destroy_percpu_counters(mp); out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); - out_close_devices: - xfs_close_devices(mp); - out_free_names: - sb->s_fs_info = NULL; - xfs_mount_free(mp); + out_shutdown_devices: + xfs_shutdown_devices(mp); return error; out_unmount: @@ -1934,7 +1949,8 @@ xfs_fs_reconfigure( return 0; } -static void xfs_fs_free( +static void +xfs_fs_free( struct fs_context *fc) { struct xfs_mount *mp = fc->s_fs_info; @@ -2003,13 +2019,21 @@ static int xfs_init_fs_context( return 0; } +static void +xfs_kill_sb( + struct super_block *sb) +{ + kill_block_super(sb); + xfs_mount_free(XFS_M(sb)); +} + static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .kill_sb = xfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("xfs"); |