aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h75
-rw-r--r--fs/xfs/libxfs/xfs_fs.h4
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c5
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c6
-rw-r--r--fs/xfs/scrub/fscounters.c188
-rw-r--r--fs/xfs/scrub/scrub.c6
-rw-r--r--fs/xfs/scrub/scrub.h1
-rw-r--r--fs/xfs/scrub/trace.h26
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_bmap_util.c6
-rw-r--r--fs/xfs/xfs_buf.c7
-rw-r--r--fs/xfs/xfs_inode.c3
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_iops.c25
-rw-r--r--fs/xfs/xfs_itable.c4
-rw-r--r--fs/xfs/xfs_ondisk.h5
-rw-r--r--fs/xfs/xfs_super.c138
18 files changed, 371 insertions, 134 deletions
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 25e2841084e1..f9015f88eca7 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -591,7 +591,7 @@ struct xfs_attr_shortform {
uint8_t valuelen; /* actual length of value (no NULL) */
uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
uint8_t nameval[]; /* name & value bytes concatenated */
- } list[1]; /* variable sized array */
+ } list[]; /* variable sized array */
};
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
@@ -620,19 +620,29 @@ typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
typedef struct xfs_attr_leaf_name_local {
__be16 valuelen; /* number of bytes in value */
__u8 namelen; /* length of name bytes */
- __u8 nameval[1]; /* name/value bytes */
+ /*
+ * In Linux 6.5 this flex array was converted from nameval[1] to
+ * nameval[]. Be very careful here about extra padding at the end;
+ * see xfs_attr_leaf_entsize_local() for details.
+ */
+ __u8 nameval[]; /* name/value bytes */
} xfs_attr_leaf_name_local_t;
typedef struct xfs_attr_leaf_name_remote {
__be32 valueblk; /* block number of value bytes */
__be32 valuelen; /* number of bytes in value */
__u8 namelen; /* length of name bytes */
- __u8 name[1]; /* name bytes */
+ /*
+ * In Linux 6.5 this flex array was converted from name[1] to name[].
+ * Be very careful here about extra padding at the end; see
+ * xfs_attr_leaf_entsize_remote() for details.
+ */
+ __u8 name[]; /* name bytes */
} xfs_attr_leaf_name_remote_t;
typedef struct xfs_attr_leafblock {
xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
- xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
+ xfs_attr_leaf_entry_t entries[]; /* sorted on key, not name */
/*
* The rest of the block contains the following structures after the
* leaf entries, growing from the bottom up. The variables are never
@@ -664,7 +674,7 @@ struct xfs_attr3_leaf_hdr {
struct xfs_attr3_leafblock {
struct xfs_attr3_leaf_hdr hdr;
- struct xfs_attr_leaf_entry entries[1];
+ struct xfs_attr_leaf_entry entries[];
/*
* The rest of the block contains the following structures after the
@@ -747,14 +757,61 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
*/
static inline int xfs_attr_leaf_entsize_remote(int nlen)
{
- return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 +
- nlen, XFS_ATTR_LEAF_NAME_ALIGN);
+ /*
+ * Prior to Linux 6.5, struct xfs_attr_leaf_name_remote ended with
+ * name[1], which was used as a flexarray. The layout of this struct
+ * is 9 bytes of fixed-length fields followed by a __u8 flex array at
+ * offset 9.
+ *
+ * On most architectures, struct xfs_attr_leaf_name_remote had two
+ * bytes of implicit padding at the end of the struct to make the
+ * struct length 12. After converting name[1] to name[], there are
+ * three implicit padding bytes and the struct size remains 12.
+ * However, there are compiler configurations that do not add implicit
+ * padding at all (m68k) and have been broken for years.
+ *
+ * This entsize computation historically added (the xattr name length)
+ * to (the padded struct length - 1) and rounded that sum up to the
+ * nearest multiple of 4 (NAME_ALIGN). IOWs, round_up(11 + nlen, 4).
+ * This is encoded in the ondisk format, so we cannot change this.
+ *
+ * Compute the entsize from offsetof of the flexarray and manually
+ * adding bytes for the implicit padding.
+ */
+ const size_t remotesize =
+ offsetof(struct xfs_attr_leaf_name_remote, name) + 2;
+
+ return round_up(remotesize + nlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
{
- return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 +
- nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN);
+ /*
+ * Prior to Linux 6.5, struct xfs_attr_leaf_name_local ended with
+ * nameval[1], which was used as a flexarray. The layout of this
+ * struct is 3 bytes of fixed-length fields followed by a __u8 flex
+ * array at offset 3.
+ *
+ * struct xfs_attr_leaf_name_local had zero bytes of implicit padding
+ * at the end of the struct to make the struct length 4. On most
+ * architectures, after converting nameval[1] to nameval[], there is
+ * one implicit padding byte and the struct size remains 4. However,
+ * there are compiler configurations that do not add implicit padding
+ * at all (m68k) and would break.
+ *
+ * This entsize computation historically added (the xattr name and
+ * value length) to (the padded struct length - 1) and rounded that sum
+ * up to the nearest multiple of 4 (NAME_ALIGN). IOWs, the formula is
+ * round_up(3 + nlen + vlen, 4). This is encoded in the ondisk format,
+ * so we cannot change this.
+ *
+ * Compute the entsize from offsetof of the flexarray and manually
+ * adding bytes for the implicit padding.
+ */
+ const size_t localsize =
+ offsetof(struct xfs_attr_leaf_name_local, nameval);
+
+ return round_up(localsize + nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local_max(int bsize)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 9c60ebb328b4..2cbf9ea39b8c 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -592,12 +592,12 @@ typedef struct xfs_attrlist_cursor {
struct xfs_attrlist {
__s32 al_count; /* number of entries in attrlist */
__s32 al_more; /* T/F: more attrs (do call again) */
- __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */
+ __s32 al_offset[]; /* byte offsets of attrs [var-sized] */
};
struct xfs_attrlist_ent { /* data from attr_list() */
__u32 a_valuelen; /* number bytes in value of attr */
- char a_name[1]; /* attr name (NULL terminated) */
+ char a_name[]; /* attr name (NULL terminated) */
};
typedef struct xfs_fsop_attrlist_handlereq {
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 758aacd8166b..a35781577cad 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -222,7 +222,8 @@ xfs_inode_from_disk(
*/
inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
- inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime);
+ inode_set_ctime_to_ts(inode,
+ xfs_inode_from_disk_ts(from, from->di_ctime));
ip->i_disk_size = be64_to_cpu(from->di_size);
ip->i_nblocks = be64_to_cpu(from->di_nblocks);
@@ -316,7 +317,7 @@ xfs_inode_to_disk(
to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
- to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
+ to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode));
to->di_nlink = cpu_to_be32(inode->i_nlink);
to->di_gen = cpu_to_be32(inode->i_generation);
to->di_mode = cpu_to_be16(inode->i_mode);
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index cb4796b6e693..ad22656376d3 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -62,12 +62,12 @@ xfs_trans_ichgtime(
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- tv = current_time(inode);
+ /* If the mtime changes, then ctime must also change */
+ ASSERT(flags & XFS_ICHGTIME_CHG);
+ tv = inode_set_ctime_current(inode);
if (flags & XFS_ICHGTIME_MOD)
inode->i_mtime = tv;
- if (flags & XFS_ICHGTIME_CHG)
- inode->i_ctime = tv;
if (flags & XFS_ICHGTIME_CREATE)
ip->i_crtime = tv;
}
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index e382a35e98d8..05be757668bb 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2019-2023 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
@@ -8,6 +8,8 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_mount.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
@@ -16,6 +18,7 @@
#include "xfs_ag.h"
#include "xfs_rtalloc.h"
#include "xfs_inode.h"
+#include "xfs_icache.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -53,6 +56,7 @@ struct xchk_fscounters {
uint64_t frextents;
unsigned long long icount_min;
unsigned long long icount_max;
+ bool frozen;
};
/*
@@ -123,6 +127,82 @@ xchk_fscount_warmup(
return error;
}
+static inline int
+xchk_fsfreeze(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ trace_xchk_fsfreeze(sc, error);
+ return error;
+}
+
+static inline int
+xchk_fsthaw(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ /* This should always succeed, we have a kernel freeze */
+ error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ trace_xchk_fsthaw(sc, error);
+ return error;
+}
+
+/*
+ * We couldn't stabilize the filesystem long enough to sample all the variables
+ * that comprise the summary counters and compare them to the percpu counters.
+ * We need to disable all writer threads, which means taking the first two
+ * freeze levels to put userspace to sleep, and the third freeze level to
+ * prevent background threads from starting new transactions. Take one level
+ * more to prevent other callers from unfreezing the filesystem while we run.
+ */
+STATIC int
+xchk_fscounters_freeze(
+ struct xfs_scrub *sc)
+{
+ struct xchk_fscounters *fsc = sc->buf;
+ int error = 0;
+
+ if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
+ sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
+ mnt_drop_write_file(sc->file);
+ }
+
+ /* Try to grab a kernel freeze. */
+ while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ delay(HZ / 10);
+ }
+ if (error)
+ return error;
+
+ fsc->frozen = true;
+ return 0;
+}
+
+/* Thaw the filesystem after checking or repairing fscounters. */
+STATIC void
+xchk_fscounters_cleanup(
+ void *buf)
+{
+ struct xchk_fscounters *fsc = buf;
+ struct xfs_scrub *sc = fsc->sc;
+ int error;
+
+ if (!fsc->frozen)
+ return;
+
+ error = xchk_fsthaw(sc);
+ if (error)
+ xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
+ else
+ fsc->frozen = false;
+}
+
int
xchk_setup_fscounters(
struct xfs_scrub *sc)
@@ -140,6 +220,7 @@ xchk_setup_fscounters(
sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
+ sc->buf_cleanup = xchk_fscounters_cleanup;
fsc = sc->buf;
fsc->sc = sc;
@@ -150,7 +231,18 @@ xchk_setup_fscounters(
if (error)
return error;
- return xchk_trans_alloc(sc, 0);
+ /*
+ * Pause all writer activity in the filesystem while we're scrubbing to
+ * reduce the likelihood of background perturbations to the counters
+ * throwing off our calculations.
+ */
+ if (sc->flags & XCHK_TRY_HARDER) {
+ error = xchk_fscounters_freeze(sc);
+ if (error)
+ return error;
+ }
+
+ return xfs_trans_alloc_empty(sc->mp, &sc->tp);
}
/*
@@ -290,8 +382,7 @@ retry:
if (fsc->ifree > fsc->icount) {
if (tries--)
goto retry;
- xchk_set_incomplete(sc);
- return 0;
+ return -EDEADLOCK;
}
return 0;
@@ -367,6 +458,8 @@ xchk_fscount_count_frextents(
* Otherwise, we /might/ have a problem. If the change in the summations is
* more than we want to tolerate, the filesystem is probably busy and we should
* just send back INCOMPLETE and see if userspace will try again.
+ *
+ * If we're repairing then we require an exact match.
*/
static inline bool
xchk_fscount_within_range(
@@ -396,21 +489,7 @@ xchk_fscount_within_range(
if (expected >= min_value && expected <= max_value)
return true;
- /*
- * If the difference between the two summations is too large, the fs
- * might just be busy and so we'll mark the scrub incomplete. Return
- * true here so that we don't mark the counter corrupt.
- *
- * XXX: In the future when userspace can grant scrub permission to
- * quiesce the filesystem to solve the outsized variance problem, this
- * check should be moved up and the return code changed to signal to
- * userspace that we need quiesce permission.
- */
- if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
- xchk_set_incomplete(sc);
- return true;
- }
-
+ /* Everything else is bad. */
return false;
}
@@ -422,6 +501,7 @@ xchk_fscounters(
struct xfs_mount *mp = sc->mp;
struct xchk_fscounters *fsc = sc->buf;
int64_t icount, ifree, fdblocks, frextents;
+ bool try_again = false;
int error;
/* Snapshot the percpu counters. */
@@ -431,9 +511,26 @@ xchk_fscounters(
frextents = percpu_counter_sum(&mp->m_frextents);
/* No negative values, please! */
- if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0)
+ if (icount < 0 || ifree < 0)
xchk_set_corrupt(sc);
+ /*
+ * If the filesystem is not frozen, the counter summation calls above
+ * can race with xfs_mod_freecounter, which subtracts a requested space
+ * reservation from the counter and undoes the subtraction if that made
+ * the counter go negative. Therefore, it's possible to see negative
+ * values here, and we should only flag that as a corruption if we
+ * froze the fs. This is much more likely to happen with frextents
+ * since there are no reserved pools.
+ */
+ if (fdblocks < 0 || frextents < 0) {
+ if (!fsc->frozen)
+ return -EDEADLOCK;
+
+ xchk_set_corrupt(sc);
+ return 0;
+ }
+
/* See if icount is obviously wrong. */
if (icount < fsc->icount_min || icount > fsc->icount_max)
xchk_set_corrupt(sc);
@@ -447,12 +544,6 @@ xchk_fscounters(
xchk_set_corrupt(sc);
/*
- * XXX: We can't quiesce percpu counter updates, so exit early.
- * This can be re-enabled when we gain exclusive freeze functionality.
- */
- return 0;
-
- /*
* If ifree exceeds icount by more than the minimum variance then
* something's probably wrong with the counters.
*/
@@ -463,8 +554,6 @@ xchk_fscounters(
error = xchk_fscount_aggregate_agcounts(sc, fsc);
if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
return error;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
- return 0;
/* Count the free extents counter for rt volumes. */
error = xchk_fscount_count_frextents(sc, fsc);
@@ -473,20 +562,45 @@ xchk_fscounters(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
return 0;
- /* Compare the in-core counters with whatever we counted. */
- if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
- xchk_set_corrupt(sc);
+ /*
+ * Compare the in-core counters with whatever we counted. If the fs is
+ * frozen, we treat the discrepancy as a corruption because the freeze
+ * should have stabilized the counter values. Otherwise, we need
+ * userspace to call us back having granted us freeze permission.
+ */
+ if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
+ fsc->icount)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
- if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
- xchk_set_corrupt(sc);
+ if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
- fsc->fdblocks))
- xchk_set_corrupt(sc);
+ fsc->fdblocks)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
- fsc->frextents))
- xchk_set_corrupt(sc);
+ fsc->frextents)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
+
+ if (try_again)
+ return -EDEADLOCK;
return 0;
}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 3d98f604765e..a0fffbcd022b 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -184,8 +184,10 @@ xchk_teardown(
xchk_irele(sc, sc->ip);
sc->ip = NULL;
}
- if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
+ sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
mnt_drop_write_file(sc->file);
+ }
if (sc->buf) {
if (sc->buf_cleanup)
sc->buf_cleanup(sc->buf);
@@ -505,6 +507,8 @@ retry_op:
error = mnt_want_write_file(sc->file);
if (error)
goto out_sc;
+
+ sc->flags |= XCHK_HAVE_FREEZE_PROT;
}
/* Set up for the operation. */
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index e113f2f5c254..f8ba00e51ca9 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -106,6 +106,7 @@ struct xfs_scrub {
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */
+#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index b3894daeb86a..0b54f1a1cf0c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -98,6 +98,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
#define XFS_SCRUB_STATE_STRINGS \
{ XCHK_TRY_HARDER, "try_harder" }, \
+ { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \
{ XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
{ XCHK_NEED_DRAIN, "need_drain" }, \
{ XREP_ALREADY_FIXED, "already_fixed" }
@@ -693,6 +694,31 @@ TRACE_EVENT(xchk_fscounters_within_range,
__entry->old_value)
)
+DECLARE_EVENT_CLASS(xchk_fsfreeze_class,
+ TP_PROTO(struct xfs_scrub *sc, int error),
+ TP_ARGS(sc, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->error = error;
+ ),
+ TP_printk("dev %d:%d type %s error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __entry->error)
+);
+#define DEFINE_XCHK_FSFREEZE_EVENT(name) \
+DEFINE_EVENT(xchk_fsfreeze_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, int error), \
+ TP_ARGS(sc, error))
+DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze);
+DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw);
+
TRACE_EVENT(xchk_refcount_incorrect,
TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec,
xfs_nlink_t seen),
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 791db7d9c849..6b840301817a 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -233,7 +233,7 @@ xfs_acl_set_mode(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (xfs_has_wsync(mp))
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 451942fb38ec..2fca4b4e7fd8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = {
.read_folio = xfs_vm_read_folio,
.readahead = xfs_vm_readahead,
.writepages = xfs_vm_writepages,
- .dirty_folio = filemap_dirty_folio,
+ .dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.bmap = xfs_vm_bmap,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index fbb675563208..fcefab687285 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1644,6 +1644,7 @@ xfs_swap_extents(
uint64_t f;
int resblks = 0;
unsigned int flags = 0;
+ struct timespec64 ctime;
/*
* Lock the inodes against other IO, page faults and truncate to
@@ -1756,8 +1757,9 @@ xfs_swap_extents(
* process that the file was not changed out from
* under it.
*/
- if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
- (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+ ctime = inode_get_ctime(VFS_I(ip));
+ if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) ||
+ (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) ||
(sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
(sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
error = -EBUSY;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 15d1e5a7c2d3..3b903f6bce98 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1938,14 +1938,17 @@ void
xfs_free_buftarg(
struct xfs_buftarg *btp)
{
+ struct block_device *bdev = btp->bt_bdev;
+
unregister_shrinker(&btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
- blkdev_issue_flush(btp->bt_bdev);
- invalidate_bdev(btp->bt_bdev);
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
+ /* the main block device is closed by kill_block_super */
+ if (bdev != btp->bt_mount->m_super->s_bdev)
+ blkdev_put(bdev, btp->bt_mount->m_super);
kmem_free(btp);
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9e62cc500140..360fe83a334f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -843,10 +843,9 @@ xfs_init_new_inode(
ip->i_df.if_nextents = 0;
ASSERT(ip->i_nblocks == 0);
- tv = current_time(inode);
+ tv = inode_set_ctime_current(inode);
inode->i_mtime = tv;
inode->i_atime = tv;
- inode->i_ctime = tv;
ip->i_extsize = 0;
ip->i_diflags = 0;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 91c847a84e10..127b2410eb20 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -528,7 +528,7 @@ xfs_inode_to_log_dinode(
memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
- to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime);
+ to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode));
to->di_nlink = inode->i_nlink;
to->di_gen = inode->i_generation;
to->di_mode = inode->i_mode;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 24718adb3c16..2ededd3f6b8c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -573,10 +573,10 @@ xfs_vn_getattr(
stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
- stat->mtime = inode->i_mtime;
- stat->ctime = inode->i_ctime;
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
+ fill_mg_cmtime(stat, request_mask, inode);
+
if (xfs_has_v3inodes(mp)) {
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
@@ -917,7 +917,7 @@ xfs_setattr_size(
if (newsize != oldsize &&
!(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
iattr->ia_ctime = iattr->ia_mtime =
- current_time(inode);
+ current_mgtime(inode);
iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
}
@@ -1029,7 +1029,6 @@ xfs_vn_setattr(
STATIC int
xfs_vn_update_time(
struct inode *inode,
- struct timespec64 *now,
int flags)
{
struct xfs_inode *ip = XFS_I(inode);
@@ -1037,13 +1036,16 @@ xfs_vn_update_time(
int log_flags = XFS_ILOG_TIMESTAMP;
struct xfs_trans *tp;
int error;
+ struct timespec64 now;
trace_xfs_update_time(ip);
if (inode->i_sb->s_flags & SB_LAZYTIME) {
if (!((flags & S_VERSION) &&
- inode_maybe_inc_iversion(inode, false)))
- return generic_update_time(inode, now, flags);
+ inode_maybe_inc_iversion(inode, false))) {
+ generic_update_time(inode, flags);
+ return 0;
+ }
/* Capture the iversion update that just occurred */
log_flags |= XFS_ILOG_CORE;
@@ -1054,12 +1056,15 @@ xfs_vn_update_time(
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (flags & S_CTIME)
- inode->i_ctime = *now;
+ if (flags & (S_CTIME|S_MTIME))
+ now = inode_set_ctime_current(inode);
+ else
+ now = current_time(inode);
+
if (flags & S_MTIME)
- inode->i_mtime = *now;
+ inode->i_mtime = now;
if (flags & S_ATIME)
- inode->i_atime = *now;
+ inode->i_atime = now;
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, log_flags);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f225413a993c..c2093cb56092 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -100,8 +100,8 @@ xfs_bulkstat_one_int(
buf->bs_atime_nsec = inode->i_atime.tv_nsec;
buf->bs_mtime = inode->i_mtime.tv_sec;
buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
- buf->bs_ctime = inode->i_ctime.tv_sec;
- buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
+ buf->bs_ctime = inode_get_ctime(inode).tv_sec;
+ buf->bs_ctime_nsec = inode_get_ctime(inode).tv_nsec;
buf->bs_gen = inode->i_generation;
buf->bs_mode = inode->i_mode;
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 9737b5a9f405..c4cc99b70dd3 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -56,7 +56,7 @@ xfs_check_ondisk_structs(void)
/* dir/attr trees */
XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80);
- XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 88);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 80);
XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode, 64);
@@ -88,7 +88,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4);
XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0);
XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2);
XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 818510243130..c79eac048456 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -377,17 +377,6 @@ disable_dax:
return 0;
}
-static void
-xfs_bdev_mark_dead(
- struct block_device *bdev)
-{
- xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED);
-}
-
-static const struct blk_holder_ops xfs_holder_ops = {
- .mark_dead = xfs_bdev_mark_dead,
-};
-
STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
@@ -396,8 +385,8 @@ xfs_blkdev_get(
{
int error = 0;
- *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp,
- &xfs_holder_ops);
+ *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ mp->m_super, &fs_holder_ops);
if (IS_ERR(*bdevp)) {
error = PTR_ERR(*bdevp);
xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
@@ -407,31 +396,45 @@ xfs_blkdev_get(
}
STATIC void
-xfs_blkdev_put(
- struct xfs_mount *mp,
- struct block_device *bdev)
-{
- if (bdev)
- blkdev_put(bdev, mp);
-}
-
-STATIC void
-xfs_close_devices(
+xfs_shutdown_devices(
struct xfs_mount *mp)
{
+ /*
+ * Udev is triggered whenever anyone closes a block device or unmounts
+ * a file systemm on a block device.
+ * The default udev rules invoke blkid to read the fs super and create
+ * symlinks to the bdev under /dev/disk. For this, it uses buffered
+ * reads through the page cache.
+ *
+ * xfs_db also uses buffered reads to examine metadata. There is no
+ * coordination between xfs_db and udev, which means that they can run
+ * concurrently. Note there is no coordination between the kernel and
+ * blkid either.
+ *
+ * On a system with 64k pages, the page cache can cache the superblock
+ * and the root inode (and hence the root directory) with the same 64k
+ * page. If udev spawns blkid after the mkfs and the system is busy
+ * enough that it is still running when xfs_db starts up, they'll both
+ * read from the same page in the pagecache.
+ *
+ * The unmount writes updated inode metadata to disk directly. The XFS
+ * buffer cache does not use the bdev pagecache, so it needs to
+ * invalidate that pagecache on unmount. If the above scenario occurs,
+ * the pagecache no longer reflects what's on disk, xfs_db reads the
+ * stale metadata, and fails to find /a. Most of the time this succeeds
+ * because closing a bdev invalidates the page cache, but when processes
+ * race, everyone loses.
+ */
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
- struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
-
- xfs_free_buftarg(mp->m_logdev_targp);
- xfs_blkdev_put(mp, logdev);
+ blkdev_issue_flush(mp->m_logdev_targp->bt_bdev);
+ invalidate_bdev(mp->m_logdev_targp->bt_bdev);
}
if (mp->m_rtdev_targp) {
- struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
-
- xfs_free_buftarg(mp->m_rtdev_targp);
- xfs_blkdev_put(mp, rtdev);
+ blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
+ invalidate_bdev(mp->m_rtdev_targp->bt_bdev);
}
- xfs_free_buftarg(mp->m_ddev_targp);
+ blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+ invalidate_bdev(mp->m_ddev_targp->bt_bdev);
}
/*
@@ -448,17 +451,24 @@ STATIC int
xfs_open_devices(
struct xfs_mount *mp)
{
- struct block_device *ddev = mp->m_super->s_bdev;
+ struct super_block *sb = mp->m_super;
+ struct block_device *ddev = sb->s_bdev;
struct block_device *logdev = NULL, *rtdev = NULL;
int error;
/*
+ * blkdev_put() can't be called under s_umount, see the comment
+ * in get_tree_bdev() for more details
+ */
+ up_write(&sb->s_umount);
+
+ /*
* Open real time and log devices - order is important.
*/
if (mp->m_logname) {
error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
if (error)
- return error;
+ goto out_relock;
}
if (mp->m_rtname) {
@@ -496,7 +506,10 @@ xfs_open_devices(
mp->m_logdev_targp = mp->m_ddev_targp;
}
- return 0;
+ error = 0;
+out_relock:
+ down_write(&sb->s_umount);
+ return error;
out_free_rtdev_targ:
if (mp->m_rtdev_targp)
@@ -504,11 +517,12 @@ xfs_open_devices(
out_free_ddev_targ:
xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
- xfs_blkdev_put(mp, rtdev);
+ if (rtdev)
+ blkdev_put(rtdev, sb);
out_close_logdev:
if (logdev && logdev != ddev)
- xfs_blkdev_put(mp, logdev);
- return error;
+ blkdev_put(logdev, sb);
+ goto out_relock;
}
/*
@@ -758,6 +772,17 @@ static void
xfs_mount_free(
struct xfs_mount *mp)
{
+ /*
+ * Free the buftargs here because blkdev_put needs to be called outside
+ * of sb->s_umount, which is held around the call to ->put_super.
+ */
+ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
+ xfs_free_buftarg(mp->m_logdev_targp);
+ if (mp->m_rtdev_targp)
+ xfs_free_buftarg(mp->m_rtdev_targp);
+ if (mp->m_ddev_targp)
+ xfs_free_buftarg(mp->m_ddev_targp);
+
kfree(mp->m_rtname);
kfree(mp->m_logname);
kmem_free(mp);
@@ -1133,10 +1158,6 @@ xfs_fs_put_super(
{
struct xfs_mount *mp = XFS_M(sb);
- /* if ->fill_super failed, we have no mount to tear down */
- if (!sb->s_fs_info)
- return;
-
xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid);
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
@@ -1147,10 +1168,7 @@ xfs_fs_put_super(
xfs_inodegc_free_percpu(mp);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
- xfs_close_devices(mp);
-
- sb->s_fs_info = NULL;
- xfs_mount_free(mp);
+ xfs_shutdown_devices(mp);
}
static long
@@ -1492,7 +1510,7 @@ xfs_fs_fill_super(
error = xfs_fs_validate_params(mp);
if (error)
- goto out_free_names;
+ return error;
sb_min_blocksize(sb, BBSIZE);
sb->s_xattr = xfs_xattr_handlers;
@@ -1519,11 +1537,11 @@ xfs_fs_fill_super(
error = xfs_open_devices(mp);
if (error)
- goto out_free_names;
+ return error;
error = xfs_init_mount_workqueues(mp);
if (error)
- goto out_close_devices;
+ goto out_shutdown_devices;
error = xfs_init_percpu_counters(mp);
if (error)
@@ -1737,11 +1755,8 @@ xfs_fs_fill_super(
xfs_destroy_percpu_counters(mp);
out_destroy_workqueues:
xfs_destroy_mount_workqueues(mp);
- out_close_devices:
- xfs_close_devices(mp);
- out_free_names:
- sb->s_fs_info = NULL;
- xfs_mount_free(mp);
+ out_shutdown_devices:
+ xfs_shutdown_devices(mp);
return error;
out_unmount:
@@ -1934,7 +1949,8 @@ xfs_fs_reconfigure(
return 0;
}
-static void xfs_fs_free(
+static void
+xfs_fs_free(
struct fs_context *fc)
{
struct xfs_mount *mp = fc->s_fs_info;
@@ -2003,13 +2019,21 @@ static int xfs_init_fs_context(
return 0;
}
+static void
+xfs_kill_sb(
+ struct super_block *sb)
+{
+ kill_block_super(sb);
+ xfs_mount_free(XFS_M(sb));
+}
+
static struct file_system_type xfs_fs_type = {
.owner = THIS_MODULE,
.name = "xfs",
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .kill_sb = xfs_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("xfs");