aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c50
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h3
-rw-r--r--fs/xfs/libxfs/xfs_attr.c22
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c315
-rw-r--r--fs/xfs/libxfs/xfs_btree.c45
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c2
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h6
-rw-r--r--fs/xfs/libxfs/xfs_fs.h1
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c27
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h63
-rw-r--r--fs/xfs/libxfs/xfs_sb.c2
-rw-r--r--fs/xfs/scrub/common.c4
-rw-r--r--fs/xfs/xfs_acl.c5
-rw-r--r--fs/xfs/xfs_acl.h3
-rw-r--r--fs/xfs/xfs_aops.c17
-rw-r--r--fs/xfs/xfs_bio_io.c2
-rw-r--r--fs/xfs/xfs_bmap_item.c10
-rw-r--r--fs/xfs/xfs_bmap_util.c81
-rw-r--r--fs/xfs/xfs_buf.c34
-rw-r--r--fs/xfs/xfs_buf.h11
-rw-r--r--fs/xfs/xfs_dquot.c47
-rw-r--r--fs/xfs/xfs_error.c6
-rw-r--r--fs/xfs/xfs_extent_busy.c14
-rw-r--r--fs/xfs/xfs_file.c443
-rw-r--r--fs/xfs/xfs_fsops.c32
-rw-r--r--fs/xfs/xfs_fsops.h4
-rw-r--r--fs/xfs/xfs_globals.c7
-rw-r--r--fs/xfs/xfs_icache.c438
-rw-r--r--fs/xfs/xfs_icache.h24
-rw-r--r--fs/xfs/xfs_inode.c156
-rw-r--r--fs/xfs/xfs_inode.h16
-rw-r--r--fs/xfs/xfs_ioctl.c104
-rw-r--r--fs/xfs/xfs_ioctl32.c13
-rw-r--r--fs/xfs/xfs_iomap.c82
-rw-r--r--fs/xfs/xfs_iops.c129
-rw-r--r--fs/xfs/xfs_iops.h3
-rw-r--r--fs/xfs/xfs_itable.c17
-rw-r--r--fs/xfs/xfs_itable.h1
-rw-r--r--fs/xfs/xfs_iwalk.c5
-rw-r--r--fs/xfs/xfs_linux.h3
-rw-r--r--fs/xfs/xfs_log.c142
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_mount.c43
-rw-r--r--fs/xfs/xfs_mount.h10
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_pwork.c25
-rw-r--r--fs/xfs/xfs_pwork.h4
-rw-r--r--fs/xfs/xfs_qm.c119
-rw-r--r--fs/xfs/xfs_quota.h49
-rw-r--r--fs/xfs/xfs_reflink.c103
-rw-r--r--fs/xfs/xfs_rtalloc.c5
-rw-r--r--fs/xfs/xfs_super.c86
-rw-r--r--fs/xfs/xfs_super.h6
-rw-r--r--fs/xfs/xfs_symlink.c20
-rw-r--r--fs/xfs/xfs_symlink.h5
-rw-r--r--fs/xfs/xfs_sysctl.c40
-rw-r--r--fs/xfs/xfs_sysctl.h3
-rw-r--r--fs/xfs/xfs_trace.c1
-rw-r--r--fs/xfs/xfs_trace.h72
-rw-r--r--fs/xfs/xfs_trans.c222
-rw-r--r--fs/xfs/xfs_trans.h43
-rw-r--r--fs/xfs/xfs_trans_dquot.c71
-rw-r--r--fs/xfs/xfs_xattr.c7
64 files changed, 2050 insertions, 1281 deletions
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 7cb9f064ac64..0c623d3c1036 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2474,6 +2474,47 @@ xfs_defer_agfl_block(
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
}
+#ifdef DEBUG
+/*
+ * Check if an AGF has a free extent record whose length is equal to
+ * args->minlen.
+ */
+STATIC int
+xfs_exact_minlen_extent_available(
+ struct xfs_alloc_arg *args,
+ struct xfs_buf *agbp,
+ int *stat)
+{
+ struct xfs_btree_cur *cnt_cur;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int error = 0;
+
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
+ args->agno, XFS_BTNUM_CNT);
+ error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
+ if (error)
+ goto out;
+
+ if (*stat == 0) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
+ if (error)
+ goto out;
+
+ if (*stat == 1 && flen != args->minlen)
+ *stat = 0;
+
+out:
+ xfs_btree_del_cursor(cnt_cur, error);
+
+ return error;
+}
+#endif
+
/*
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
@@ -2545,6 +2586,15 @@ xfs_alloc_fix_freelist(
if (!xfs_alloc_space_available(args, need, flags))
goto out_agbp_relse;
+#ifdef DEBUG
+ if (args->alloc_minlen_only) {
+ int stat;
+
+ error = xfs_exact_minlen_extent_available(args, agbp, &stat);
+ if (error || !stat)
+ goto out_agbp_relse;
+ }
+#endif
/*
* Make the freelist shorter if it's too long.
*
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6c22b12176b8..a4427c5775c2 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -75,6 +75,9 @@ typedef struct xfs_alloc_arg {
char wasfromfl; /* set if allocation is from freelist */
struct xfs_owner_info oinfo; /* owner of blocks being allocated */
enum xfs_ag_resv_type resv; /* block reservation to use */
+#ifdef DEBUG
+ bool alloc_minlen_only; /* allocate exact minlen extent */
+#endif
} xfs_alloc_arg_t;
/*
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index fd8e6418a0d3..472b3039eabb 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -396,6 +396,7 @@ xfs_attr_set(
struct xfs_trans_res tres;
bool rsvd = (args->attr_filter & XFS_ATTR_ROOT);
int error, local;
+ int rmt_blks = 0;
unsigned int total;
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
@@ -442,34 +443,33 @@ xfs_attr_set(
tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
total = args->total;
+
+ if (!local)
+ rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
} else {
XFS_STATS_INC(mp, xs_attr_remove);
tres = M_RES(mp)->tr_attrrm;
total = XFS_ATTRRM_SPACE_RES(mp);
+ rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
}
/*
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
- error = xfs_trans_alloc(mp, &tres, total, 0,
- rsvd ? XFS_TRANS_RESERVE : 0, &args->trans);
+ error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans);
if (error)
return error;
- xfs_ilock(dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(args->trans, dp, 0);
- if (args->value) {
- unsigned int quota_flags = XFS_QMOPT_RES_REGBLKS;
-
- if (rsvd)
- quota_flags |= XFS_QMOPT_FORCE_RES;
- error = xfs_trans_reserve_quota_nblks(args->trans, dp,
- args->total, 0, quota_flags);
+ if (args->value || xfs_inode_hasattr(dp)) {
+ error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
+ XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
if (error)
goto out_trans_cancel;
+ }
+ if (args->value) {
error = xfs_has_attr(args);
if (error == -EEXIST && (args->attr_flags & XATTR_CREATE))
goto out_trans_cancel;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index bc446418e227..e0905ad171f0 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1079,21 +1079,13 @@ xfs_bmap_add_attrfork(
blks = XFS_ADDAFORK_SPACE_RES(mp);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0,
- rsvd ? XFS_TRANS_RESERVE : 0, &tp);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0,
+ rsvd, &tp);
if (error)
return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
- XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto trans_cancel;
if (XFS_IFORK_Q(ip))
goto trans_cancel;
- xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_bmap_set_attrforkoff(ip, size, &version);
if (error)
@@ -3463,34 +3455,16 @@ xfs_bmap_btalloc_accounting(
args->len);
}
-STATIC int
-xfs_bmap_btalloc(
- struct xfs_bmalloca *ap) /* bmap alloc argument struct */
+static int
+xfs_bmap_compute_alignments(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args)
{
- xfs_mount_t *mp; /* mount point structure */
- xfs_alloctype_t atype = 0; /* type for allocation routines */
- xfs_extlen_t align = 0; /* minimum allocation alignment */
- xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
- xfs_agnumber_t ag;
- xfs_alloc_arg_t args;
- xfs_fileoff_t orig_offset;
- xfs_extlen_t orig_length;
- xfs_extlen_t blen;
- xfs_extlen_t nextminlen = 0;
- int nullfb; /* true if ap->firstblock isn't set */
- int isaligned;
- int tryagain;
- int error;
- int stripe_align;
-
- ASSERT(ap->length);
- orig_offset = ap->offset;
- orig_length = ap->length;
-
- mp = ap->ip->i_mount;
+ struct xfs_mount *mp = args->mp;
+ xfs_extlen_t align = 0; /* minimum allocation alignment */
+ int stripe_align = 0;
/* stripe alignment for allocation is determined by mount parameters */
- stripe_align = 0;
if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
stripe_align = mp->m_swidth;
else if (mp->m_dalign)
@@ -3501,13 +3475,171 @@ xfs_bmap_btalloc(
else if (ap->datatype & XFS_ALLOC_USERDATA)
align = xfs_get_extsz_hint(ap->ip);
if (align) {
- error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
- align, 0, ap->eof, 0, ap->conv,
- &ap->offset, &ap->length);
- ASSERT(!error);
+ if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
+ ap->eof, 0, ap->conv, &ap->offset,
+ &ap->length))
+ ASSERT(0);
ASSERT(ap->length);
}
+ /* apply extent size hints if obtained earlier */
+ if (align) {
+ args->prod = align;
+ div_u64_rem(ap->offset, args->prod, &args->mod);
+ if (args->mod)
+ args->mod = args->prod - args->mod;
+ } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) {
+ args->prod = 1;
+ args->mod = 0;
+ } else {
+ args->prod = PAGE_SIZE >> mp->m_sb.sb_blocklog;
+ div_u64_rem(ap->offset, args->prod, &args->mod);
+ if (args->mod)
+ args->mod = args->prod - args->mod;
+ }
+
+ return stripe_align;
+}
+
+static void
+xfs_bmap_process_allocated_extent(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_fileoff_t orig_offset,
+ xfs_extlen_t orig_length)
+{
+ int nullfb;
+
+ nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
+
+ /*
+ * check the allocation happened at the same or higher AG than
+ * the first block that was allocated.
+ */
+ ASSERT(nullfb ||
+ XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <=
+ XFS_FSB_TO_AGNO(args->mp, args->fsbno));
+
+ ap->blkno = args->fsbno;
+ if (nullfb)
+ ap->tp->t_firstblock = args->fsbno;
+ ap->length = args->len;
+ /*
+ * If the extent size hint is active, we tried to round the
+ * caller's allocation request offset down to extsz and the
+ * length up to another extsz boundary. If we found a free
+ * extent we mapped it in starting at this new offset. If the
+ * newly mapped space isn't long enough to cover any of the
+ * range of offsets that was originally requested, move the
+ * mapping up so that we can fill as much of the caller's
+ * original request as possible. Free space is apparently
+ * very fragmented so we're unlikely to be able to satisfy the
+ * hints anyway.
+ */
+ if (ap->length <= orig_length)
+ ap->offset = orig_offset;
+ else if (ap->offset + ap->length < orig_offset + orig_length)
+ ap->offset = orig_offset + orig_length - ap->length;
+ xfs_bmap_btalloc_accounting(ap, args);
+}
+
+#ifdef DEBUG
+static int
+xfs_bmap_exact_minlen_extent_alloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp };
+ xfs_fileoff_t orig_offset;
+ xfs_extlen_t orig_length;
+ int error;
+
+ ASSERT(ap->length);
+
+ if (ap->minlen != 1) {
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
+ return 0;
+ }
+
+ orig_offset = ap->offset;
+ orig_length = ap->length;
+
+ args.alloc_minlen_only = 1;
+
+ xfs_bmap_compute_alignments(ap, &args);
+
+ if (ap->tp->t_firstblock == NULLFSBLOCK) {
+ /*
+ * Unlike the longest extent available in an AG, we don't track
+ * the length of an AG's shortest extent.
+ * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and
+ * hence we can afford to start traversing from the 0th AG since
+ * we need not be concerned about a drop in performance in
+ * "debug only" code paths.
+ */
+ ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0);
+ } else {
+ ap->blkno = ap->tp->t_firstblock;
+ }
+
+ args.fsbno = ap->blkno;
+ args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ args.total = args.minlen = args.maxlen = ap->minlen;
+
+ args.alignment = 1;
+ args.minalignslop = 0;
+
+ args.minleft = ap->minleft;
+ args.wasdel = ap->wasdel;
+ args.resv = XFS_AG_RESV_NONE;
+ args.datatype = ap->datatype;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+
+ if (args.fsbno != NULLFSBLOCK) {
+ xfs_bmap_process_allocated_extent(ap, &args, orig_offset,
+ orig_length);
+ } else {
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
+ }
+
+ return 0;
+}
+#else
+
+#define xfs_bmap_exact_minlen_extent_alloc(bma) (-EFSCORRUPTED)
+
+#endif
+
+STATIC int
+xfs_bmap_btalloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp };
+ xfs_alloctype_t atype = 0;
+ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
+ xfs_agnumber_t ag;
+ xfs_fileoff_t orig_offset;
+ xfs_extlen_t orig_length;
+ xfs_extlen_t blen;
+ xfs_extlen_t nextminlen = 0;
+ int nullfb; /* true if ap->firstblock isn't set */
+ int isaligned;
+ int tryagain;
+ int error;
+ int stripe_align;
+
+ ASSERT(ap->length);
+ orig_offset = ap->offset;
+ orig_length = ap->length;
+
+ stripe_align = xfs_bmap_compute_alignments(ap, &args);
nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
@@ -3538,9 +3670,6 @@ xfs_bmap_btalloc(
* Normal allocation, done through xfs_alloc_vextent.
*/
tryagain = isaligned = 0;
- memset(&args, 0, sizeof(args));
- args.tp = ap->tp;
- args.mp = mp;
args.fsbno = ap->blkno;
args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
@@ -3571,21 +3700,7 @@ xfs_bmap_btalloc(
args.total = ap->total;
args.minlen = ap->minlen;
}
- /* apply extent size hints if obtained earlier */
- if (align) {
- args.prod = align;
- div_u64_rem(ap->offset, args.prod, &args.mod);
- if (args.mod)
- args.mod = args.prod - args.mod;
- } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) {
- args.prod = 1;
- args.mod = 0;
- } else {
- args.prod = PAGE_SIZE >> mp->m_sb.sb_blocklog;
- div_u64_rem(ap->offset, args.prod, &args.mod);
- if (args.mod)
- args.mod = args.prod - args.mod;
- }
+
/*
* If we are not low on available data blocks, and the underlying
* logical volume manager is a stripe, and the file offset is zero then
@@ -3687,37 +3802,10 @@ xfs_bmap_btalloc(
return error;
ap->tp->t_flags |= XFS_TRANS_LOWMODE;
}
+
if (args.fsbno != NULLFSBLOCK) {
- /*
- * check the allocation happened at the same or higher AG than
- * the first block that was allocated.
- */
- ASSERT(ap->tp->t_firstblock == NULLFSBLOCK ||
- XFS_FSB_TO_AGNO(mp, ap->tp->t_firstblock) <=
- XFS_FSB_TO_AGNO(mp, args.fsbno));
-
- ap->blkno = args.fsbno;
- if (ap->tp->t_firstblock == NULLFSBLOCK)
- ap->tp->t_firstblock = args.fsbno;
- ASSERT(nullfb || fb_agno <= args.agno);
- ap->length = args.len;
- /*
- * If the extent size hint is active, we tried to round the
- * caller's allocation request offset down to extsz and the
- * length up to another extsz boundary. If we found a free
- * extent we mapped it in starting at this new offset. If the
- * newly mapped space isn't long enough to cover any of the
- * range of offsets that was originally requested, move the
- * mapping up so that we can fill as much of the caller's
- * original request as possible. Free space is apparently
- * very fragmented so we're unlikely to be able to satisfy the
- * hints anyway.
- */
- if (ap->length <= orig_length)
- ap->offset = orig_offset;
- else if (ap->offset + ap->length < orig_offset + orig_length)
- ap->offset = orig_offset + orig_length - ap->length;
- xfs_bmap_btalloc_accounting(ap, &args);
+ xfs_bmap_process_allocated_extent(ap, &args, orig_offset,
+ orig_length);
} else {
ap->blkno = NULLFSBLOCK;
ap->length = 0;
@@ -4001,8 +4089,7 @@ xfs_bmapi_reserve_delalloc(
* blocks. This number gets adjusted later. We return if we haven't
* allocated blocks already inside this loop.
*/
- error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
- XFS_QMOPT_RES_REGBLKS);
+ error = xfs_quota_reserve_blkres(ip, alen);
if (error)
return error;
@@ -4048,8 +4135,7 @@ out_unreserve_blocks:
xfs_mod_fdblocks(mp, alen, false);
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
- xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0,
- XFS_QMOPT_RES_REGBLKS);
+ xfs_quota_unreserve_blkres(ip, alen);
return error;
}
@@ -4083,6 +4169,10 @@ xfs_bmap_alloc_userdata(
return xfs_bmap_rtalloc(bma);
}
+ if (unlikely(XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+ return xfs_bmap_exact_minlen_extent_alloc(bma);
+
return xfs_bmap_btalloc(bma);
}
@@ -4119,10 +4209,15 @@ xfs_bmapi_allocate(
else
bma->minlen = 1;
- if (bma->flags & XFS_BMAPI_METADATA)
- error = xfs_bmap_btalloc(bma);
- else
+ if (bma->flags & XFS_BMAPI_METADATA) {
+ if (unlikely(XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+ error = xfs_bmap_exact_minlen_extent_alloc(bma);
+ else
+ error = xfs_bmap_btalloc(bma);
+ } else {
error = xfs_bmap_alloc_userdata(bma);
+ }
if (error || bma->blkno == NULLFSBLOCK)
return error;
@@ -4527,6 +4622,12 @@ xfs_bmapi_convert_delalloc(
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ error = xfs_iext_count_may_overflow(ip, whichfork,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ goto out_trans_cancel;
+
xfs_trans_ijoin(tp, ip, 0);
if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
@@ -4826,9 +4927,8 @@ xfs_bmap_del_extent_delay(
* sb counters as we might have to borrow some blocks for the
* indirect block accounting.
*/
- error = xfs_trans_reserve_quota_nblks(NULL, ip,
- -((long)del->br_blockcount), 0,
- isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+ ASSERT(!isrt);
+ error = xfs_quota_unreserve_blkres(ip, del->br_blockcount);
if (error)
return error;
ip->i_delayed_blks -= del->br_blockcount;
@@ -5145,6 +5245,27 @@ xfs_bmap_del_extent_real(
/*
* Deleting the middle of the extent.
*/
+
+ /*
+ * For directories, -ENOSPC is returned since a directory entry
+ * remove operation must not fail due to low extent count
+ * availability. -ENOSPC will be handled by higher layers of XFS
+ * by letting the corresponding empty Data/Free blocks to linger
+ * until a future remove operation. Dabtree blocks would be
+ * swapped with the last block in the leaf space and then the
+ * new last block will be unmapped.
+ *
+ * The above logic also applies to the source directory entry of
+ * a rename operation.
+ */
+ error = xfs_iext_count_may_overflow(ip, whichfork, 1);
+ if (error) {
+ ASSERT(S_ISDIR(VFS_I(ip)->i_mode) &&
+ whichfork == XFS_DATA_FORK);
+ error = -ENOSPC;
+ goto done;
+ }
+
old = got;
got.br_blockcount = del->br_startoff - got.br_startoff;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c4d7a9241dc3..5b6fcb9b44e2 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -353,20 +353,17 @@ xfs_btree_free_block(
*/
void
xfs_btree_del_cursor(
- xfs_btree_cur_t *cur, /* btree cursor */
- int error) /* del because of error */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int error) /* del because of error */
{
- int i; /* btree level */
+ int i; /* btree level */
/*
- * Clear the buffer pointers, and release the buffers.
- * If we're doing this in the face of an error, we
- * need to make sure to inspect all of the entries
- * in the bc_bufs array for buffers to be unlocked.
- * This is because some of the btree code works from
- * level n down to 0, and if we get an error along
- * the way we won't have initialized all the entries
- * down to 0.
+ * Clear the buffer pointers and release the buffers. If we're doing
+ * this because of an error, inspect all of the entries in the bc_bufs
+ * array for buffers to be unlocked. This is because some of the btree
+ * code works from level n down to 0, and if we get an error along the
+ * way we won't have initialized all the entries down to 0.
*/
for (i = 0; i < cur->bc_nlevels; i++) {
if (cur->bc_bufs[i])
@@ -374,17 +371,11 @@ xfs_btree_del_cursor(
else if (!error)
break;
}
- /*
- * Can't free a bmap cursor without having dealt with the
- * allocated indirect blocks' accounting.
- */
- ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
- cur->bc_ino.allocated == 0);
- /*
- * Free the cursor.
- */
+
+ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
+ XFS_FORCED_SHUTDOWN(cur->bc_mp));
if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
- kmem_free((void *)cur->bc_ops);
+ kmem_free(cur->bc_ops);
kmem_cache_free(xfs_btree_cur_zone, cur);
}
@@ -2814,7 +2805,7 @@ xfs_btree_split_worker(
struct xfs_btree_split_args *args = container_of(work,
struct xfs_btree_split_args, work);
unsigned long pflags;
- unsigned long new_pflags = PF_MEMALLOC_NOFS;
+ unsigned long new_pflags = 0;
/*
* we are in a transaction context here, but may also be doing work
@@ -2826,12 +2817,20 @@ xfs_btree_split_worker(
new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
current_set_flags_nested(&pflags, new_pflags);
+ xfs_trans_set_context(args->cur->bc_tp);
args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
args->key, args->curp, args->stat);
- complete(args->done);
+ xfs_trans_clear_context(args->cur->bc_tp);
current_restore_flags_nested(&pflags, new_pflags);
+
+ /*
+ * Do not access args after complete() has run here. We don't own args
+ * and the owner may run and free args before we return here.
+ */
+ complete(args->done);
+
}
/*
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index e55378640b05..d03e6098ded9 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -47,8 +47,6 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t ino,
xfs_extlen_t tot);
-extern bool xfs_dir2_sf_replace_needblock(struct xfs_inode *dp,
- xfs_ino_t inum);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 2463b5d73447..8c4f76bba88b 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -1018,7 +1018,7 @@ xfs_dir2_sf_removename(
/*
* Check whether the sf dir replace operation need more blocks.
*/
-bool
+static bool
xfs_dir2_sf_replace_needblock(
struct xfs_inode *dp,
xfs_ino_t inum)
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 53b305dea381..6ca9084b6934 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -56,7 +56,9 @@
#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
#define XFS_ERRTAG_IUNLINK_FALLBACK 34
#define XFS_ERRTAG_BUF_IOERROR 35
-#define XFS_ERRTAG_MAX 36
+#define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36
+#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37
+#define XFS_ERRTAG_MAX 38
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -97,5 +99,7 @@
#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1
+#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 2a2e3cfd94f0..6fad140d4c8e 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -250,6 +250,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */
#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
+#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */
/*
* Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 7575de5cecb1..e080d7e07643 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -23,6 +23,8 @@
#include "xfs_da_btree.h"
#include "xfs_dir2_priv.h"
#include "xfs_attr_leaf.h"
+#include "xfs_types.h"
+#include "xfs_errortag.h"
kmem_zone_t *xfs_ifork_zone;
@@ -728,3 +730,28 @@ xfs_ifork_verify_local_attr(
return 0;
}
+
+int
+xfs_iext_count_may_overflow(
+ struct xfs_inode *ip,
+ int whichfork,
+ int nr_to_add)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ uint64_t max_exts;
+ uint64_t nr_exts;
+
+ if (whichfork == XFS_COW_FORK)
+ return 0;
+
+ max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM;
+
+ if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+ max_exts = 10;
+
+ nr_exts = ifp->if_nextents + nr_to_add;
+ if (nr_exts < ifp->if_nextents || nr_exts > max_exts)
+ return -EFBIG;
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index a4953e95c4f3..9e2137cd7372 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -35,6 +35,67 @@ struct xfs_ifork {
#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
/*
+ * Worst-case increase in the fork extent count when we're adding a single
+ * extent to a fork and there's no possibility of splitting an existing mapping.
+ */
+#define XFS_IEXT_ADD_NOSPLIT_CNT (1)
+
+/*
+ * Punching out an extent from the middle of an existing extent can cause the
+ * extent count to increase by 1.
+ * i.e. | Old extent | Hole | Old extent |
+ */
+#define XFS_IEXT_PUNCH_HOLE_CNT (1)
+
+/*
+ * Directory entry addition can cause the following,
+ * 1. Data block can be added/removed.
+ * A new extent can cause extent count to increase by 1.
+ * 2. Free disk block can be added/removed.
+ * Same behaviour as described above for Data block.
+ * 3. Dabtree blocks.
+ * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new
+ * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH.
+ */
+#define XFS_IEXT_DIR_MANIP_CNT(mp) \
+ ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount)
+
+/*
+ * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to
+ * be added. One extra extent for dabtree in case a local attr is
+ * large enough to cause a double split. It can also cause extent
+ * count to increase proportional to the size of a remote xattr's
+ * value.
+ */
+#define XFS_IEXT_ATTR_MANIP_CNT(rmt_blks) \
+ (XFS_DA_NODE_MAXDEPTH + max(1, rmt_blks))
+
+/*
+ * A write to a sub-interval of an existing unwritten extent causes the original
+ * extent to be split into 3 extents
+ * i.e. | Unwritten | Real | Unwritten |
+ * Hence extent count can increase by 2.
+ */
+#define XFS_IEXT_WRITE_UNWRITTEN_CNT (2)
+
+
+/*
+ * Moving an extent to data fork can cause a sub-interval of an existing extent
+ * to be unmapped. This will increase extent count by 1. Mapping in the new
+ * extent can increase the extent count by 1 again i.e.
+ * | Old extent | New extent | Old extent |
+ * Hence number of extents increases by 2.
+ */
+#define XFS_IEXT_REFLINK_END_COW_CNT (2)
+
+/*
+ * Removing an initial range of source/donor file's extent and adding a new
+ * extent (from donor/source file) in its place will cause extent count to
+ * increase by 1.
+ */
+#define XFS_IEXT_SWAP_RMAP_CNT (1)
+
+/*
* Fork handling.
*/
@@ -172,5 +233,7 @@ extern void xfs_ifork_init_cow(struct xfs_inode *ip);
int xfs_ifork_verify_local_data(struct xfs_inode *ip);
int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
+int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
+ int nr_to_add);
#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index bbda117e5d85..60e6d255e5e2 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1138,6 +1138,8 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK;
if (xfs_sb_version_hasbigtime(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME;
+ if (xfs_sb_version_hasinobtcounts(sbp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT;
if (xfs_sb_version_hassector(sbp))
geo->logsectsize = sbp->sb_logsectsize;
else
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 8ea6d4aa3f55..53456f3de881 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -888,7 +888,7 @@ xchk_stop_reaping(
struct xfs_scrub *sc)
{
sc->flags |= XCHK_REAPING_DISABLED;
- xfs_stop_block_reaping(sc->mp);
+ xfs_blockgc_stop(sc->mp);
}
/* Restart background reaping of resources. */
@@ -896,6 +896,6 @@ void
xchk_start_reaping(
struct xfs_scrub *sc)
{
- xfs_start_block_reaping(sc->mp);
+ xfs_blockgc_start(sc->mp);
sc->flags &= ~XCHK_REAPING_DISABLED;
}
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 779cb73b3d00..d02bef24b32b 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -238,7 +238,8 @@ xfs_acl_set_mode(
}
int
-xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type)
{
umode_t mode;
bool set_mode = false;
@@ -252,7 +253,7 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return error;
if (type == ACL_TYPE_ACCESS) {
- error = posix_acl_update_mode(inode, &mode, &acl);
+ error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl);
if (error)
return error;
set_mode = true;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index c042c0868016..7bdb3a4ed798 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -11,7 +11,8 @@ struct posix_acl;
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
-extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type);
extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
void xfs_forget_acl(struct inode *inode, const char *name);
#else
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4304c6416fbb..b4186d666157 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -62,7 +62,7 @@ xfs_setfilesize_trans_alloc(
* We hand off the transaction to the completion thread now, so
* clear the flag here.
*/
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ xfs_trans_clear_context(tp);
return 0;
}
@@ -125,7 +125,7 @@ xfs_setfilesize_ioend(
* thus we need to mark ourselves as being in a transaction manually.
* Similarly for freeze protection.
*/
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ xfs_trans_set_context(tp);
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
/* we abort the update if there was an IO error */
@@ -568,6 +568,12 @@ xfs_vm_writepage(
{
struct xfs_writepage_ctx wpc = { };
+ if (WARN_ON_ONCE(current->journal_info)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+
return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
}
@@ -578,6 +584,13 @@ xfs_vm_writepages(
{
struct xfs_writepage_ctx wpc = { };
+ /*
+ * Writing back data in a transaction context can result in recursive
+ * transactions. This is bad, so issue a warning and get out of here.
+ */
+ if (WARN_ON_ONCE(current->journal_info))
+ return 0;
+
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index e2148f2d5d6b..17f36db2f792 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -6,7 +6,7 @@
static inline unsigned int bio_max_vecs(unsigned int count)
{
- return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES);
+ return bio_max_segs(howmany(count, PAGE_SIZE));
}
int
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 93e4d8ae6e92..2344757ede63 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -471,6 +471,7 @@ xfs_bui_item_recover(
xfs_exntst_t state;
unsigned int bui_type;
int whichfork;
+ int iext_delta;
int error = 0;
if (!xfs_bui_validate(mp, buip)) {
@@ -508,6 +509,15 @@ xfs_bui_item_recover(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ if (bui_type == XFS_BMAP_MAP)
+ iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
+ else
+ iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
+
+ error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
+ if (error)
+ goto err_cancel;
+
count = bmap->me_len;
error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
whichfork, bmap->me_startoff, bmap->me_startblock,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 7371a7f7c652..e7d68318e6a5 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -727,11 +727,9 @@ xfs_alloc_file_space(
xfs_fileoff_t startoffset_fsb;
xfs_fileoff_t endoffset_fsb;
int nimaps;
- int quota_flag;
int rt;
xfs_trans_t *tp;
xfs_bmbt_irec_t imaps[1], *imapp;
- uint qblocks, resblks, resrtextents;
int error;
trace_xfs_alloc_file_space(ip);
@@ -761,6 +759,7 @@ xfs_alloc_file_space(
*/
while (allocatesize_fsb && !error) {
xfs_fileoff_t s, e;
+ unsigned int dblocks, rblocks, resblks;
/*
* Determine space reservations for data/realtime.
@@ -790,45 +789,31 @@ xfs_alloc_file_space(
*/
resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
if (unlikely(rt)) {
- resrtextents = qblocks = resblks;
- resrtextents /= mp->m_sb.sb_rextsize;
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
- quota_flag = XFS_QMOPT_RES_RTBLKS;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ rblocks = resblks;
} else {
- resrtextents = 0;
- resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
- quota_flag = XFS_QMOPT_RES_REGBLKS;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+ rblocks = 0;
}
/*
* Allocate and setup the transaction.
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
- resrtextents, 0, &tp);
-
- /*
- * Check for running out of space
- */
- if (error) {
- /*
- * Free the transaction structure.
- */
- ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- break;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
- 0, quota_flag);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ dblocks, rblocks, false, &tp);
if (error)
- goto error1;
+ break;
- xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ goto error;
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
allocatesize_fsb, alloc_type, 0, imapp,
&nimaps);
if (error)
- goto error0;
+ goto error;
/*
* Complete the transaction
@@ -851,10 +836,7 @@ xfs_alloc_file_space(
return error;
-error0: /* unlock inode, unreserve quota blocks, cancel trans */
- xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
-
-error1: /* Just cancel transaction */
+error:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -872,20 +854,16 @@ xfs_unmap_extent(
uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
int error;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
- if (error) {
- ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
+ false, &tp);
+ if (error)
return error;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
- ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, 0);
-
error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, done);
if (error)
goto out_trans_cancel;
@@ -1163,6 +1141,11 @@ xfs_insert_file_space(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error)
+ goto out_trans_cancel;
+
/*
* The extent shifting code works on extent granularity. So, if stop_fsb
* is not the starting block of extent, we need to split the extent at
@@ -1384,6 +1367,22 @@ xfs_swap_extent_rmap(
irec.br_blockcount);
trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
+ if (xfs_bmap_is_real_extent(&uirec)) {
+ error = xfs_iext_count_may_overflow(ip,
+ XFS_DATA_FORK,
+ XFS_IEXT_SWAP_RMAP_CNT);
+ if (error)
+ goto out;
+ }
+
+ if (xfs_bmap_is_real_extent(&irec)) {
+ error = xfs_iext_count_may_overflow(tip,
+ XFS_DATA_FORK,
+ XFS_IEXT_SWAP_RMAP_CNT);
+ if (error)
+ goto out;
+ }
+
/* Remove the mapping from the donor file. */
xfs_bmap_unmap_extent(tp, tip, &uirec);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f8400bbd6473..37a1d12762d8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -43,7 +43,7 @@ static kmem_zone_t *xfs_buf_zone;
* pag_buf_lock
* lru_lock
*
- * xfs_buftarg_wait_rele
+ * xfs_buftarg_drain_rele
* lru_lock
* b_lock (trylock due to inversion)
*
@@ -88,7 +88,7 @@ xfs_buf_vmap_len(
* because the corresponding decrement is deferred to buffer release. Buffers
* can undergo I/O multiple times in a hold-release cycle and per buffer I/O
* tracking adds unnecessary overhead. This is used for sychronization purposes
- * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
+ * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
* in-flight buffers.
*
* Buffers that are never released (e.g., superblock, iclog buffers) must set
@@ -1480,7 +1480,7 @@ xfs_buf_ioapply_map(
int op)
{
int page_index;
- int total_nr_pages = bp->b_page_count;
+ unsigned int total_nr_pages = bp->b_page_count;
int nr_pages;
struct bio *bio;
sector_t sector = bp->b_maps[map].bm_bn;
@@ -1505,7 +1505,7 @@ xfs_buf_ioapply_map(
next_chunk:
atomic_inc(&bp->b_io_remaining);
- nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
+ nr_pages = bio_max_segs(total_nr_pages);
bio = bio_alloc(GFP_NOIO, nr_pages);
bio_set_dev(bio, bp->b_target->bt_bdev);
@@ -1786,7 +1786,7 @@ __xfs_buf_mark_corrupt(
* while freeing all the buffers only held by the LRU.
*/
static enum lru_status
-xfs_buftarg_wait_rele(
+xfs_buftarg_drain_rele(
struct list_head *item,
struct list_lru_one *lru,
spinlock_t *lru_lock,
@@ -1798,7 +1798,7 @@ xfs_buftarg_wait_rele(
if (atomic_read(&bp->b_hold) > 1) {
/* need to wait, so skip it this pass */
- trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+ trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
return LRU_SKIP;
}
if (!spin_trylock(&bp->b_lock))
@@ -1815,14 +1815,13 @@ xfs_buftarg_wait_rele(
return LRU_REMOVED;
}
+/*
+ * Wait for outstanding I/O on the buftarg to complete.
+ */
void
-xfs_wait_buftarg(
+xfs_buftarg_wait(
struct xfs_buftarg *btp)
{
- LIST_HEAD(dispose);
- int loop = 0;
- bool write_fail = false;
-
/*
* First wait on the buftarg I/O count for all in-flight buffers to be
* released. This is critical as new buffers do not make the LRU until
@@ -1838,10 +1837,21 @@ xfs_wait_buftarg(
while (percpu_counter_sum(&btp->bt_io_count))
delay(100);
flush_workqueue(btp->bt_mount->m_buf_workqueue);
+}
+
+void
+xfs_buftarg_drain(
+ struct xfs_buftarg *btp)
+{
+ LIST_HEAD(dispose);
+ int loop = 0;
+ bool write_fail = false;
+
+ xfs_buftarg_wait(btp);
/* loop until there is nothing left on the lru list. */
while (list_lru_count(&btp->bt_lru)) {
- list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
+ list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
&dispose, LONG_MAX);
while (!list_empty(&dispose)) {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5d91a31298a4..459ca34f26f5 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -152,7 +152,7 @@ struct xfs_buf {
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
struct xfs_mount *b_mount;
- xfs_buftarg_t *b_target; /* buffer target (device) */
+ struct xfs_buftarg *b_target; /* buffer target (device) */
void *b_addr; /* virtual address of buffer */
struct work_struct b_ioend_work;
struct completion b_iowait; /* queue for I/O waiters */
@@ -344,11 +344,12 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
/*
* Handling of buftargs.
*/
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
- struct block_device *, struct dax_device *);
+extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *,
+ struct block_device *, struct dax_device *);
extern void xfs_free_buftarg(struct xfs_buftarg *);
-extern void xfs_wait_buftarg(xfs_buftarg_t *);
-extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
+extern void xfs_buftarg_wait(struct xfs_buftarg *);
+extern void xfs_buftarg_drain(struct xfs_buftarg *);
+extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1d95ed387d66..bd8379b98374 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -314,8 +314,14 @@ xfs_dquot_disk_alloc(
return -ESRCH;
}
- /* Create the block mapping. */
xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
+
+ error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ return error;
+
+ /* Create the block mapping. */
error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset,
XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map,
&nmaps);
@@ -500,6 +506,42 @@ xfs_dquot_alloc(
return dqp;
}
+/* Check the ondisk dquot's id and type match what the incore dquot expects. */
+static bool
+xfs_dquot_check_type(
+ struct xfs_dquot *dqp,
+ struct xfs_disk_dquot *ddqp)
+{
+ uint8_t ddqp_type;
+ uint8_t dqp_type;
+
+ ddqp_type = ddqp->d_type & XFS_DQTYPE_REC_MASK;
+ dqp_type = xfs_dquot_type(dqp);
+
+ if (be32_to_cpu(ddqp->d_id) != dqp->q_id)
+ return false;
+
+ /*
+ * V5 filesystems always expect an exact type match. V4 filesystems
+ * expect an exact match for user dquots and for non-root group and
+ * project dquots.
+ */
+ if (xfs_sb_version_hascrc(&dqp->q_mount->m_sb) ||
+ dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0)
+ return ddqp_type == dqp_type;
+
+ /*
+ * V4 filesystems support either group or project quotas, but not both
+ * at the same time. The non-user quota file can be switched between
+ * group and project quota uses depending on the mount options, which
+ * means that we can encounter the other type when we try to load quota
+ * defaults. Quotacheck will soon reset the the entire quota file
+ * (including the root dquot) anyway, but don't log scary corruption
+ * reports to dmesg.
+ */
+ return ddqp_type == XFS_DQTYPE_GROUP || ddqp_type == XFS_DQTYPE_PROJ;
+}
+
/* Copy the in-core quota fields in from the on-disk buffer. */
STATIC int
xfs_dquot_from_disk(
@@ -512,8 +554,7 @@ xfs_dquot_from_disk(
* Ensure that we got the type and ID we were looking for.
* Everything else was checked by the dquot buffer verifier.
*/
- if ((ddqp->d_type & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) ||
- be32_to_cpu(ddqp->d_id) != dqp->q_id) {
+ if (!xfs_dquot_check_type(dqp, ddqp)) {
xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR,
"Metadata corruption detected at %pS, quota %u",
__this_address, dqp->q_id);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 7f6e20899473..185b4915b7bf 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -54,6 +54,8 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_FORCE_SUMMARY_RECALC,
XFS_RANDOM_IUNLINK_FALLBACK,
XFS_RANDOM_BUF_IOERROR,
+ XFS_RANDOM_REDUCE_MAX_IEXTENTS,
+ XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
};
struct xfs_errortag_attr {
@@ -164,6 +166,8 @@ XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR);
+XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
+XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -202,6 +206,8 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(bad_summary),
XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
XFS_ERRORTAG_ATTR_LIST(buf_ioerror),
+ XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
+ XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
NULL,
};
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 3991e59cfd18..ef17c1f6db32 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -344,7 +344,6 @@ xfs_extent_busy_trim(
ASSERT(*len > 0);
spin_lock(&args->pag->pagb_lock);
-restart:
fbno = *bno;
flen = *len;
rbp = args->pag->pagb_tree.rb_node;
@@ -363,19 +362,6 @@ restart:
continue;
}
- /*
- * If this is a metadata allocation, try to reuse the busy
- * extent instead of trimming the allocation.
- */
- if (!(args->datatype & XFS_ALLOC_USERDATA) &&
- !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
- if (!xfs_extent_busy_update_extent(args->mp, args->pag,
- busyp, fbno, flen,
- false))
- goto restart;
- continue;
- }
-
if (bbno <= fbno) {
/* start overlap */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5b0f93f73837..a007ca0711d9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -29,6 +29,7 @@
#include <linux/backing-dev.h>
#include <linux/mman.h>
#include <linux/fadvise.h>
+#include <linux/mount.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -118,6 +119,54 @@ xfs_dir_fsync(
return xfs_log_force_inode(ip);
}
+static xfs_lsn_t
+xfs_fsync_lsn(
+ struct xfs_inode *ip,
+ bool datasync)
+{
+ if (!xfs_ipincount(ip))
+ return 0;
+ if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ return 0;
+ return ip->i_itemp->ili_last_lsn;
+}
+
+/*
+ * All metadata updates are logged, which means that we just have to flush the
+ * log up to the latest LSN that touched the inode.
+ *
+ * If we have concurrent fsync/fdatasync() calls, we need them to all block on
+ * the log force before we clear the ili_fsync_fields field. This ensures that
+ * we don't get a racing sync operation that does not wait for the metadata to
+ * hit the journal before returning. If we race with clearing ili_fsync_fields,
+ * then all that will happen is the log force will do nothing as the lsn will
+ * already be on disk. We can't race with setting ili_fsync_fields because that
+ * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
+ * shared until after the ili_fsync_fields is cleared.
+ */
+static int
+xfs_fsync_flush_log(
+ struct xfs_inode *ip,
+ bool datasync,
+ int *log_flushed)
+{
+ int error = 0;
+ xfs_lsn_t lsn;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ lsn = xfs_fsync_lsn(ip, datasync);
+ if (lsn) {
+ error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC,
+ log_flushed);
+
+ spin_lock(&ip->i_itemp->ili_lock);
+ ip->i_itemp->ili_fsync_fields = 0;
+ spin_unlock(&ip->i_itemp->ili_lock);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
+}
+
STATIC int
xfs_file_fsync(
struct file *file,
@@ -125,13 +174,10 @@ xfs_file_fsync(
loff_t end,
int datasync)
{
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct xfs_inode *ip = XFS_I(file->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
int error = 0;
int log_flushed = 0;
- xfs_lsn_t lsn = 0;
trace_xfs_file_fsync(ip);
@@ -156,32 +202,13 @@ xfs_file_fsync(
xfs_blkdev_issue_flush(mp->m_ddev_targp);
/*
- * All metadata updates are logged, which means that we just have to
- * flush the log up to the latest LSN that touched the inode. If we have
- * concurrent fsync/fdatasync() calls, we need them to all block on the
- * log force before we clear the ili_fsync_fields field. This ensures
- * that we don't get a racing sync operation that does not wait for the
- * metadata to hit the journal before returning. If we race with
- * clearing the ili_fsync_fields, then all that will happen is the log
- * force will do nothing as the lsn will already be on disk. We can't
- * race with setting ili_fsync_fields because that is done under
- * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
- * until after the ili_fsync_fields is cleared.
+ * Any inode that has dirty modifications in the log is pinned. The
+ * racy check here for a pinned inode while not catch modifications
+ * that happen concurrently to the fsync call, but fsync semantics
+ * only require to sync previously completed I/O.
*/
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip)) {
- if (!datasync ||
- (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- lsn = iip->ili_last_lsn;
- }
-
- if (lsn) {
- error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
- spin_lock(&iip->ili_lock);
- iip->ili_fsync_fields = 0;
- spin_unlock(&iip->ili_lock);
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
/*
* If we only have a single device, and the log force about was
@@ -197,30 +224,42 @@ xfs_file_fsync(
return error;
}
+static int
+xfs_ilock_iocb(
+ struct kiocb *iocb,
+ unsigned int lock_mode)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, lock_mode))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, lock_mode);
+ }
+
+ return 0;
+}
+
STATIC ssize_t
-xfs_file_dio_aio_read(
+xfs_file_dio_read(
struct kiocb *iocb,
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
- size_t count = iov_iter_count(to);
ssize_t ret;
- trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
+ trace_xfs_file_direct_read(iocb, to);
- if (!count)
+ if (!iov_iter_count(to))
return 0; /* skip atime */
file_accessed(iocb->ki_filp);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -232,21 +271,16 @@ xfs_file_dax_read(
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
- size_t count = iov_iter_count(to);
ssize_t ret = 0;
- trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+ trace_xfs_file_dax_read(iocb, to);
- if (!count)
+ if (!iov_iter_count(to))
return 0; /* skip atime */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
-
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -255,21 +289,18 @@ xfs_file_dax_read(
}
STATIC ssize_t
-xfs_file_buffered_aio_read(
+xfs_file_buffered_read(
struct kiocb *iocb,
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
ssize_t ret;
- trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+ trace_xfs_file_buffered_read(iocb, to);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
ret = generic_file_read_iter(iocb, to);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -293,9 +324,9 @@ xfs_file_read_iter(
if (IS_DAX(inode))
ret = xfs_file_dax_read(iocb, to);
else if (iocb->ki_flags & IOCB_DIRECT)
- ret = xfs_file_dio_aio_read(iocb, to);
+ ret = xfs_file_dio_read(iocb, to);
else
- ret = xfs_file_buffered_aio_read(iocb, to);
+ ret = xfs_file_buffered_read(iocb, to);
if (ret > 0)
XFS_STATS_ADD(mp, xs_read_bytes, ret);
@@ -310,7 +341,7 @@ xfs_file_read_iter(
* if called for a direct write beyond i_size.
*/
STATIC ssize_t
-xfs_file_aio_write_checks(
+xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
int *iolock)
@@ -328,7 +359,14 @@ restart:
if (error <= 0)
return error;
- error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ error = break_layout(inode, false);
+ if (error == -EWOULDBLOCK)
+ error = -EAGAIN;
+ } else {
+ error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ }
+
if (error)
return error;
@@ -339,7 +377,11 @@ restart:
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, *iolock);
+ error = xfs_ilock_iocb(iocb, *iolock);
+ if (error) {
+ *iolock = 0;
+ return error;
+ }
goto restart;
}
/*
@@ -361,6 +403,10 @@ restart:
isize = i_size_read(inode);
if (iocb->ki_pos > isize) {
spin_unlock(&ip->i_flags_lock);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_iunlock(ip, *iolock);
@@ -389,12 +435,6 @@ restart:
} else
spin_unlock(&ip->i_flags_lock);
- /*
- * Updating the timestamps will grab the ilock again from
- * xfs_fs_dirty_inode, so we have to call it after dropping the
- * lock above. Eventually we should look into a way to avoid
- * the pointless lock roundtrip.
- */
return file_modified(file);
}
@@ -480,122 +520,149 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
};
/*
- * xfs_file_dio_aio_write - handle direct IO writes
- *
- * Lock the inode appropriately to prepare for and issue a direct IO write.
- * By separating it from the buffered write path we remove all the tricky to
- * follow locking changes and looping.
- *
- * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
- * until we're sure the bytes at the new EOF have been zeroed and/or the cached
- * pages are flushed out.
- *
- * In most cases the direct IO writes will be done holding IOLOCK_SHARED
- * allowing them to be done in parallel with reads and other direct IO writes.
- * However, if the IO is not aligned to filesystem blocks, the direct IO layer
- * needs to do sub-block zeroing and that requires serialisation against other
- * direct IOs to the same block. In this case we need to serialise the
- * submission of the unaligned IOs so that we don't get racing block zeroing in
- * the dio layer. To avoid the problem with aio, we also need to wait for
- * outstanding IOs to complete so that unwritten extent conversion is completed
- * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. inode_dio_wait()).
- *
- * Returns with locks held indicated by @iolock and errors indicated by
- * negative return values.
+ * Handle block aligned direct I/O writes
*/
-STATIC ssize_t
-xfs_file_dio_aio_write(
+static noinline ssize_t
+xfs_file_dio_write_aligned(
+ struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- ssize_t ret = 0;
- int unaligned_io = 0;
- int iolock;
- size_t count = iov_iter_count(from);
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ int iolock = XFS_IOLOCK_SHARED;
+ ssize_t ret;
- /* DIO must be aligned to device logical sector size */
- if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
- return -EINVAL;
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
+ ret = xfs_file_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out_unlock;
/*
- * Don't take the exclusive iolock here unless the I/O is unaligned to
- * the file system block size. We don't need to consider the EOF
- * extension case here because xfs_file_aio_write_checks() will relock
- * the inode as necessary for EOF zeroing cases and fill out the new
- * inode size as appropriate.
+ * We don't need to hold the IOLOCK exclusively across the IO, so demote
+ * the iolock back to shared if we had to take the exclusive lock in
+ * xfs_file_write_checks() for other reasons.
*/
- if ((iocb->ki_pos & mp->m_blockmask) ||
- ((iocb->ki_pos + count) & mp->m_blockmask)) {
- unaligned_io = 1;
-
- /*
- * We can't properly handle unaligned direct I/O to reflink
- * files yet, as we can't unshare a partial block.
- */
- if (xfs_is_cow_inode(ip)) {
- trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
- return -ENOTBLK;
- }
- iolock = XFS_IOLOCK_EXCL;
- } else {
+ if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+ &xfs_dio_write_ops, 0);
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
- if (iocb->ki_flags & IOCB_NOWAIT) {
- /* unaligned dio always waits, bail */
- if (unaligned_io)
- return -EAGAIN;
- if (!xfs_ilock_nowait(ip, iolock))
+/*
+ * Handle block unaligned direct I/O writes
+ *
+ * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
+ * them to be done in parallel with reads and other direct I/O writes. However,
+ * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
+ * to do sub-block zeroing and that requires serialisation against other direct
+ * I/O to the same block. In this case we need to serialise the submission of
+ * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
+ * In the case where sub-block zeroing is not required, we can do concurrent
+ * sub-block dios to the same block successfully.
+ *
+ * Optimistically submit the I/O using the shared lock first, but use the
+ * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
+ * if block allocation or partial block zeroing would be required. In that case
+ * we try again with the exclusive lock.
+ */
+static noinline ssize_t
+xfs_file_dio_write_unaligned(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ size_t isize = i_size_read(VFS_I(ip));
+ size_t count = iov_iter_count(from);
+ int iolock = XFS_IOLOCK_SHARED;
+ unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
+ ssize_t ret;
+
+ /*
+ * Extending writes need exclusivity because of the sub-block zeroing
+ * that the DIO code always does for partial tail blocks beyond EOF, so
+ * don't even bother trying the fast path in this case.
+ */
+ if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
+retry_exclusive:
+ if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
- } else {
- xfs_ilock(ip, iolock);
+ iolock = XFS_IOLOCK_EXCL;
+ flags = IOMAP_DIO_FORCE_WAIT;
}
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
- goto out;
- count = iov_iter_count(from);
+ return ret;
/*
- * If we are doing unaligned IO, we can't allow any other overlapping IO
- * in-flight at the same time or we risk data corruption. Wait for all
- * other IO to drain before we submit. If the IO is aligned, demote the
- * iolock if we had to take the exclusive lock in
- * xfs_file_aio_write_checks() for other reasons.
+ * We can't properly handle unaligned direct I/O to reflink files yet,
+ * as we can't unshare a partial block.
*/
- if (unaligned_io) {
- inode_dio_wait(inode);
- } else if (iolock == XFS_IOLOCK_EXCL) {
- xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
- iolock = XFS_IOLOCK_SHARED;
+ if (xfs_is_cow_inode(ip)) {
+ trace_xfs_reflink_bounce_dio_write(iocb, from);
+ ret = -ENOTBLK;
+ goto out_unlock;
}
- trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+ ret = xfs_file_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out_unlock;
+
/*
- * If unaligned, this is the only IO in-flight. Wait on it before we
- * release the iolock to prevent subsequent overlapping IO.
+ * If we are doing exclusive unaligned I/O, this must be the only I/O
+ * in-flight. Otherwise we risk data corruption due to unwritten extent
+ * conversions from the AIO end_io handler. Wait for all other I/O to
+ * drain first.
*/
+ if (flags & IOMAP_DIO_FORCE_WAIT)
+ inode_dio_wait(VFS_I(ip));
+
+ trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops,
- is_sync_kiocb(iocb) || unaligned_io);
-out:
- xfs_iunlock(ip, iolock);
+ &xfs_dio_write_ops, flags);
/*
- * No fallback to buffered IO after short writes for XFS, direct I/O
- * will either complete fully or return an error.
+ * Retry unaligned I/O with exclusive blocking semantics if the DIO
+ * layer rejected it for mapping or locking reasons. If we are doing
+ * nonblocking user I/O, propagate the error.
*/
- ASSERT(ret < 0 || ret == count);
+ if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
+ ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
+ xfs_iunlock(ip, iolock);
+ goto retry_exclusive;
+ }
+
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
return ret;
}
+static ssize_t
+xfs_file_dio_write(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ size_t count = iov_iter_count(from);
+
+ /* direct I/O must be aligned to device logical sector size */
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
+ return -EINVAL;
+ if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+ return xfs_file_dio_write_unaligned(ip, iocb, from);
+ return xfs_file_dio_write_aligned(ip, iocb, from);
+}
+
static noinline ssize_t
xfs_file_dax_write(
struct kiocb *iocb,
@@ -605,31 +672,26 @@ xfs_file_dax_write(
struct xfs_inode *ip = XFS_I(inode);
int iolock = XFS_IOLOCK_EXCL;
ssize_t ret, error = 0;
- size_t count;
loff_t pos;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, iolock))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, iolock);
- }
-
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
+ ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
goto out;
pos = iocb->ki_pos;
- count = iov_iter_count(from);
- trace_xfs_file_dax_write(ip, count, pos);
+ trace_xfs_file_dax_write(iocb, from);
ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
}
out:
- xfs_iunlock(ip, iolock);
+ if (iolock)
+ xfs_iunlock(ip, iolock);
if (error)
return error;
@@ -643,7 +705,7 @@ out:
}
STATIC ssize_t
-xfs_file_buffered_aio_write(
+xfs_file_buffered_write(
struct kiocb *iocb,
struct iov_iter *from)
{
@@ -652,7 +714,7 @@ xfs_file_buffered_aio_write(
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- int enospc = 0;
+ bool cleared_space = false;
int iolock;
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -662,14 +724,14 @@ write_retry:
iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock);
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
goto out;
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
- trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+ trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
&xfs_buffered_write_iomap_ops);
if (likely(ret >= 0))
@@ -682,27 +744,23 @@ write_retry:
* metadata space. This reduces the chances that the eofblocks scan
* waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
* also behaves as a filter to prevent too many eofblocks scans from
- * running at the same time.
+ * running at the same time. Use a synchronous scan to increase the
+ * effectiveness of the scan.
*/
- if (ret == -EDQUOT && !enospc) {
+ if (ret == -EDQUOT && !cleared_space) {
xfs_iunlock(ip, iolock);
- enospc = xfs_inode_free_quota_eofblocks(ip);
- if (enospc)
- goto write_retry;
- enospc = xfs_inode_free_quota_cowblocks(ip);
- if (enospc)
- goto write_retry;
- iolock = 0;
- } else if (ret == -ENOSPC && !enospc) {
+ xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC);
+ cleared_space = true;
+ goto write_retry;
+ } else if (ret == -ENOSPC && !cleared_space) {
struct xfs_eofblocks eofb = {0};
- enospc = 1;
+ cleared_space = true;
xfs_flush_inodes(ip->i_mount);
xfs_iunlock(ip, iolock);
eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
- xfs_icache_free_eofblocks(ip->i_mount, &eofb);
- xfs_icache_free_cowblocks(ip->i_mount, &eofb);
+ xfs_blockgc_free_space(ip->i_mount, &eofb);
goto write_retry;
}
@@ -749,12 +807,12 @@ xfs_file_write_iter(
* CoW. In all other directio scenarios we do not
* allow an operation to fall back to buffered mode.
*/
- ret = xfs_file_dio_aio_write(iocb, from);
+ ret = xfs_file_dio_write(iocb, from);
if (ret != -ENOTBLK)
return ret;
}
- return xfs_file_buffered_aio_write(iocb, from);
+ return xfs_file_buffered_write(iocb, from);
}
static void
@@ -994,7 +1052,8 @@ xfs_file_fallocate(
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
- error = xfs_vn_setattr_size(file_dentry(file), &iattr);
+ error = xfs_vn_setattr_size(file_mnt_user_ns(file),
+ file_dentry(file), &iattr);
if (error)
goto out_unlock;
}
@@ -1319,17 +1378,19 @@ xfs_filemap_pfn_mkwrite(
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
-static void
+static vm_fault_t
xfs_filemap_map_pages(
struct vm_fault *vmf,
pgoff_t start_pgoff,
pgoff_t end_pgoff)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
+ vm_fault_t ret;
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- filemap_map_pages(vmf, start_pgoff, end_pgoff);
+ ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ return ret;
}
static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 959ce91a3755..a2a407039227 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -25,17 +25,17 @@
*/
static int
xfs_growfs_data_private(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_growfs_data_t *in) /* growfs data input struct */
+ struct xfs_mount *mp, /* mount point for filesystem */
+ struct xfs_growfs_data *in) /* growfs data input struct */
{
struct xfs_buf *bp;
int error;
xfs_agnumber_t nagcount;
xfs_agnumber_t nagimax = 0;
- xfs_rfsblock_t nb, nb_mod;
- xfs_rfsblock_t new;
+ xfs_rfsblock_t nb, nb_div, nb_mod;
+ xfs_rfsblock_t delta;
xfs_agnumber_t oagcount;
- xfs_trans_t *tp;
+ struct xfs_trans *tp;
struct aghdr_init_data id = {};
nb = in->newblocks;
@@ -50,16 +50,16 @@ xfs_growfs_data_private(
return error;
xfs_buf_relse(bp);
- new = nb; /* use new as a temporary here */
- nb_mod = do_div(new, mp->m_sb.sb_agblocks);
- nagcount = new + (nb_mod != 0);
+ nb_div = nb;
+ nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks);
+ nagcount = nb_div + (nb_mod != 0);
if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
nagcount--;
nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
if (nb < mp->m_sb.sb_dblocks)
return -EINVAL;
}
- new = nb - mp->m_sb.sb_dblocks;
+ delta = nb - mp->m_sb.sb_dblocks;
oagcount = mp->m_sb.sb_agcount;
/* allocate the new per-ag structures */
@@ -89,7 +89,7 @@ xfs_growfs_data_private(
INIT_LIST_HEAD(&id.buffer_list);
for (id.agno = nagcount - 1;
id.agno >= oagcount;
- id.agno--, new -= id.agsize) {
+ id.agno--, delta -= id.agsize) {
if (id.agno == nagcount - 1)
id.agsize = nb -
@@ -110,8 +110,8 @@ xfs_growfs_data_private(
xfs_trans_agblocks_delta(tp, id.nfree);
/* If there are new blocks in the old last AG, extend it. */
- if (new) {
- error = xfs_ag_extend_space(mp, tp, &id, new);
+ if (delta) {
+ error = xfs_ag_extend_space(mp, tp, &id, delta);
if (error)
goto out_trans_cancel;
}
@@ -143,7 +143,7 @@ xfs_growfs_data_private(
* If we expanded the last AG, free the per-AG reservation
* so we can reinitialize it with the new size.
*/
- if (new) {
+ if (delta) {
struct xfs_perag *pag;
pag = xfs_perag_get(mp, id.agno);
@@ -170,8 +170,8 @@ out_trans_cancel:
static int
xfs_growfs_log_private(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_growfs_log_t *in) /* growfs log input struct */
+ struct xfs_mount *mp, /* mount point for filesystem */
+ struct xfs_growfs_log *in) /* growfs log input struct */
{
xfs_extlen_t nb;
@@ -268,7 +268,7 @@ out_error:
int
xfs_growfs_log(
xfs_mount_t *mp,
- xfs_growfs_log_t *in)
+ struct xfs_growfs_log *in)
{
int error;
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 92869f6ec8d3..2cffe51a31e8 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -6,8 +6,8 @@
#ifndef __XFS_FSOPS_H__
#define __XFS_FSOPS_H__
-extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in);
-extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in);
+extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
+extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
xfs_fsop_resblks_t *outval);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index fa55ab8b8d80..f62fa652c2fd 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -8,8 +8,8 @@
/*
* Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
* other XFS code uses these values. Times are measured in centisecs (i.e.
- * 100ths of a second) with the exception of eofb_timer and cowb_timer, which
- * are measured in seconds.
+ * 100ths of a second) with the exception of blockgc_timer, which is measured
+ * in seconds.
*/
xfs_param_t xfs_params = {
/* MIN DFLT MAX */
@@ -28,8 +28,7 @@ xfs_param_t xfs_params = {
.rotorstep = { 1, 1, 255 },
.inherit_nodfrg = { 0, 1, 1 },
.fstrm_timer = { 1, 30*100, 3600*100},
- .eofb_timer = { 1, 300, 3600*24},
- .cowb_timer = { 1, 1800, 3600*24},
+ .blockgc_timer = { 1, 300, 3600*24},
};
struct xfs_globals xfs_globals = {
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index deb99300d171..1d7720a0c068 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -916,69 +916,6 @@ xfs_inode_walk(
}
/*
- * Background scanning to trim post-EOF preallocated space. This is queued
- * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
- */
-void
-xfs_queue_eofblocks(
- struct xfs_mount *mp)
-{
- rcu_read_lock();
- if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
- queue_delayed_work(mp->m_eofblocks_workqueue,
- &mp->m_eofblocks_work,
- msecs_to_jiffies(xfs_eofb_secs * 1000));
- rcu_read_unlock();
-}
-
-void
-xfs_eofblocks_worker(
- struct work_struct *work)
-{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_eofblocks_work);
-
- if (!sb_start_write_trylock(mp->m_super))
- return;
- xfs_icache_free_eofblocks(mp, NULL);
- sb_end_write(mp->m_super);
-
- xfs_queue_eofblocks(mp);
-}
-
-/*
- * Background scanning to trim preallocated CoW space. This is queued
- * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
- * (We'll just piggyback on the post-EOF prealloc space workqueue.)
- */
-void
-xfs_queue_cowblocks(
- struct xfs_mount *mp)
-{
- rcu_read_lock();
- if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG))
- queue_delayed_work(mp->m_eofblocks_workqueue,
- &mp->m_cowblocks_work,
- msecs_to_jiffies(xfs_cowb_secs * 1000));
- rcu_read_unlock();
-}
-
-void
-xfs_cowblocks_worker(
- struct work_struct *work)
-{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_cowblocks_work);
-
- if (!sb_start_write_trylock(mp->m_super))
- return;
- xfs_icache_free_cowblocks(mp, NULL);
- sb_end_write(mp->m_super);
-
- xfs_queue_cowblocks(mp);
-}
-
-/*
* Grab the inode for reclaim exclusively.
*
* We have found this inode via a lookup under RCU, so the inode may have
@@ -1346,14 +1283,17 @@ xfs_reclaim_worker(
STATIC int
xfs_inode_free_eofblocks(
struct xfs_inode *ip,
- void *args)
+ void *args,
+ unsigned int *lockflags)
{
struct xfs_eofblocks *eofb = args;
bool wait;
- int ret;
wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
+ if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
+ return 0;
+
if (!xfs_can_free_eofblocks(ip, false)) {
/* inode could be preallocated or append-only */
trace_xfs_inode_free_eofblocks_invalid(ip);
@@ -1380,130 +1320,68 @@ xfs_inode_free_eofblocks(
return -EAGAIN;
return 0;
}
+ *lockflags |= XFS_IOLOCK_EXCL;
- ret = xfs_free_eofblocks(ip);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-
- return ret;
-}
-
-int
-xfs_icache_free_eofblocks(
- struct xfs_mount *mp,
- struct xfs_eofblocks *eofb)
-{
- return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb,
- XFS_ICI_EOFBLOCKS_TAG);
+ return xfs_free_eofblocks(ip);
}
/*
- * Run eofblocks scans on the quotas applicable to the inode. For inodes with
- * multiple quotas, we don't know exactly which quota caused an allocation
- * failure. We make a best effort by including each quota under low free space
- * conditions (less than 1% free space) in the scan.
+ * Background scanning to trim preallocated space. This is queued based on the
+ * 'speculative_prealloc_lifetime' tunable (5m by default).
*/
-static int
-__xfs_inode_free_quota_eofblocks(
- struct xfs_inode *ip,
- int (*execute)(struct xfs_mount *mp,
- struct xfs_eofblocks *eofb))
-{
- int scan = 0;
- struct xfs_eofblocks eofb = {0};
- struct xfs_dquot *dq;
-
- /*
- * Run a sync scan to increase effectiveness and use the union filter to
- * cover all applicable quotas in a single scan.
- */
- eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
-
- if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
- dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER);
- if (dq && xfs_dquot_lowsp(dq)) {
- eofb.eof_uid = VFS_I(ip)->i_uid;
- eofb.eof_flags |= XFS_EOF_FLAGS_UID;
- scan = 1;
- }
- }
-
- if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
- dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP);
- if (dq && xfs_dquot_lowsp(dq)) {
- eofb.eof_gid = VFS_I(ip)->i_gid;
- eofb.eof_flags |= XFS_EOF_FLAGS_GID;
- scan = 1;
- }
- }
-
- if (scan)
- execute(ip->i_mount, &eofb);
-
- return scan;
-}
-
-int
-xfs_inode_free_quota_eofblocks(
- struct xfs_inode *ip)
-{
- return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
-}
-
-static inline unsigned long
-xfs_iflag_for_tag(
- int tag)
+static inline void
+xfs_blockgc_queue(
+ struct xfs_perag *pag)
{
- switch (tag) {
- case XFS_ICI_EOFBLOCKS_TAG:
- return XFS_IEOFBLOCKS;
- case XFS_ICI_COWBLOCKS_TAG:
- return XFS_ICOWBLOCKS;
- default:
- ASSERT(0);
- return 0;
- }
+ rcu_read_lock();
+ if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
+ queue_delayed_work(pag->pag_mount->m_blockgc_workqueue,
+ &pag->pag_blockgc_work,
+ msecs_to_jiffies(xfs_blockgc_secs * 1000));
+ rcu_read_unlock();
}
static void
-__xfs_inode_set_blocks_tag(
- xfs_inode_t *ip,
- void (*execute)(struct xfs_mount *mp),
- void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
- int error, unsigned long caller_ip),
- int tag)
+xfs_blockgc_set_iflag(
+ struct xfs_inode *ip,
+ unsigned long iflag)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
- int tagged;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ int tagged;
+
+ ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
/*
* Don't bother locking the AG and looking up in the radix trees
* if we already know that we have the tag set.
*/
- if (ip->i_flags & xfs_iflag_for_tag(tag))
+ if (ip->i_flags & iflag)
return;
spin_lock(&ip->i_flags_lock);
- ip->i_flags |= xfs_iflag_for_tag(tag);
+ ip->i_flags |= iflag;
spin_unlock(&ip->i_flags_lock);
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
- tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
+ tagged = radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG);
radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_BLOCKGC_TAG);
if (!tagged) {
- /* propagate the eofblocks tag up into the perag radix tree */
+ /* propagate the blockgc tag up into the perag radix tree */
spin_lock(&ip->i_mount->m_perag_lock);
radix_tree_tag_set(&ip->i_mount->m_perag_tree,
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- tag);
+ XFS_ICI_BLOCKGC_TAG);
spin_unlock(&ip->i_mount->m_perag_lock);
/* kick off background trimming */
- execute(ip->i_mount);
+ xfs_blockgc_queue(pag);
- set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
+ trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1,
+ _RET_IP_);
}
spin_unlock(&pag->pag_ici_lock);
@@ -1515,38 +1393,43 @@ xfs_inode_set_eofblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_set_eofblocks_tag(ip);
- return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks,
- trace_xfs_perag_set_eofblocks,
- XFS_ICI_EOFBLOCKS_TAG);
+ return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
}
static void
-__xfs_inode_clear_blocks_tag(
- xfs_inode_t *ip,
- void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
- int error, unsigned long caller_ip),
- int tag)
+xfs_blockgc_clear_iflag(
+ struct xfs_inode *ip,
+ unsigned long iflag)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ bool clear_tag;
+
+ ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
spin_lock(&ip->i_flags_lock);
- ip->i_flags &= ~xfs_iflag_for_tag(tag);
+ ip->i_flags &= ~iflag;
+ clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
spin_unlock(&ip->i_flags_lock);
+ if (!clear_tag)
+ return;
+
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
- if (!radix_tree_tagged(&pag->pag_ici_root, tag)) {
- /* clear the eofblocks tag from the perag radix tree */
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_BLOCKGC_TAG);
+ if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) {
+ /* clear the blockgc tag from the perag radix tree */
spin_lock(&ip->i_mount->m_perag_lock);
radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- tag);
+ XFS_ICI_BLOCKGC_TAG);
spin_unlock(&ip->i_mount->m_perag_lock);
- clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
+ trace_xfs_perag_clear_blockgc(ip->i_mount, pag->pag_agno, -1,
+ _RET_IP_);
}
spin_unlock(&pag->pag_ici_lock);
@@ -1558,8 +1441,7 @@ xfs_inode_clear_eofblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_clear_eofblocks_tag(ip);
- return __xfs_inode_clear_blocks_tag(ip,
- trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
+ return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
}
/*
@@ -1609,20 +1491,42 @@ xfs_prep_free_cowblocks(
STATIC int
xfs_inode_free_cowblocks(
struct xfs_inode *ip,
- void *args)
+ void *args,
+ unsigned int *lockflags)
{
struct xfs_eofblocks *eofb = args;
+ bool wait;
int ret = 0;
+ wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
+
+ if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
+ return 0;
+
if (!xfs_prep_free_cowblocks(ip))
return 0;
if (!xfs_inode_matches_eofb(ip, eofb))
return 0;
- /* Free the CoW blocks */
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ /*
+ * If the caller is waiting, return -EAGAIN to keep the background
+ * scanner moving and revisit the inode in a subsequent pass.
+ */
+ if (!(*lockflags & XFS_IOLOCK_EXCL) &&
+ !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ if (wait)
+ return -EAGAIN;
+ return 0;
+ }
+ *lockflags |= XFS_IOLOCK_EXCL;
+
+ if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
+ if (wait)
+ return -EAGAIN;
+ return 0;
+ }
+ *lockflags |= XFS_MMAPLOCK_EXCL;
/*
* Check again, nobody else should be able to dirty blocks or change
@@ -1630,37 +1534,15 @@ xfs_inode_free_cowblocks(
*/
if (xfs_prep_free_cowblocks(ip))
ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
-
- xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-
return ret;
}
-int
-xfs_icache_free_cowblocks(
- struct xfs_mount *mp,
- struct xfs_eofblocks *eofb)
-{
- return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb,
- XFS_ICI_COWBLOCKS_TAG);
-}
-
-int
-xfs_inode_free_quota_cowblocks(
- struct xfs_inode *ip)
-{
- return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks);
-}
-
void
xfs_inode_set_cowblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_set_cowblocks_tag(ip);
- return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks,
- trace_xfs_perag_set_cowblocks,
- XFS_ICI_COWBLOCKS_TAG);
+ return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
}
void
@@ -1668,24 +1550,158 @@ xfs_inode_clear_cowblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_clear_cowblocks_tag(ip);
- return __xfs_inode_clear_blocks_tag(ip,
- trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
+ return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
}
+#define for_each_perag_tag(mp, next_agno, pag, tag) \
+ for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \
+ (pag) != NULL; \
+ (next_agno) = (pag)->pag_agno + 1, \
+ xfs_perag_put(pag), \
+ (pag) = xfs_perag_get_tag((mp), (next_agno), (tag)))
+
+
/* Disable post-EOF and CoW block auto-reclamation. */
void
-xfs_stop_block_reaping(
+xfs_blockgc_stop(
struct xfs_mount *mp)
{
- cancel_delayed_work_sync(&mp->m_eofblocks_work);
- cancel_delayed_work_sync(&mp->m_cowblocks_work);
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ cancel_delayed_work_sync(&pag->pag_blockgc_work);
}
/* Enable post-EOF and CoW block auto-reclamation. */
void
-xfs_start_block_reaping(
+xfs_blockgc_start(
struct xfs_mount *mp)
{
- xfs_queue_eofblocks(mp);
- xfs_queue_cowblocks(mp);
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ xfs_blockgc_queue(pag);
+}
+
+/* Scan one incore inode for block preallocations that we can remove. */
+static int
+xfs_blockgc_scan_inode(
+ struct xfs_inode *ip,
+ void *args)
+{
+ unsigned int lockflags = 0;
+ int error;
+
+ error = xfs_inode_free_eofblocks(ip, args, &lockflags);
+ if (error)
+ goto unlock;
+
+ error = xfs_inode_free_cowblocks(ip, args, &lockflags);
+unlock:
+ if (lockflags)
+ xfs_iunlock(ip, lockflags);
+ return error;
+}
+
+/* Background worker that trims preallocated space. */
+void
+xfs_blockgc_worker(
+ struct work_struct *work)
+{
+ struct xfs_perag *pag = container_of(to_delayed_work(work),
+ struct xfs_perag, pag_blockgc_work);
+ struct xfs_mount *mp = pag->pag_mount;
+ int error;
+
+ if (!sb_start_write_trylock(mp->m_super))
+ return;
+ error = xfs_inode_walk_ag(pag, 0, xfs_blockgc_scan_inode, NULL,
+ XFS_ICI_BLOCKGC_TAG);
+ if (error)
+ xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
+ pag->pag_agno, error);
+ sb_end_write(mp->m_super);
+ xfs_blockgc_queue(pag);
+}
+
+/*
+ * Try to free space in the filesystem by purging eofblocks and cowblocks.
+ */
+int
+xfs_blockgc_free_space(
+ struct xfs_mount *mp,
+ struct xfs_eofblocks *eofb)
+{
+ trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_);
+
+ return xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, eofb,
+ XFS_ICI_BLOCKGC_TAG);
+}
+
+/*
+ * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which
+ * quota caused an allocation failure, so we make a best effort by including
+ * each quota under low free space conditions (less than 1% free space) in the
+ * scan.
+ *
+ * Callers must not hold any inode's ILOCK. If requesting a synchronous scan
+ * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or
+ * MMAPLOCK.
+ */
+int
+xfs_blockgc_free_dquots(
+ struct xfs_mount *mp,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ unsigned int eof_flags)
+{
+ struct xfs_eofblocks eofb = {0};
+ bool do_work = false;
+
+ if (!udqp && !gdqp && !pdqp)
+ return 0;
+
+ /*
+ * Run a scan to free blocks using the union filter to cover all
+ * applicable quotas in a single scan.
+ */
+ eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags;
+
+ if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
+ eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
+ eofb.eof_flags |= XFS_EOF_FLAGS_UID;
+ do_work = true;
+ }
+
+ if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
+ eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
+ eofb.eof_flags |= XFS_EOF_FLAGS_GID;
+ do_work = true;
+ }
+
+ if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
+ eofb.eof_prid = pdqp->q_id;
+ eofb.eof_flags |= XFS_EOF_FLAGS_PRID;
+ do_work = true;
+ }
+
+ if (!do_work)
+ return 0;
+
+ return xfs_blockgc_free_space(mp, &eofb);
+}
+
+/* Run cow/eofblocks scans on the quotas attached to the inode. */
+int
+xfs_blockgc_free_quota(
+ struct xfs_inode *ip,
+ unsigned int eof_flags)
+{
+ return xfs_blockgc_free_dquots(ip->i_mount,
+ xfs_inode_dquot(ip, XFS_DQTYPE_USER),
+ xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
+ xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags);
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 3a4c8b382cd0..d1fddb152420 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -23,8 +23,8 @@ struct xfs_eofblocks {
#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
in xfs_inode_walk */
#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
-#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
-#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */
+/* Inode has speculative preallocations (posteof or cow) to clean. */
+#define XFS_ICI_BLOCKGC_TAG 1
/*
* Flags for xfs_iget()
@@ -54,19 +54,19 @@ long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
+ unsigned int eof_flags);
+int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags);
+int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb);
+
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
-int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
-int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
-void xfs_eofblocks_worker(struct work_struct *);
-void xfs_queue_eofblocks(struct xfs_mount *);
void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
-int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *);
-int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip);
-void xfs_cowblocks_worker(struct work_struct *);
-void xfs_queue_cowblocks(struct xfs_mount *);
+
+void xfs_blockgc_worker(struct work_struct *work);
int xfs_inode_walk(struct xfs_mount *mp, int iter_flags,
int (*execute)(struct xfs_inode *ip, void *args),
@@ -75,7 +75,7 @@ int xfs_inode_walk(struct xfs_mount *mp, int iter_flags,
int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_ino_t ino, bool *inuse);
-void xfs_stop_block_reaping(struct xfs_mount *mp);
-void xfs_start_block_reaping(struct xfs_mount *mp);
+void xfs_blockgc_stop(struct xfs_mount *mp);
+void xfs_blockgc_start(struct xfs_mount *mp);
#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b7352bc4c815..46a861d55e48 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -766,6 +766,7 @@ xfs_inode_inherit_flags2(
*/
static int
xfs_init_new_inode(
+ struct user_namespace *mnt_userns,
struct xfs_trans *tp,
struct xfs_inode *pip,
xfs_ino_t ino,
@@ -775,6 +776,7 @@ xfs_init_new_inode(
prid_t prid,
struct xfs_inode **ipp)
{
+ struct inode *dir = pip ? VFS_I(pip) : NULL;
struct xfs_mount *mp = tp->t_mountp;
struct xfs_inode *ip;
unsigned int flags;
@@ -804,18 +806,17 @@ xfs_init_new_inode(
ASSERT(ip != NULL);
inode = VFS_I(ip);
- inode->i_mode = mode;
set_nlink(inode, nlink);
- inode->i_uid = current_fsuid();
inode->i_rdev = rdev;
ip->i_d.di_projid = prid;
- if (pip && XFS_INHERIT_GID(pip)) {
- inode->i_gid = VFS_I(pip)->i_gid;
- if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
+ if (dir && !(dir->i_mode & S_ISGID) &&
+ (mp->m_flags & XFS_MOUNT_GRPID)) {
+ inode->i_uid = fsuid_into_mnt(mnt_userns);
+ inode->i_gid = dir->i_gid;
+ inode->i_mode = mode;
} else {
- inode->i_gid = current_fsgid();
+ inode_init_owner(mnt_userns, inode, dir, mode);
}
/*
@@ -824,7 +825,8 @@ xfs_init_new_inode(
* (and only if the irix_sgid_inherit compatibility variable is set).
*/
if (irix_sgid_inherit &&
- (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid))
+ (inode->i_mode & S_ISGID) &&
+ !in_group_p(i_gid_into_mnt(mnt_userns, inode)))
inode->i_mode &= ~S_ISGID;
ip->i_d.di_size = 0;
@@ -901,6 +903,7 @@ xfs_init_new_inode(
*/
int
xfs_dir_ialloc(
+ struct user_namespace *mnt_userns,
struct xfs_trans **tpp,
struct xfs_inode *dp,
umode_t mode,
@@ -933,7 +936,8 @@ xfs_dir_ialloc(
return error;
ASSERT(ino != NULLFSINO);
- return xfs_init_new_inode(*tpp, dp, ino, mode, nlink, rdev, prid, ipp);
+ return xfs_init_new_inode(mnt_userns, *tpp, dp, ino, mode, nlink, rdev,
+ prid, ipp);
}
/*
@@ -973,6 +977,7 @@ xfs_bumplink(
int
xfs_create(
+ struct user_namespace *mnt_userns,
xfs_inode_t *dp,
struct xfs_name *name,
umode_t mode,
@@ -1022,23 +1027,22 @@ xfs_create(
* the case we'll drop the one we have and get a more
* appropriate transaction later.
*/
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ &tp);
if (error == -ENOSPC) {
/* flush outstanding delalloc blocks and retry */
xfs_flush_inodes(mp);
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
+ resblks, &tp);
}
if (error)
- goto out_release_inode;
+ goto out_release_dquots;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
- /*
- * Reserve disk quota and the inode.
- */
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- pdqp, resblks, 1, 0);
+ error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
+ XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
goto out_trans_cancel;
@@ -1047,7 +1051,8 @@ xfs_create(
* entry pointing to them, but a directory also the "." entry
* pointing to itself.
*/
- error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip);
+ error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, is_dir ? 2 : 1, rdev,
+ prid, &ip);
if (error)
goto out_trans_cancel;
@@ -1116,7 +1121,7 @@ xfs_create(
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
-
+ out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
@@ -1128,6 +1133,7 @@ xfs_create(
int
xfs_create_tmpfile(
+ struct user_namespace *mnt_userns,
struct xfs_inode *dp,
umode_t mode,
struct xfs_inode **ipp)
@@ -1160,16 +1166,12 @@ xfs_create_tmpfile(
resblks = XFS_IALLOC_SPACE_RES(mp);
tres = &M_RES(mp)->tr_create_tmpfile;
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
- if (error)
- goto out_release_inode;
-
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- pdqp, resblks, 1, 0);
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ &tp);
if (error)
- goto out_trans_cancel;
+ goto out_release_dquots;
- error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
+ error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, 0, 0, prid, &ip);
if (error)
goto out_trans_cancel;
@@ -1210,7 +1212,7 @@ xfs_create_tmpfile(
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
-
+ out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
@@ -1258,6 +1260,11 @@ xfs_link(
xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+ error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
+ XFS_IEXT_DIR_MANIP_CNT(mp));
+ if (error)
+ goto error_return;
+
/*
* If we are using project inheritance, we only allow hard link
* creation in our tree when the project IDs are the same; else
@@ -2977,13 +2984,15 @@ out_trans_abort:
*/
static int
xfs_rename_alloc_whiteout(
+ struct user_namespace *mnt_userns,
struct xfs_inode *dp,
struct xfs_inode **wip)
{
struct xfs_inode *tmpfile;
int error;
- error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile);
+ error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE,
+ &tmpfile);
if (error)
return error;
@@ -3005,6 +3014,7 @@ xfs_rename_alloc_whiteout(
*/
int
xfs_rename(
+ struct user_namespace *mnt_userns,
struct xfs_inode *src_dp,
struct xfs_name *src_name,
struct xfs_inode *src_ip,
@@ -3017,7 +3027,7 @@ xfs_rename(
struct xfs_trans *tp;
struct xfs_inode *wip = NULL; /* whiteout inode */
struct xfs_inode *inodes[__XFS_SORT_INODES];
- struct xfs_buf *agibp;
+ int i;
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
@@ -3036,7 +3046,7 @@ xfs_rename(
*/
if (flags & RENAME_WHITEOUT) {
ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
- error = xfs_rename_alloc_whiteout(target_dp, &wip);
+ error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip);
if (error)
return error;
@@ -3106,6 +3116,35 @@ xfs_rename(
/*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
+ *
+ * Extent count overflow check:
+ *
+ * From the perspective of src_dp, a rename operation is essentially a
+ * directory entry remove operation. Hence the only place where we check
+ * for extent count overflow for src_dp is in
+ * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns
+ * -ENOSPC when it detects a possible extent count overflow and in
+ * response, the higher layers of directory handling code do the
+ * following:
+ * 1. Data/Free blocks: XFS lets these blocks linger until a
+ * future remove operation removes them.
+ * 2. Dabtree blocks: XFS swaps the blocks with the last block in the
+ * Leaf space and unmaps the last block.
+ *
+ * For target_dp, there are two cases depending on whether the
+ * destination directory entry exists or not.
+ *
+ * When destination directory entry does not exist (i.e. target_ip ==
+ * NULL), extent count overflow check is performed only when transaction
+ * has a non-zero sized space reservation associated with it. With a
+ * zero-sized space reservation, XFS allows a rename operation to
+ * continue only when the directory has sufficient free space in its
+ * data/leaf/free space blocks to hold the new entry.
+ *
+ * When destination directory entry exists (i.e. target_ip != NULL), all
+ * we need to do is change the inode number associated with the already
+ * existing entry. Hence there is no need to perform an extent count
+ * overflow check.
*/
if (target_ip == NULL) {
/*
@@ -3116,6 +3155,12 @@ xfs_rename(
error = xfs_dir_canenter(tp, target_dp, target_name);
if (error)
goto out_trans_cancel;
+ } else {
+ error = xfs_iext_count_may_overflow(target_dp,
+ XFS_DATA_FORK,
+ XFS_IEXT_DIR_MANIP_CNT(mp));
+ if (error)
+ goto out_trans_cancel;
}
} else {
/*
@@ -3131,6 +3176,30 @@ xfs_rename(
}
/*
+ * Lock the AGI buffers we need to handle bumping the nlink of the
+ * whiteout inode off the unlinked list and to handle dropping the
+ * nlink of the target inode. Per locking order rules, do this in
+ * increasing AG order and before directory block allocation tries to
+ * grab AGFs because we grab AGIs before AGFs.
+ *
+ * The (vfs) caller must ensure that if src is a directory then
+ * target_ip is either null or an empty directory.
+ */
+ for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
+ if (inodes[i] == wip ||
+ (inodes[i] == target_ip &&
+ (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
+ struct xfs_buf *bp;
+ xfs_agnumber_t agno;
+
+ agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
+ error = xfs_read_agi(mp, tp, agno, &bp);
+ if (error)
+ goto out_trans_cancel;
+ }
+ }
+
+ /*
* Directory entry creation below may acquire the AGF. Remove
* the whiteout from the unlinked list first to preserve correct
* AGI/AGF locking order. This dirties the transaction so failures
@@ -3182,22 +3251,6 @@ xfs_rename(
* In case there is already an entry with the same
* name at the destination directory, remove it first.
*/
-
- /*
- * Check whether the replace operation will need to allocate
- * blocks. This happens when the shortform directory lacks
- * space and we have to convert it to a block format directory.
- * When more blocks are necessary, we must lock the AGI first
- * to preserve locking order (AGI -> AGF).
- */
- if (xfs_dir2_sf_replace_needblock(target_dp, src_ip->i_ino)) {
- error = xfs_read_agi(mp, tp,
- XFS_INO_TO_AGNO(mp, target_ip->i_ino),
- &agibp);
- if (error)
- goto out_trans_cancel;
- }
-
error = xfs_dir_replace(tp, target_dp, target_name,
src_ip->i_ino, spaceres);
if (error)
@@ -3273,9 +3326,16 @@ xfs_rename(
if (wip) {
error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
spaceres);
- } else
+ } else {
+ /*
+ * NOTE: We don't need to check for extent count overflow here
+ * because the dir remove name code will leave the dir block in
+ * place if the extent count would overflow.
+ */
error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
spaceres);
+ }
+
if (error)
goto out_trans_cancel;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index eca333f5f715..88ee4c3930ae 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -369,15 +369,18 @@ int xfs_release(struct xfs_inode *ip);
void xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
-int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
+int xfs_create(struct user_namespace *mnt_userns,
+ struct xfs_inode *dp, struct xfs_name *name,
umode_t mode, dev_t rdev, struct xfs_inode **ipp);
-int xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode,
+int xfs_create_tmpfile(struct user_namespace *mnt_userns,
+ struct xfs_inode *dp, umode_t mode,
struct xfs_inode **ipp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
struct xfs_name *target_name);
-int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
+int xfs_rename(struct user_namespace *mnt_userns,
+ struct xfs_inode *src_dp, struct xfs_name *src_name,
struct xfs_inode *src_ip, struct xfs_inode *target_dp,
struct xfs_name *target_name,
struct xfs_inode *target_ip, unsigned int flags);
@@ -407,9 +410,10 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip);
-int xfs_dir_ialloc(struct xfs_trans **tpp, struct xfs_inode *dp, umode_t mode,
- xfs_nlink_t nlink, dev_t dev, prid_t prid,
- struct xfs_inode **ipp);
+int xfs_dir_ialloc(struct user_namespace *mnt_userns,
+ struct xfs_trans **tpp, struct xfs_inode *dp,
+ umode_t mode, xfs_nlink_t nlink, dev_t dev,
+ prid_t prid, struct xfs_inode **ipp);
static inline int
xfs_itruncate_extents(
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 3fbd98f61ea5..99dfe89a8d08 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -693,7 +693,8 @@ xfs_ioc_space(
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = bf->l_start;
- error = xfs_vn_setattr_size(file_dentry(filp), &iattr);
+ error = xfs_vn_setattr_size(file_mnt_user_ns(filp), file_dentry(filp),
+ &iattr);
if (error)
goto out_unlock;
@@ -734,13 +735,15 @@ xfs_fsinumbers_fmt(
STATIC int
xfs_ioc_fsbulkstat(
- xfs_mount_t *mp,
+ struct file *file,
unsigned int cmd,
void __user *arg)
{
+ struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
struct xfs_fsop_bulkreq bulkreq;
struct xfs_ibulk breq = {
.mp = mp,
+ .mnt_userns = file_mnt_user_ns(file),
.ocount = 0,
};
xfs_ino_t lastino;
@@ -908,13 +911,15 @@ xfs_bulk_ireq_teardown(
/* Handle the v5 bulkstat ioctl. */
STATIC int
xfs_ioc_bulkstat(
- struct xfs_mount *mp,
+ struct file *file,
unsigned int cmd,
struct xfs_bulkstat_req __user *arg)
{
+ struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
struct xfs_bulk_ireq hdr;
struct xfs_ibulk breq = {
.mp = mp,
+ .mnt_userns = file_mnt_user_ns(file),
};
int error;
@@ -1275,24 +1280,24 @@ xfs_ioctl_setattr_prepare_dax(
*/
static struct xfs_trans *
xfs_ioctl_setattr_get_trans(
- struct xfs_inode *ip)
+ struct file *file,
+ struct xfs_dquot *pdqp)
{
+ struct xfs_inode *ip = XFS_I(file_inode(file));
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
int error = -EROFS;
if (mp->m_flags & XFS_MOUNT_RDONLY)
- goto out_unlock;
+ goto out_error;
error = -EIO;
if (XFS_FORCED_SHUTDOWN(mp))
- goto out_unlock;
+ goto out_error;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp,
+ capable(CAP_FOWNER), &tp);
if (error)
- goto out_unlock;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ goto out_error;
/*
* CAP_FOWNER overrides the following restrictions:
@@ -1300,7 +1305,7 @@ xfs_ioctl_setattr_get_trans(
* The user ID of the calling process must be equal to the file owner
* ID, except in cases where the CAP_FSETID capability is applicable.
*/
- if (!inode_owner_or_capable(VFS_I(ip))) {
+ if (!inode_owner_or_capable(file_mnt_user_ns(file), VFS_I(ip))) {
error = -EPERM;
goto out_cancel;
}
@@ -1312,7 +1317,7 @@ xfs_ioctl_setattr_get_trans(
out_cancel:
xfs_trans_cancel(tp);
-out_unlock:
+out_error:
return ERR_PTR(error);
}
@@ -1428,21 +1433,23 @@ xfs_ioctl_setattr_check_projid(
STATIC int
xfs_ioctl_setattr(
- xfs_inode_t *ip,
+ struct file *file,
struct fsxattr *fa)
{
+ struct user_namespace *mnt_userns = file_mnt_user_ns(file);
+ struct xfs_inode *ip = XFS_I(file_inode(file));
struct fsxattr old_fa;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
struct xfs_dquot *pdqp = NULL;
struct xfs_dquot *olddquot = NULL;
- int code;
+ int error;
trace_xfs_ioctl_setattr(ip);
- code = xfs_ioctl_setattr_check_projid(ip, fa);
- if (code)
- return code;
+ error = xfs_ioctl_setattr_check_projid(ip, fa);
+ if (error)
+ return error;
/*
* If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1453,44 +1460,36 @@ xfs_ioctl_setattr(
* because the i_*dquot fields will get updated anyway.
*/
if (XFS_IS_QUOTA_ON(mp)) {
- code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
+ error = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
VFS_I(ip)->i_gid, fa->fsx_projid,
XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp);
- if (code)
- return code;
+ if (error)
+ return error;
}
xfs_ioctl_setattr_prepare_dax(ip, fa);
- tp = xfs_ioctl_setattr_get_trans(ip);
+ tp = xfs_ioctl_setattr_get_trans(file, pdqp);
if (IS_ERR(tp)) {
- code = PTR_ERR(tp);
+ error = PTR_ERR(tp);
goto error_free_dquots;
}
- if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
- ip->i_d.di_projid != fa->fsx_projid) {
- code = xfs_qm_vop_chown_reserve(tp, ip, NULL, NULL, pdqp,
- capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0);
- if (code) /* out of quota */
- goto error_trans_cancel;
- }
-
xfs_fill_fsxattr(ip, false, &old_fa);
- code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
- if (code)
+ error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
+ if (error)
goto error_trans_cancel;
- code = xfs_ioctl_setattr_check_extsize(ip, fa);
- if (code)
+ error = xfs_ioctl_setattr_check_extsize(ip, fa);
+ if (error)
goto error_trans_cancel;
- code = xfs_ioctl_setattr_check_cowextsize(ip, fa);
- if (code)
+ error = xfs_ioctl_setattr_check_cowextsize(ip, fa);
+ if (error)
goto error_trans_cancel;
- code = xfs_ioctl_setattr_xflags(tp, ip, fa);
- if (code)
+ error = xfs_ioctl_setattr_xflags(tp, ip, fa);
+ if (error)
goto error_trans_cancel;
/*
@@ -1502,7 +1501,7 @@ xfs_ioctl_setattr(
*/
if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) &&
- !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
+ !capable_wrt_inode_uidgid(mnt_userns, VFS_I(ip), CAP_FSETID))
VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
/* Change the ownerships and register project quota modifications */
@@ -1530,7 +1529,7 @@ xfs_ioctl_setattr(
else
ip->i_d.di_cowextsize = 0;
- code = xfs_trans_commit(tp);
+ error = xfs_trans_commit(tp);
/*
* Release any dquot(s) the inode had kept before chown.
@@ -1538,18 +1537,17 @@ xfs_ioctl_setattr(
xfs_qm_dqrele(olddquot);
xfs_qm_dqrele(pdqp);
- return code;
+ return error;
error_trans_cancel:
xfs_trans_cancel(tp);
error_free_dquots:
xfs_qm_dqrele(pdqp);
- return code;
+ return error;
}
STATIC int
xfs_ioc_fssetxattr(
- xfs_inode_t *ip,
struct file *filp,
void __user *arg)
{
@@ -1562,7 +1560,7 @@ xfs_ioc_fssetxattr(
error = mnt_want_write_file(filp);
if (error)
return error;
- error = xfs_ioctl_setattr(ip, &fa);
+ error = xfs_ioctl_setattr(filp, &fa);
mnt_drop_write_file(filp);
return error;
}
@@ -1608,7 +1606,7 @@ xfs_ioc_setxflags(
xfs_ioctl_setattr_prepare_dax(ip, &fa);
- tp = xfs_ioctl_setattr_get_trans(ip);
+ tp = xfs_ioctl_setattr_get_trans(filp, NULL);
if (IS_ERR(tp)) {
error = PTR_ERR(tp);
goto out_drop_write;
@@ -2119,10 +2117,10 @@ xfs_file_ioctl(
case XFS_IOC_FSBULKSTAT_SINGLE:
case XFS_IOC_FSBULKSTAT:
case XFS_IOC_FSINUMBERS:
- return xfs_ioc_fsbulkstat(mp, cmd, arg);
+ return xfs_ioc_fsbulkstat(filp, cmd, arg);
case XFS_IOC_BULKSTAT:
- return xfs_ioc_bulkstat(mp, cmd, arg);
+ return xfs_ioc_bulkstat(filp, cmd, arg);
case XFS_IOC_INUMBERS:
return xfs_ioc_inumbers(mp, cmd, arg);
@@ -2144,7 +2142,7 @@ xfs_file_ioctl(
case XFS_IOC_FSGETXATTRA:
return xfs_ioc_fsgetxattr(ip, 1, arg);
case XFS_IOC_FSSETXATTR:
- return xfs_ioc_fssetxattr(ip, filp, arg);
+ return xfs_ioc_fssetxattr(filp, arg);
case XFS_IOC_GETXFLAGS:
return xfs_ioc_getxflags(ip, arg);
case XFS_IOC_SETXFLAGS:
@@ -2260,7 +2258,7 @@ xfs_file_ioctl(
}
case XFS_IOC_FSGROWFSDATA: {
- xfs_growfs_data_t in;
+ struct xfs_growfs_data in;
if (copy_from_user(&in, arg, sizeof(in)))
return -EFAULT;
@@ -2274,7 +2272,7 @@ xfs_file_ioctl(
}
case XFS_IOC_FSGROWFSLOG: {
- xfs_growfs_log_t in;
+ struct xfs_growfs_log in;
if (copy_from_user(&in, arg, sizeof(in)))
return -EFAULT;
@@ -2348,8 +2346,10 @@ xfs_file_ioctl(
if (error)
return error;
+ trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_);
+
sb_start_write(mp->m_super);
- error = xfs_icache_free_eofblocks(mp, &keofb);
+ error = xfs_blockgc_free_space(mp, &keofb);
sb_end_write(mp->m_super);
return error;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index c1771e728117..33c09ec8e6c0 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -209,14 +209,16 @@ xfs_fsbulkstat_one_fmt_compat(
/* copied from xfs_ioctl.c */
STATIC int
xfs_compat_ioc_fsbulkstat(
- xfs_mount_t *mp,
+ struct file *file,
unsigned int cmd,
struct compat_xfs_fsop_bulkreq __user *p32)
{
+ struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
u32 addr;
struct xfs_fsop_bulkreq bulkreq;
struct xfs_ibulk breq = {
.mp = mp,
+ .mnt_userns = file_mnt_user_ns(file),
.ocount = 0,
};
xfs_ino_t lastino;
@@ -436,7 +438,6 @@ xfs_file_compat_ioctl(
{
struct inode *inode = file_inode(filp);
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
void __user *arg = compat_ptr(p);
int error;
@@ -456,7 +457,7 @@ xfs_file_compat_ioctl(
return xfs_ioc_space(filp, &bf);
}
case XFS_IOC_FSGEOMETRY_V1_32:
- return xfs_compat_ioc_fsgeometry_v1(mp, arg);
+ return xfs_compat_ioc_fsgeometry_v1(ip->i_mount, arg);
case XFS_IOC_FSGROWFSDATA_32: {
struct xfs_growfs_data in;
@@ -465,7 +466,7 @@ xfs_file_compat_ioctl(
error = mnt_want_write_file(filp);
if (error)
return error;
- error = xfs_growfs_data(mp, &in);
+ error = xfs_growfs_data(ip->i_mount, &in);
mnt_drop_write_file(filp);
return error;
}
@@ -477,7 +478,7 @@ xfs_file_compat_ioctl(
error = mnt_want_write_file(filp);
if (error)
return error;
- error = xfs_growfs_rt(mp, &in);
+ error = xfs_growfs_rt(ip->i_mount, &in);
mnt_drop_write_file(filp);
return error;
}
@@ -507,7 +508,7 @@ xfs_file_compat_ioctl(
case XFS_IOC_FSBULKSTAT_32:
case XFS_IOC_FSBULKSTAT_SINGLE_32:
case XFS_IOC_FSINUMBERS_32:
- return xfs_compat_ioc_fsbulkstat(mp, cmd, arg);
+ return xfs_compat_ioc_fsbulkstat(filp, cmd, arg);
case XFS_IOC_FD_TO_HANDLE_32:
case XFS_IOC_PATH_TO_HANDLE_32:
case XFS_IOC_PATH_TO_FSHANDLE_32: {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7b9ff824e82d..e17ab7f42928 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -194,25 +194,21 @@ xfs_iomap_write_direct(
struct xfs_trans *tp;
xfs_filblks_t resaligned;
int nimaps;
- int quota_flag;
- uint qblocks, resblks;
- unsigned int resrtextents = 0;
+ unsigned int dblocks, rblocks;
+ bool force = false;
int error;
int bmapi_flags = XFS_BMAPI_PREALLOC;
- uint tflags = 0;
ASSERT(count_fsb > 0);
resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
xfs_get_extsz_hint(ip));
if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
- resrtextents = qblocks = resaligned;
- resrtextents /= mp->m_sb.sb_rextsize;
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
- quota_flag = XFS_QMOPT_RES_RTBLKS;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ rblocks = resaligned;
} else {
- resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
- quota_flag = XFS_QMOPT_RES_REGBLKS;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+ rblocks = 0;
}
error = xfs_qm_dqattach(ip);
@@ -235,23 +231,21 @@ xfs_iomap_write_direct(
if (IS_DAX(VFS_I(ip))) {
bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
if (imap->br_state == XFS_EXT_UNWRITTEN) {
- tflags |= XFS_TRANS_RESERVE;
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+ force = true;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
}
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents,
- tflags, &tp);
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+ rblocks, force, &tp);
if (error)
return error;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* From this point onwards we overwrite the imap pointer that the
* caller gave to us.
@@ -260,7 +254,7 @@ xfs_iomap_write_direct(
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
imap, &nimaps);
if (error)
- goto out_res_cancel;
+ goto out_trans_cancel;
/*
* Complete the transaction
@@ -284,8 +278,6 @@ out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
-out_res_cancel:
- xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
out_trans_cancel:
xfs_trans_cancel(tp);
goto out_unlock;
@@ -548,16 +540,13 @@ xfs_iomap_write_unwritten(
* here as we might be asked to write out the same inode that we
* complete here and might deadlock on the iolock.
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
- XFS_TRANS_RESERVE, &tp);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks,
+ 0, true, &tp);
if (error)
return error;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
- error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
- XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_WRITE_UNWRITTEN_CNT);
if (error)
goto error_on_bmapi_transaction;
@@ -784,15 +773,28 @@ xfs_direct_write_iomap_begin(
goto allocate_blocks;
/*
- * NOWAIT IO needs to span the entire requested IO with a single map so
- * that we avoid partial IO failures due to the rest of the IO range not
- * covered by this map triggering an EAGAIN condition when it is
- * subsequently mapped and aborting the IO.
+ * NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with
+ * a single map so that we avoid partial IO failures due to the rest of
+ * the I/O range not covered by this map triggering an EAGAIN condition
+ * when it is subsequently mapped and aborting the I/O.
*/
- if ((flags & IOMAP_NOWAIT) &&
- !imap_spans_range(&imap, offset_fsb, end_fsb)) {
+ if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) {
error = -EAGAIN;
- goto out_unlock;
+ if (!imap_spans_range(&imap, offset_fsb, end_fsb))
+ goto out_unlock;
+ }
+
+ /*
+ * For overwrite only I/O, we cannot convert unwritten extents without
+ * requiring sub-block zeroing. This can only be done under an
+ * exclusive IOLOCK, hence return -EAGAIN if this is not a written
+ * extent to tell the caller to try again.
+ */
+ if (flags & IOMAP_OVERWRITE_ONLY) {
+ error = -EAGAIN;
+ if (imap.br_state != XFS_EXT_NORM &&
+ ((offset | length) & mp->m_blockmask))
+ goto out_unlock;
}
xfs_iunlock(ip, lockmode);
@@ -801,7 +803,7 @@ xfs_direct_write_iomap_begin(
allocate_blocks:
error = -EAGAIN;
- if (flags & IOMAP_NOWAIT)
+ if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY))
goto out_unlock;
/*
@@ -842,7 +844,8 @@ out_found_cow:
return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
out_unlock:
- xfs_iunlock(ip, lockmode);
+ if (lockmode)
+ xfs_iunlock(ip, lockmode);
return error;
}
@@ -870,6 +873,9 @@ xfs_buffered_write_iomap_begin(
int allocfork = XFS_DATA_FORK;
int error = 0;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
/* we can't use delayed allocations when using extent size hints */
if (xfs_get_extsz_hint(ip))
return xfs_direct_write_iomap_begin(inode, offset, count,
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 67c8dc9de8aa..66ebccb5a6ff 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -128,6 +128,7 @@ xfs_cleanup_inode(
STATIC int
xfs_generic_create(
+ struct user_namespace *mnt_userns,
struct inode *dir,
struct dentry *dentry,
umode_t mode,
@@ -161,9 +162,10 @@ xfs_generic_create(
goto out_free_acl;
if (!tmpfile) {
- error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+ error = xfs_create(mnt_userns, XFS_I(dir), &name, mode, rdev,
+ &ip);
} else {
- error = xfs_create_tmpfile(XFS_I(dir), mode, &ip);
+ error = xfs_create_tmpfile(mnt_userns, XFS_I(dir), mode, &ip);
}
if (unlikely(error))
goto out_free_acl;
@@ -220,31 +222,35 @@ xfs_generic_create(
STATIC int
xfs_vn_mknod(
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode,
- dev_t rdev)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ dev_t rdev)
{
- return xfs_generic_create(dir, dentry, mode, rdev, false);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, false);
}
STATIC int
xfs_vn_create(
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode,
- bool flags)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ bool flags)
{
- return xfs_generic_create(dir, dentry, mode, 0, false);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, false);
}
STATIC int
xfs_vn_mkdir(
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
{
- return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0,
+ false);
}
STATIC struct dentry *
@@ -361,9 +367,10 @@ xfs_vn_unlink(
STATIC int
xfs_vn_symlink(
- struct inode *dir,
- struct dentry *dentry,
- const char *symname)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ const char *symname)
{
struct inode *inode;
struct xfs_inode *cip = NULL;
@@ -377,7 +384,7 @@ xfs_vn_symlink(
if (unlikely(error))
goto out;
- error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
+ error = xfs_symlink(mnt_userns, XFS_I(dir), &name, symname, mode, &cip);
if (unlikely(error))
goto out;
@@ -403,11 +410,12 @@ xfs_vn_symlink(
STATIC int
xfs_vn_rename(
- struct inode *odir,
- struct dentry *odentry,
- struct inode *ndir,
- struct dentry *ndentry,
- unsigned int flags)
+ struct user_namespace *mnt_userns,
+ struct inode *odir,
+ struct dentry *odentry,
+ struct inode *ndir,
+ struct dentry *ndentry,
+ unsigned int flags)
{
struct inode *new_inode = d_inode(ndentry);
int omode = 0;
@@ -431,8 +439,8 @@ xfs_vn_rename(
if (unlikely(error))
return error;
- return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)),
- XFS_I(ndir), &nname,
+ return xfs_rename(mnt_userns, XFS_I(odir), &oname,
+ XFS_I(d_inode(odentry)), XFS_I(ndir), &nname,
new_inode ? XFS_I(new_inode) : NULL, flags);
}
@@ -529,6 +537,7 @@ xfs_stat_blksize(
STATIC int
xfs_vn_getattr(
+ struct user_namespace *mnt_userns,
const struct path *path,
struct kstat *stat,
u32 request_mask,
@@ -547,8 +556,8 @@ xfs_vn_getattr(
stat->dev = inode->i_sb->s_dev;
stat->mode = inode->i_mode;
stat->nlink = inode->i_nlink;
- stat->uid = inode->i_uid;
- stat->gid = inode->i_gid;
+ stat->uid = i_uid_into_mnt(mnt_userns, inode);
+ stat->gid = i_gid_into_mnt(mnt_userns, inode);
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
stat->mtime = inode->i_mtime;
@@ -626,8 +635,9 @@ xfs_setattr_time(
static int
xfs_vn_change_ok(
- struct dentry *dentry,
- struct iattr *iattr)
+ struct user_namespace *mnt_userns,
+ struct dentry *dentry,
+ struct iattr *iattr)
{
struct xfs_mount *mp = XFS_I(d_inode(dentry))->i_mount;
@@ -637,7 +647,7 @@ xfs_vn_change_ok(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- return setattr_prepare(dentry, iattr);
+ return setattr_prepare(mnt_userns, dentry, iattr);
}
/*
@@ -648,6 +658,7 @@ xfs_vn_change_ok(
*/
static int
xfs_setattr_nonsize(
+ struct user_namespace *mnt_userns,
struct xfs_inode *ip,
struct iattr *iattr)
{
@@ -700,13 +711,11 @@ xfs_setattr_nonsize(
return error;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL,
+ capable(CAP_FOWNER), &tp);
if (error)
goto out_dqrele;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* Change file ownership. Must be the owner or privileged.
*/
@@ -723,21 +732,6 @@ xfs_setattr_nonsize(
uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
/*
- * Do a quota reservation only if uid/gid is actually
- * going to change.
- */
- if (XFS_IS_QUOTA_RUNNING(mp) &&
- ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
- (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
- ASSERT(tp);
- error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
- NULL, capable(CAP_FOWNER) ?
- XFS_QMOPT_FORCE_RES : 0);
- if (error) /* out of quota */
- goto out_cancel;
- }
-
- /*
* CAP_FSETID overrides the following restrictions:
*
* The set-user-ID and set-group-ID bits of a file will be
@@ -786,8 +780,6 @@ xfs_setattr_nonsize(
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
/*
* Release any dquot(s) the inode had kept before chown.
*/
@@ -807,16 +799,13 @@ xfs_setattr_nonsize(
* Posix ACL code seems to care about this issue either.
*/
if (mask & ATTR_MODE) {
- error = posix_acl_chmod(inode, inode->i_mode);
+ error = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
if (error)
return error;
}
return 0;
-out_cancel:
- xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_dqrele:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -831,6 +820,7 @@ out_dqrele:
*/
STATIC int
xfs_setattr_size(
+ struct user_namespace *mnt_userns,
struct xfs_inode *ip,
struct iattr *iattr)
{
@@ -846,7 +836,7 @@ xfs_setattr_size(
ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
ASSERT(S_ISREG(inode->i_mode));
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
- ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+ ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
oldsize = inode->i_size;
newsize = iattr->ia_size;
@@ -862,7 +852,7 @@ xfs_setattr_size(
* Use the regular setattr path to update the timestamps.
*/
iattr->ia_valid &= ~ATTR_SIZE;
- return xfs_setattr_nonsize(ip, iattr);
+ return xfs_setattr_nonsize(mnt_userns, ip, iattr);
}
/*
@@ -1031,6 +1021,7 @@ out_trans_cancel:
int
xfs_vn_setattr_size(
+ struct user_namespace *mnt_userns,
struct dentry *dentry,
struct iattr *iattr)
{
@@ -1039,14 +1030,15 @@ xfs_vn_setattr_size(
trace_xfs_setattr(ip);
- error = xfs_vn_change_ok(dentry, iattr);
+ error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (error)
return error;
- return xfs_setattr_size(ip, iattr);
+ return xfs_setattr_size(mnt_userns, ip, iattr);
}
STATIC int
xfs_vn_setattr(
+ struct user_namespace *mnt_userns,
struct dentry *dentry,
struct iattr *iattr)
{
@@ -1066,14 +1058,14 @@ xfs_vn_setattr(
return error;
}
- error = xfs_vn_setattr_size(dentry, iattr);
+ error = xfs_vn_setattr_size(mnt_userns, dentry, iattr);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
} else {
trace_xfs_setattr(ip);
- error = xfs_vn_change_ok(dentry, iattr);
+ error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (!error)
- error = xfs_setattr_nonsize(ip, iattr);
+ error = xfs_setattr_nonsize(mnt_userns, ip, iattr);
}
return error;
@@ -1144,11 +1136,12 @@ xfs_vn_fiemap(
STATIC int
xfs_vn_tmpfile(
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
{
- return xfs_generic_create(dir, dentry, mode, 0, true);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, true);
}
static const struct inode_operations xfs_inode_operations = {
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 99ca745c1071..278949056048 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -14,6 +14,7 @@ extern const struct file_operations xfs_dir_file_operations;
extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
-extern int xfs_vn_setattr_size(struct dentry *dentry, struct iattr *vap);
+int xfs_vn_setattr_size(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *vap);
#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 16ca97a7ff00..ca310a125d1e 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -54,10 +54,12 @@ struct xfs_bstat_chunk {
STATIC int
xfs_bulkstat_one_int(
struct xfs_mount *mp,
+ struct user_namespace *mnt_userns,
struct xfs_trans *tp,
xfs_ino_t ino,
struct xfs_bstat_chunk *bc)
{
+ struct user_namespace *sb_userns = mp->m_super->s_user_ns;
struct xfs_icdinode *dic; /* dinode core info pointer */
struct xfs_inode *ip; /* incore inode pointer */
struct inode *inode;
@@ -86,8 +88,8 @@ xfs_bulkstat_one_int(
*/
buf->bs_projectid = ip->i_d.di_projid;
buf->bs_ino = ino;
- buf->bs_uid = i_uid_read(inode);
- buf->bs_gid = i_gid_read(inode);
+ buf->bs_uid = from_kuid(sb_userns, i_uid_into_mnt(mnt_userns, inode));
+ buf->bs_gid = from_kgid(sb_userns, i_gid_into_mnt(mnt_userns, inode));
buf->bs_size = dic->di_size;
buf->bs_nlink = inode->i_nlink;
@@ -173,7 +175,8 @@ xfs_bulkstat_one(
if (!bc.buf)
return -ENOMEM;
- error = xfs_bulkstat_one_int(breq->mp, NULL, breq->startino, &bc);
+ error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, NULL,
+ breq->startino, &bc);
kmem_free(bc.buf);
@@ -194,9 +197,10 @@ xfs_bulkstat_iwalk(
xfs_ino_t ino,
void *data)
{
+ struct xfs_bstat_chunk *bc = data;
int error;
- error = xfs_bulkstat_one_int(mp, tp, ino, data);
+ error = xfs_bulkstat_one_int(mp, bc->breq->mnt_userns, tp, ino, data);
/* bulkstat just skips over missing inodes */
if (error == -ENOENT || error == -EINVAL)
return 0;
@@ -239,6 +243,11 @@ xfs_bulkstat(
};
int error;
+ if (breq->mnt_userns != &init_user_ns) {
+ xfs_warn_ratelimited(breq->mp,
+ "bulkstat not supported inside of idmapped mounts.");
+ return -EINVAL;
+ }
if (xfs_bulkstat_already_done(breq->mp, breq->startino))
return 0;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 96a1e2a9be3f..7078d10c9b12 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -8,6 +8,7 @@
/* In-memory representation of a userspace request for batch inode data. */
struct xfs_ibulk {
struct xfs_mount *mp;
+ struct user_namespace *mnt_userns;
void __user *ubuffer; /* user output buffer */
xfs_ino_t startino; /* start with this inode */
unsigned int icount; /* number of elements in ubuffer */
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index eae3aff9bc97..c4a340f1f1e1 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -618,15 +618,12 @@ xfs_iwalk_threaded(
{
struct xfs_pwork_ctl pctl;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
- unsigned int nr_threads;
int error;
ASSERT(agno < mp->m_sb.sb_agcount);
ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
- nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
- error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
- nr_threads);
+ error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk");
if (error)
return error;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 5b7a1e201559..af6be9b9ccdf 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -98,8 +98,7 @@ typedef __u32 xfs_nlink_t;
#define xfs_rotorstep xfs_params.rotorstep.val
#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
-#define xfs_eofb_secs xfs_params.eofb_timer.val
-#define xfs_cowb_secs xfs_params.cowb_timer.val
+#define xfs_blockgc_secs xfs_params.blockgc_timer.val
#define current_cpu() (raw_smp_processor_id())
#define current_set_flags_nested(sp, f) \
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index fa2d05e65ff1..06041834daa3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -91,6 +91,9 @@ STATIC int
xlog_iclogs_empty(
struct xlog *log);
+static int
+xfs_log_cover(struct xfs_mount *);
+
static void
xlog_grant_sub_space(
struct xlog *log,
@@ -347,6 +350,25 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
tic->t_res_num++;
}
+bool
+xfs_log_writable(
+ struct xfs_mount *mp)
+{
+ /*
+ * Never write to the log on norecovery mounts, if the block device is
+ * read-only, or if the filesystem is shutdown. Read-only mounts still
+ * allow internal writes for log recovery and unmount purposes, so don't
+ * restrict that case here.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return false;
+ if (xfs_readonly_buftarg(mp->m_log->l_targ))
+ return false;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return false;
+ return true;
+}
+
/*
* Replenish the byte reservation required by moving the grant write head.
*/
@@ -741,7 +763,7 @@ xfs_log_mount_finish(
xfs_log_force(mp, XFS_LOG_SYNC);
xfs_ail_push_all_sync(mp->m_ail);
}
- xfs_wait_buftarg(mp->m_ddev_targp);
+ xfs_buftarg_drain(mp->m_ddev_targp);
if (readonly)
mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -886,15 +908,8 @@ xfs_log_unmount_write(
{
struct xlog *log = mp->m_log;
- /*
- * Don't write out unmount record on norecovery mounts or ro devices.
- * Or, if we are doing a forced umount (typically because of IO errors).
- */
- if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
- xfs_readonly_buftarg(log->l_targ)) {
- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ if (!xfs_log_writable(mp))
return;
- }
xfs_log_force(mp, XFS_LOG_SYNC);
@@ -924,10 +939,9 @@ xfs_log_unmount_write(
* To do this, we first need to shut down the background log work so it is not
* trying to cover the log as we clean up. We then need to unpin all objects in
* the log so we can then flush them out. Once they have completed their IO and
- * run the callbacks removing themselves from the AIL, we can write the unmount
- * record.
+ * run the callbacks removing themselves from the AIL, we can cover the log.
*/
-void
+int
xfs_log_quiesce(
struct xfs_mount *mp)
{
@@ -936,16 +950,24 @@ xfs_log_quiesce(
/*
* The superblock buffer is uncached and while xfs_ail_push_all_sync()
- * will push it, xfs_wait_buftarg() will not wait for it. Further,
+ * will push it, xfs_buftarg_wait() will not wait for it. Further,
* xfs_buf_iowait() cannot be used because it was pushed with the
* XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
* the IO to complete.
*/
xfs_ail_push_all_sync(mp->m_ail);
- xfs_wait_buftarg(mp->m_ddev_targp);
+ xfs_buftarg_wait(mp->m_ddev_targp);
xfs_buf_lock(mp->m_sb_bp);
xfs_buf_unlock(mp->m_sb_bp);
+ return xfs_log_cover(mp);
+}
+
+void
+xfs_log_clean(
+ struct xfs_mount *mp)
+{
+ xfs_log_quiesce(mp);
xfs_log_unmount_write(mp);
}
@@ -960,7 +982,9 @@ void
xfs_log_unmount(
struct xfs_mount *mp)
{
- xfs_log_quiesce(mp);
+ xfs_log_clean(mp);
+
+ xfs_buftarg_drain(mp->m_ddev_targp);
xfs_trans_ail_destroy(mp);
@@ -1037,17 +1061,15 @@ xfs_log_space_wake(
* there's no point in running a dummy transaction at this point because we
* can't start trying to idle the log until both the CIL and AIL are empty.
*/
-static int
-xfs_log_need_covered(xfs_mount_t *mp)
+static bool
+xfs_log_need_covered(
+ struct xfs_mount *mp)
{
- struct xlog *log = mp->m_log;
- int needed = 0;
-
- if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
- return 0;
+ struct xlog *log = mp->m_log;
+ bool needed = false;
if (!xlog_cil_empty(log))
- return 0;
+ return false;
spin_lock(&log->l_icloglock);
switch (log->l_covered_state) {
@@ -1062,14 +1084,14 @@ xfs_log_need_covered(xfs_mount_t *mp)
if (!xlog_iclogs_empty(log))
break;
- needed = 1;
+ needed = true;
if (log->l_covered_state == XLOG_STATE_COVER_NEED)
log->l_covered_state = XLOG_STATE_COVER_DONE;
else
log->l_covered_state = XLOG_STATE_COVER_DONE2;
break;
default:
- needed = 1;
+ needed = true;
break;
}
spin_unlock(&log->l_icloglock);
@@ -1077,6 +1099,60 @@ xfs_log_need_covered(xfs_mount_t *mp)
}
/*
+ * Explicitly cover the log. This is similar to background log covering but
+ * intended for usage in quiesce codepaths. The caller is responsible to ensure
+ * the log is idle and suitable for covering. The CIL, iclog buffers and AIL
+ * must all be empty.
+ */
+static int
+xfs_log_cover(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+ bool need_covered;
+
+ ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) &&
+ !xfs_ail_min_lsn(mp->m_log->l_ailp)) ||
+ XFS_FORCED_SHUTDOWN(mp));
+
+ if (!xfs_log_writable(mp))
+ return 0;
+
+ /*
+ * xfs_log_need_covered() is not idempotent because it progresses the
+ * state machine if the log requires covering. Therefore, we must call
+ * this function once and use the result until we've issued an sb sync.
+ * Do so first to make that abundantly clear.
+ *
+ * Fall into the covering sequence if the log needs covering or the
+ * mount has lazy superblock accounting to sync to disk. The sb sync
+ * used for covering accumulates the in-core counters, so covering
+ * handles this for us.
+ */
+ need_covered = xfs_log_need_covered(mp);
+ if (!need_covered && !xfs_sb_version_haslazysbcount(&mp->m_sb))
+ return 0;
+
+ /*
+ * To cover the log, commit the superblock twice (at most) in
+ * independent checkpoints. The first serves as a reference for the
+ * tail pointer. The sync transaction and AIL push empties the AIL and
+ * updates the in-core tail to the LSN of the first checkpoint. The
+ * second commit updates the on-disk tail with the in-core LSN,
+ * covering the log. Push the AIL one more time to leave it empty, as
+ * we found it.
+ */
+ do {
+ error = xfs_sync_sb(mp, true);
+ if (error)
+ break;
+ xfs_ail_push_all_sync(mp->m_ail);
+ } while (xfs_log_need_covered(mp));
+
+ return error;
+}
+
+/*
* We may be holding the log iclog lock upon entering this routine.
*/
xfs_lsn_t
@@ -1259,7 +1335,7 @@ xfs_log_worker(
struct xfs_mount *mp = log->l_mp;
/* dgc: errors ignored - not fatal and nowhere to report them */
- if (xfs_log_need_covered(mp)) {
+ if (xfs_fs_writable(mp, SB_FREEZE_WRITE) && xfs_log_need_covered(mp)) {
/*
* Dump a transaction into the log that contains no real change.
* This is needed to stamp the current tail LSN into the log
@@ -1416,8 +1492,9 @@ xlog_alloc_log(
log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0,
- mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM |
+ WQ_HIGHPRI),
+ 0, mp->m_super->s_id);
if (!log->l_ioend_workqueue)
goto out_free_iclog;
@@ -2538,12 +2615,15 @@ xlog_covered_state(
int iclogs_changed)
{
/*
- * We usually go to NEED. But we go to NEED2 if the changed indicates we
- * are done writing the dummy record. If we are done with the second
- * dummy recored (DONE2), then we go to IDLE.
+ * We go to NEED for any non-covering writes. We go to NEED2 if we just
+ * wrote the first covering record (DONE). We go to IDLE if we just
+ * wrote the second covering record (DONE2) and remain in IDLE until a
+ * non-covering write occurs.
*/
switch (prev_state) {
case XLOG_STATE_COVER_IDLE:
+ if (iclogs_changed == 1)
+ return XLOG_STATE_COVER_IDLE;
case XLOG_STATE_COVER_NEED:
case XLOG_STATE_COVER_NEED2:
break;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 58c3fcbec94a..044e02cb8921 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -127,6 +127,7 @@ int xfs_log_reserve(struct xfs_mount *mp,
int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
void xfs_log_unmount(struct xfs_mount *mp);
int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
+bool xfs_log_writable(struct xfs_mount *mp);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
@@ -137,7 +138,8 @@ void xlog_cil_process_committed(struct list_head *list);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
-void xfs_log_quiesce(struct xfs_mount *mp);
+int xfs_log_quiesce(struct xfs_mount *mp);
+void xfs_log_clean(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
bool xfs_log_in_recovery(struct xfs_mount *);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 7110507a2b6b..52370d0a3f43 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -126,6 +126,7 @@ __xfs_free_perag(
{
struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+ ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
ASSERT(atomic_read(&pag->pag_ref) == 0);
kmem_free(pag);
}
@@ -146,6 +147,7 @@ xfs_free_perag(
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0);
+ cancel_delayed_work_sync(&pag->pag_blockgc_work);
xfs_iunlink_destroy(pag);
xfs_buf_hash_destroy(pag);
call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -201,6 +203,7 @@ xfs_initialize_perag(
pag->pag_agno = index;
pag->pag_mount = mp;
spin_lock_init(&pag->pag_ici_lock);
+ INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
error = xfs_buf_hash_init(pag);
@@ -946,7 +949,7 @@ xfs_mountfs(
*/
if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) ==
XFS_MOUNT_RDONLY) {
- xfs_quiesce_attr(mp);
+ xfs_log_clean(mp);
}
/*
@@ -1023,8 +1026,8 @@ xfs_mountfs(
xfs_log_mount_cancel(mp);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
- xfs_wait_buftarg(mp->m_logdev_targp);
- xfs_wait_buftarg(mp->m_ddev_targp);
+ xfs_buftarg_drain(mp->m_logdev_targp);
+ xfs_buftarg_drain(mp->m_ddev_targp);
out_free_perag:
xfs_free_perag(mp);
out_free_dir:
@@ -1054,7 +1057,7 @@ xfs_unmountfs(
uint64_t resblks;
int error;
- xfs_stop_block_reaping(mp);
+ xfs_blockgc_stop(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
@@ -1124,12 +1127,6 @@ xfs_unmountfs(
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
- error = xfs_log_sbcount(mp);
- if (error)
- xfs_warn(mp, "Unable to update superblock counters. "
- "Freespace may not be correct on next mount.");
-
-
xfs_log_unmount(mp);
xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
@@ -1165,32 +1162,6 @@ xfs_fs_writable(
}
/*
- * xfs_log_sbcount
- *
- * Sync the superblock counters to disk.
- *
- * Note this code can be called during the process of freezing, so we use the
- * transaction allocator that does not block when the transaction subsystem is
- * in its frozen state.
- */
-int
-xfs_log_sbcount(xfs_mount_t *mp)
-{
- /* allow this to proceed during the freeze sequence... */
- if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
- return 0;
-
- /*
- * we don't need to do this if we are updating the superblock
- * counters on every modification.
- */
- if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
- return 0;
-
- return xfs_sync_sb(mp, true);
-}
-
-/*
* Deltas for the block count can vary from 1 to very large, but lock contention
* only occurs on frequent small block count updates such as in the delayed
* allocation path for buffered writes (page a time updates). Hence we set
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index dfa429b77ee2..659ad95fe3e0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -93,7 +93,7 @@ typedef struct xfs_mount {
struct workqueue_struct *m_unwritten_workqueue;
struct workqueue_struct *m_cil_workqueue;
struct workqueue_struct *m_reclaim_workqueue;
- struct workqueue_struct *m_eofblocks_workqueue;
+ struct workqueue_struct *m_blockgc_workqueue;
struct workqueue_struct *m_sync_workqueue;
int m_bsize; /* fs logical block size */
@@ -177,10 +177,6 @@ typedef struct xfs_mount {
uint64_t m_resblks_avail;/* available reserved blocks */
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
struct delayed_work m_reclaim_work; /* background inode reclaim */
- struct delayed_work m_eofblocks_work; /* background eof blocks
- trimming */
- struct delayed_work m_cowblocks_work; /* background cow blocks
- trimming */
struct xfs_kobj m_kobj;
struct xfs_kobj m_error_kobj;
struct xfs_kobj m_error_meta_kobj;
@@ -369,6 +365,9 @@ typedef struct xfs_perag {
/* Blocks reserved for the reverse mapping btree. */
struct xfs_ag_resv pag_rmapbt_resv;
+ /* background prealloc block trimming */
+ struct delayed_work pag_blockgc_work;
+
/* reference count */
uint8_t pagf_refcount_level;
@@ -399,7 +398,6 @@ int xfs_buf_hash_init(xfs_perag_t *pag);
void xfs_buf_hash_destroy(xfs_perag_t *pag);
extern void xfs_uuid_table_free(void);
-extern int xfs_log_sbcount(xfs_mount_t *);
extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index a06661dac5be..34c3b16f834f 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -294,7 +294,7 @@ int
xfs_mru_cache_init(void)
{
xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 1);
+ XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1);
if (!xfs_mru_reap_wq)
return -ENOMEM;
return 0;
diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
index b03333f1c84a..c283b801cc5d 100644
--- a/fs/xfs/xfs_pwork.c
+++ b/fs/xfs/xfs_pwork.c
@@ -61,16 +61,18 @@ xfs_pwork_init(
struct xfs_mount *mp,
struct xfs_pwork_ctl *pctl,
xfs_pwork_work_fn work_fn,
- const char *tag,
- unsigned int nr_threads)
+ const char *tag)
{
+ unsigned int nr_threads = 0;
+
#ifdef DEBUG
if (xfs_globals.pwork_threads >= 0)
nr_threads = xfs_globals.pwork_threads;
#endif
trace_xfs_pwork_init(mp, nr_threads, current->pid);
- pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag,
+ pctl->wq = alloc_workqueue("%s-%d",
+ WQ_UNBOUND | WQ_SYSFS | WQ_FREEZABLE, nr_threads, tag,
current->pid);
if (!pctl->wq)
return -ENOMEM;
@@ -117,20 +119,3 @@ xfs_pwork_poll(
atomic_read(&pctl->nr_work) == 0, HZ) == 0)
touch_softlockup_watchdog();
}
-
-/*
- * Return the amount of parallelism that the data device can handle, or 0 for
- * no limit.
- */
-unsigned int
-xfs_pwork_guess_datadev_parallelism(
- struct xfs_mount *mp)
-{
- struct xfs_buftarg *btp = mp->m_ddev_targp;
-
- /*
- * For now we'll go with the most conservative setting possible,
- * which is two threads for an SSD and 1 thread everywhere else.
- */
- return blk_queue_nonrot(btp->bt_bdev->bd_disk->queue) ? 2 : 1;
-}
diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
index 8133124cf3bb..c0ef81fc85dd 100644
--- a/fs/xfs/xfs_pwork.h
+++ b/fs/xfs/xfs_pwork.h
@@ -51,11 +51,9 @@ xfs_pwork_want_abort(
}
int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
- xfs_pwork_work_fn work_fn, const char *tag,
- unsigned int nr_threads);
+ xfs_pwork_work_fn work_fn, const char *tag);
void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
void xfs_pwork_poll(struct xfs_pwork_ctl *pctl);
-unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
#endif /* __XFS_PWORK_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index c134eb4aeaa8..bfa4164990b1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -787,7 +787,8 @@ xfs_qm_qino_alloc(
return error;
if (need_alloc) {
- error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ipp);
+ error = xfs_dir_ialloc(&init_user_ns, &tp, NULL, S_IFREG, 1, 0,
+ 0, ipp);
if (error) {
xfs_trans_cancel(tp);
return error;
@@ -1786,105 +1787,35 @@ xfs_qm_vop_chown(
xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
/*
- * Take an extra reference, because the inode is going to keep
- * this dquot pointer even after the trans_commit.
+ * Back when we made quota reservations for the chown, we reserved the
+ * ondisk blocks + delalloc blocks with the new dquot. Now that we've
+ * switched the dquots, decrease the new dquot's block reservation
+ * (having already bumped up the real counter) so that we don't have
+ * any reservation to give back when we commit.
*/
- *IO_olddq = xfs_qm_dqhold(newdq);
+ xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS,
+ -ip->i_delayed_blks);
- return prevdq;
-}
-
-/*
- * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID).
- */
-int
-xfs_qm_vop_chown_reserve(
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- struct xfs_dquot *udqp,
- struct xfs_dquot *gdqp,
- struct xfs_dquot *pdqp,
- uint flags)
-{
- struct xfs_mount *mp = ip->i_mount;
- uint64_t delblks;
- unsigned int blkflags;
- struct xfs_dquot *udq_unres = NULL;
- struct xfs_dquot *gdq_unres = NULL;
- struct xfs_dquot *pdq_unres = NULL;
- struct xfs_dquot *udq_delblks = NULL;
- struct xfs_dquot *gdq_delblks = NULL;
- struct xfs_dquot *pdq_delblks = NULL;
- int error;
-
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
- delblks = ip->i_delayed_blks;
- blkflags = XFS_IS_REALTIME_INODE(ip) ?
- XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
-
- if (XFS_IS_UQUOTA_ON(mp) && udqp &&
- i_uid_read(VFS_I(ip)) != udqp->q_id) {
- udq_delblks = udqp;
- /*
- * If there are delayed allocation blocks, then we have to
- * unreserve those from the old dquot, and add them to the
- * new dquot.
- */
- if (delblks) {
- ASSERT(ip->i_udquot);
- udq_unres = ip->i_udquot;
- }
- }
- if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
- i_gid_read(VFS_I(ip)) != gdqp->q_id) {
- gdq_delblks = gdqp;
- if (delblks) {
- ASSERT(ip->i_gdquot);
- gdq_unres = ip->i_gdquot;
- }
- }
-
- if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
- ip->i_d.di_projid != pdqp->q_id) {
- pdq_delblks = pdqp;
- if (delblks) {
- ASSERT(ip->i_pdquot);
- pdq_unres = ip->i_pdquot;
- }
- }
-
- error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
- udq_delblks, gdq_delblks, pdq_delblks,
- ip->i_d.di_nblocks, 1, flags | blkflags);
- if (error)
- return error;
+ /*
+ * Give the incore reservation for delalloc blocks back to the old
+ * dquot. We don't normally handle delalloc quota reservations
+ * transactionally, so just lock the dquot and subtract from the
+ * reservation. Dirty the transaction because it's too late to turn
+ * back now.
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ xfs_dqlock(prevdq);
+ ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks);
+ prevdq->q_blk.reserved -= ip->i_delayed_blks;
+ xfs_dqunlock(prevdq);
/*
- * Do the delayed blks reservations/unreservations now. Since, these
- * are done without the help of a transaction, if a reservation fails
- * its previous reservations won't be automatically undone by trans
- * code. So, we have to do it manually here.
+ * Take an extra reference, because the inode is going to keep
+ * this dquot pointer even after the trans_commit.
*/
- if (delblks) {
- /*
- * Do the reservations first. Unreservation can't fail.
- */
- ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
- ASSERT(udq_unres || gdq_unres || pdq_unres);
- error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- udq_delblks, gdq_delblks, pdq_delblks,
- (xfs_qcnt_t)delblks, 0, flags | blkflags);
- if (error)
- return error;
- xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- udq_unres, gdq_unres, pdq_unres,
- -((xfs_qcnt_t)delblks), 0, blkflags);
- }
+ *IO_olddq = xfs_qm_dqhold(newdq);
- return 0;
+ return prevdq;
}
int
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 5a62398940d0..d00d01302545 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -81,11 +81,14 @@ extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
uint, int64_t);
extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
-extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
- struct xfs_inode *, int64_t, long, uint);
+int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip,
+ int64_t dblocks, int64_t rblocks, bool force);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
struct xfs_mount *, struct xfs_dquot *,
struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint);
+int xfs_trans_reserve_quota_icreate(struct xfs_trans *tp,
+ struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp, int64_t dblocks);
extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t,
prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
@@ -95,9 +98,6 @@ extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
-extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
- struct xfs_dquot *, struct xfs_dquot *,
- struct xfs_dquot *, uint);
extern int xfs_qm_dqattach(struct xfs_inode *);
extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc);
extern void xfs_qm_dqdetach(struct xfs_inode *);
@@ -108,6 +108,11 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *);
extern void xfs_qm_unmount(struct xfs_mount *);
extern void xfs_qm_unmount_quotas(struct xfs_mount *);
+static inline int
+xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+ return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
+}
#else
static inline int
xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
@@ -121,11 +126,12 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
}
#define xfs_trans_dup_dqinfo(tp, tp2)
#define xfs_trans_free_dqinfo(tp)
-#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
+#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0)
#define xfs_trans_apply_dquot_deltas(tp)
#define xfs_trans_unreserve_and_mod_dquots(tp)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
- struct xfs_inode *ip, int64_t nblks, long ninos, uint flags)
+ struct xfs_inode *ip, int64_t dblocks, int64_t rblocks,
+ bool force)
{
return 0;
}
@@ -136,26 +142,39 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
{
return 0;
}
+
+static inline int
+xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+ return 0;
+}
+
+static inline int
+xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, int64_t dblocks)
+{
+ return 0;
+}
+
#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p)
#define xfs_qm_vop_rename_dqattach(it) (0)
#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
-#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0)
#define xfs_qm_dqattach(ip) (0)
#define xfs_qm_dqattach_locked(ip, fl) (0)
#define xfs_qm_dqdetach(ip)
-#define xfs_qm_dqrele(d)
-#define xfs_qm_statvfs(ip, s)
+#define xfs_qm_dqrele(d) do { (d) = (d); } while(0)
+#define xfs_qm_statvfs(ip, s) do { } while(0)
#define xfs_qm_newmount(mp, a, b) (0)
#define xfs_qm_mount_quotas(mp)
#define xfs_qm_unmount(mp)
#define xfs_qm_unmount_quotas(mp)
#endif /* CONFIG_XFS_QUOTA */
-#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
- xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
-#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \
- xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \
- f | XFS_QMOPT_RES_REGBLKS)
+static inline int
+xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+ return xfs_quota_reserve_blkres(ip, -blocks);
+}
extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 6fa05fb78189..725c7d8e4438 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -376,16 +376,14 @@ xfs_reflink_allocate_cow(
resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
xfs_iunlock(ip, *lockmode);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
- *lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, *lockmode);
+ *lockmode = 0;
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
+ false, &tp);
if (error)
return error;
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- goto out_trans_cancel;
+ *lockmode = XFS_ILOCK_EXCL;
/*
* Check for an overlapping extent again now that we dropped the ilock.
@@ -398,20 +396,13 @@ xfs_reflink_allocate_cow(
goto convert;
}
- error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_trans_cancel;
-
- xfs_trans_ijoin(tp, ip, 0);
-
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
&nimaps);
if (error)
- goto out_unreserve;
+ goto out_trans_cancel;
xfs_inode_set_cowblocks_tag(ip);
error = xfs_trans_commit(tp);
@@ -436,9 +427,6 @@ convert:
trace_xfs_reflink_convert_cow(ip, cmap);
return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
-out_unreserve:
- xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
out_trans_cancel:
xfs_trans_cancel(tp);
return error;
@@ -508,9 +496,8 @@ xfs_reflink_cancel_cow_blocks(
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
/* Remove the quota reservation */
- error = xfs_trans_reserve_quota_nblks(NULL, ip,
- -(long)del.br_blockcount, 0,
- XFS_QMOPT_RES_REGBLKS);
+ error = xfs_quota_unreserve_blkres(ip,
+ del.br_blockcount);
if (error)
break;
} else {
@@ -628,6 +615,11 @@ xfs_reflink_end_cow_extent(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error)
+ goto out_cancel;
+
/*
* In case of racing, overlapping AIO writes no COW extents might be
* left by the time I/O completes for the loser of the race. In that
@@ -997,22 +989,47 @@ xfs_reflink_remap_extent(
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
xfs_off_t newlen;
- int64_t qres, qdelta;
+ int64_t qdelta = 0;
unsigned int resblks;
+ bool quota_reserved = true;
bool smap_real;
bool dmap_written = xfs_bmap_is_written_extent(dmap);
+ int iext_delta = 0;
int nimaps;
int error;
- /* Start a rolling transaction to switch the mappings */
+ /*
+ * Start a rolling transaction to switch the mappings.
+ *
+ * Adding a written extent to the extent map can cause a bmbt split,
+ * and removing a mapped extent from the extent can cause a bmbt split.
+ * The two operations cannot both cause a split since they operate on
+ * the same index in the bmap btree, so we only need a reservation for
+ * one bmbt split if either thing is happening. However, we haven't
+ * locked the inode yet, so we reserve assuming this is the case.
+ *
+ * The first allocation call tries to reserve enough space to handle
+ * mapping dmap into a sparse part of the file plus the bmbt split. We
+ * haven't locked the inode or read the existing mapping yet, so we do
+ * not know for sure that we need the space. This should succeed most
+ * of the time.
+ *
+ * If the first attempt fails, try again but reserving only enough
+ * space to handle a bmbt split. This is the hard minimum requirement,
+ * and we revisit quota reservations later when we know more about what
+ * we're remapping.
+ */
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ resblks + dmap->br_blockcount, 0, false, &tp);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ quota_reserved = false;
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ resblks, 0, false, &tp);
+ }
if (error)
goto out;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* Read what's currently mapped in the destination file into smap.
* If smap isn't a hole, we will have to remove it before we can add
@@ -1060,15 +1077,9 @@ xfs_reflink_remap_extent(
}
/*
- * Compute quota reservation if we think the quota block counter for
+ * Increase quota reservation if we think the quota block counter for
* this file could increase.
*
- * Adding a written extent to the extent map can cause a bmbt split,
- * and removing a mapped extent from the extent can cause a bmbt split.
- * The two operations cannot both cause a split since they operate on
- * the same index in the bmap btree, so we only need a reservation for
- * one bmbt split if either thing is happening.
- *
* If we are mapping a written extent into the file, we need to have
* enough quota block count reservation to handle the blocks in that
* extent. We log only the delta to the quota block counts, so if the
@@ -1081,19 +1092,29 @@ xfs_reflink_remap_extent(
* count. This is suboptimal, but the VFS flushed the dest range
* before we started. That should have removed all the delalloc
* reservations, but we code defensively.
+ *
+ * xfs_trans_alloc_inode above already tried to grab an even larger
+ * quota reservation, and kicked off a blockgc scan if it couldn't.
+ * If we can't get a potentially smaller quota reservation now, we're
+ * done.
*/
- qres = qdelta = 0;
- if (smap_real || dmap_written)
- qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- if (!smap_real && dmap_written)
- qres += dmap->br_blockcount;
- if (qres > 0) {
- error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
- XFS_QMOPT_RES_REGBLKS);
+ if (!quota_reserved && !smap_real && dmap_written) {
+ error = xfs_trans_reserve_quota_nblks(tp, ip,
+ dmap->br_blockcount, 0, false);
if (error)
goto out_cancel;
}
+ if (smap_real)
+ ++iext_delta;
+
+ if (dmap_written)
+ ++iext_delta;
+
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
+ if (error)
+ goto out_cancel;
+
if (smap_real) {
/*
* If the extent we're unmapping is backed by storage (written
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index b4999fb01ff7..161b0e8992ba 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -804,6 +804,11 @@ xfs_growfs_rt_alloc(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ goto out_trans_cancel;
+
/*
* Allocate blocks to the bitmap file.
*/
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 813be879a5e5..e5e0713bebcd 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -35,6 +35,7 @@
#include "xfs_refcount_item.h"
#include "xfs_bmap_item.h"
#include "xfs_reflink.h"
+#include "xfs_pwork.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -342,7 +343,7 @@ void
xfs_blkdev_issue_flush(
xfs_buftarg_t *buftarg)
{
- blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS);
+ blkdev_issue_flush(buftarg->bt_bdev);
}
STATIC void
@@ -495,40 +496,44 @@ xfs_init_mount_workqueues(
struct xfs_mount *mp)
{
mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 1, mp->m_super->s_id);
if (!mp->m_buf_workqueue)
goto out;
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 0, mp->m_super->s_id);
if (!mp->m_unwritten_workqueue)
goto out_destroy_buf;
mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
- WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND),
0, mp->m_super->s_id);
if (!mp->m_cil_workqueue)
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 0, mp->m_super->s_id);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
- mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
- if (!mp->m_eofblocks_workqueue)
+ mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s",
+ WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM,
+ 0, mp->m_super->s_id);
+ if (!mp->m_blockgc_workqueue)
goto out_destroy_reclaim;
- mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
- mp->m_super->s_id);
+ mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s",
+ XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id);
if (!mp->m_sync_workqueue)
goto out_destroy_eofb;
return 0;
out_destroy_eofb:
- destroy_workqueue(mp->m_eofblocks_workqueue);
+ destroy_workqueue(mp->m_blockgc_workqueue);
out_destroy_reclaim:
destroy_workqueue(mp->m_reclaim_workqueue);
out_destroy_cil:
@@ -546,7 +551,7 @@ xfs_destroy_mount_workqueues(
struct xfs_mount *mp)
{
destroy_workqueue(mp->m_sync_workqueue);
- destroy_workqueue(mp->m_eofblocks_workqueue);
+ destroy_workqueue(mp->m_blockgc_workqueue);
destroy_workqueue(mp->m_reclaim_workqueue);
destroy_workqueue(mp->m_cil_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
@@ -868,39 +873,6 @@ xfs_restore_resvblks(struct xfs_mount *mp)
}
/*
- * Trigger writeback of all the dirty metadata in the file system.
- *
- * This ensures that the metadata is written to their location on disk rather
- * than just existing in transactions in the log. This means after a quiesce
- * there is no log replay required to write the inodes to disk - this is the
- * primary difference between a sync and a quiesce.
- *
- * We cancel log work early here to ensure all transactions the log worker may
- * run have finished before we clean up and log the superblock and write an
- * unmount record. The unfreeze process is responsible for restarting the log
- * worker correctly.
- */
-void
-xfs_quiesce_attr(
- struct xfs_mount *mp)
-{
- int error = 0;
-
- cancel_delayed_work_sync(&mp->m_log->l_work);
-
- /* force the log to unpin objects from the now complete transactions */
- xfs_log_force(mp, XFS_LOG_SYNC);
-
-
- /* Push the superblock and write an unmount record */
- error = xfs_log_sbcount(mp);
- if (error)
- xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
- "Frozen image may not be consistent.");
- xfs_log_quiesce(mp);
-}
-
-/*
* Second stage of a freeze. The data is already frozen so we only
* need to take care of the metadata. Once that's done sync the superblock
* to the log to dirty it in case of a crash while frozen. This ensures that we
@@ -920,10 +892,9 @@ xfs_fs_freeze(
* set a GFP_NOFS context here to avoid recursion deadlocks.
*/
flags = memalloc_nofs_save();
- xfs_stop_block_reaping(mp);
+ xfs_blockgc_stop(mp);
xfs_save_resvblks(mp);
- xfs_quiesce_attr(mp);
- ret = xfs_sync_sb(mp, true);
+ ret = xfs_log_quiesce(mp);
memalloc_nofs_restore(flags);
return ret;
}
@@ -936,7 +907,7 @@ xfs_fs_unfreeze(
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
- xfs_start_block_reaping(mp);
+ xfs_blockgc_start(mp);
return 0;
}
@@ -1720,7 +1691,7 @@ xfs_remount_rw(
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
- xfs_start_block_reaping(mp);
+ xfs_blockgc_start(mp);
/* Create the per-AG metadata reservation pool .*/
error = xfs_fs_reserve_ag_blocks(mp);
@@ -1740,10 +1711,10 @@ xfs_remount_ro(
* Cancel background eofb scanning so it cannot race with the final
* log force+buftarg wait and deadlock the remount.
*/
- xfs_stop_block_reaping(mp);
+ xfs_blockgc_stop(mp);
/* Get rid of any leftover CoW reservations... */
- error = xfs_icache_free_cowblocks(mp, NULL);
+ error = xfs_blockgc_free_space(mp, NULL);
if (error) {
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
@@ -1765,7 +1736,7 @@ xfs_remount_ro(
*/
xfs_save_resvblks(mp);
- xfs_quiesce_attr(mp);
+ xfs_log_clean(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
return 0;
@@ -1872,8 +1843,6 @@ static int xfs_init_fs_context(
mutex_init(&mp->m_growlock);
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
- INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
- INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
mp->m_kobj.kobject.kset = xfs_kset;
/*
* We don't create the finobt per-ag space reservation until after log
@@ -1912,7 +1881,7 @@ static struct file_system_type xfs_fs_type = {
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("xfs");
@@ -2119,11 +2088,12 @@ xfs_init_workqueues(void)
* max_active value for this workqueue.
*/
xfs_alloc_wq = alloc_workqueue("xfsalloc",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0);
+ XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0);
if (!xfs_alloc_wq)
return -ENOMEM;
- xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0);
+ xfs_discard_wq = alloc_workqueue("xfsdiscard", XFS_WQFLAGS(WQ_UNBOUND),
+ 0);
if (!xfs_discard_wq)
goto out_free_alloc_wq;
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index b552cf6d3379..1ca484b8357f 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -75,6 +75,12 @@ extern void xfs_qm_exit(void);
XFS_ASSERT_FATAL_STRING \
XFS_DBG_STRING /* DBG must be last */
+#ifdef DEBUG
+# define XFS_WQFLAGS(wqflags) (WQ_SYSFS | (wqflags))
+#else
+# define XFS_WQFLAGS(wqflags) (wqflags)
+#endif
+
struct xfs_inode;
struct xfs_mount;
struct xfs_buftarg;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 1f43fd7f3209..1379013d74b8 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -134,6 +134,7 @@ xfs_readlink(
int
xfs_symlink(
+ struct user_namespace *mnt_userns,
struct xfs_inode *dp,
struct xfs_name *link_name,
const char *target_path,
@@ -197,9 +198,10 @@ xfs_symlink(
fs_blocks = xfs_symlink_blocks(mp, pathlen);
resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp,
+ pdqp, resblks, &tp);
if (error)
- goto out_release_inode;
+ goto out_release_dquots;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
@@ -212,19 +214,16 @@ xfs_symlink(
goto out_trans_cancel;
}
- /*
- * Reserve disk quota : blocks and inode.
- */
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- pdqp, resblks, 1, 0);
+ error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
+ XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
goto out_trans_cancel;
/*
* Allocate an inode for the symlink.
*/
- error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
- prid, &ip);
+ error = xfs_dir_ialloc(mnt_userns, &tp, dp, S_IFLNK | (mode & ~S_IFMT),
+ 1, 0, prid, &ip);
if (error)
goto out_trans_cancel;
@@ -300,6 +299,7 @@ xfs_symlink(
}
ASSERT(pathlen == 0);
}
+ i_size_write(VFS_I(ip), ip->i_d.di_size);
/*
* Create the directory entry for the symlink.
@@ -342,7 +342,7 @@ out_release_inode:
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
-
+out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index b1fa091427e6..2586b7e393f3 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -7,8 +7,9 @@
/* Kernel only symlink definitions */
-int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
- const char *target_path, umode_t mode, struct xfs_inode **ipp);
+int xfs_symlink(struct user_namespace *mnt_userns, struct xfs_inode *dp,
+ struct xfs_name *link_name, const char *target_path,
+ umode_t mode, struct xfs_inode **ipp);
int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link);
int xfs_readlink(struct xfs_inode *ip, char *link);
int xfs_inactive_symlink(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index fac9de7ee6d0..546a6cd96729 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -51,7 +51,7 @@ xfs_panic_mask_proc_handler(
#endif /* CONFIG_PROC_FS */
STATIC int
-xfs_deprecate_irix_sgid_inherit_proc_handler(
+xfs_deprecated_dointvec_minmax(
struct ctl_table *ctl,
int write,
void *buffer,
@@ -59,24 +59,8 @@ xfs_deprecate_irix_sgid_inherit_proc_handler(
loff_t *ppos)
{
if (write) {
- printk_once(KERN_WARNING
- "XFS: " "%s sysctl option is deprecated.\n",
- ctl->procname);
- }
- return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
-}
-
-STATIC int
-xfs_deprecate_irix_symlink_mode_proc_handler(
- struct ctl_table *ctl,
- int write,
- void *buffer,
- size_t *lenp,
- loff_t *ppos)
-{
- if (write) {
- printk_once(KERN_WARNING
- "XFS: " "%s sysctl option is deprecated.\n",
+ printk_ratelimited(KERN_WARNING
+ "XFS: %s sysctl option is deprecated.\n",
ctl->procname);
}
return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
@@ -88,7 +72,7 @@ static struct ctl_table xfs_table[] = {
.data = &xfs_params.sgid_inherit.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler,
+ .proc_handler = xfs_deprecated_dointvec_minmax,
.extra1 = &xfs_params.sgid_inherit.min,
.extra2 = &xfs_params.sgid_inherit.max
},
@@ -97,7 +81,7 @@ static struct ctl_table xfs_table[] = {
.data = &xfs_params.symlink_mode.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler,
+ .proc_handler = xfs_deprecated_dointvec_minmax,
.extra1 = &xfs_params.symlink_mode.min,
.extra2 = &xfs_params.symlink_mode.max
},
@@ -194,21 +178,21 @@ static struct ctl_table xfs_table[] = {
},
{
.procname = "speculative_prealloc_lifetime",
- .data = &xfs_params.eofb_timer.val,
+ .data = &xfs_params.blockgc_timer.val,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &xfs_params.eofb_timer.min,
- .extra2 = &xfs_params.eofb_timer.max,
+ .extra1 = &xfs_params.blockgc_timer.min,
+ .extra2 = &xfs_params.blockgc_timer.max,
},
{
.procname = "speculative_cow_prealloc_lifetime",
- .data = &xfs_params.cowb_timer.val,
+ .data = &xfs_params.blockgc_timer.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &xfs_params.cowb_timer.min,
- .extra2 = &xfs_params.cowb_timer.max,
+ .proc_handler = xfs_deprecated_dointvec_minmax,
+ .extra1 = &xfs_params.blockgc_timer.min,
+ .extra2 = &xfs_params.blockgc_timer.max,
},
/* please keep this the last entry */
#ifdef CONFIG_PROC_FS
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 8abf4640f1d5..7692e76ead33 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -35,8 +35,7 @@ typedef struct xfs_param {
xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */
- xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */
- xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */
+ xfs_sysctl_val_t blockgc_timer; /* Interval between blockgc scans */
} xfs_param_t;
/*
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 120398a37c2a..9b8d703dc9fd 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -29,6 +29,7 @@
#include "xfs_filestream.h"
#include "xfs_fsmap.h"
#include "xfs_btree_staging.h"
+#include "xfs_icache.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 5a263ae3d4f0..e74bbb648f83 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -37,6 +37,7 @@ struct xfs_trans_res;
struct xfs_inobt_rec_incore;
union xfs_btree_ptr;
struct xfs_dqtrx;
+struct xfs_eofblocks;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -154,10 +155,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_put);
DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
-DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
-DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
-DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks);
-DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_blockgc);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_blockgc);
DECLARE_EVENT_CLASS(xfs_ag_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
@@ -358,7 +357,7 @@ DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
-DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
+DEFINE_BUF_EVENT(xfs_buf_drain_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
/* not really buffer traces, but the buf provides useful information */
@@ -1287,8 +1286,8 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
)
DECLARE_EVENT_CLASS(xfs_file_class,
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
- TP_ARGS(ip, count, offset),
+ TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),
+ TP_ARGS(iocb, iter),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
@@ -1297,11 +1296,11 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__field(size_t, count)
),
TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->size = ip->i_d.di_size;
- __entry->offset = offset;
- __entry->count = count;
+ __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
+ __entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino;
+ __entry->size = XFS_I(file_inode(iocb->ki_filp))->i_d.di_size;
+ __entry->offset = iocb->ki_pos;
+ __entry->count = iov_iter_count(iter);
),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1313,14 +1312,16 @@ DECLARE_EVENT_CLASS(xfs_file_class,
#define DEFINE_RW_EVENT(name) \
DEFINE_EVENT(xfs_file_class, name, \
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \
- TP_ARGS(ip, count, offset))
+ TP_PROTO(struct kiocb *iocb, struct iov_iter *iter), \
+ TP_ARGS(iocb, iter))
DEFINE_RW_EVENT(xfs_file_buffered_read);
DEFINE_RW_EVENT(xfs_file_direct_read);
DEFINE_RW_EVENT(xfs_file_dax_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
+DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
+
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
@@ -3294,8 +3295,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
-DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
-
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
@@ -3888,6 +3887,47 @@ DEFINE_EVENT(xfs_timestamp_range_class, name, \
DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range);
DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range);
+DECLARE_EVENT_CLASS(xfs_eofblocks_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb,
+ unsigned long caller_ip),
+ TP_ARGS(mp, eofb, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(__u32, flags)
+ __field(uint32_t, uid)
+ __field(uint32_t, gid)
+ __field(prid_t, prid)
+ __field(__u64, min_file_size)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->flags = eofb ? eofb->eof_flags : 0;
+ __entry->uid = eofb ? from_kuid(mp->m_super->s_user_ns,
+ eofb->eof_uid) : 0;
+ __entry->gid = eofb ? from_kgid(mp->m_super->s_user_ns,
+ eofb->eof_gid) : 0;
+ __entry->prid = eofb ? eofb->eof_prid : 0;
+ __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->flags,
+ __entry->uid,
+ __entry->gid,
+ __entry->prid,
+ __entry->min_file_size,
+ (char *)__entry->caller_ip)
+);
+#define DEFINE_EOFBLOCKS_EVENT(name) \
+DEFINE_EVENT(xfs_eofblocks_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, \
+ unsigned long caller_ip), \
+ TP_ARGS(mp, eofb, caller_ip))
+DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks);
+DEFINE_EOFBLOCKS_EVENT(xfs_blockgc_free_space);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index e72730f85af1..b22a09e9daee 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -20,6 +20,10 @@
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_icache.h"
kmem_zone_t *xfs_trans_zone;
@@ -68,6 +72,7 @@ xfs_trans_free(
xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
trace_xfs_trans_free(tp, _RET_IP_);
+ xfs_trans_clear_context(tp);
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
@@ -119,7 +124,8 @@ xfs_trans_dup(
ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
tp->t_rtx_res = tp->t_rtx_res_used;
- ntp->t_pflags = tp->t_pflags;
+
+ xfs_trans_switch_context(tp, ntp);
/* move deferred ops over to the new tp */
xfs_defer_move(ntp, tp);
@@ -153,9 +159,6 @@ xfs_trans_reserve(
int error = 0;
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
- /* Mark this thread as being in a transaction */
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
/*
* Attempt to reserve the needed disk blocks by decrementing
* the number needed from the number available. This will
@@ -163,10 +166,8 @@ xfs_trans_reserve(
*/
if (blocks > 0) {
error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
- if (error != 0) {
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ if (error != 0)
return -ENOSPC;
- }
tp->t_blk_res += blocks;
}
@@ -240,9 +241,6 @@ undo_blocks:
xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
-
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
return error;
}
@@ -256,6 +254,7 @@ xfs_trans_alloc(
struct xfs_trans **tpp)
{
struct xfs_trans *tp;
+ bool want_retry = true;
int error;
/*
@@ -263,9 +262,11 @@ xfs_trans_alloc(
* GFP_NOFS allocation context so that we avoid lockdep false positives
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
+retry:
tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
+ xfs_trans_set_context(tp);
/*
* Zero-reservation ("empty") transactions can't modify anything, so
@@ -285,6 +286,22 @@ xfs_trans_alloc(
tp->t_firstblock = NULLFSBLOCK;
error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+ if (error == -ENOSPC && want_retry) {
+ xfs_trans_cancel(tp);
+
+ /*
+ * We weren't able to reserve enough space for the transaction.
+ * Flush the other speculative space allocations to free space.
+ * Do not perform a synchronous scan because callers can hold
+ * other locks.
+ */
+ error = xfs_blockgc_free_space(mp, NULL);
+ if (error)
+ return error;
+
+ want_retry = false;
+ goto retry;
+ }
if (error) {
xfs_trans_cancel(tp);
return error;
@@ -878,7 +895,6 @@ __xfs_trans_commit(
xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free(tp);
/*
@@ -910,7 +926,6 @@ out_unreserve:
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, !!error);
xfs_trans_free(tp);
@@ -970,9 +985,6 @@ xfs_trans_cancel(
tp->t_ticket = NULL;
}
- /* mark this thread as no longer being in a transaction */
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
xfs_trans_free_items(tp, dirty);
xfs_trans_free(tp);
}
@@ -1024,3 +1036,183 @@ xfs_trans_roll(
tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
return xfs_trans_reserve(*tpp, &tres, 0, 0);
}
+
+/*
+ * Allocate an transaction, lock and join the inode to it, and reserve quota.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized. The caller is responsible for
+ * releasing ILOCK_EXCL if a new transaction is returned.
+ */
+int
+xfs_trans_alloc_inode(
+ struct xfs_inode *ip,
+ struct xfs_trans_res *resv,
+ unsigned int dblocks,
+ unsigned int rblocks,
+ bool force,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = ip->i_mount;
+ bool retried = false;
+ int error;
+
+retry:
+ error = xfs_trans_alloc(mp, resv, dblocks,
+ rblocks / mp->m_sb.sb_rextsize,
+ force ? XFS_TRANS_RESERVE : 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_blockgc_free_quota(ip, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error)
+ goto out_cancel;
+
+ *tpp = tp;
+ return 0;
+
+out_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * Allocate an transaction in preparation for inode creation by reserving quota
+ * against the given dquots. Callers are not required to hold any inode locks.
+ */
+int
+xfs_trans_alloc_icreate(
+ struct xfs_mount *mp,
+ struct xfs_trans_res *resv,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ unsigned int dblocks,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans *tp;
+ bool retried = false;
+ int error;
+
+retry:
+ error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp);
+ if (error)
+ return error;
+
+ error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, dblocks);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error) {
+ xfs_trans_cancel(tp);
+ return error;
+ }
+
+ *tpp = tp;
+ return 0;
+}
+
+/*
+ * Allocate an transaction, lock and join the inode to it, and reserve quota
+ * in preparation for inode attribute changes that include uid, gid, or prid
+ * changes.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized. The ILOCK will be dropped when the
+ * transaction is committed or cancelled.
+ */
+int
+xfs_trans_alloc_ichange(
+ struct xfs_inode *ip,
+ struct xfs_dquot *new_udqp,
+ struct xfs_dquot *new_gdqp,
+ struct xfs_dquot *new_pdqp,
+ bool force,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_dquot *udqp;
+ struct xfs_dquot *gdqp;
+ struct xfs_dquot *pdqp;
+ bool retried = false;
+ int error;
+
+retry:
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ /*
+ * For each quota type, skip quota reservations if the inode's dquots
+ * now match the ones that came from the caller, or the caller didn't
+ * pass one in. The inode's dquots can change if we drop the ILOCK to
+ * perform a blockgc scan, so we must preserve the caller's arguments.
+ */
+ udqp = (new_udqp != ip->i_udquot) ? new_udqp : NULL;
+ gdqp = (new_gdqp != ip->i_gdquot) ? new_gdqp : NULL;
+ pdqp = (new_pdqp != ip->i_pdquot) ? new_pdqp : NULL;
+ if (udqp || gdqp || pdqp) {
+ unsigned int qflags = XFS_QMOPT_RES_REGBLKS;
+
+ if (force)
+ qflags |= XFS_QMOPT_FORCE_RES;
+
+ /*
+ * Reserve enough quota to handle blocks on disk and reserved
+ * for a delayed allocation. We'll actually transfer the
+ * delalloc reservation between dquots at chown time, even
+ * though that part is only semi-transactional.
+ */
+ error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp,
+ pdqp, ip->i_d.di_nblocks + ip->i_delayed_blks,
+ 1, qflags);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error)
+ goto out_cancel;
+ }
+
+ *tpp = tp;
+ return 0;
+
+out_cancel:
+ xfs_trans_cancel(tp);
+ return error;
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 084658946cc8..9dd745cf77c9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -268,4 +268,47 @@ xfs_trans_item_relog(
return lip->li_ops->iop_relog(lip, tp);
}
+struct xfs_dquot;
+
+int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv,
+ unsigned int dblocks, unsigned int rblocks, bool force,
+ struct xfs_trans **tpp);
+int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
+ struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp, unsigned int dblocks,
+ struct xfs_trans **tpp);
+int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
+ struct xfs_trans **tpp);
+
+static inline void
+xfs_trans_set_context(
+ struct xfs_trans *tp)
+{
+ ASSERT(current->journal_info == NULL);
+ tp->t_pflags = memalloc_nofs_save();
+ current->journal_info = tp;
+}
+
+static inline void
+xfs_trans_clear_context(
+ struct xfs_trans *tp)
+{
+ if (current->journal_info == tp) {
+ memalloc_nofs_restore(tp->t_pflags);
+ current->journal_info = NULL;
+ }
+}
+
+static inline void
+xfs_trans_switch_context(
+ struct xfs_trans *old_tp,
+ struct xfs_trans *new_tp)
+{
+ ASSERT(current->journal_info == old_tp);
+ new_tp->t_pflags = old_tp->t_pflags;
+ old_tp->t_pflags = 0;
+ current->journal_info = new_tp;
+}
+
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 28b8ac701919..48e09ea30ee5 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -16,6 +16,7 @@
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_trace.h"
+#include "xfs_error.h"
STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
@@ -691,9 +692,11 @@ xfs_trans_dqresv(
nblks);
xfs_trans_mod_dquot(tp, dqp, XFS_TRANS_DQ_RES_INOS, ninos);
}
- ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
- ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
- ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
+
+ if (XFS_IS_CORRUPT(mp, dqp->q_blk.reserved < dqp->q_blk.count) ||
+ XFS_IS_CORRUPT(mp, dqp->q_rtb.reserved < dqp->q_rtb.count) ||
+ XFS_IS_CORRUPT(mp, dqp->q_ino.reserved < dqp->q_ino.count))
+ goto error_corrupt;
xfs_dqunlock(dqp);
return 0;
@@ -703,6 +706,10 @@ error_return:
if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ)
return -ENOSPC;
return -EDQUOT;
+error_corrupt:
+ xfs_dqunlock(dqp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return -EFSCORRUPTED;
}
@@ -780,28 +787,60 @@ int
xfs_trans_reserve_quota_nblks(
struct xfs_trans *tp,
struct xfs_inode *ip,
- int64_t nblks,
- long ninos,
- uint flags)
+ int64_t dblocks,
+ int64_t rblocks,
+ bool force)
{
struct xfs_mount *mp = ip->i_mount;
+ unsigned int qflags = 0;
+ int error;
if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
return 0;
ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
-
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT((flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_RTBLKS ||
- (flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_BLKS);
- /*
- * Reserve nblks against these dquots, with trans as the mediator.
- */
- return xfs_trans_reserve_quota_bydquots(tp, mp,
- ip->i_udquot, ip->i_gdquot,
- ip->i_pdquot,
- nblks, ninos, flags);
+ if (force)
+ qflags |= XFS_QMOPT_FORCE_RES;
+
+ /* Reserve data device quota against the inode's dquots. */
+ error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot,
+ ip->i_gdquot, ip->i_pdquot, dblocks, 0,
+ XFS_QMOPT_RES_REGBLKS | qflags);
+ if (error)
+ return error;
+
+ /* Do the same but for realtime blocks. */
+ error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot,
+ ip->i_gdquot, ip->i_pdquot, rblocks, 0,
+ XFS_QMOPT_RES_RTBLKS | qflags);
+ if (error) {
+ xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot,
+ ip->i_gdquot, ip->i_pdquot, -dblocks, 0,
+ XFS_QMOPT_RES_REGBLKS);
+ return error;
+ }
+
+ return 0;
+}
+
+/* Change the quota reservations for an inode creation activity. */
+int
+xfs_trans_reserve_quota_icreate(
+ struct xfs_trans *tp,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ int64_t dblocks)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ return 0;
+
+ return xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp,
+ dblocks, 1, XFS_QMOPT_RES_REGBLKS);
}
/*
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index bca48b308c02..12be32f66dc1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -38,9 +38,10 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
}
static int
-xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
- struct inode *inode, const char *name, const void *value,
- size_t size, int flags)
+xfs_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns, struct dentry *unused,
+ struct inode *inode, const char *name, const void *value,
+ size_t size, int flags)
{
struct xfs_da_args args = {
.dp = XFS_I(inode),