diff options
Diffstat (limited to 'fs/xfs')
-rw-r--r-- | fs/xfs/libxfs/xfs_ag.c | 6 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_inode_buf.c | 10 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_rtbitmap.c | 6 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_trans_inode.c | 2 | ||||
-rw-r--r-- | fs/xfs/scrub/xfile.c | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_bmap_util.c | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.c | 22 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.h | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_discard.c | 266 | ||||
-rw-r--r-- | fs/xfs/xfs_discard.h | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_extent_busy.c | 35 | ||||
-rw-r--r-- | fs/xfs/xfs_extent_busy.h | 24 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_inode_item.c | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_ioctl32.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_iops.c | 13 | ||||
-rw-r--r-- | fs/xfs/xfs_itable.c | 12 | ||||
-rw-r--r-- | fs/xfs/xfs_log_cil.c | 93 | ||||
-rw-r--r-- | fs/xfs/xfs_log_priv.h | 5 | ||||
-rw-r--r-- | fs/xfs/xfs_notify_failure.c | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_rtalloc.c | 30 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c | 42 | ||||
-rw-r--r-- | fs/xfs/xfs_xattr.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_xattr.h | 2 |
24 files changed, 411 insertions, 192 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index e9cc481b4ddf..f9f4d694640d 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -1001,6 +1001,12 @@ xfs_ag_shrink_space( error = -ENOSPC; goto resv_init_out; } + + /* Update perag geometry */ + pag->block_count -= delta; + __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, + &pag->agino_max); + xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); return 0; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index a35781577cad..543f3748c2a3 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -220,8 +220,10 @@ xfs_inode_from_disk( * a time before epoch is converted to a time long after epoch * on 64 bit systems. */ - inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime); - inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime); + inode_set_atime_to_ts(inode, + xfs_inode_from_disk_ts(from, from->di_atime)); + inode_set_mtime_to_ts(inode, + xfs_inode_from_disk_ts(from, from->di_mtime)); inode_set_ctime_to_ts(inode, xfs_inode_from_disk_ts(from, from->di_ctime)); @@ -315,8 +317,8 @@ xfs_inode_to_disk( to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); - to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); - to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); + to->di_atime = xfs_inode_to_disk_ts(ip, inode_get_atime(inode)); + to->di_mtime = xfs_inode_to_disk_ts(ip, inode_get_mtime(inode)); to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode)); to->di_nlink = cpu_to_be32(inode->i_nlink); to->di_gen = cpu_to_be32(inode->i_generation); diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index fa180ab66b73..396648acb5be 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -970,6 +970,7 @@ xfs_rtfree_extent( xfs_mount_t *mp; /* file system mount structure */ xfs_fsblock_t sb; /* summary file block number */ struct xfs_buf *sumbp = NULL; /* summary file block buffer */ + struct timespec64 atime; mp = tp->t_mountp; @@ -999,7 +1000,10 @@ xfs_rtfree_extent( mp->m_sb.sb_rextents) { if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; - *(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0; + + atime = inode_get_atime(VFS_I(mp->m_rbmip)); + *((uint64_t *)&atime) = 0; + inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); } return 0; diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 6b2296ff248a..70e97ea6eee7 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -65,7 +65,7 @@ xfs_trans_ichgtime( tv = current_time(inode); if (flags & XFS_ICHGTIME_MOD) - inode->i_mtime = tv; + inode_set_mtime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CHG) inode_set_ctime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index d98e8e77c684..090c3ead43fd 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -10,7 +10,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_format.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/scrub.h" diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fcefab687285..40e0a1f1f753 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1644,7 +1644,7 @@ xfs_swap_extents( uint64_t f; int resblks = 0; unsigned int flags = 0; - struct timespec64 ctime; + struct timespec64 ctime, mtime; /* * Lock the inodes against other IO, page faults and truncate to @@ -1758,10 +1758,11 @@ xfs_swap_extents( * under it. */ ctime = inode_get_ctime(VFS_I(ip)); + mtime = inode_get_mtime(VFS_I(ip)); if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) || (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) || - (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || - (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + (sbp->bs_mtime.tv_sec != mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != mtime.tv_nsec)) { error = -EBUSY; goto out_trans_cancel; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9e7ba04572db..545c7991b9b5 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1943,8 +1943,6 @@ void xfs_free_buftarg( struct xfs_buftarg *btp) { - struct block_device *bdev = btp->bt_bdev; - shrinker_free(btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); @@ -1952,8 +1950,8 @@ xfs_free_buftarg( fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ - if (bdev != btp->bt_mount->m_super->s_bdev) - blkdev_put(bdev, btp->bt_mount->m_super); + if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) + bdev_release(btp->bt_bdev_handle); kmem_free(btp); } @@ -1988,16 +1986,15 @@ xfs_setsize_buftarg( */ STATIC int xfs_setsize_buftarg_early( - xfs_buftarg_t *btp, - struct block_device *bdev) + xfs_buftarg_t *btp) { - return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); + return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)); } struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev) + struct bdev_handle *bdev_handle) { xfs_buftarg_t *btp; const struct dax_holder_operations *ops = NULL; @@ -2008,9 +2005,10 @@ xfs_alloc_buftarg( btp = kmem_zalloc(sizeof(*btp), KM_NOFS); btp->bt_mount = mp; - btp->bt_dev = bdev->bd_dev; - btp->bt_bdev = bdev; - btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, + btp->bt_bdev_handle = bdev_handle; + btp->bt_dev = bdev_handle->bdev->bd_dev; + btp->bt_bdev = bdev_handle->bdev; + btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); /* @@ -2020,7 +2018,7 @@ xfs_alloc_buftarg( ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, DEFAULT_RATELIMIT_BURST); - if (xfs_setsize_buftarg_early(btp, bdev)) + if (xfs_setsize_buftarg_early(btp)) goto error_free; if (list_lru_init(&btp->bt_lru)) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 702e7d9ea2ac..c86e16419656 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -98,6 +98,7 @@ typedef unsigned int xfs_buf_flags_t; */ typedef struct xfs_buftarg { dev_t bt_dev; + struct bdev_handle *bt_bdev_handle; struct block_device *bt_bdev; struct dax_device *bt_daxdev; u64 bt_dax_part_off; @@ -364,7 +365,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) * Handling of buftargs. */ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, - struct block_device *bdev); + struct bdev_handle *bdev_handle); extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index afc4c78b9eed..d5787991bb5b 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2010 Red Hat, Inc. + * Copyright (C) 2010, 2023 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" @@ -19,21 +19,147 @@ #include "xfs_log.h" #include "xfs_ag.h" -STATIC int -xfs_trim_extents( +/* + * Notes on an efficient, low latency fstrim algorithm + * + * We need to walk the filesystem free space and issue discards on the free + * space that meet the search criteria (size and location). We cannot issue + * discards on extents that might be in use, or are so recently in use they are + * still marked as busy. To serialise against extent state changes whilst we are + * gathering extents to trim, we must hold the AGF lock to lock out other + * allocations and extent free operations that might change extent state. + * + * However, we cannot just hold the AGF for the entire AG free space walk whilst + * we issue discards on each free space that is found. Storage devices can have + * extremely slow discard implementations (e.g. ceph RBD) and so walking a + * couple of million free extents and issuing synchronous discards on each + * extent can take a *long* time. Whilst we are doing this walk, nothing else + * can access the AGF, and we can stall transactions and hence the log whilst + * modifications wait for the AGF lock to be released. This can lead hung tasks + * kicking the hung task timer and rebooting the system. This is bad. + * + * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI + * lock, gathers a range of inode cluster buffers that are allocated, drops the + * AGI lock and then reads all the inode cluster buffers and processes them. It + * loops doing this, using a cursor to keep track of where it is up to in the AG + * for each iteration to restart the INOBT lookup from. + * + * We can't do this exactly with free space - once we drop the AGF lock, the + * state of the free extent is out of our control and we cannot run a discard + * safely on it in this situation. Unless, of course, we've marked the free + * extent as busy and undergoing a discard operation whilst we held the AGF + * locked. + * + * This is exactly how online discard works - free extents are marked busy when + * they are freed, and once the extent free has been committed to the journal, + * the busy extent record is marked as "undergoing discard" and the discard is + * then issued on the free extent. Once the discard completes, the busy extent + * record is removed and the extent is able to be allocated again. + * + * In the context of fstrim, if we find a free extent we need to discard, we + * don't have to discard it immediately. All we need to do it record that free + * extent as being busy and under discard, and all the allocation routines will + * now avoid trying to allocate it. Hence if we mark the extent as busy under + * the AGF lock, we can safely discard it without holding the AGF lock because + * nothing will attempt to allocate that free space until the discard completes. + * + * This also allows us to issue discards asynchronously like we do with online + * discard, and so for fast devices fstrim will run much faster as we can have + * multiple discard operations in flight at once, as well as pipeline the free + * extent search so that it overlaps in flight discard IO. + */ + +struct workqueue_struct *xfs_discard_wq; + +static void +xfs_discard_endio_work( + struct work_struct *work) +{ + struct xfs_busy_extents *extents = + container_of(work, struct xfs_busy_extents, endio_work); + + xfs_extent_busy_clear(extents->mount, &extents->extent_list, false); + kmem_free(extents->owner); +} + +/* + * Queue up the actual completion to a thread to avoid IRQ-safe locking for + * pagb_lock. + */ +static void +xfs_discard_endio( + struct bio *bio) +{ + struct xfs_busy_extents *extents = bio->bi_private; + + INIT_WORK(&extents->endio_work, xfs_discard_endio_work); + queue_work(xfs_discard_wq, &extents->endio_work); + bio_put(bio); +} + +/* + * Walk the discard list and issue discards on all the busy extents in the + * list. We plug and chain the bios so that we only need a single completion + * call to clear all the busy extents once the discards are complete. + */ +int +xfs_discard_extents( + struct xfs_mount *mp, + struct xfs_busy_extents *extents) +{ + struct xfs_extent_busy *busyp; + struct bio *bio = NULL; + struct blk_plug plug; + int error = 0; + + blk_start_plug(&plug); + list_for_each_entry(busyp, &extents->extent_list, list) { + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, + busyp->length); + + error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, &bio); + if (error && error != -EOPNOTSUPP) { + xfs_info(mp, + "discard failed for extent [0x%llx,%u], error %d", + (unsigned long long)busyp->bno, + busyp->length, + error); + break; + } + } + + if (bio) { + bio->bi_private = extents; + bio->bi_end_io = xfs_discard_endio; + submit_bio(bio); + } else { + xfs_discard_endio_work(&extents->endio_work); + } + blk_finish_plug(&plug); + + return error; +} + + +static int +xfs_trim_gather_extents( struct xfs_perag *pag, xfs_daddr_t start, xfs_daddr_t end, xfs_daddr_t minlen, + struct xfs_alloc_rec_incore *tcur, + struct xfs_busy_extents *extents, uint64_t *blocks_trimmed) { struct xfs_mount *mp = pag->pag_mount; - struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; - struct xfs_agf *agf; int error; int i; + int batch = 100; /* * Force out the log. This means any transactions that might have freed @@ -45,20 +171,28 @@ xfs_trim_extents( error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); if (error) return error; - agf = agbp->b_addr; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); /* - * Look up the longest btree in the AGF and start with it. + * Look up the extent length requested in the AGF and start with it. */ - error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i); + if (tcur->ar_startblock == NULLAGBLOCK) + error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i); + else + error = xfs_alloc_lookup_le(cur, tcur->ar_startblock, + tcur->ar_blockcount, &i); if (error) goto out_del_cursor; + if (i == 0) { + /* nothing of that length left in the AG, we are done */ + tcur->ar_blockcount = 0; + goto out_del_cursor; + } /* * Loop until we are done with all extents that are large - * enough to be worth discarding. + * enough to be worth discarding or we hit batch limits. */ while (i) { xfs_agblock_t fbno; @@ -73,7 +207,16 @@ xfs_trim_extents( error = -EFSCORRUPTED; break; } - ASSERT(flen <= be32_to_cpu(agf->agf_longest)); + + if (--batch <= 0) { + /* + * Update the cursor to point at this extent so we + * restart the next batch from this extent. + */ + tcur->ar_startblock = fbno; + tcur->ar_blockcount = flen; + break; + } /* * use daddr format for all range/len calculations as that is @@ -88,6 +231,7 @@ xfs_trim_extents( */ if (dlen < minlen) { trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); + tcur->ar_blockcount = 0; break; } @@ -110,29 +254,103 @@ xfs_trim_extents( goto next_extent; } - trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen); - error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); - if (error) - break; + xfs_extent_busy_insert_discard(pag, fbno, flen, + &extents->extent_list); *blocks_trimmed += flen; - next_extent: error = xfs_btree_decrement(cur, 0, &i); if (error) break; - if (fatal_signal_pending(current)) { - error = -ERESTARTSYS; - break; - } + /* + * If there's no more records in the tree, we are done. Set the + * cursor block count to 0 to indicate to the caller that there + * is no more extents to search. + */ + if (i == 0) + tcur->ar_blockcount = 0; } + /* + * If there was an error, release all the gathered busy extents because + * we aren't going to issue a discard on them any more. + */ + if (error) + xfs_extent_busy_clear(mp, &extents->extent_list, false); out_del_cursor: xfs_btree_del_cursor(cur, error); xfs_buf_relse(agbp); return error; } +static bool +xfs_trim_should_stop(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + +/* + * Iterate the free list gathering extents and discarding them. We need a cursor + * for the repeated iteration of gather/discard loop, so use the longest extent + * we found in the last batch as the key to start the next. + */ +static int +xfs_trim_extents( + struct xfs_perag *pag, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, + uint64_t *blocks_trimmed) +{ + struct xfs_alloc_rec_incore tcur = { + .ar_blockcount = pag->pagf_longest, + .ar_startblock = NULLAGBLOCK, + }; + int error = 0; + + do { + struct xfs_busy_extents *extents; + + extents = kzalloc(sizeof(*extents), GFP_KERNEL); + if (!extents) { + error = -ENOMEM; + break; + } + + extents->mount = pag->pag_mount; + extents->owner = extents; + INIT_LIST_HEAD(&extents->extent_list); + + error = xfs_trim_gather_extents(pag, start, end, minlen, + &tcur, extents, blocks_trimmed); + if (error) { + kfree(extents); + break; + } + + /* + * We hand the extent list to the discard function here so the + * discarded extents can be removed from the busy extent list. + * This allows the discards to run asynchronously with gathering + * the next round of extents to discard. + * + * However, we must ensure that we do not reference the extent + * list after this function call, as it may have been freed by + * the time control returns to us. + */ + error = xfs_discard_extents(pag->pag_mount, extents); + if (error) + break; + + if (xfs_trim_should_stop()) + break; + + } while (tcur.ar_blockcount != 0); + + return error; + +} + /* * trim a range of the filesystem. * @@ -195,12 +413,12 @@ xfs_ioc_trim( for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { error = xfs_trim_extents(pag, start, end, minlen, &blocks_trimmed); - if (error) { + if (error) last_error = error; - if (error == -ERESTARTSYS) { - xfs_perag_rele(pag); - break; - } + + if (xfs_trim_should_stop()) { + xfs_perag_rele(pag); + break; } } diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h index de92d9cc958f..2b1a85223a56 100644 --- a/fs/xfs/xfs_discard.h +++ b/fs/xfs/xfs_discard.h @@ -3,8 +3,10 @@ #define XFS_DISCARD_H 1 struct fstrim_range; -struct list_head; +struct xfs_mount; +struct xfs_busy_extents; -extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); +int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy); +int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim); #endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 7c2fdc71e42d..9ecfdcdc752f 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -19,13 +19,13 @@ #include "xfs_log.h" #include "xfs_ag.h" -void -xfs_extent_busy_insert( - struct xfs_trans *tp, +static void +xfs_extent_busy_insert_list( struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, - unsigned int flags) + unsigned int flags, + struct list_head *busy_list) { struct xfs_extent_busy *new; struct xfs_extent_busy *busyp; @@ -40,7 +40,7 @@ xfs_extent_busy_insert( new->flags = flags; /* trace before insert to be able to see failed inserts */ - trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len); + trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len); spin_lock(&pag->pagb_lock); rbp = &pag->pagb_tree.rb_node; @@ -62,10 +62,33 @@ xfs_extent_busy_insert( rb_link_node(&new->rb_node, parent, rbp); rb_insert_color(&new->rb_node, &pag->pagb_tree); - list_add(&new->list, &tp->t_busy); + /* always process discard lists in fifo order */ + list_add_tail(&new->list, busy_list); spin_unlock(&pag->pagb_lock); } +void +xfs_extent_busy_insert( + struct xfs_trans *tp, + struct xfs_perag *pag, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy); +} + +void +xfs_extent_busy_insert_discard( + struct xfs_perag *pag, + xfs_agblock_t bno, + xfs_extlen_t len, + struct list_head *busy_list) +{ + xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED, + busy_list); +} + /* * Search for a busy extent within the range of the extent we are about to * allocate. You need to be holding the busy extent tree lock when calling diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index c37bf87e6781..0639aab336f3 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -16,9 +16,6 @@ struct xfs_alloc_arg; /* * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that * have been freed but whose transactions aren't committed to disk yet. - * - * Note that we use the transaction ID to record the transaction, not the - * transaction structure itself. See xfs_extent_busy_insert() for details. */ struct xfs_extent_busy { struct rb_node rb_node; /* ag by-bno indexed search tree */ @@ -31,11 +28,32 @@ struct xfs_extent_busy { #define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ }; +/* + * List used to track groups of related busy extents all the way through + * to discard completion. + */ +struct xfs_busy_extents { + struct xfs_mount *mount; + struct list_head extent_list; + struct work_struct endio_work; + + /* + * Owner is the object containing the struct xfs_busy_extents to free + * once the busy extents have been processed. If only the + * xfs_busy_extents object needs freeing, then point this at itself. + */ + void *owner; +}; + void xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); void +xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno, + xfs_extlen_t len, struct list_head *busy_list); + +void xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, bool do_discard); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4d55f58d99b7..36f5cf802c07 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -844,8 +844,8 @@ xfs_init_new_inode( ASSERT(ip->i_nblocks == 0); tv = inode_set_ctime_current(inode); - inode->i_mtime = tv; - inode->i_atime = tv; + inode_set_mtime_to_ts(inode, tv); + inode_set_atime_to_ts(inode, tv); ip->i_extsize = 0; ip->i_diflags = 0; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 127b2410eb20..17c51804f9c6 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -526,8 +526,8 @@ xfs_inode_to_log_dinode( to->di_projid_hi = ip->i_projid >> 16; memset(to->di_pad3, 0, sizeof(to->di_pad3)); - to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); - to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); + to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode)); + to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode)); to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode)); to->di_nlink = inode->i_nlink; to->di_gen = inode->i_generation; diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index c14852362fce..052d0e888c27 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -22,7 +22,7 @@ /* * On intel, even if sizes match, alignment and/or padding may differ. */ -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +#if defined(CONFIG_X86_64) #define BROKEN_X86_ALIGNMENT #define __compat_packed __attribute__((packed)) #else diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 1c1e6171209d..fdfda4fba12b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -572,8 +572,8 @@ xfs_vn_getattr( stat->uid = vfsuid_into_kuid(vfsuid); stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; - stat->atime = inode->i_atime; - stat->mtime = inode->i_mtime; + stat->atime = inode_get_atime(inode); + stat->mtime = inode_get_mtime(inode); stat->ctime = inode_get_ctime(inode); stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); @@ -584,6 +584,11 @@ xfs_vn_getattr( } } + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { + stat->change_cookie = inode_query_iversion(inode); + stat->result_mask |= STATX_CHANGE_COOKIE; + } + /* * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. @@ -1062,9 +1067,9 @@ xfs_vn_update_time( now = current_time(inode); if (flags & S_MTIME) - inode->i_mtime = now; + inode_set_mtime_to_ts(inode, now); if (flags & S_ATIME) - inode->i_atime = now; + inode_set_atime_to_ts(inode, now); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, log_flags); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f5377ba5967a..14462614fcc8 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -107,12 +107,12 @@ xfs_bulkstat_one_int( buf->bs_size = ip->i_disk_size; buf->bs_nlink = inode->i_nlink; - buf->bs_atime = inode->i_atime.tv_sec; - buf->bs_atime_nsec = inode->i_atime.tv_nsec; - buf->bs_mtime = inode->i_mtime.tv_sec; - buf->bs_mtime_nsec = inode->i_mtime.tv_nsec; - buf->bs_ctime = inode_get_ctime(inode).tv_sec; - buf->bs_ctime_nsec = inode_get_ctime(inode).tv_nsec; + buf->bs_atime = inode_get_atime_sec(inode); + buf->bs_atime_nsec = inode_get_atime_nsec(inode); + buf->bs_mtime = inode_get_mtime_sec(inode); + buf->bs_mtime_nsec = inode_get_mtime_nsec(inode); + buf->bs_ctime = inode_get_ctime_sec(inode); + buf->bs_ctime_nsec = inode_get_ctime_nsec(inode); buf->bs_gen = inode->i_generation; buf->bs_mode = inode->i_mode; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ebc70aaa299c..67a99d94701e 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -16,8 +16,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_trace.h" - -struct workqueue_struct *xfs_discard_wq; +#include "xfs_discard.h" /* * Allocate a new ticket. Failing to get a new ticket makes it really hard to @@ -103,7 +102,7 @@ xlog_cil_ctx_alloc(void) ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); INIT_LIST_HEAD(&ctx->committing); - INIT_LIST_HEAD(&ctx->busy_extents); + INIT_LIST_HEAD(&ctx->busy_extents.extent_list); INIT_LIST_HEAD(&ctx->log_items); INIT_LIST_HEAD(&ctx->lv_chain); INIT_WORK(&ctx->push_work, xlog_cil_push_work); @@ -132,7 +131,7 @@ xlog_cil_push_pcp_aggregate( if (!list_empty(&cilpcp->busy_extents)) { list_splice_init(&cilpcp->busy_extents, - &ctx->busy_extents); + &ctx->busy_extents.extent_list); } if (!list_empty(&cilpcp->log_items)) list_splice_init(&cilpcp->log_items, &ctx->log_items); @@ -708,76 +707,6 @@ xlog_cil_free_logvec( } } -static void -xlog_discard_endio_work( - struct work_struct *work) -{ - struct xfs_cil_ctx *ctx = - container_of(work, struct xfs_cil_ctx, discard_endio_work); - struct xfs_mount *mp = ctx->cil->xc_log->l_mp; - - xfs_extent_busy_clear(mp, &ctx->busy_extents, false); - kmem_free(ctx); -} - -/* - * Queue up the actual completion to a thread to avoid IRQ-safe locking for - * pagb_lock. Note that we need a unbounded workqueue, otherwise we might - * get the execution delayed up to 30 seconds for weird reasons. - */ -static void -xlog_discard_endio( - struct bio *bio) -{ - struct xfs_cil_ctx *ctx = bio->bi_private; - - INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work); - queue_work(xfs_discard_wq, &ctx->discard_endio_work); - bio_put(bio); -} - -static void -xlog_discard_busy_extents( - struct xfs_mount *mp, - struct xfs_cil_ctx *ctx) -{ - struct list_head *list = &ctx->busy_extents; - struct xfs_extent_busy *busyp; - struct bio *bio = NULL; - struct blk_plug plug; - int error = 0; - - ASSERT(xfs_has_discard(mp)); - - blk_start_plug(&plug); - list_for_each_entry(busyp, list, list) { - trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, - busyp->length); - - error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, - XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), - XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, &bio); - if (error && error != -EOPNOTSUPP) { - xfs_info(mp, - "discard failed for extent [0x%llx,%u], error %d", - (unsigned long long)busyp->bno, - busyp->length, - error); - break; - } - } - - if (bio) { - bio->bi_private = ctx; - bio->bi_end_io = xlog_discard_endio; - submit_bio(bio); - } else { - xlog_discard_endio_work(&ctx->discard_endio_work); - } - blk_finish_plug(&plug); -} - /* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as @@ -807,8 +736,8 @@ xlog_cil_committed( xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, ctx->start_lsn, abort); - xfs_extent_busy_sort(&ctx->busy_extents); - xfs_extent_busy_clear(mp, &ctx->busy_extents, + xfs_extent_busy_sort(&ctx->busy_extents.extent_list); + xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list, xfs_has_discard(mp) && !abort); spin_lock(&ctx->cil->xc_push_lock); @@ -817,10 +746,14 @@ xlog_cil_committed( xlog_cil_free_logvec(&ctx->lv_chain); - if (!list_empty(&ctx->busy_extents)) - xlog_discard_busy_extents(mp, ctx); - else - kmem_free(ctx); + if (!list_empty(&ctx->busy_extents.extent_list)) { + ctx->busy_extents.mount = mp; + ctx->busy_extents.owner = ctx; + xfs_discard_extents(mp, &ctx->busy_extents); + return; + } + + kmem_free(ctx); } void diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index af87648331d5..fa3ad1d7b31c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -6,6 +6,8 @@ #ifndef __XFS_LOG_PRIV_H__ #define __XFS_LOG_PRIV_H__ +#include "xfs_extent_busy.h" /* for struct xfs_busy_extents */ + struct xfs_buf; struct xlog; struct xlog_ticket; @@ -223,12 +225,11 @@ struct xfs_cil_ctx { struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ atomic_t space_used; /* aggregate size of regions */ - struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_busy_extents busy_extents; struct list_head log_items; /* log items in chkpt */ struct list_head lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ - struct work_struct discard_endio_work; struct work_struct push_work; atomic_t order_id; diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 4a9bbd3fe120..a7daa522e00f 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -126,8 +126,8 @@ xfs_dax_notify_ddev_failure( struct xfs_rmap_irec ri_low = { }; struct xfs_rmap_irec ri_high; struct xfs_agf *agf; - xfs_agblock_t agend; struct xfs_perag *pag; + xfs_agblock_t range_agend; pag = xfs_perag_get(mp, agno); error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); @@ -148,10 +148,10 @@ xfs_dax_notify_ddev_failure( ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); agf = agf_bp->b_addr; - agend = min(be32_to_cpu(agf->agf_length), + range_agend = min(be32_to_cpu(agf->agf_length) - 1, ri_high.rm_startblock); notify.startblock = ri_low.rm_startblock; - notify.blockcount = agend - ri_low.rm_startblock; + notify.blockcount = range_agend + 1 - ri_low.rm_startblock; error = xfs_rmap_query_range(cur, &ri_low, &ri_high, xfs_dax_failure_fn, ¬ify); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 16534e9873f6..2e1a4e5cd03d 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1420,25 +1420,26 @@ xfs_rtunmount_inodes( */ int /* error */ xfs_rtpick_extent( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_extlen_t len, /* allocation length (rtextents) */ - xfs_rtblock_t *pick) /* result rt extent */ -{ - xfs_rtblock_t b; /* result block */ - int log2; /* log of sequence number */ - uint64_t resid; /* residual after log removed */ - uint64_t seq; /* sequence number of file creation */ - uint64_t *seqp; /* pointer to seqno in inode */ + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_extlen_t len, /* allocation length (rtextents) */ + xfs_rtblock_t *pick) /* result rt extent */ + { + xfs_rtblock_t b; /* result block */ + int log2; /* log of sequence number */ + uint64_t resid; /* residual after log removed */ + uint64_t seq; /* sequence number of file creation */ + struct timespec64 ts; /* temporary timespec64 storage */ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); - seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime; if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) { mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; - *seqp = 0; + seq = 0; + } else { + ts = inode_get_atime(VFS_I(mp->m_rbmip)); + seq = (uint64_t)ts.tv_sec; } - seq = *seqp; if ((log2 = xfs_highbit64(seq)) == -1) b = 0; else { @@ -1450,7 +1451,8 @@ xfs_rtpick_extent( if (b + len > mp->m_sb.sb_rextents) b = mp->m_sb.sb_rextents - len; } - *seqp = seq + 1; + ts.tv_sec = (time64_t)seq + 1; + inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); *pick = b; return 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 819a3568b28f..f0ae07828153 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -361,14 +361,15 @@ STATIC int xfs_blkdev_get( xfs_mount_t *mp, const char *name, - struct block_device **bdevp) + struct bdev_handle **handlep) { int error = 0; - *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, - mp->m_super, &fs_holder_ops); - if (IS_ERR(*bdevp)) { - error = PTR_ERR(*bdevp); + *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, + mp->m_super, &fs_holder_ops); + if (IS_ERR(*handlep)) { + error = PTR_ERR(*handlep); + *handlep = NULL; xfs_warn(mp, "Invalid device [%s], error=%d", name, error); } @@ -433,7 +434,7 @@ xfs_open_devices( { struct super_block *sb = mp->m_super; struct block_device *ddev = sb->s_bdev; - struct block_device *logdev = NULL, *rtdev = NULL; + struct bdev_handle *logdev_handle = NULL, *rtdev_handle = NULL; int error; /* @@ -446,17 +447,19 @@ xfs_open_devices( * Open real time and log devices - order is important. */ if (mp->m_logname) { - error = xfs_blkdev_get(mp, mp->m_logname, &logdev); + error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle); if (error) goto out_relock; } if (mp->m_rtname) { - error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev); + error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle); if (error) goto out_close_logdev; - if (rtdev == ddev || rtdev == logdev) { + if (rtdev_handle->bdev == ddev || + (logdev_handle && + rtdev_handle->bdev == logdev_handle->bdev)) { xfs_warn(mp, "Cannot mount filesystem with identical rtdev and ddev/logdev."); error = -EINVAL; @@ -468,22 +471,25 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle); if (!mp->m_ddev_targp) goto out_close_rtdev; - if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); + if (rtdev_handle) { + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } - if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); + if (logdev_handle && logdev_handle->bdev != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { mp->m_logdev_targp = mp->m_ddev_targp; + /* Handle won't be used, drop it */ + if (logdev_handle) + bdev_release(logdev_handle); } error = 0; @@ -497,11 +503,11 @@ out_relock: out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - if (rtdev) - blkdev_put(rtdev, sb); + if (rtdev_handle) + bdev_release(rtdev_handle); out_close_logdev: - if (logdev && logdev != ddev) - blkdev_put(logdev, sb); + if (logdev_handle) + bdev_release(logdev_handle); goto out_relock; } diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index a3975f325f4e..987843f84d03 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -186,7 +186,7 @@ static const struct xattr_handler xfs_xattr_security_handler = { .set = xfs_xattr_set, }; -const struct xattr_handler *xfs_xattr_handlers[] = { +const struct xattr_handler * const xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h index 2b09133b1b9b..cec766cad26c 100644 --- a/fs/xfs/xfs_xattr.h +++ b/fs/xfs/xfs_xattr.h @@ -8,6 +8,6 @@ int xfs_attr_change(struct xfs_da_args *args); -extern const struct xattr_handler *xfs_xattr_handlers[]; +extern const struct xattr_handler * const xfs_xattr_handlers[]; #endif /* __XFS_XATTR_H__ */ |