aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/bio.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-26 09:13:44 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-26 09:13:44 -0700
commit85d7ab2463822a4ab096c0b7b59feec962552572 (patch)
treec48c09b785202f626d4b86493bdd4751ec45ad2f /fs/btrfs/bio.c
parent94fc0792661a96d64a4bb79cf10d0793ecadf76e (diff)
parentf372463124df5f980de6ee0cd6000a3e43df0e01 (diff)
Merge tag 'for-6.4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "Mostly core changes and cleanups, some notable fixes and two performance improvements in directory logging. The IO path cleanups are removing or refactoring old code, scrub main loop has been completely rewritten also refactoring old code. There are some changes to non-btrfs code, mostly trivial, the cgroup punt bio logic is only moved from generic code. Performance improvements: - improve logging changes in a directory during one transaction, avoid iterating over items and reduce lock contention (fsync time 4x lower) - when logging directory entries during one transaction, reduce locking of subvolume trees by checking tree-log instead (improvement in throughput and latency for concurrent access to a subvolume) Notable fixes: - dev-replace: - properly honor read mode when requested to avoid reading from source device - target device won't be used for eventual read repair, this is unreliable for NODATASUM files - when there are unpaired (and unrepairable) metadata during replace, exit early with error and don't try to finish whole operation - scrub ioctl properly rejects unknown flags - fix global block reserve calculations - fix partial direct io write when there's a page fault in the middle, iomap will try to continue with partial request but the btrfs part did not match that, this can lead to zeros written instead of data Core changes: - io path: - continued cleanups and refactoring around bio handling - extent io submit path simplifications and cleanups - flush write path simplifications and cleanups - rework logic of passing sync mode of bio, with further cleanups - rewrite scrub code flow, restructure how the stripes are enumerated and verified in a more unified way - allow to set lower threshold for block group reclaim in debug mode to aid zoned mode testing - remove obsolete time-based delayed ref throttling logic when truncating items - DREW locks are not using percpu variables anymore - more warning fixes (-Wmaybe-uninitialized) - u64 division simplifications - error handling improvements Non-btrfs code changes: - push cgroup punt bio logic to btrfs code (there was no other user of that), the functionality can be now selected separately by BLK_CGROUP_PUNT_BIO - crc32c_impl removed after removing last uses in btrfs code - add btrfs_assertfail() to objtool table" * tag 'for-6.4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (147 commits) btrfs: mark btrfs_assertfail() __noreturn btrfs: fix uninitialized variable warnings btrfs: use log root when iterating over index keys when logging directory btrfs: avoid iterating over all indexes when logging directory btrfs: dev-replace: error out if we have unrepaired metadata error during btrfs: remove pointless loop at btrfs_get_next_valid_item() btrfs: scrub: reject unsupported scrub flags btrfs: reinterpret async discard iops_limit=0 as no delay btrfs: set default discard iops_limit to 1000 btrfs: remove unused raid56 functions which were dedicated for scrub btrfs: scrub: remove scrub_bio structure btrfs: scrub: remove scrub_block and scrub_sector structures btrfs: scrub: remove the old scrub recheck code btrfs: scrub: remove the old writeback infrastructure btrfs: scrub: remove scrub_parity structure btrfs: scrub: use scrub_stripe to implement RAID56 P/Q scrub btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure btrfs: scrub: introduce helper to queue a stripe for scrub btrfs: scrub: introduce error reporting functionality for scrub_stripe btrfs: scrub: introduce a writeback helper for scrub_stripe ...
Diffstat (limited to 'fs/btrfs/bio.c')
-rw-r--r--fs/btrfs/bio.c211
1 files changed, 123 insertions, 88 deletions
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 726592868e9c..5379c4714905 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -31,11 +31,11 @@ struct btrfs_failed_bio {
* Initialize a btrfs_bio structure. This skips the embedded bio itself as it
* is already initialized by the block layer.
*/
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
btrfs_bio_end_io_t end_io, void *private)
{
memset(bbio, 0, offsetof(struct btrfs_bio, bio));
- bbio->inode = inode;
+ bbio->fs_info = fs_info;
bbio->end_io = end_io;
bbio->private = private;
atomic_set(&bbio->pending_ios, 1);
@@ -48,41 +48,58 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
* Just like the underlying bio_alloc_bioset it will not fail as it is backed by
* a mempool.
*/
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
- struct btrfs_inode *inode,
- btrfs_bio_end_io_t end_io, void *private)
+struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ struct btrfs_fs_info *fs_info,
+ btrfs_bio_end_io_t end_io, void *private)
{
+ struct btrfs_bio *bbio;
struct bio *bio;
bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
- btrfs_bio_init(btrfs_bio(bio), inode, end_io, private);
- return bio;
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio, fs_info, end_io, private);
+ return bbio;
}
-static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
- struct bio *orig, u64 map_length,
- bool use_append)
+static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio)
{
- struct btrfs_bio *orig_bbio = btrfs_bio(orig);
+ struct btrfs_ordered_extent *ordered;
+ int ret;
+
+ ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
+ if (WARN_ON_ONCE(!ordered))
+ return BLK_STS_IOERR;
+ ret = btrfs_extract_ordered_extent(bbio, ordered);
+ btrfs_put_ordered_extent(ordered);
+
+ return errno_to_blk_status(ret);
+}
+
+static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
+ struct btrfs_bio *orig_bbio,
+ u64 map_length, bool use_append)
+{
+ struct btrfs_bio *bbio;
struct bio *bio;
if (use_append) {
unsigned int nr_segs;
- bio = bio_split_rw(orig, &fs_info->limits, &nr_segs,
+ bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
&btrfs_clone_bioset, map_length);
} else {
- bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS,
- &btrfs_clone_bioset);
+ bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
+ GFP_NOFS, &btrfs_clone_bioset);
}
- btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio);
-
- btrfs_bio(bio)->file_offset = orig_bbio->file_offset;
- if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED))
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
+ bbio->inode = orig_bbio->inode;
+ bbio->file_offset = orig_bbio->file_offset;
+ if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED))
orig_bbio->file_offset += map_length;
atomic_inc(&orig_bbio->pending_ios);
- return bio;
+ return bbio;
}
static void btrfs_orig_write_end_io(struct bio *bio);
@@ -164,7 +181,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
goto done;
}
- btrfs_submit_bio(&repair_bbio->bio, mirror);
+ btrfs_submit_bio(repair_bbio, mirror);
return;
}
@@ -224,15 +241,16 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
&btrfs_repair_bioset);
repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
- bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
+ __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
repair_bbio = btrfs_bio(repair_bio);
- btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio);
+ btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
+ repair_bbio->inode = failed_bbio->inode;
repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
- btrfs_submit_bio(repair_bio, mirror);
+ btrfs_submit_bio(repair_bbio, mirror);
return fbio;
}
@@ -246,6 +264,9 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
struct btrfs_failed_bio *fbio = NULL;
u32 offset = 0;
+ /* Read-repair requires the inode field to be set by the submitter. */
+ ASSERT(inode);
+
/*
* Hand off repair bios to the repair code as there is no upper level
* submitter for them.
@@ -306,17 +327,17 @@ static void btrfs_end_bio_work(struct work_struct *work)
struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
/* Metadata reads are checked and repaired by the submitter. */
- if (bbio->bio.bi_opf & REQ_META)
- bbio->end_io(bbio);
- else
+ if (bbio->inode && !(bbio->bio.bi_opf & REQ_META))
btrfs_check_read_bio(bbio, bbio->bio.bi_private);
+ else
+ bbio->end_io(bbio);
}
static void btrfs_simple_end_io(struct bio *bio)
{
struct btrfs_bio *bbio = btrfs_bio(bio);
struct btrfs_device *dev = bio->bi_private;
- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
btrfs_bio_counter_dec(fs_info);
@@ -340,7 +361,8 @@ static void btrfs_raid56_end_io(struct bio *bio)
btrfs_bio_counter_dec(bioc->fs_info);
bbio->mirror_num = bioc->mirror_num;
- if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META))
+ if (bio_op(bio) == REQ_OP_READ && bbio->inode &&
+ !(bbio->bio.bi_opf & REQ_META))
btrfs_check_read_bio(bbio, NULL);
else
btrfs_orig_bbio_end_io(bbio);
@@ -418,7 +440,11 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
dev->devid, bio->bi_iter.bi_size);
btrfsic_check_bio(bio);
- submit_bio(bio);
+
+ if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
+ blkcg_punt_bio_submit(bio);
+ else
+ submit_bio(bio);
}
static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
@@ -534,10 +560,10 @@ static void run_one_async_done(struct btrfs_work *work)
/*
* All of the bios that pass through here are from async helpers.
- * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
- * This changes nothing when cgroups aren't in use.
+ * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
+ * context. This changes nothing when cgroups aren't in use.
*/
- bio->bi_opf |= REQ_CGROUP_PUNT;
+ bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
}
@@ -562,7 +588,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
* in order.
*/
if (bbio->bio.bi_opf & REQ_META) {
- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
if (btrfs_is_zoned(fs_info))
return false;
@@ -582,7 +608,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
struct btrfs_io_context *bioc,
struct btrfs_io_stripe *smap, int mirror_num)
{
- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
@@ -603,12 +629,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
return true;
}
-static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
+static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
{
- struct btrfs_bio *bbio = btrfs_bio(bio);
struct btrfs_inode *inode = bbio->inode;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
struct btrfs_bio *orig_bbio = bbio;
+ struct bio *bio = &bbio->bio;
u64 logical = bio->bi_iter.bi_sector << 9;
u64 length = bio->bi_iter.bi_size;
u64 map_length = length;
@@ -631,15 +657,15 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
map_length = min(map_length, fs_info->max_zone_append_size);
if (map_length < length) {
- bio = btrfs_split_bio(fs_info, bio, map_length, use_append);
- bbio = btrfs_bio(bio);
+ bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
+ bio = &bbio->bio;
}
/*
* Save the iter for the end_io handler and preload the checksums for
* data reads.
*/
- if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) {
+ if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) {
bbio->saved_iter = bio->bi_iter;
ret = btrfs_lookup_bio_sums(bbio);
if (ret)
@@ -650,7 +676,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
if (use_append) {
bio->bi_opf &= ~REQ_OP_WRITE;
bio->bi_opf |= REQ_OP_ZONE_APPEND;
- ret = btrfs_extract_ordered_extent(btrfs_bio(bio));
+ ret = btrfs_bio_extract_ordered_extent(bbio);
if (ret)
goto fail_put_bio;
}
@@ -659,7 +685,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
* Csum items for reloc roots have already been cloned at this
* point, so they are handled as part of the no-checksum case.
*/
- if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
+ if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
!test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
!btrfs_is_data_reloc_root(inode->root)) {
if (should_async_write(bbio) &&
@@ -686,9 +712,12 @@ fail:
return true;
}
-void btrfs_submit_bio(struct bio *bio, int mirror_num)
+void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
{
- while (!btrfs_submit_chunk(bio, mirror_num))
+ /* If bbio->inode is not populated, its file_offset must be 0. */
+ ASSERT(bbio->inode || bbio->file_offset == 0);
+
+ while (!btrfs_submit_chunk(bbio, mirror_num))
;
}
@@ -706,12 +735,9 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num)
{
- struct btrfs_device *dev;
+ struct btrfs_io_stripe smap = { 0 };
struct bio_vec bvec;
struct bio bio;
- u64 map_length = 0;
- u64 sector;
- struct btrfs_io_context *bioc = NULL;
int ret = 0;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
@@ -720,68 +746,38 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
if (btrfs_repair_one_zone(fs_info, logical))
return 0;
- map_length = length;
-
/*
* Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are doing the
* read repair operation.
*/
btrfs_bio_counter_inc_blocked(fs_info);
- if (btrfs_is_parity_mirror(fs_info, logical, length)) {
- /*
- * Note that we don't use BTRFS_MAP_WRITE because it's supposed
- * to update all raid stripes, but here we just want to correct
- * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
- * stripe's dev and sector.
- */
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &map_length, &bioc, 0);
- if (ret)
- goto out_counter_dec;
- ASSERT(bioc->mirror_num == 1);
- } else {
- ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bioc, mirror_num);
- if (ret)
- goto out_counter_dec;
- /*
- * This happens when dev-replace is also running, and the
- * mirror_num indicates the dev-replace target.
- *
- * In this case, we don't need to do anything, as the read
- * error just means the replace progress hasn't reached our
- * read range, and later replace routine would handle it well.
- */
- if (mirror_num != bioc->mirror_num)
- goto out_counter_dec;
- }
-
- sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
- dev = bioc->stripes[bioc->mirror_num - 1].dev;
- btrfs_put_bioc(bioc);
+ ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
+ if (ret < 0)
+ goto out_counter_dec;
- if (!dev || !dev->bdev ||
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
+ if (!smap.dev->bdev ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
ret = -EIO;
goto out_counter_dec;
}
- bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
- bio.bi_iter.bi_sector = sector;
+ bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+ bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
__bio_add_page(&bio, page, length, pg_offset);
btrfsic_check_bio(&bio);
ret = submit_bio_wait(&bio);
if (ret) {
/* try to remap that extent elsewhere? */
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
goto out_bio_uninit;
}
btrfs_info_rl_in_rcu(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
- ino, start, btrfs_dev_name(dev), sector);
+ ino, start, btrfs_dev_name(smap.dev),
+ smap.physical >> SECTOR_SHIFT);
ret = 0;
out_bio_uninit:
@@ -791,6 +787,45 @@ out_counter_dec:
return ret;
}
+/*
+ * Submit a btrfs_bio based repair write.
+ *
+ * If @dev_replace is true, the write would be submitted to dev-replace target.
+ */
+void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
+{
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
+ u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 length = bbio->bio.bi_iter.bi_size;
+ struct btrfs_io_stripe smap = { 0 };
+ int ret;
+
+ ASSERT(fs_info);
+ ASSERT(mirror_num > 0);
+ ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
+ ASSERT(!bbio->inode);
+
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
+ if (ret < 0)
+ goto fail;
+
+ if (dev_replace) {
+ if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) {
+ bbio->bio.bi_opf &= ~REQ_OP_WRITE;
+ bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ }
+ ASSERT(smap.dev == fs_info->dev_replace.srcdev);
+ smap.dev = fs_info->dev_replace.tgtdev;
+ }
+ __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
+ return;
+
+fail:
+ btrfs_bio_counter_dec(fs_info);
+ btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
+}
+
int __init btrfs_bioset_init(void)
{
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,