diff options
Diffstat (limited to 'fs/btrfs')
90 files changed, 6780 insertions, 7261 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 66fa9ab2c046..4fb925e8c981 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -31,7 +31,7 @@ config BTRFS_FS continue to be mountable and usable by newer kernels. For more information, please see the web pages at - http://btrfs.wiki.kernel.org. + https://btrfs.readthedocs.io To compile this file system support as a module, choose M here. The module will be called btrfs. @@ -48,25 +48,6 @@ config BTRFS_FS_POSIX_ACL If you don't know what Access Control Lists are, say N -config BTRFS_FS_CHECK_INTEGRITY - bool "Btrfs with integrity check tool compiled in (DANGEROUS)" - depends on BTRFS_FS - help - Adds code that examines all block write requests (including - writes of the super block). The goal is to verify that the - state of the filesystem on disk is always consistent, i.e., - after a power-loss or kernel panic event the filesystem is - in a consistent state. - - If the integrity check tool is included and activated in - the mount options, plenty of kernel memory is used, and - plenty of additional CPU cycles are spent. Enabling this - functionality is not intended for normal use. - - In most cases, unless you are a btrfs developer who needs - to verify the integrity of (super)-block write requests - during the run of a regression test, say N - config BTRFS_FS_RUN_SANITY_TESTS bool "Btrfs will run sanity tests upon loading" depends on BTRFS_FS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 90d53209755b..525af975f61c 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -33,10 +33,9 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ - lru_cache.o + lru_cache.o raid-stripe-tree.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o -btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o btrfs-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index ceadfc5d6c66..aa0844535644 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -3,6 +3,9 @@ #ifndef BTRFS_ACCESSORS_H #define BTRFS_ACCESSORS_H +#include <linux/stddef.h> +#include <asm/unaligned.h> + struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; @@ -34,13 +37,13 @@ static inline void put_unaligned_le8(u8 val, void *p) read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ - sizeof(((type *)0)->member))) + sizeof_field(type, member))) #define write_eb_member(eb, ptr, type, member, result) (\ write_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ - sizeof(((type *)0)->member))) + sizeof_field(type, member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ @@ -62,25 +65,25 @@ DECLARE_BTRFS_SETGET_BITS(64) static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ const type *s) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ type *s, u##bits val) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } @@ -111,17 +114,14 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); - return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, - total_bytes)); + static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } @@ -306,6 +306,14 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8); +BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64); +BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding, + struct btrfs_stripe_extent, encoding, 8); +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64); + /* struct btrfs_dev_extent */ BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, @@ -350,6 +358,9 @@ BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 3 BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); +BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref, + root_id, 64); + BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, type, 8); BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, @@ -366,6 +377,8 @@ static inline u32 btrfs_extent_inline_ref_size(int type) if (type == BTRFS_EXTENT_DATA_REF_KEY) return sizeof(struct btrfs_extent_data_ref) + offsetof(struct btrfs_extent_inline_ref, offset); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); return 0; } @@ -967,6 +980,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, rescan, 64); +BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item, + enable_gen, 64); /* btrfs_qgroup_info_item */ BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index ce083e99ef68..9e261aac671e 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -9,6 +9,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> +#include <trace/events/btrfs.h> #include "async-thread.h" #include "ctree.h" @@ -242,7 +243,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, break; trace_btrfs_ordered_sched(work); spin_unlock_irqrestore(lock, flags); - work->ordered_func(work); + work->ordered_func(work, false); /* now take the lock again and drop our item from the list */ spin_lock_irqsave(lock, flags); @@ -277,7 +278,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, * We don't want to call the ordered free functions with * the lock held. */ - work->ordered_free(work); + work->ordered_func(work, true); /* NB: work must not be dereferenced past this point. */ trace_btrfs_all_work_done(wq->fs_info, work); } @@ -285,7 +286,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, spin_unlock_irqrestore(lock, flags); if (free_self) { - self->ordered_free(self); + self->ordered_func(self, true); /* NB: self must not be dereferenced past this point. */ trace_btrfs_all_work_done(wq->fs_info, self); } @@ -300,7 +301,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) /* * We should not touch things inside work in the following cases: - * 1) after work->func() if it has no ordered_free + * 1) after work->func() if it has no ordered_func(..., true) to free * Since the struct is freed in work->func(). * 2) after setting WORK_DONE_BIT * The work may be freed in other threads almost instantly. @@ -329,11 +330,10 @@ static void btrfs_work_helper(struct work_struct *normal_work) } void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, - btrfs_func_t ordered_func, btrfs_func_t ordered_free) + btrfs_ordered_func_t ordered_func) { work->func = func; work->ordered_func = ordered_func; - work->ordered_free = ordered_free; INIT_WORK(&work->normal_work, btrfs_work_helper); INIT_LIST_HEAD(&work->ordered_list); work->flags = 0; diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 30f66c5e2e6e..62b8a0d57898 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -13,11 +13,11 @@ struct btrfs_fs_info; struct btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); +typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool); struct btrfs_work { btrfs_func_t func; - btrfs_func_t ordered_func; - btrfs_func_t ordered_free; + btrfs_ordered_func_t ordered_func; /* Don't touch things below */ struct work_struct normal_work; @@ -35,7 +35,7 @@ struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( struct btrfs_fs_info *fs_info, const char *name, unsigned int flags); void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, - btrfs_func_t ordered_func, btrfs_func_t ordered_free); + btrfs_ordered_func_t ordered_func); void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work); void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 79336fa853db..beed7e459dab 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1129,6 +1129,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, count, sc, GFP_NOFS); break; } + case BTRFS_EXTENT_OWNER_REF_KEY: + ASSERT(btrfs_fs_incompat(ctx->fs_info, SIMPLE_QUOTA)); + break; default: WARN_ON(1); } @@ -2998,7 +3001,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) } void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, - struct btrfs_backref_cache *cache, int is_reloc) + struct btrfs_backref_cache *cache, bool is_reloc) { int i; @@ -3196,12 +3199,14 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, * We still need to do a tree search to find out the parents. This is for * TREE_BLOCK_REF backref (keyed or inlined). * + * @trans: Transaction handle. * @ref_key: The same as @ref_key in handle_direct_tree_backref() * @tree_key: The first key of this tree block. * @path: A clean (released) path, to avoid allocating path every time * the function get called. */ -static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache, +static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache, struct btrfs_path *path, struct btrfs_key *ref_key, struct btrfs_key *tree_key, @@ -3315,7 +3320,7 @@ static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache, * If we know the block isn't shared we can avoid * checking its backrefs. */ - if (btrfs_block_can_be_shared(root, eb)) + if (btrfs_block_can_be_shared(trans, root, eb)) upper->checked = 0; else upper->checked = 1; @@ -3363,17 +3368,18 @@ out: * links aren't yet bi-directional. Needs to finish such links. * Use btrfs_backref_finish_upper_links() to finish such linkage. * + * @trans: Transaction handle. * @path: Released path for indirect tree backref lookup * @iter: Released backref iter for extent tree search * @node_key: The first key of the tree block */ -int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, +int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache, struct btrfs_path *path, struct btrfs_backref_iter *iter, struct btrfs_key *node_key, struct btrfs_backref_node *cur) { - struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_backref_edge *edge; struct btrfs_backref_node *exist; int ret; @@ -3462,25 +3468,21 @@ int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, ret = handle_direct_tree_backref(cache, &key, cur); if (ret < 0) goto out; - continue; - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - ret = -EINVAL; - btrfs_print_v0_err(fs_info); - btrfs_handle_fs_error(fs_info, ret, NULL); - goto out; - } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { - continue; + } else if (key.type == BTRFS_TREE_BLOCK_REF_KEY) { + /* + * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref + * offset means the root objectid. We need to search + * the tree to get its parent bytenr. + */ + ret = handle_indirect_tree_backref(trans, cache, path, + &key, node_key, cur); + if (ret < 0) + goto out; } - /* - * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset - * means the root objectid. We need to search the tree to get - * its parent bytenr. + * Unrecognized tree backref items (if it can pass tree-checker) + * would be ignored. */ - ret = handle_indirect_tree_backref(cache, path, &key, node_key, - cur); - if (ret < 0) - goto out; } ret = 0; cur->checked = 1; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 1616e3e3f1e4..ab4ca0eda605 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -247,7 +247,7 @@ struct prelim_ref { struct rb_node rbnode; u64 root_id; struct btrfs_key key_for_search; - int level; + u8 level; int count; struct extent_inode_elem *inode_list; u64 parent; @@ -440,11 +440,11 @@ struct btrfs_backref_cache { * Reloction backref cache require more info for reloc root compared * to generic backref cache. */ - unsigned int is_reloc; + bool is_reloc; }; void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, - struct btrfs_backref_cache *cache, int is_reloc); + struct btrfs_backref_cache *cache, bool is_reloc); struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_cache *cache, u64 bytenr, int level); struct btrfs_backref_edge *btrfs_backref_alloc_edge( @@ -533,14 +533,15 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, void btrfs_backref_release_cache(struct btrfs_backref_cache *cache); static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info, - u64 bytenr, int errno) + u64 bytenr, int error) { - btrfs_panic(fs_info, errno, + btrfs_panic(fs_info, error, "Inconsistency in backref cache found at offset %llu", bytenr); } -int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, +int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache, struct btrfs_path *path, struct btrfs_backref_iter *iter, struct btrfs_key *node_key, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 12b12443efaa..4f3b693a16b1 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -10,11 +10,11 @@ #include "volumes.h" #include "raid56.h" #include "async-thread.h" -#include "check-integrity.h" #include "dev-replace.h" #include "rcu-string.h" #include "zoned.h" #include "file-item.h" +#include "raid-stripe-tree.h" static struct bio_set btrfs_bioset; static struct bio_set btrfs_clone_bioset; @@ -416,6 +416,9 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; + if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -427,6 +430,8 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); + } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } /* Pass on control to the original bio this one was cloned from */ @@ -463,8 +468,6 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); - btrfsic_check_bio(bio); - if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) blkcg_punt_bio_submit(bio); else @@ -490,6 +493,7 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) bio->bi_private = &bioc->stripes[dev_nr]; bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; bioc->stripes[dev_nr].bioc = bioc; + bioc->size = bio->bi_iter.bi_size; btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } @@ -499,6 +503,8 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, if (!bioc) { /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; + if (bio_op(bio) != REQ_OP_READ) + btrfs_bio(bio)->orig_physical = smap->physical; bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; if (bio_op(bio) != REQ_OP_READ) btrfs_bio(bio)->orig_physical = smap->physical; @@ -568,13 +574,20 @@ static void run_one_async_start(struct btrfs_work *work) * * At IO completion time the csums attached on the ordered extent record are * inserted into the tree. + * + * If called with @do_free == true, then it will free the work struct. */ -static void run_one_async_done(struct btrfs_work *work) +static void run_one_async_done(struct btrfs_work *work, bool do_free) { struct async_submit_bio *async = container_of(work, struct async_submit_bio, work); struct bio *bio = &async->bbio->bio; + if (do_free) { + kfree(container_of(work, struct async_submit_bio, work)); + return; + } + /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { btrfs_orig_bbio_end_io(async->bbio); @@ -590,11 +603,6 @@ static void run_one_async_done(struct btrfs_work *work) __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } -static void run_one_async_free(struct btrfs_work *work) -{ - kfree(container_of(work, struct async_submit_bio, work)); -} - static bool should_async_write(struct btrfs_bio *bbio) { /* Submit synchronously if the checksum implementation is fast. */ @@ -636,8 +644,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, async->smap = *smap; async->mirror_num = mirror_num; - btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, - run_one_async_free); + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done); btrfs_queue_work(fs_info->workers, &async->work); return true; } @@ -657,9 +664,11 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t ret; int error; + smap.is_scrub = !bbio->inode; + btrfs_bio_counter_inc_blocked(fs_info); error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); + &bioc, &smap, &mirror_num); if (error) { ret = errno_to_blk_status(error); goto fail; @@ -691,6 +700,18 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bio->bi_opf |= REQ_OP_ZONE_APPEND; } + if (is_data_bbio(bbio) && bioc && + btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) { + /* + * No locking for the list update, as we only add to + * the list in the I/O submission path, and list + * iteration only happens in the completion path, which + * can't happen until after the last submission. + */ + btrfs_get_bioc(bioc); + list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list); + } + /* * Csum items for reloc roots have already been cloned at this * point, so they are handled as part of the no-checksum case. @@ -779,8 +800,6 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; __bio_add_page(&bio, page, length, pg_offset); - - btrfsic_check_bio(&bio); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 82324c327a50..6e5dc68ff661 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -504,13 +504,20 @@ static void fragment_free_space(struct btrfs_block_group *block_group) #endif /* - * This is only called by btrfs_cache_block_group, since we could have freed - * extents we need to check the pinned_extents for any extents that can't be - * used yet since their free space will be released as soon as the transaction - * commits. + * Add a free space range to the in memory free space cache of a block group. + * This checks if the range contains super block locations and any such + * locations are not added to the free space cache. + * + * @block_group: The target block group. + * @start: Start offset of the range. + * @end: End offset of the range (exclusive). + * @total_added_ret: Optional pointer to return the total amount of space + * added to the block group's free space cache. + * + * Returns 0 on success or < 0 on error. */ -int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end, - u64 *total_added_ret) +int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, + u64 end, u64 *total_added_ret) { struct btrfs_fs_info *info = block_group->fs_info; u64 extent_start, extent_end, size; @@ -520,11 +527,10 @@ int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end *total_added_ret = 0; while (start < end) { - ret = find_first_extent_bit(&info->excluded_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE, - NULL); - if (ret) + if (!find_first_extent_bit(&info->excluded_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL)) break; if (extent_start <= start) { @@ -799,8 +805,8 @@ next: key.type == BTRFS_METADATA_ITEM_KEY) { u64 space_added; - ret = add_new_free_space(block_group, last, key.objectid, - &space_added); + ret = btrfs_add_new_free_space(block_group, last, + key.objectid, &space_added); if (ret) goto out; total_found += space_added; @@ -821,14 +827,20 @@ next: path->slots[0]++; } - ret = add_new_free_space(block_group, last, - block_group->start + block_group->length, - NULL); + ret = btrfs_add_new_free_space(block_group, last, + block_group->start + block_group->length, + NULL); out: btrfs_free_path(path); return ret; } +static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) +{ + clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, + bg->start + bg->length - 1, EXTENT_UPTODATE); +} + static noinline void caching_thread(struct btrfs_work *work) { struct btrfs_block_group *block_group; @@ -923,7 +935,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) caching_ctl->block_group = cache; refcount_set(&caching_ctl->count, 2); atomic_set(&caching_ctl->progress, 0); - btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); + btrfs_init_work(&caching_ctl->work, caching_thread, NULL); spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { @@ -1274,7 +1286,7 @@ out: /* Once for the lookup reference */ btrfs_put_block_group(block_group); if (remove_rsv) - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); btrfs_free_path(path); return ret; } @@ -2098,8 +2110,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) if (cache->start < BTRFS_SUPER_INFO_OFFSET) { stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; cache->bytes_super += stripe_len; - ret = btrfs_add_excluded_extent(fs_info, cache->start, - stripe_len); + ret = set_extent_bit(&fs_info->excluded_extents, cache->start, + cache->start + stripe_len - 1, + EXTENT_UPTODATE, NULL); if (ret) return ret; } @@ -2125,8 +2138,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) cache->start + cache->length - logical[nr]); cache->bytes_super += len; - ret = btrfs_add_excluded_extent(fs_info, logical[nr], - len); + ret = set_extent_bit(&fs_info->excluded_extents, logical[nr], + logical[nr] + len - 1, + EXTENT_UPTODATE, NULL); if (ret) { kfree(logical); return ret; @@ -2319,8 +2333,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { cache->cached = BTRFS_CACHE_FINISHED; - ret = add_new_free_space(cache, cache->start, - cache->start + cache->length, NULL); + ret = btrfs_add_new_free_space(cache, cache->start, + cache->start + cache->length, NULL); btrfs_free_excluded_extents(cache); if (ret) goto error; @@ -2587,7 +2601,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); btrfs_set_dev_extent_length(leaf, extent, num_bytes); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -2695,7 +2709,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) /* Already aborted the transaction if it failed. */ next: - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); } @@ -2767,7 +2781,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } - ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); + ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); btrfs_free_excluded_extents(cache); if (ret) { btrfs_put_block_group(cache); @@ -2805,8 +2819,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran #endif list_add_tail(&cache->bg_list, &trans->new_bgs); - trans->delayed_ref_updates++; - btrfs_update_delayed_refs_rsv(trans); + btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info); set_avail_alloc_bits(fs_info, type); return cache; @@ -3011,11 +3024,19 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); fail: btrfs_release_path(path); - /* We didn't update the block group item, need to revert @commit_used. */ - if (ret < 0) { + /* + * We didn't update the block group item, need to revert commit_used + * unless the block group item didn't exist yet - this is to prevent a + * race with a concurrent insertion of the block group item, with + * insert_block_group_item(), that happened just after we attempted to + * update. In that case we would reset commit_used to 0 just after the + * insertion set it to a value greater than 0 - if the block group later + * becomes with 0 used bytes, we would incorrectly skip its update. + */ + if (ret < 0 && ret != -ENOENT) { spin_lock(&cache->lock); cache->commit_used = old_commit_used; spin_unlock(&cache->lock); @@ -3029,7 +3050,6 @@ static int cache_save_setup(struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; @@ -3081,7 +3101,7 @@ again: * time. */ BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { /* * So theoretically we could recover from this, simply set the @@ -3348,7 +3368,7 @@ again: if (should_put) btrfs_put_block_group(cache); if (drop_reserve) - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); /* * Avoid blocking other tasks for too long. It might even save * us from writing caches for block groups that are going to be @@ -3452,8 +3472,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache_save_setup(cache, trans, path); if (!ret) - ret = btrfs_run_delayed_refs(trans, - (unsigned long) -1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL; @@ -3496,7 +3515,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) /* If its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -3521,12 +3540,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { struct btrfs_fs_info *info = trans->fs_info; - struct btrfs_block_group *cache = NULL; - u64 total = num_bytes; + struct btrfs_space_info *space_info; + struct btrfs_block_group *cache; u64 old_val; - u64 byte_in_group; + bool reclaim = false; + bool bg_already_dirty = true; int factor; - int ret = 0; /* Block accounting for super block */ spin_lock(&info->delalloc_root_lock); @@ -3538,97 +3557,86 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, btrfs_set_super_bytes_used(info->super_copy, old_val); spin_unlock(&info->delalloc_root_lock); - while (total) { - struct btrfs_space_info *space_info; - bool reclaim = false; - - cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) { - ret = -ENOENT; - break; - } - space_info = cache->space_info; - factor = btrfs_bg_type_to_factor(cache->flags); + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) + return -ENOENT; - /* - * If this block group has free space cache written out, we - * need to make sure to load it if we are removing space. This - * is because we need the unpinning stage to actually add the - * space back to the block group, otherwise we will leak space. - */ - if (!alloc && !btrfs_block_group_done(cache)) - btrfs_cache_block_group(cache, true); + /* An extent can not span multiple block groups. */ + ASSERT(bytenr + num_bytes <= cache->start + cache->length); - byte_in_group = bytenr - cache->start; - WARN_ON(byte_in_group > cache->length); + space_info = cache->space_info; + factor = btrfs_bg_type_to_factor(cache->flags); - spin_lock(&space_info->lock); - spin_lock(&cache->lock); + /* + * If this block group has free space cache written out, we need to make + * sure to load it if we are removing space. This is because we need + * the unpinning stage to actually add the space back to the block group, + * otherwise we will leak space. + */ + if (!alloc && !btrfs_block_group_done(cache)) + btrfs_cache_block_group(cache, true); - if (btrfs_test_opt(info, SPACE_CACHE) && - cache->disk_cache_state < BTRFS_DC_CLEAR) - cache->disk_cache_state = BTRFS_DC_CLEAR; + spin_lock(&space_info->lock); + spin_lock(&cache->lock); - old_val = cache->used; - num_bytes = min(total, cache->length - byte_in_group); - if (alloc) { - old_val += num_bytes; - cache->used = old_val; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->bytes_used += num_bytes; - space_info->disk_used += num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - } else { - old_val -= num_bytes; - cache->used = old_val; - cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, space_info, - num_bytes); - space_info->bytes_used -= num_bytes; - space_info->disk_used -= num_bytes * factor; + if (btrfs_test_opt(info, SPACE_CACHE) && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; - reclaim = should_reclaim_block_group(cache, num_bytes); + old_val = cache->used; + if (alloc) { + old_val += num_bytes; + cache->used = old_val; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->bytes_used += num_bytes; + space_info->disk_used += num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + } else { + old_val -= num_bytes; + cache->used = old_val; + cache->pinned += num_bytes; + btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes); + space_info->bytes_used -= num_bytes; + space_info->disk_used -= num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); + reclaim = should_reclaim_block_group(cache, num_bytes); - set_extent_bit(&trans->transaction->pinned_extents, - bytenr, bytenr + num_bytes - 1, - EXTENT_DIRTY, NULL); - } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); - spin_lock(&trans->transaction->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &trans->transaction->dirty_bgs); - trans->delayed_ref_updates++; - btrfs_get_block_group(cache); - } - spin_unlock(&trans->transaction->dirty_bgs_lock); + set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + } - /* - * No longer have used bytes in this block group, queue it for - * deletion. We do this after adding the block group to the - * dirty list to avoid races between cleaner kthread and space - * cache writeout. - */ - if (!alloc && old_val == 0) { - if (!btrfs_test_opt(info, DISCARD_ASYNC)) - btrfs_mark_bg_unused(cache); - } else if (!alloc && reclaim) { - btrfs_mark_bg_to_reclaim(cache); - } + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); + bg_already_dirty = false; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); - btrfs_put_block_group(cache); - total -= num_bytes; - bytenr += num_bytes; + /* + * No longer have used bytes in this block group, queue it for deletion. + * We do this after adding the block group to the dirty list to avoid + * races between cleaner kthread and space cache writeout. + */ + if (!alloc && old_val == 0) { + if (!btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(cache); + } else if (!alloc && reclaim) { + btrfs_mark_bg_to_reclaim(cache); } + btrfs_put_block_group(cache); + /* Modified block groups are accounted for in the delayed_refs_rsv. */ - btrfs_update_delayed_refs_rsv(trans); - return ret; + if (!bg_already_dirty) + btrfs_inc_delayed_refs_rsv_bg_updates(info); + + return 0; } /* @@ -4075,7 +4083,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (IS_ERR(ret_bg)) { ret = PTR_ERR(ret_bg); - } else if (from_extent_allocation) { + } else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) { /* * New block group is likely to be used soon. Try to activate * it now. Failure is OK for now. @@ -4273,6 +4281,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; + if (btrfs_is_zoned(info)) { + if (info->active_meta_bg) { + btrfs_put_block_group(info->active_meta_bg); + info->active_meta_bg = NULL; + } + if (info->active_system_bg) { + btrfs_put_block_group(info->active_system_bg); + info->active_system_bg = NULL; + } + } + write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 74b61e663028..2bdbcb834f95 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -291,8 +291,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); -int add_new_free_space(struct btrfs_block_group *block_group, - u64 start, u64 end, u64 *total_added_ret); +int btrfs_add_new_free_space(struct btrfs_block_group *block_group, + u64 start, u64 end, u64 *total_added_ret); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 77684c5e0c8b..ceb5f586a2d5 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -221,7 +221,8 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, if (num_bytes == 0) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + num_bytes, flush); if (!ret) btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); @@ -261,7 +262,8 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, if (!ret) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + num_bytes, flush); if (!ret) { btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; @@ -279,10 +281,10 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *target = NULL; /* - * If we are the delayed_rsv then push to the global rsv, otherwise dump - * into the delayed rsv if it is not full. + * If we are a delayed block reserve then push to the global rsv, + * otherwise dump into the global delayed reserve if it is not full. */ - if (block_rsv == delayed_rsv) + if (block_rsv->type == BTRFS_BLOCK_RSV_DELOPS) target = global_rsv; else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv)) target = delayed_rsv; @@ -354,6 +356,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) min_items++; } + if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) { + num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item); + min_items++; + } + /* * But we also want to reserve enough space so we can do the fallback * global reserve for an unlink, which is an additional @@ -405,6 +412,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_EXTENT_TREE_OBJECTID: case BTRFS_FREE_SPACE_TREE_OBJECTID: case BTRFS_BLOCK_GROUP_TREE_OBJECTID: + case BTRFS_RAID_STRIPE_TREE_OBJECTID: root->block_rsv = &fs_info->delayed_refs_rsv; break; case BTRFS_ROOT_TREE_OBJECTID: @@ -517,8 +525,8 @@ again: block_rsv->type, ret); } try_reserve: - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; /* @@ -539,7 +547,7 @@ try_reserve: * one last time to force a reservation if there's enough actual space * on disk to make the reservation. */ - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize, BTRFS_RESERVE_FLUSH_EMERGENCY); if (!ret) return block_rsv; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d47a927b3504..5572ae52444e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -8,6 +8,8 @@ #include <linux/hash.h> #include <linux/refcount.h> +#include <linux/fscrypt.h> +#include <trace/events/btrfs.h> #include "extent_map.h" #include "extent_io.h" #include "ordered-data.h" @@ -79,11 +81,21 @@ struct btrfs_inode { */ struct btrfs_key location; + /* Cached value of inode property 'compression'. */ + u8 prop_compress; + + /* + * Force compression on the file using the defrag ioctl, could be + * different from prop_compress and takes precedence if set. + */ + u8 defrag_compress; + /* * Lock for counters and all fields used to determine if the inode is in * the log or not (last_trans, last_sub_trans, last_log_commit, - * logged_trans), to access/update new_delalloc_bytes and to update the - * VFS' inode number of bytes used. + * logged_trans), to access/update delalloc_bytes, new_delalloc_bytes, + * defrag_bytes, disk_i_size, outstanding_extents, csum_bytes and to + * update the VFS' inode number of bytes used. */ spinlock_t lock; @@ -102,8 +114,18 @@ struct btrfs_inode { /* held while logging the inode in tree-log.c */ struct mutex log_mutex; + /* + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. Protected by 'lock'. + */ + unsigned outstanding_extents; + /* used to order data wrt metadata */ - struct btrfs_ordered_inode_tree ordered_tree; + spinlock_t ordered_tree_lock; + struct rb_root ordered_tree; + struct rb_node *ordered_tree_last; /* list of all the delalloc inodes in the FS. There are times we need * to write all the delalloc pages to disk, and this list is used @@ -122,28 +144,31 @@ struct btrfs_inode { u64 generation; /* - * transid of the trans_handle that last modified this inode + * ID of the transaction handle that last modified this inode. + * Protected by 'lock'. */ u64 last_trans; /* - * transid that last logged this inode + * ID of the transaction that last logged this inode. + * Protected by 'lock'. */ u64 logged_trans; /* - * log transid when this inode was last modified + * Log transaction ID when this inode was last modified. + * Protected by 'lock'. */ int last_sub_trans; - /* a local copy of root's last_log_commit */ + /* A local copy of root's last_log_commit. Protected by 'lock'. */ int last_log_commit; union { /* * Total number of bytes pending delalloc, used by stat to * calculate the real block usage of the file. This is used - * only for files. + * only for files. Protected by 'lock'. */ u64 delalloc_bytes; /* @@ -161,7 +186,7 @@ struct btrfs_inode { * Total number of bytes pending delalloc that fall within a file * range that is either a hole or beyond EOF (and no prealloc extent * exists in the range). This is always <= delalloc_bytes and this - * is used only for files. + * is used only for files. Protected by 'lock'. */ u64 new_delalloc_bytes; /* @@ -172,15 +197,15 @@ struct btrfs_inode { }; /* - * total number of bytes pending defrag, used by stat to check whether - * it needs COW. + * Total number of bytes pending defrag, used by stat to check whether + * it needs COW. Protected by 'lock'. */ u64 defrag_bytes; /* - * the size of the file stored in the metadata on disk. data=ordered + * The size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk - * because not all the blocks are written yet. + * because not all the blocks are written yet. Protected by 'lock'. */ u64 disk_i_size; @@ -214,7 +239,7 @@ struct btrfs_inode { /* * Number of bytes outstanding that are going to need csums. This is - * used in ENOSPC accounting. + * used in ENOSPC accounting. Protected by 'lock'. */ u64 csum_bytes; @@ -223,30 +248,13 @@ struct btrfs_inode { /* Read-only compatibility flags, upper half of inode_item::flags */ u32 ro_flags; - /* - * Counters to keep track of the number of extent item's we may use due - * to delalloc and such. outstanding_extents is the number of extent - * items we think we'll end up using, and reserved_extents is the number - * of extent items we've reserved metadata for. - */ - unsigned outstanding_extents; - struct btrfs_block_rsv block_rsv; - /* - * Cached values of inode properties - */ - unsigned prop_compress; /* per-file compression algorithm */ - /* - * Force compression on the file using the defrag ioctl, could be - * different from prop_compress and takes precedence if set - */ - unsigned defrag_compress; - struct btrfs_delayed_node *delayed_node; /* File creation time. */ - struct timespec64 i_otime; + u64 i_otime_sec; + u32 i_otime_nsec; /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; @@ -387,7 +395,7 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) spin_lock(&inode->lock); if (inode->logged_trans == generation && inode->last_sub_trans <= inode->last_log_commit && - inode->last_sub_trans <= inode->root->last_log_commit) + inode->last_sub_trans <= btrfs_get_root_last_log_commit(inode->root)) ret = true; spin_unlock(&inode->lock); return ret; @@ -481,9 +489,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 end); int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); + struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); + struct btrfs_inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); @@ -498,12 +506,8 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, struct writeback_control *wbc); + u64 start, u64 end, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page); -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c deleted file mode 100644 index 3caf339c4bb3..000000000000 --- a/fs/btrfs/check-integrity.c +++ /dev/null @@ -1,2871 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) STRATO AG 2011. All rights reserved. - */ - -/* - * This module can be used to catch cases when the btrfs kernel - * code executes write requests to the disk that bring the file - * system in an inconsistent state. In such a state, a power-loss - * or kernel panic event would cause that the data on disk is - * lost or at least damaged. - * - * Code is added that examines all block write requests during - * runtime (including writes of the super block). Three rules - * are verified and an error is printed on violation of the - * rules: - * 1. It is not allowed to write a disk block which is - * currently referenced by the super block (either directly - * or indirectly). - * 2. When a super block is written, it is verified that all - * referenced (directly or indirectly) blocks fulfill the - * following requirements: - * 2a. All referenced blocks have either been present when - * the file system was mounted, (i.e., they have been - * referenced by the super block) or they have been - * written since then and the write completion callback - * was called and no write error was indicated and a - * FLUSH request to the device where these blocks are - * located was received and completed. - * 2b. All referenced blocks need to have a generation - * number which is equal to the parent's number. - * - * One issue that was found using this module was that the log - * tree on disk became temporarily corrupted because disk blocks - * that had been in use for the log tree had been freed and - * reused too early, while being referenced by the written super - * block. - * - * The search term in the kernel log that can be used to filter - * on the existence of detected integrity issues is - * "btrfs: attempt". - * - * The integrity check is enabled via mount options. These - * mount options are only supported if the integrity check - * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. - * - * Example #1, apply integrity checks to all metadata: - * mount /dev/sdb1 /mnt -o check_int - * - * Example #2, apply integrity checks to all metadata and - * to data extents: - * mount /dev/sdb1 /mnt -o check_int_data - * - * Example #3, apply integrity checks to all metadata and dump - * the tree that the super block references to kernel messages - * each time after a super block was written: - * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 - * - * If the integrity check tool is included and activated in - * the mount options, plenty of kernel memory is used, and - * plenty of additional CPU cycles are spent. Enabling this - * functionality is not intended for normal use. In most - * cases, unless you are a btrfs developer who needs to verify - * the integrity of (super)-block write requests, do not - * enable the config option BTRFS_FS_CHECK_INTEGRITY to - * include and compile the integrity check tool. - * - * Expect millions of lines of information in the kernel log with an - * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the - * kernel config to at least 26 (which is 64MB). Usually the value is - * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be - * changed like this before LOG_BUF_SHIFT can be set to a high value: - * config LOG_BUF_SHIFT - * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" - * range 12 30 - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/mutex.h> -#include <linux/blkdev.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <crypto/hash.h> -#include "messages.h" -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "extent_io.h" -#include "volumes.h" -#include "print-tree.h" -#include "locking.h" -#include "check-integrity.h" -#include "rcu-string.h" -#include "compression.h" -#include "accessors.h" - -#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 -#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 -#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 -#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 -#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 -#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 -#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 -#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, - * excluding " [...]" */ -#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) - -/* - * The definition of the bitmask fields for the print_mask. - * They are specified with the mount option check_integrity_print_mask. - */ -#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 -#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 -#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 -#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 -#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 -#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 -#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 -#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 -#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 -#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 -#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 -#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 -#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 -#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000 - -struct btrfsic_dev_state; -struct btrfsic_state; - -struct btrfsic_block { - u32 magic_num; /* only used for debug purposes */ - unsigned int is_metadata:1; /* if it is meta-data, not data-data */ - unsigned int is_superblock:1; /* if it is one of the superblocks */ - unsigned int is_iodone:1; /* if is done by lower subsystem */ - unsigned int iodone_w_error:1; /* error was indicated to endio */ - unsigned int never_written:1; /* block was added because it was - * referenced, not because it was - * written */ - unsigned int mirror_num; /* large enough to hold - * BTRFS_SUPER_MIRROR_MAX */ - struct btrfsic_dev_state *dev_state; - u64 dev_bytenr; /* key, physical byte num on disk */ - u64 logical_bytenr; /* logical byte num on disk */ - u64 generation; - struct btrfs_disk_key disk_key; /* extra info to print in case of - * issues, will not always be correct */ - struct list_head collision_resolving_node; /* list node */ - struct list_head all_blocks_node; /* list node */ - - /* the following two lists contain block_link items */ - struct list_head ref_to_list; /* list */ - struct list_head ref_from_list; /* list */ - struct btrfsic_block *next_in_same_bio; - void *orig_bio_private; - bio_end_io_t *orig_bio_end_io; - blk_opf_t submit_bio_bh_rw; - u64 flush_gen; /* only valid if !never_written */ -}; - -/* - * Elements of this type are allocated dynamically and required because - * each block object can refer to and can be ref from multiple blocks. - * The key to lookup them in the hashtable is the dev_bytenr of - * the block ref to plus the one from the block referred from. - * The fact that they are searchable via a hashtable and that a - * ref_cnt is maintained is not required for the btrfs integrity - * check algorithm itself, it is only used to make the output more - * beautiful in case that an error is detected (an error is defined - * as a write operation to a block while that block is still referenced). - */ -struct btrfsic_block_link { - u32 magic_num; /* only used for debug purposes */ - u32 ref_cnt; - struct list_head node_ref_to; /* list node */ - struct list_head node_ref_from; /* list node */ - struct list_head collision_resolving_node; /* list node */ - struct btrfsic_block *block_ref_to; - struct btrfsic_block *block_ref_from; - u64 parent_generation; -}; - -struct btrfsic_dev_state { - u32 magic_num; /* only used for debug purposes */ - struct block_device *bdev; - struct btrfsic_state *state; - struct list_head collision_resolving_node; /* list node */ - struct btrfsic_block dummy_block_for_bio_bh_flush; - u64 last_flush_gen; -}; - -struct btrfsic_block_hashtable { - struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; -}; - -struct btrfsic_block_link_hashtable { - struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; -}; - -struct btrfsic_dev_state_hashtable { - struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; -}; - -struct btrfsic_block_data_ctx { - u64 start; /* virtual bytenr */ - u64 dev_bytenr; /* physical bytenr on device */ - u32 len; - struct btrfsic_dev_state *dev; - char **datav; - struct page **pagev; - void *mem_to_free; -}; - -/* This structure is used to implement recursion without occupying - * any stack space, refer to btrfsic_process_metablock() */ -struct btrfsic_stack_frame { - u32 magic; - u32 nr; - int error; - int i; - int limit_nesting; - int num_copies; - int mirror_num; - struct btrfsic_block *block; - struct btrfsic_block_data_ctx *block_ctx; - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx next_block_ctx; - struct btrfs_header *hdr; - struct btrfsic_stack_frame *prev; -}; - -/* Some state per mounted filesystem */ -struct btrfsic_state { - u32 print_mask; - int include_extent_data; - struct list_head all_blocks_list; - struct btrfsic_block_hashtable block_hashtable; - struct btrfsic_block_link_hashtable block_link_hashtable; - struct btrfs_fs_info *fs_info; - u64 max_superblock_generation; - struct btrfsic_block *latest_superblock; - u32 metablock_size; - u32 datablock_size; -}; - -static int btrfsic_process_metablock(struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - int limit_nesting, int force_iodone_flag); -static void btrfsic_read_from_block_data( - struct btrfsic_block_data_ctx *block_ctx, - void *dst, u32 offset, size_t len); -static int btrfsic_create_link_to_next_block( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx - *block_ctx, u64 next_bytenr, - int limit_nesting, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block **next_blockp, - int force_iodone_flag, - int *num_copiesp, int *mirror_nump, - struct btrfs_disk_key *disk_key, - u64 parent_generation); -static int btrfsic_handle_extent_data(struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u32 item_offset, int force_iodone_flag); -static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, - struct btrfsic_block_data_ctx *block_ctx_out, - int mirror_num); -static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); -static int btrfsic_read_block(struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx); -static int btrfsic_process_written_superblock( - struct btrfsic_state *state, - struct btrfsic_block *const block, - struct btrfs_super_block *const super_hdr); -static void btrfsic_bio_end_io(struct bio *bp); -static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int recursion_level); -static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, - struct btrfsic_block *const block, - int recursion_level); -static void btrfsic_print_add_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l); -static void btrfsic_print_rem_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l); -static char btrfsic_get_block_type(const struct btrfsic_state *state, - const struct btrfsic_block *block); -static void btrfsic_dump_tree(const struct btrfsic_state *state); -static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int indent_level); -static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block *next_block, - struct btrfsic_block *from_block, - u64 parent_generation); -static struct btrfsic_block *btrfsic_block_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx, - const char *additional_string, - int is_metadata, - int is_iodone, - int never_written, - int mirror_num, - int *was_created); -static int btrfsic_process_superblock_dev_mirror( - struct btrfsic_state *state, - struct btrfsic_dev_state *dev_state, - struct btrfs_device *device, - int superblock_mirror_num, - struct btrfsic_dev_state **selected_dev_state, - struct btrfs_super_block *selected_super); -static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev); -static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, - u64 bytenr, - struct btrfsic_dev_state *dev_state, - u64 dev_bytenr); - -static struct mutex btrfsic_mutex; -static int btrfsic_is_initialized; -static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; - - -static void btrfsic_block_init(struct btrfsic_block *b) -{ - b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; - b->dev_state = NULL; - b->dev_bytenr = 0; - b->logical_bytenr = 0; - b->generation = BTRFSIC_GENERATION_UNKNOWN; - b->disk_key.objectid = 0; - b->disk_key.type = 0; - b->disk_key.offset = 0; - b->is_metadata = 0; - b->is_superblock = 0; - b->is_iodone = 0; - b->iodone_w_error = 0; - b->never_written = 0; - b->mirror_num = 0; - b->next_in_same_bio = NULL; - b->orig_bio_private = NULL; - b->orig_bio_end_io = NULL; - INIT_LIST_HEAD(&b->collision_resolving_node); - INIT_LIST_HEAD(&b->all_blocks_node); - INIT_LIST_HEAD(&b->ref_to_list); - INIT_LIST_HEAD(&b->ref_from_list); - b->submit_bio_bh_rw = 0; - b->flush_gen = 0; -} - -static struct btrfsic_block *btrfsic_block_alloc(void) -{ - struct btrfsic_block *b; - - b = kzalloc(sizeof(*b), GFP_NOFS); - if (NULL != b) - btrfsic_block_init(b); - - return b; -} - -static void btrfsic_block_free(struct btrfsic_block *b) -{ - BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); - kfree(b); -} - -static void btrfsic_block_link_init(struct btrfsic_block_link *l) -{ - l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; - l->ref_cnt = 1; - INIT_LIST_HEAD(&l->node_ref_to); - INIT_LIST_HEAD(&l->node_ref_from); - INIT_LIST_HEAD(&l->collision_resolving_node); - l->block_ref_to = NULL; - l->block_ref_from = NULL; -} - -static struct btrfsic_block_link *btrfsic_block_link_alloc(void) -{ - struct btrfsic_block_link *l; - - l = kzalloc(sizeof(*l), GFP_NOFS); - if (NULL != l) - btrfsic_block_link_init(l); - - return l; -} - -static void btrfsic_block_link_free(struct btrfsic_block_link *l) -{ - BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); - kfree(l); -} - -static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) -{ - ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; - ds->bdev = NULL; - ds->state = NULL; - INIT_LIST_HEAD(&ds->collision_resolving_node); - ds->last_flush_gen = 0; - btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); - ds->dummy_block_for_bio_bh_flush.is_iodone = 1; - ds->dummy_block_for_bio_bh_flush.dev_state = ds; -} - -static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) -{ - struct btrfsic_dev_state *ds; - - ds = kzalloc(sizeof(*ds), GFP_NOFS); - if (NULL != ds) - btrfsic_dev_state_init(ds); - - return ds; -} - -static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) -{ - BUG_ON(!(NULL == ds || - BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); - kfree(ds); -} - -static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_block_hashtable_add(struct btrfsic_block *b, - struct btrfsic_block_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(b->dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)b->dev_state->bdev))) & - (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - - list_add(&b->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) -{ - list_del(&b->collision_resolving_node); -} - -static struct btrfsic_block *btrfsic_block_hashtable_lookup( - struct block_device *bdev, - u64 dev_bytenr, - struct btrfsic_block_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)bdev))) & - (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - struct btrfsic_block *b; - - list_for_each_entry(b, h->table + hashval, collision_resolving_node) { - if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) - return b; - } - - return NULL; -} - -static void btrfsic_block_link_hashtable_init( - struct btrfsic_block_link_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_block_link_hashtable_add( - struct btrfsic_block_link *l, - struct btrfsic_block_link_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ - ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ - ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) - & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - - BUG_ON(NULL == l->block_ref_to); - BUG_ON(NULL == l->block_ref_from); - list_add(&l->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) -{ - list_del(&l->collision_resolving_node); -} - -static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( - struct block_device *bdev_ref_to, - u64 dev_bytenr_ref_to, - struct block_device *bdev_ref_from, - u64 dev_bytenr_ref_from, - struct btrfsic_block_link_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ - ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ - ((unsigned int)((uintptr_t)bdev_ref_to)) ^ - ((unsigned int)((uintptr_t)bdev_ref_from))) & - (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - struct btrfsic_block_link *l; - - list_for_each_entry(l, h->table + hashval, collision_resolving_node) { - BUG_ON(NULL == l->block_ref_to); - BUG_ON(NULL == l->block_ref_from); - if (l->block_ref_to->dev_state->bdev == bdev_ref_to && - l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && - l->block_ref_from->dev_state->bdev == bdev_ref_from && - l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) - return l; - } - - return NULL; -} - -static void btrfsic_dev_state_hashtable_init( - struct btrfsic_dev_state_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_dev_state_hashtable_add( - struct btrfsic_dev_state *ds, - struct btrfsic_dev_state_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) & - (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); - - list_add(&ds->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) -{ - list_del(&ds->collision_resolving_node); -} - -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, - struct btrfsic_dev_state_hashtable *h) -{ - const unsigned int hashval = - dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1); - struct btrfsic_dev_state *ds; - - list_for_each_entry(ds, h->table + hashval, collision_resolving_node) { - if (ds->bdev->bd_dev == dev) - return ds; - } - - return NULL; -} - -static int btrfsic_process_superblock(struct btrfsic_state *state, - struct btrfs_fs_devices *fs_devices) -{ - struct btrfs_super_block *selected_super; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - struct btrfsic_dev_state *selected_dev_state = NULL; - int ret = 0; - int pass; - - selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); - if (!selected_super) - return -ENOMEM; - - list_for_each_entry(device, dev_head, dev_list) { - int i; - struct btrfsic_dev_state *dev_state; - - if (!device->bdev || !device->name) - continue; - - dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev); - BUG_ON(NULL == dev_state); - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - ret = btrfsic_process_superblock_dev_mirror( - state, dev_state, device, i, - &selected_dev_state, selected_super); - if (0 != ret && 0 == i) { - kfree(selected_super); - return ret; - } - } - } - - if (NULL == state->latest_superblock) { - pr_info("btrfsic: no superblock found!\n"); - kfree(selected_super); - return -1; - } - - for (pass = 0; pass < 3; pass++) { - int num_copies; - int mirror_num; - u64 next_bytenr; - - switch (pass) { - case 0: - next_bytenr = btrfs_super_root(selected_super); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("root@%llu\n", next_bytenr); - break; - case 1: - next_bytenr = btrfs_super_chunk_root(selected_super); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("chunk@%llu\n", next_bytenr); - break; - case 2: - next_bytenr = btrfs_super_log_root(selected_super); - if (0 == next_bytenr) - continue; - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("log@%llu\n", next_bytenr); - break; - } - - num_copies = btrfs_num_copies(state->fs_info, next_bytenr, - state->metablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - - ret = btrfsic_map_block(state, next_bytenr, - state->metablock_size, - &tmp_next_block_ctx, - mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(root @%llu, mirror %d) failed!\n", - next_bytenr, mirror_num); - kfree(selected_super); - return -1; - } - - next_block = btrfsic_block_hashtable_lookup( - tmp_next_block_ctx.dev->bdev, - tmp_next_block_ctx.dev_bytenr, - &state->block_hashtable); - BUG_ON(NULL == next_block); - - l = btrfsic_block_link_hashtable_lookup( - tmp_next_block_ctx.dev->bdev, - tmp_next_block_ctx.dev_bytenr, - state->latest_superblock->dev_state-> - bdev, - state->latest_superblock->dev_bytenr, - &state->block_link_hashtable); - BUG_ON(NULL == l); - - ret = btrfsic_read_block(state, &tmp_next_block_ctx); - if (ret < (int)PAGE_SIZE) { - pr_info("btrfsic: read @logical %llu failed!\n", - tmp_next_block_ctx.start); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - kfree(selected_super); - return -1; - } - - ret = btrfsic_process_metablock(state, - next_block, - &tmp_next_block_ctx, - BTRFS_MAX_LEVEL + 3, 1); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - } - } - - kfree(selected_super); - return ret; -} - -static int btrfsic_process_superblock_dev_mirror( - struct btrfsic_state *state, - struct btrfsic_dev_state *dev_state, - struct btrfs_device *device, - int superblock_mirror_num, - struct btrfsic_dev_state **selected_dev_state, - struct btrfs_super_block *selected_super) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfs_super_block *super_tmp; - u64 dev_bytenr; - struct btrfsic_block *superblock_tmp; - int pass; - struct block_device *const superblock_bdev = device->bdev; - struct page *page; - struct address_space *mapping = superblock_bdev->bd_inode->i_mapping; - int ret = 0; - - /* super block bytenr is always the unmapped device bytenr */ - dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) - return -1; - - page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS); - if (IS_ERR(page)) - return -1; - - super_tmp = page_address(page); - - if (btrfs_super_bytenr(super_tmp) != dev_bytenr || - btrfs_super_magic(super_tmp) != BTRFS_MAGIC || - memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || - btrfs_super_nodesize(super_tmp) != state->metablock_size || - btrfs_super_sectorsize(super_tmp) != state->datablock_size) { - ret = 0; - goto out; - } - - superblock_tmp = - btrfsic_block_hashtable_lookup(superblock_bdev, - dev_bytenr, - &state->block_hashtable); - if (NULL == superblock_tmp) { - superblock_tmp = btrfsic_block_alloc(); - if (NULL == superblock_tmp) { - ret = -1; - goto out; - } - /* for superblock, only the dev_bytenr makes sense */ - superblock_tmp->dev_bytenr = dev_bytenr; - superblock_tmp->dev_state = dev_state; - superblock_tmp->logical_bytenr = dev_bytenr; - superblock_tmp->generation = btrfs_super_generation(super_tmp); - superblock_tmp->is_metadata = 1; - superblock_tmp->is_superblock = 1; - superblock_tmp->is_iodone = 1; - superblock_tmp->never_written = 0; - superblock_tmp->mirror_num = 1 + superblock_mirror_num; - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - btrfs_info_in_rcu(fs_info, - "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)", - superblock_bdev, - btrfs_dev_name(device), dev_bytenr, - dev_state->bdev, dev_bytenr, - superblock_mirror_num); - list_add(&superblock_tmp->all_blocks_node, - &state->all_blocks_list); - btrfsic_block_hashtable_add(superblock_tmp, - &state->block_hashtable); - } - - /* select the one with the highest generation field */ - if (btrfs_super_generation(super_tmp) > - state->max_superblock_generation || - 0 == state->max_superblock_generation) { - memcpy(selected_super, super_tmp, sizeof(*selected_super)); - *selected_dev_state = dev_state; - state->max_superblock_generation = - btrfs_super_generation(super_tmp); - state->latest_superblock = superblock_tmp; - } - - for (pass = 0; pass < 3; pass++) { - u64 next_bytenr; - int num_copies; - int mirror_num; - const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key; - - tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_disk_key.offset = 0; - switch (pass) { - case 0: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_ROOT_TREE_OBJECTID); - additional_string = "initial root "; - next_bytenr = btrfs_super_root(super_tmp); - break; - case 1: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_CHUNK_TREE_OBJECTID); - additional_string = "initial chunk "; - next_bytenr = btrfs_super_chunk_root(super_tmp); - break; - case 2: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_TREE_LOG_OBJECTID); - additional_string = "initial log "; - next_bytenr = btrfs_super_log_root(super_tmp); - if (0 == next_bytenr) - continue; - break; - } - - num_copies = btrfs_num_copies(fs_info, next_bytenr, - state->metablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - - if (btrfsic_map_block(state, next_bytenr, - state->metablock_size, - &tmp_next_block_ctx, - mirror_num)) { - pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n", - next_bytenr, mirror_num); - ret = -1; - goto out; - } - - next_block = btrfsic_block_lookup_or_add( - state, &tmp_next_block_ctx, - additional_string, 1, 1, 0, - mirror_num, NULL); - if (NULL == next_block) { - btrfsic_release_block_ctx(&tmp_next_block_ctx); - ret = -1; - goto out; - } - - next_block->disk_key = tmp_disk_key; - next_block->generation = BTRFSIC_GENERATION_UNKNOWN; - l = btrfsic_block_link_lookup_or_add( - state, &tmp_next_block_ctx, - next_block, superblock_tmp, - BTRFSIC_GENERATION_UNKNOWN); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - if (NULL == l) { - ret = -1; - goto out; - } - } - } - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) - btrfsic_dump_tree_sub(state, superblock_tmp, 0); - -out: - put_page(page); - return ret; -} - -static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) -{ - struct btrfsic_stack_frame *sf; - - sf = kzalloc(sizeof(*sf), GFP_NOFS); - if (sf) - sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; - return sf; -} - -static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) -{ - BUG_ON(!(NULL == sf || - BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); - kfree(sf); -} - -static noinline_for_stack int btrfsic_process_metablock( - struct btrfsic_state *state, - struct btrfsic_block *const first_block, - struct btrfsic_block_data_ctx *const first_block_ctx, - int first_limit_nesting, int force_iodone_flag) -{ - struct btrfsic_stack_frame initial_stack_frame = { 0 }; - struct btrfsic_stack_frame *sf; - struct btrfsic_stack_frame *next_stack; - struct btrfs_header *const first_hdr = - (struct btrfs_header *)first_block_ctx->datav[0]; - - BUG_ON(!first_hdr); - sf = &initial_stack_frame; - sf->error = 0; - sf->i = -1; - sf->limit_nesting = first_limit_nesting; - sf->block = first_block; - sf->block_ctx = first_block_ctx; - sf->next_block = NULL; - sf->hdr = first_hdr; - sf->prev = NULL; - -continue_with_new_stack_frame: - sf->block->generation = btrfs_stack_header_generation(sf->hdr); - if (0 == sf->hdr->level) { - struct btrfs_leaf *const leafhdr = - (struct btrfs_leaf *)sf->hdr; - - if (-1 == sf->i) { - sf->nr = btrfs_stack_header_nritems(&leafhdr->header); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("leaf %llu items %d generation %llu owner %llu\n", - sf->block_ctx->start, sf->nr, - btrfs_stack_header_generation( - &leafhdr->header), - btrfs_stack_header_owner( - &leafhdr->header)); - } - -continue_with_current_leaf_stack_frame: - if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { - sf->i++; - sf->num_copies = 0; - } - - if (sf->i < sf->nr) { - struct btrfs_item disk_item; - u32 disk_item_offset = - (uintptr_t)(leafhdr->items + sf->i) - - (uintptr_t)leafhdr; - struct btrfs_disk_key *disk_key; - u8 type; - u32 item_offset; - u32 item_size; - - if (disk_item_offset + sizeof(struct btrfs_item) > - sf->block_ctx->len) { -leaf_item_out_of_bounce_error: - pr_info( - "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n", - sf->block_ctx->start, - sf->block_ctx->dev->bdev); - goto one_stack_frame_backwards; - } - btrfsic_read_from_block_data(sf->block_ctx, - &disk_item, - disk_item_offset, - sizeof(struct btrfs_item)); - item_offset = btrfs_stack_item_offset(&disk_item); - item_size = btrfs_stack_item_size(&disk_item); - disk_key = &disk_item.key; - type = btrfs_disk_key_type(disk_key); - - if (BTRFS_ROOT_ITEM_KEY == type) { - struct btrfs_root_item root_item; - u32 root_item_offset; - u64 next_bytenr; - - root_item_offset = item_offset + - offsetof(struct btrfs_leaf, items); - if (root_item_offset + item_size > - sf->block_ctx->len) - goto leaf_item_out_of_bounce_error; - btrfsic_read_from_block_data( - sf->block_ctx, &root_item, - root_item_offset, - item_size); - next_bytenr = btrfs_root_bytenr(&root_item); - - sf->error = - btrfsic_create_link_to_next_block( - state, - sf->block, - sf->block_ctx, - next_bytenr, - sf->limit_nesting, - &sf->next_block_ctx, - &sf->next_block, - force_iodone_flag, - &sf->num_copies, - &sf->mirror_num, - disk_key, - btrfs_root_generation( - &root_item)); - if (sf->error) - goto one_stack_frame_backwards; - - if (NULL != sf->next_block) { - struct btrfs_header *const next_hdr = - (struct btrfs_header *) - sf->next_block_ctx.datav[0]; - - next_stack = - btrfsic_stack_frame_alloc(); - if (NULL == next_stack) { - sf->error = -1; - btrfsic_release_block_ctx( - &sf-> - next_block_ctx); - goto one_stack_frame_backwards; - } - - next_stack->i = -1; - next_stack->block = sf->next_block; - next_stack->block_ctx = - &sf->next_block_ctx; - next_stack->next_block = NULL; - next_stack->hdr = next_hdr; - next_stack->limit_nesting = - sf->limit_nesting - 1; - next_stack->prev = sf; - sf = next_stack; - goto continue_with_new_stack_frame; - } - } else if (BTRFS_EXTENT_DATA_KEY == type && - state->include_extent_data) { - sf->error = btrfsic_handle_extent_data( - state, - sf->block, - sf->block_ctx, - item_offset, - force_iodone_flag); - if (sf->error) - goto one_stack_frame_backwards; - } - - goto continue_with_current_leaf_stack_frame; - } - } else { - struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; - - if (-1 == sf->i) { - sf->nr = btrfs_stack_header_nritems(&nodehdr->header); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("node %llu level %d items %d generation %llu owner %llu\n", - sf->block_ctx->start, - nodehdr->header.level, sf->nr, - btrfs_stack_header_generation( - &nodehdr->header), - btrfs_stack_header_owner( - &nodehdr->header)); - } - -continue_with_current_node_stack_frame: - if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { - sf->i++; - sf->num_copies = 0; - } - - if (sf->i < sf->nr) { - struct btrfs_key_ptr key_ptr; - u32 key_ptr_offset; - u64 next_bytenr; - - key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - - (uintptr_t)nodehdr; - if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > - sf->block_ctx->len) { - pr_info( - "btrfsic: node item out of bounce at logical %llu, dev %pg\n", - sf->block_ctx->start, - sf->block_ctx->dev->bdev); - goto one_stack_frame_backwards; - } - btrfsic_read_from_block_data( - sf->block_ctx, &key_ptr, key_ptr_offset, - sizeof(struct btrfs_key_ptr)); - next_bytenr = btrfs_stack_key_blockptr(&key_ptr); - - sf->error = btrfsic_create_link_to_next_block( - state, - sf->block, - sf->block_ctx, - next_bytenr, - sf->limit_nesting, - &sf->next_block_ctx, - &sf->next_block, - force_iodone_flag, - &sf->num_copies, - &sf->mirror_num, - &key_ptr.key, - btrfs_stack_key_generation(&key_ptr)); - if (sf->error) - goto one_stack_frame_backwards; - - if (NULL != sf->next_block) { - struct btrfs_header *const next_hdr = - (struct btrfs_header *) - sf->next_block_ctx.datav[0]; - - next_stack = btrfsic_stack_frame_alloc(); - if (NULL == next_stack) { - sf->error = -1; - goto one_stack_frame_backwards; - } - - next_stack->i = -1; - next_stack->block = sf->next_block; - next_stack->block_ctx = &sf->next_block_ctx; - next_stack->next_block = NULL; - next_stack->hdr = next_hdr; - next_stack->limit_nesting = - sf->limit_nesting - 1; - next_stack->prev = sf; - sf = next_stack; - goto continue_with_new_stack_frame; - } - - goto continue_with_current_node_stack_frame; - } - } - -one_stack_frame_backwards: - if (NULL != sf->prev) { - struct btrfsic_stack_frame *const prev = sf->prev; - - /* the one for the initial block is freed in the caller */ - btrfsic_release_block_ctx(sf->block_ctx); - - if (sf->error) { - prev->error = sf->error; - btrfsic_stack_frame_free(sf); - sf = prev; - goto one_stack_frame_backwards; - } - - btrfsic_stack_frame_free(sf); - sf = prev; - goto continue_with_new_stack_frame; - } else { - BUG_ON(&initial_stack_frame != sf); - } - - return sf->error; -} - -static void btrfsic_read_from_block_data( - struct btrfsic_block_data_ctx *block_ctx, - void *dstv, u32 offset, size_t len) -{ - size_t cur; - size_t pgoff; - char *kaddr; - char *dst = (char *)dstv; - size_t start_offset = offset_in_page(block_ctx->start); - unsigned long i = (start_offset + offset) >> PAGE_SHIFT; - - WARN_ON(offset + len > block_ctx->len); - pgoff = offset_in_page(start_offset + offset); - - while (len > 0) { - cur = min(len, ((size_t)PAGE_SIZE - pgoff)); - BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE)); - kaddr = block_ctx->datav[i]; - memcpy(dst, kaddr + pgoff, cur); - - dst += cur; - len -= cur; - pgoff = 0; - i++; - } -} - -static int btrfsic_create_link_to_next_block( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u64 next_bytenr, - int limit_nesting, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block **next_blockp, - int force_iodone_flag, - int *num_copiesp, int *mirror_nump, - struct btrfs_disk_key *disk_key, - u64 parent_generation) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfsic_block *next_block = NULL; - int ret; - struct btrfsic_block_link *l; - int did_alloc_block_link; - int block_was_created; - - *next_blockp = NULL; - if (0 == *num_copiesp) { - *num_copiesp = btrfs_num_copies(fs_info, next_bytenr, - state->metablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, *num_copiesp); - *mirror_nump = 1; - } - - if (*mirror_nump > *num_copiesp) - return 0; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic_create_link_to_next_block(mirror_num=%d)\n", - *mirror_nump); - ret = btrfsic_map_block(state, next_bytenr, - state->metablock_size, - next_block_ctx, *mirror_nump); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - next_bytenr, *mirror_nump); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - next_block = btrfsic_block_lookup_or_add(state, - next_block_ctx, "referenced ", - 1, force_iodone_flag, - !force_iodone_flag, - *mirror_nump, - &block_was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - if (block_was_created) { - l = NULL; - next_block->generation = BTRFSIC_GENERATION_UNKNOWN; - } else { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { - if (next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) - pr_info( -"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", - next_bytenr, next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, - next_block), - next_block->logical_bytenr); - else - pr_info( - "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n", - next_bytenr, next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, - next_block)); - } - next_block->logical_bytenr = next_bytenr; - - next_block->mirror_num = *mirror_nump; - l = btrfsic_block_link_hashtable_lookup( - next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, - block_ctx->dev->bdev, - block_ctx->dev_bytenr, - &state->block_link_hashtable); - } - - next_block->disk_key = *disk_key; - if (NULL == l) { - l = btrfsic_block_link_alloc(); - if (NULL == l) { - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - did_alloc_block_link = 1; - l->block_ref_to = next_block; - l->block_ref_from = block; - l->ref_cnt = 1; - l->parent_generation = parent_generation; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - - list_add(&l->node_ref_to, &block->ref_to_list); - list_add(&l->node_ref_from, &next_block->ref_from_list); - - btrfsic_block_link_hashtable_add(l, - &state->block_link_hashtable); - } else { - did_alloc_block_link = 0; - if (0 == limit_nesting) { - l->ref_cnt++; - l->parent_generation = parent_generation; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - } - } - - if (limit_nesting > 0 && did_alloc_block_link) { - ret = btrfsic_read_block(state, next_block_ctx); - if (ret < (int)next_block_ctx->len) { - pr_info("btrfsic: read block @logical %llu failed!\n", - next_bytenr); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - *next_blockp = next_block; - } else { - *next_blockp = NULL; - } - (*mirror_nump)++; - - return 0; -} - -static int btrfsic_handle_extent_data( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u32 item_offset, int force_iodone_flag) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfs_file_extent_item file_extent_item; - u64 file_extent_item_offset; - u64 next_bytenr; - u64 num_bytes; - u64 generation; - struct btrfsic_block_link *l; - int ret; - - file_extent_item_offset = offsetof(struct btrfs_leaf, items) + - item_offset; - if (file_extent_item_offset + - offsetof(struct btrfs_file_extent_item, disk_num_bytes) > - block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", - block_ctx->start, block_ctx->dev->bdev); - return -1; - } - - btrfsic_read_from_block_data(block_ctx, &file_extent_item, - file_extent_item_offset, - offsetof(struct btrfs_file_extent_item, disk_num_bytes)); - if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || - btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - pr_info("extent_data: type %u, disk_bytenr = %llu\n", - file_extent_item.type, - btrfs_stack_file_extent_disk_bytenr( - &file_extent_item)); - return 0; - } - - if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > - block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", - block_ctx->start, block_ctx->dev->bdev); - return -1; - } - btrfsic_read_from_block_data(block_ctx, &file_extent_item, - file_extent_item_offset, - sizeof(struct btrfs_file_extent_item)); - next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item); - if (btrfs_stack_file_extent_compression(&file_extent_item) == - BTRFS_COMPRESS_NONE) { - next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item); - num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); - } else { - num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item); - } - generation = btrfs_stack_file_extent_generation(&file_extent_item); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - pr_info("extent_data: type %u, disk_bytenr = %llu, offset = %llu, num_bytes = %llu\n", - file_extent_item.type, - btrfs_stack_file_extent_disk_bytenr(&file_extent_item), - btrfs_stack_file_extent_offset(&file_extent_item), - num_bytes); - while (num_bytes > 0) { - u32 chunk_len; - int num_copies; - int mirror_num; - - if (num_bytes > state->datablock_size) - chunk_len = state->datablock_size; - else - chunk_len = num_bytes; - - num_copies = btrfs_num_copies(fs_info, next_bytenr, - state->datablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block_data_ctx next_block_ctx; - struct btrfsic_block *next_block; - int block_was_created; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic_handle_extent_data(mirror_num=%d)\n", - mirror_num); - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - pr_info("\tdisk_bytenr = %llu, num_bytes %u\n", - next_bytenr, chunk_len); - ret = btrfsic_map_block(state, next_bytenr, - chunk_len, &next_block_ctx, - mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - next_bytenr, mirror_num); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, - &next_block_ctx, - "referenced ", - 0, - force_iodone_flag, - !force_iodone_flag, - mirror_num, - &block_was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(&next_block_ctx); - return -1; - } - if (!block_was_created) { - if ((state->print_mask & - BTRFSIC_PRINT_MASK_VERBOSE) && - next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) { - pr_info( -"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n", - next_bytenr, - next_block_ctx.dev->bdev, - next_block_ctx.dev_bytenr, - mirror_num, - next_block->logical_bytenr); - } - next_block->logical_bytenr = next_bytenr; - next_block->mirror_num = mirror_num; - } - - l = btrfsic_block_link_lookup_or_add(state, - &next_block_ctx, - next_block, block, - generation); - btrfsic_release_block_ctx(&next_block_ctx); - if (NULL == l) - return -1; - } - - next_bytenr += chunk_len; - num_bytes -= chunk_len; - } - - return 0; -} - -static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, - struct btrfsic_block_data_ctx *block_ctx_out, - int mirror_num) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - int ret; - u64 length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap, *map; - struct btrfs_device *device; - - length = len; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc, - NULL, &mirror_num, 0); - if (ret) { - block_ctx_out->start = 0; - block_ctx_out->dev_bytenr = 0; - block_ctx_out->len = 0; - block_ctx_out->dev = NULL; - block_ctx_out->datav = NULL; - block_ctx_out->pagev = NULL; - block_ctx_out->mem_to_free = NULL; - - return ret; - } - - if (bioc) - map = &bioc->stripes[0]; - else - map = &smap; - - device = map->dev; - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || - !device->bdev || !device->name) - block_ctx_out->dev = NULL; - else - block_ctx_out->dev = btrfsic_dev_state_lookup( - device->bdev->bd_dev); - block_ctx_out->dev_bytenr = map->physical; - block_ctx_out->start = bytenr; - block_ctx_out->len = len; - block_ctx_out->datav = NULL; - block_ctx_out->pagev = NULL; - block_ctx_out->mem_to_free = NULL; - - kfree(bioc); - if (NULL == block_ctx_out->dev) { - ret = -ENXIO; - pr_info("btrfsic: error, cannot lookup dev (#1)!\n"); - } - - return ret; -} - -static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) -{ - if (block_ctx->mem_to_free) { - unsigned int num_pages; - - BUG_ON(!block_ctx->datav); - BUG_ON(!block_ctx->pagev); - num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> - PAGE_SHIFT; - /* Pages must be unmapped in reverse order */ - while (num_pages > 0) { - num_pages--; - if (block_ctx->datav[num_pages]) - block_ctx->datav[num_pages] = NULL; - if (block_ctx->pagev[num_pages]) { - __free_page(block_ctx->pagev[num_pages]); - block_ctx->pagev[num_pages] = NULL; - } - } - - kfree(block_ctx->mem_to_free); - block_ctx->mem_to_free = NULL; - block_ctx->pagev = NULL; - block_ctx->datav = NULL; - } -} - -static int btrfsic_read_block(struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx) -{ - unsigned int num_pages; - unsigned int i; - size_t size; - u64 dev_bytenr; - int ret; - - BUG_ON(block_ctx->datav); - BUG_ON(block_ctx->pagev); - BUG_ON(block_ctx->mem_to_free); - if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) { - pr_info("btrfsic: read_block() with unaligned bytenr %llu\n", - block_ctx->dev_bytenr); - return -1; - } - - num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> - PAGE_SHIFT; - size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev); - block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS); - if (!block_ctx->mem_to_free) - return -ENOMEM; - block_ctx->datav = block_ctx->mem_to_free; - block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); - ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev); - if (ret) - return ret; - - dev_bytenr = block_ctx->dev_bytenr; - for (i = 0; i < num_pages;) { - struct bio *bio; - unsigned int j; - - bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, - REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT; - - for (j = i; j < num_pages; j++) { - ret = bio_add_page(bio, block_ctx->pagev[j], - PAGE_SIZE, 0); - if (PAGE_SIZE != ret) - break; - } - if (j == i) { - pr_info("btrfsic: error, failed to add a single page!\n"); - return -1; - } - if (submit_bio_wait(bio)) { - pr_info("btrfsic: read error at logical %llu dev %pg!\n", - block_ctx->start, block_ctx->dev->bdev); - bio_put(bio); - return -1; - } - bio_put(bio); - dev_bytenr += (j - i) * PAGE_SIZE; - i = j; - } - for (i = 0; i < num_pages; i++) - block_ctx->datav[i] = page_address(block_ctx->pagev[i]); - - return block_ctx->len; -} - -static void btrfsic_dump_database(struct btrfsic_state *state) -{ - const struct btrfsic_block *b_all; - - BUG_ON(NULL == state); - - pr_info("all_blocks_list:\n"); - list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) { - const struct btrfsic_block_link *l; - - pr_info("%c-block @%llu (%pg/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num); - - list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) { - pr_info( - " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - } - - list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) { - pr_info( - " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num); - } - - pr_info("\n"); - } -} - -/* - * Test whether the disk block contains a tree block (leaf or node) - * (note that this test fails for the super block) - */ -static noinline_for_stack int btrfsic_test_for_metadata( - struct btrfsic_state *state, - char **datav, unsigned int num_pages) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - struct btrfs_header *h; - u8 csum[BTRFS_CSUM_SIZE]; - unsigned int i; - - if (num_pages * PAGE_SIZE < state->metablock_size) - return 1; /* not metadata */ - num_pages = state->metablock_size >> PAGE_SHIFT; - h = (struct btrfs_header *)datav[0]; - - if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) - return 1; - - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - - for (i = 0; i < num_pages; i++) { - u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); - size_t sublen = i ? PAGE_SIZE : - (PAGE_SIZE - BTRFS_CSUM_SIZE); - - crypto_shash_update(shash, data, sublen); - } - crypto_shash_final(shash, csum); - if (memcmp(csum, h->csum, fs_info->csum_size)) - return 1; - - return 0; /* is metadata */ -} - -static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char **mapped_datav, - unsigned int num_pages, - struct bio *bio, int *bio_is_patched, - blk_opf_t submit_bio_bh_rw) -{ - int is_metadata; - struct btrfsic_block *block; - struct btrfsic_block_data_ctx block_ctx; - int ret; - struct btrfsic_state *state = dev_state->state; - struct block_device *bdev = dev_state->bdev; - unsigned int processed_len; - - if (NULL != bio_is_patched) - *bio_is_patched = 0; - -again: - if (num_pages == 0) - return; - - processed_len = 0; - is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, - num_pages)); - - block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, - &state->block_hashtable); - if (NULL != block) { - u64 bytenr = 0; - struct btrfsic_block_link *l, *tmp; - - if (block->is_superblock) { - bytenr = btrfs_super_bytenr((struct btrfs_super_block *) - mapped_datav[0]); - if (num_pages * PAGE_SIZE < - BTRFS_SUPER_INFO_SIZE) { - pr_info("btrfsic: cannot work with too short bios!\n"); - return; - } - is_metadata = 1; - BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE)); - processed_len = BTRFS_SUPER_INFO_SIZE; - if (state->print_mask & - BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { - pr_info("[before new superblock is written]:\n"); - btrfsic_dump_tree_sub(state, block, 0); - } - } - if (is_metadata) { - if (!block->is_superblock) { - if (num_pages * PAGE_SIZE < - state->metablock_size) { - pr_info("btrfsic: cannot work with too short bios!\n"); - return; - } - processed_len = state->metablock_size; - bytenr = btrfs_stack_header_bytenr( - (struct btrfs_header *) - mapped_datav[0]); - btrfsic_cmp_log_and_dev_bytenr(state, bytenr, - dev_state, - dev_bytenr); - } - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { - if (block->logical_bytenr != bytenr && - !(!block->is_metadata && - block->logical_bytenr == 0)) - pr_info( -"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", - bytenr, dev_state->bdev, - dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, - block), - block->logical_bytenr); - else - pr_info( - "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", - bytenr, dev_state->bdev, - dev_bytenr, block->mirror_num, - btrfsic_get_block_type(state, - block)); - } - block->logical_bytenr = bytenr; - } else { - if (num_pages * PAGE_SIZE < - state->datablock_size) { - pr_info("btrfsic: cannot work with too short bios!\n"); - return; - } - processed_len = state->datablock_size; - bytenr = block->logical_bytenr; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", - bytenr, dev_state->bdev, dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block)); - } - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("ref_to_list: %cE, ref_from_list: %cE\n", - list_empty(&block->ref_to_list) ? ' ' : '!', - list_empty(&block->ref_from_list) ? ' ' : '!'); - if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { - pr_info( -"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n", - btrfsic_get_block_type(state, block), bytenr, - dev_state->bdev, dev_bytenr, block->mirror_num, - block->generation, - btrfs_disk_key_objectid(&block->disk_key), - block->disk_key.type, - btrfs_disk_key_offset(&block->disk_key), - btrfs_stack_header_generation( - (struct btrfs_header *) mapped_datav[0]), - state->max_superblock_generation); - btrfsic_dump_tree(state); - } - - if (!block->is_iodone && !block->never_written) { - pr_info( -"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n", - btrfsic_get_block_type(state, block), bytenr, - dev_state->bdev, dev_bytenr, block->mirror_num, - block->generation, - btrfs_stack_header_generation( - (struct btrfs_header *) - mapped_datav[0])); - /* it would not be safe to go on */ - btrfsic_dump_tree(state); - goto continue_loop; - } - - /* - * Clear all references of this block. Do not free - * the block itself even if is not referenced anymore - * because it still carries valuable information - * like whether it was ever written and IO completed. - */ - list_for_each_entry_safe(l, tmp, &block->ref_to_list, - node_ref_to) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_rem_link(state, l); - l->ref_cnt--; - if (0 == l->ref_cnt) { - list_del(&l->node_ref_to); - list_del(&l->node_ref_from); - btrfsic_block_link_hashtable_remove(l); - btrfsic_block_link_free(l); - } - } - - block_ctx.dev = dev_state; - block_ctx.dev_bytenr = dev_bytenr; - block_ctx.start = bytenr; - block_ctx.len = processed_len; - block_ctx.pagev = NULL; - block_ctx.mem_to_free = NULL; - block_ctx.datav = mapped_datav; - - if (is_metadata || state->include_extent_data) { - block->never_written = 0; - block->iodone_w_error = 0; - if (NULL != bio) { - block->is_iodone = 0; - BUG_ON(NULL == bio_is_patched); - if (!*bio_is_patched) { - block->orig_bio_private = - bio->bi_private; - block->orig_bio_end_io = - bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - *bio_is_patched = 1; - } else { - struct btrfsic_block *chained_block = - (struct btrfsic_block *) - bio->bi_private; - - BUG_ON(NULL == chained_block); - block->orig_bio_private = - chained_block->orig_bio_private; - block->orig_bio_end_io = - chained_block->orig_bio_end_io; - block->next_in_same_bio = chained_block; - bio->bi_private = block; - } - } else { - block->is_iodone = 1; - block->orig_bio_private = NULL; - block->orig_bio_end_io = NULL; - block->next_in_same_bio = NULL; - } - } - - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = submit_bio_bh_rw; - if (is_metadata) { - block->logical_bytenr = bytenr; - block->is_metadata = 1; - if (block->is_superblock) { - BUG_ON(PAGE_SIZE != - BTRFS_SUPER_INFO_SIZE); - ret = btrfsic_process_written_superblock( - state, - block, - (struct btrfs_super_block *) - mapped_datav[0]); - if (state->print_mask & - BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { - pr_info("[after new superblock is written]:\n"); - btrfsic_dump_tree_sub(state, block, 0); - } - } else { - block->mirror_num = 0; /* unknown */ - ret = btrfsic_process_metablock( - state, - block, - &block_ctx, - 0, 0); - } - if (ret) - pr_info("btrfsic: btrfsic_process_metablock(root @%llu) failed!\n", - dev_bytenr); - } else { - block->is_metadata = 0; - block->mirror_num = 0; /* unknown */ - block->generation = BTRFSIC_GENERATION_UNKNOWN; - if (!state->include_extent_data - && list_empty(&block->ref_from_list)) { - /* - * disk block is overwritten with extent - * data (not meta data) and we are configured - * to not include extent data: take the - * chance and free the block's memory - */ - btrfsic_block_hashtable_remove(block); - list_del(&block->all_blocks_node); - btrfsic_block_free(block); - } - } - btrfsic_release_block_ctx(&block_ctx); - } else { - /* block has not been found in hash table */ - u64 bytenr; - - if (!is_metadata) { - processed_len = state->datablock_size; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "written block (%pg/%llu/?) !found in hash table, D\n", - dev_state->bdev, dev_bytenr); - if (!state->include_extent_data) { - /* ignore that written D block */ - goto continue_loop; - } - - /* this is getting ugly for the - * include_extent_data case... */ - bytenr = 0; /* unknown */ - } else { - processed_len = state->metablock_size; - bytenr = btrfs_stack_header_bytenr( - (struct btrfs_header *) - mapped_datav[0]); - btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, - dev_bytenr); - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "written block @%llu (%pg/%llu/?) !found in hash table, M\n", - bytenr, dev_state->bdev, dev_bytenr); - } - - block_ctx.dev = dev_state; - block_ctx.dev_bytenr = dev_bytenr; - block_ctx.start = bytenr; - block_ctx.len = processed_len; - block_ctx.pagev = NULL; - block_ctx.mem_to_free = NULL; - block_ctx.datav = mapped_datav; - - block = btrfsic_block_alloc(); - if (NULL == block) { - btrfsic_release_block_ctx(&block_ctx); - goto continue_loop; - } - block->dev_state = dev_state; - block->dev_bytenr = dev_bytenr; - block->logical_bytenr = bytenr; - block->is_metadata = is_metadata; - block->never_written = 0; - block->iodone_w_error = 0; - block->mirror_num = 0; /* unknown */ - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = submit_bio_bh_rw; - if (NULL != bio) { - block->is_iodone = 0; - BUG_ON(NULL == bio_is_patched); - if (!*bio_is_patched) { - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - *bio_is_patched = 1; - } else { - struct btrfsic_block *chained_block = - (struct btrfsic_block *) - bio->bi_private; - - BUG_ON(NULL == chained_block); - block->orig_bio_private = - chained_block->orig_bio_private; - block->orig_bio_end_io = - chained_block->orig_bio_end_io; - block->next_in_same_bio = chained_block; - bio->bi_private = block; - } - } else { - block->is_iodone = 1; - block->orig_bio_private = NULL; - block->orig_bio_end_io = NULL; - block->next_in_same_bio = NULL; - } - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("new written %c-block @%llu (%pg/%llu/%d)\n", - is_metadata ? 'M' : 'D', - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num); - list_add(&block->all_blocks_node, &state->all_blocks_list); - btrfsic_block_hashtable_add(block, &state->block_hashtable); - - if (is_metadata) { - ret = btrfsic_process_metablock(state, block, - &block_ctx, 0, 0); - if (ret) - pr_info("btrfsic: process_metablock(root @%llu) failed!\n", - dev_bytenr); - } - btrfsic_release_block_ctx(&block_ctx); - } - -continue_loop: - BUG_ON(!processed_len); - dev_bytenr += processed_len; - mapped_datav += processed_len >> PAGE_SHIFT; - num_pages -= processed_len >> PAGE_SHIFT; - goto again; -} - -static void btrfsic_bio_end_io(struct bio *bp) -{ - struct btrfsic_block *block = bp->bi_private; - int iodone_w_error; - - /* mutex is not held! This is not save if IO is not yet completed - * on umount */ - iodone_w_error = 0; - if (bp->bi_status) - iodone_w_error = 1; - - BUG_ON(NULL == block); - bp->bi_private = block->orig_bio_private; - bp->bi_end_io = block->orig_bio_end_io; - - do { - struct btrfsic_block *next_block; - struct btrfsic_dev_state *const dev_state = block->dev_state; - - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n", - bp->bi_status, - btrfsic_get_block_type(dev_state->state, block), - block->logical_bytenr, dev_state->bdev, - block->dev_bytenr, block->mirror_num); - next_block = block->next_in_same_bio; - block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_PREFLUSH) { - dev_state->last_flush_gen++; - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io() new %pg flush_gen=%llu\n", - dev_state->bdev, - dev_state->last_flush_gen); - } - if (block->submit_bio_bh_rw & REQ_FUA) - block->flush_gen = 0; /* FUA completed means block is - * on disk */ - block->is_iodone = 1; /* for FLUSH, this releases the block */ - block = next_block; - } while (NULL != block); - - bp->bi_end_io(bp); -} - -static int btrfsic_process_written_superblock( - struct btrfsic_state *state, - struct btrfsic_block *const superblock, - struct btrfs_super_block *const super_hdr) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - int pass; - - superblock->generation = btrfs_super_generation(super_hdr); - if (!(superblock->generation > state->max_superblock_generation || - 0 == state->max_superblock_generation)) { - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info( - "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n", - superblock->logical_bytenr, - superblock->dev_state->bdev, - superblock->dev_bytenr, superblock->mirror_num, - btrfs_super_generation(super_hdr), - state->max_superblock_generation); - } else { - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info( - "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n", - superblock->logical_bytenr, - superblock->dev_state->bdev, - superblock->dev_bytenr, superblock->mirror_num, - btrfs_super_generation(super_hdr), - state->max_superblock_generation); - - state->max_superblock_generation = - btrfs_super_generation(super_hdr); - state->latest_superblock = superblock; - } - - for (pass = 0; pass < 3; pass++) { - int ret; - u64 next_bytenr; - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - int num_copies; - int mirror_num; - const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key = {0}; - - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_ROOT_ITEM_KEY); - btrfs_set_disk_key_objectid(&tmp_disk_key, 0); - - switch (pass) { - case 0: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_ROOT_TREE_OBJECTID); - additional_string = "root "; - next_bytenr = btrfs_super_root(super_hdr); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("root@%llu\n", next_bytenr); - break; - case 1: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_CHUNK_TREE_OBJECTID); - additional_string = "chunk "; - next_bytenr = btrfs_super_chunk_root(super_hdr); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("chunk@%llu\n", next_bytenr); - break; - case 2: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_TREE_LOG_OBJECTID); - additional_string = "log "; - next_bytenr = btrfs_super_log_root(super_hdr); - if (0 == next_bytenr) - continue; - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("log@%llu\n", next_bytenr); - break; - } - - num_copies = btrfs_num_copies(fs_info, next_bytenr, - BTRFS_SUPER_INFO_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - int was_created; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic_process_written_superblock(mirror_num=%d)\n", mirror_num); - ret = btrfsic_map_block(state, next_bytenr, - BTRFS_SUPER_INFO_SIZE, - &tmp_next_block_ctx, - mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - next_bytenr, mirror_num); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, - &tmp_next_block_ctx, - additional_string, - 1, 0, 1, - mirror_num, - &was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(&tmp_next_block_ctx); - return -1; - } - - next_block->disk_key = tmp_disk_key; - if (was_created) - next_block->generation = - BTRFSIC_GENERATION_UNKNOWN; - l = btrfsic_block_link_lookup_or_add( - state, - &tmp_next_block_ctx, - next_block, - superblock, - BTRFSIC_GENERATION_UNKNOWN); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - if (NULL == l) - return -1; - } - } - - if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0))) - btrfsic_dump_tree(state); - - return 0; -} - -static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, - struct btrfsic_block *const block, - int recursion_level) -{ - const struct btrfsic_block_link *l; - int ret = 0; - - if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { - /* - * Note that this situation can happen and does not - * indicate an error in regular cases. It happens - * when disk blocks are freed and later reused. - * The check-integrity module is not aware of any - * block free operations, it just recognizes block - * write operations. Therefore it keeps the linkage - * information for a block until a block is - * rewritten. This can temporarily cause incorrect - * and even circular linkage information. This - * causes no harm unless such blocks are referenced - * by the most recent super block. - */ - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic: abort cyclic linkage (case 1).\n"); - - return ret; - } - - /* - * This algorithm is recursive because the amount of used stack - * space is very small and the max recursion depth is limited. - */ - list_for_each_entry(l, &block->ref_to_list, node_ref_to) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n", - recursion_level, - btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - if (l->block_ref_to->never_written) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (!l->block_ref_to->is_iodone) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (l->block_ref_to->iodone_w_error) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (l->parent_generation != - l->block_ref_to->generation && - BTRFSIC_GENERATION_UNKNOWN != - l->parent_generation && - BTRFSIC_GENERATION_UNKNOWN != - l->block_ref_to->generation) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, - l->block_ref_to->generation, - l->parent_generation); - ret = -1; - } else if (l->block_ref_to->flush_gen > - l->block_ref_to->dev_state->last_flush_gen) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, block->flush_gen, - l->block_ref_to->dev_state->last_flush_gen); - ret = -1; - } else if (-1 == btrfsic_check_all_ref_blocks(state, - l->block_ref_to, - recursion_level + - 1)) { - ret = -1; - } - } - - return ret; -} - -static int btrfsic_is_block_ref_by_superblock( - const struct btrfsic_state *state, - const struct btrfsic_block *block, - int recursion_level) -{ - const struct btrfsic_block_link *l; - - if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { - /* refer to comment at "abort cyclic linkage (case 1)" */ - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic: abort cyclic linkage (case 2).\n"); - - return 0; - } - - /* - * This algorithm is recursive because the amount of used stack space - * is very small and the max recursion depth is limited. - */ - list_for_each_entry(l, &block->ref_from_list, node_ref_from) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", - recursion_level, - btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num); - if (l->block_ref_from->is_superblock && - state->latest_superblock->dev_bytenr == - l->block_ref_from->dev_bytenr && - state->latest_superblock->dev_state->bdev == - l->block_ref_from->dev_state->bdev) - return 1; - else if (btrfsic_is_block_ref_by_superblock(state, - l->block_ref_from, - recursion_level + - 1)) - return 1; - } - - return 0; -} - -static void btrfsic_print_add_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l) -{ - pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); -} - -static void btrfsic_print_rem_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l) -{ - pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); -} - -static char btrfsic_get_block_type(const struct btrfsic_state *state, - const struct btrfsic_block *block) -{ - if (block->is_superblock && - state->latest_superblock->dev_bytenr == block->dev_bytenr && - state->latest_superblock->dev_state->bdev == block->dev_state->bdev) - return 'S'; - else if (block->is_superblock) - return 's'; - else if (block->is_metadata) - return 'M'; - else - return 'D'; -} - -static void btrfsic_dump_tree(const struct btrfsic_state *state) -{ - btrfsic_dump_tree_sub(state, state->latest_superblock, 0); -} - -static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int indent_level) -{ - const struct btrfsic_block_link *l; - int indent_add; - static char buf[80]; - int cursor_position; - - /* - * Should better fill an on-stack buffer with a complete line and - * dump it at once when it is time to print a newline character. - */ - - /* - * This algorithm is recursive because the amount of used stack space - * is very small and the max recursion depth is limited. - */ - indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)", - btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num); - if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { - printk("[...]\n"); - return; - } - printk(buf); - indent_level += indent_add; - if (list_empty(&block->ref_to_list)) { - printk("\n"); - return; - } - if (block->mirror_num > 1 && - !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { - printk(" [...]\n"); - return; - } - - cursor_position = indent_level; - list_for_each_entry(l, &block->ref_to_list, node_ref_to) { - while (cursor_position < indent_level) { - printk(" "); - cursor_position++; - } - if (l->ref_cnt > 1) - indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); - else - indent_add = sprintf(buf, " --> "); - if (indent_level + indent_add > - BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { - printk("[...]\n"); - cursor_position = 0; - continue; - } - - printk(buf); - - btrfsic_dump_tree_sub(state, l->block_ref_to, - indent_level + indent_add); - cursor_position = 0; - } -} - -static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block *next_block, - struct btrfsic_block *from_block, - u64 parent_generation) -{ - struct btrfsic_block_link *l; - - l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, - from_block->dev_state->bdev, - from_block->dev_bytenr, - &state->block_link_hashtable); - if (NULL == l) { - l = btrfsic_block_link_alloc(); - if (!l) - return NULL; - - l->block_ref_to = next_block; - l->block_ref_from = from_block; - l->ref_cnt = 1; - l->parent_generation = parent_generation; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - - list_add(&l->node_ref_to, &from_block->ref_to_list); - list_add(&l->node_ref_from, &next_block->ref_from_list); - - btrfsic_block_link_hashtable_add(l, - &state->block_link_hashtable); - } else { - l->ref_cnt++; - l->parent_generation = parent_generation; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - } - - return l; -} - -static struct btrfsic_block *btrfsic_block_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx, - const char *additional_string, - int is_metadata, - int is_iodone, - int never_written, - int mirror_num, - int *was_created) -{ - struct btrfsic_block *block; - - block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, - block_ctx->dev_bytenr, - &state->block_hashtable); - if (NULL == block) { - struct btrfsic_dev_state *dev_state; - - block = btrfsic_block_alloc(); - if (!block) - return NULL; - - dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev); - if (NULL == dev_state) { - pr_info("btrfsic: error, lookup dev_state failed!\n"); - btrfsic_block_free(block); - return NULL; - } - block->dev_state = dev_state; - block->dev_bytenr = block_ctx->dev_bytenr; - block->logical_bytenr = block_ctx->start; - block->is_metadata = is_metadata; - block->is_iodone = is_iodone; - block->never_written = never_written; - block->mirror_num = mirror_num; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n", - additional_string, - btrfsic_get_block_type(state, block), - block->logical_bytenr, dev_state->bdev, - block->dev_bytenr, mirror_num); - list_add(&block->all_blocks_node, &state->all_blocks_list); - btrfsic_block_hashtable_add(block, &state->block_hashtable); - if (NULL != was_created) - *was_created = 1; - } else { - if (NULL != was_created) - *was_created = 0; - } - - return block; -} - -static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, - u64 bytenr, - struct btrfsic_dev_state *dev_state, - u64 dev_bytenr) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfsic_block_data_ctx block_ctx; - int num_copies; - int mirror_num; - int match = 0; - int ret; - - num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size); - - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, state->metablock_size, - &block_ctx, mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(logical @%llu, mirror %d) failed!\n", - bytenr, mirror_num); - continue; - } - - if (dev_state->bdev == block_ctx.dev->bdev && - dev_bytenr == block_ctx.dev_bytenr) { - match++; - btrfsic_release_block_ctx(&block_ctx); - break; - } - btrfsic_release_block_ctx(&block_ctx); - } - - if (WARN_ON(!match)) { - pr_info( -"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n", - bytenr, dev_state->bdev, dev_bytenr); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, - state->metablock_size, - &block_ctx, mirror_num); - if (ret) - continue; - - pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n", - bytenr, block_ctx.dev->bdev, - block_ctx.dev_bytenr, mirror_num); - } - } -} - -static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) -{ - return btrfsic_dev_state_hashtable_lookup(dev, - &btrfsic_dev_state_hashtable); -} - -static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) -{ - unsigned int segs = bio_segments(bio); - u64 dev_bytenr = 512 * bio->bi_iter.bi_sector; - u64 cur_bytenr = dev_bytenr; - struct bvec_iter iter; - struct bio_vec bvec; - char **mapped_datav; - int bio_is_patched = 0; - int i = 0; - - if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info( -"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, segs, - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - - mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); - if (!mapped_datav) - return; - - bio_for_each_segment(bvec, bio, iter) { - BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = page_address(bvec.bv_page); - i++; - - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) - pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec.bv_len, bvec.bv_offset); - cur_bytenr += bvec.bv_len; - } - - btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, - bio, &bio_is_patched, bio->bi_opf); - kfree(mapped_datav); -} - -static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) -{ - if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_bdev); - - if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; - - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = bio->bi_opf; - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - } else if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) { - pr_info( -"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->bdev); - } -} - -void btrfsic_check_bio(struct bio *bio) -{ - struct btrfsic_dev_state *dev_state; - - if (!btrfsic_is_initialized) - return; - - /* - * We can be called before btrfsic_mount, so there might not be a - * dev_state. - */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); - mutex_lock(&btrfsic_mutex); - if (dev_state) { - if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio)) - btrfsic_check_write_bio(bio, dev_state); - else if (bio->bi_opf & REQ_PREFLUSH) - btrfsic_check_flush_bio(bio, dev_state); - } - mutex_unlock(&btrfsic_mutex); -} - -int btrfsic_mount(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, - int including_extent_data, u32 print_mask) -{ - int ret; - struct btrfsic_state *state; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - - if (!PAGE_ALIGNED(fs_info->nodesize)) { - pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n", - fs_info->nodesize, PAGE_SIZE); - return -1; - } - if (!PAGE_ALIGNED(fs_info->sectorsize)) { - pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n", - fs_info->sectorsize, PAGE_SIZE); - return -1; - } - state = kvzalloc(sizeof(*state), GFP_KERNEL); - if (!state) - return -ENOMEM; - - if (!btrfsic_is_initialized) { - mutex_init(&btrfsic_mutex); - btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); - btrfsic_is_initialized = 1; - } - mutex_lock(&btrfsic_mutex); - state->fs_info = fs_info; - state->print_mask = print_mask; - state->include_extent_data = including_extent_data; - state->metablock_size = fs_info->nodesize; - state->datablock_size = fs_info->sectorsize; - INIT_LIST_HEAD(&state->all_blocks_list); - btrfsic_block_hashtable_init(&state->block_hashtable); - btrfsic_block_link_hashtable_init(&state->block_link_hashtable); - state->max_superblock_generation = 0; - state->latest_superblock = NULL; - - list_for_each_entry(device, dev_head, dev_list) { - struct btrfsic_dev_state *ds; - - if (!device->bdev || !device->name) - continue; - - ds = btrfsic_dev_state_alloc(); - if (NULL == ds) { - mutex_unlock(&btrfsic_mutex); - return -ENOMEM; - } - ds->bdev = device->bdev; - ds->state = state; - btrfsic_dev_state_hashtable_add(ds, - &btrfsic_dev_state_hashtable); - } - - ret = btrfsic_process_superblock(state, fs_devices); - if (0 != ret) { - mutex_unlock(&btrfsic_mutex); - btrfsic_unmount(fs_devices); - return ret; - } - - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) - btrfsic_dump_database(state); - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) - btrfsic_dump_tree(state); - - mutex_unlock(&btrfsic_mutex); - return 0; -} - -void btrfsic_unmount(struct btrfs_fs_devices *fs_devices) -{ - struct btrfsic_block *b_all, *tmp_all; - struct btrfsic_state *state; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - - if (!btrfsic_is_initialized) - return; - - mutex_lock(&btrfsic_mutex); - - state = NULL; - list_for_each_entry(device, dev_head, dev_list) { - struct btrfsic_dev_state *ds; - - if (!device->bdev || !device->name) - continue; - - ds = btrfsic_dev_state_hashtable_lookup( - device->bdev->bd_dev, - &btrfsic_dev_state_hashtable); - if (NULL != ds) { - state = ds->state; - btrfsic_dev_state_hashtable_remove(ds); - btrfsic_dev_state_free(ds); - } - } - - if (NULL == state) { - pr_info("btrfsic: error, cannot find state information on umount!\n"); - mutex_unlock(&btrfsic_mutex); - return; - } - - /* - * Don't care about keeping the lists' state up to date, - * just free all memory that was allocated dynamically. - * Free the blocks and the block_links. - */ - list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list, - all_blocks_node) { - struct btrfsic_block_link *l, *tmp; - - list_for_each_entry_safe(l, tmp, &b_all->ref_to_list, - node_ref_to) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_rem_link(state, l); - - l->ref_cnt--; - if (0 == l->ref_cnt) - btrfsic_block_link_free(l); - } - - if (b_all->is_iodone || b_all->never_written) - btrfsic_block_free(b_all); - else - pr_info( -"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num); - } - - mutex_unlock(&btrfsic_mutex); - - kvfree(state); -} diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h deleted file mode 100644 index e4c8aed7996f..000000000000 --- a/fs/btrfs/check-integrity.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) STRATO AG 2011. All rights reserved. - */ - -#ifndef BTRFS_CHECK_INTEGRITY_H -#define BTRFS_CHECK_INTEGRITY_H - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -void btrfsic_check_bio(struct bio *bio); -#else -static inline void btrfsic_check_bio(struct bio *bio) { } -#endif - -int btrfsic_mount(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, - int including_extent_data, u32 print_mask); -void btrfsic_unmount(struct btrfs_fs_devices *fs_devices); - -#endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8818ed5c390f..19b22b4653c8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -193,12 +193,12 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; - const int errno = blk_status_to_errno(cb->bbio.bio.bi_status); + const int error = blk_status_to_errno(cb->bbio.bio.bi_status); int i; int ret; - if (errno) - mapping_set_error(inode->i_mapping, errno); + if (error) + mapping_set_error(inode->i_mapping, error); folio_batch_init(&fbatch); while (index <= end_index) { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a4cb4b642987..35c1d24d4a78 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -230,9 +230,9 @@ noinline void btrfs_release_path(struct btrfs_path *p) * cause could be a bug, eg. due to ENOSPC, and not for common errors that are * caused by external factors. */ -bool __cold abort_should_print_stack(int errno) +bool __cold abort_should_print_stack(int error) { - switch (errno) { + switch (error) { case -EIO: case -EROFS: case -ENOMEM: @@ -316,6 +316,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, int ret = 0; int level; struct btrfs_disk_key disk_key; + u64 reloc_src_root = 0; WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); @@ -328,9 +329,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + reloc_src_root = btrfs_header_owner(buf); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, &disk_key, level, buf->start, 0, - BTRFS_NESTING_NEW_ROOT); + reloc_src_root, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -359,7 +362,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return ret; } - btrfs_mark_buffer_dirty(cow); + btrfs_mark_buffer_dirty(trans, cow); *cow_ret = cow; return 0; } @@ -367,7 +370,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, /* * check if the tree block can be shared by multiple trees */ -int btrfs_block_can_be_shared(struct btrfs_root *root, +int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf) { /* @@ -376,11 +380,21 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, * not allocated by tree relocation, we know the block is not shared. */ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - buf != root->node && buf != root->commit_root && + buf != root->node && (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item) || - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) - return 1; + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) { + if (buf != root->commit_root) + return 1; + /* + * An extent buffer that used to be the commit root may still be + * shared because the tree height may have increased and it + * became a child of a higher level root. This can happen when + * snapshotting a subvolume created in the current transaction. + */ + if (btrfs_header_generation(buf) == trans->transid) + return 1; + } return 0; } @@ -415,10 +429,10 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, * are only allowed for blocks use full backrefs. */ - if (btrfs_block_can_be_shared(root, buf)) { + if (btrfs_block_can_be_shared(trans, root, buf)) { ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, btrfs_header_level(buf), 1, - &refs, &flags); + &refs, &flags, NULL); if (ret) return ret; if (unlikely(refs == 0)) { @@ -507,13 +521,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, * bytes the allocator should try to find free next to the block it returns. * This is just a hint and may be ignored by the allocator. */ -static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size, - enum btrfs_lock_nesting nest) +int btrfs_force_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; @@ -522,6 +536,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, int last_ref = 0; int unlock_orig = 0; u64 parent_start = 0; + u64 reloc_src_root = 0; if (*cow_ret == buf) unlock_orig = 1; @@ -540,12 +555,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) - parent_start = parent->start; - + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent) + parent_start = parent->start; + reloc_src_root = btrfs_header_owner(buf); + } cow = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, &disk_key, level, - search_start, empty_size, nest); + search_start, empty_size, reloc_src_root, nest); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -616,7 +633,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); if (last_ref) { ret = btrfs_tree_mod_log_free_eb(buf); if (ret) { @@ -632,7 +649,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); - btrfs_mark_buffer_dirty(cow); + btrfs_mark_buffer_dirty(trans, cow); *cow_ret = cow; return 0; } @@ -668,11 +685,11 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, } /* - * cows a single block, see __btrfs_cow_block for the real work. + * COWs a single block, see btrfs_force_cow_block() for the real work. * This version of it has extra checks so that a block isn't COWed more than * once per transaction, as long as it hasn't been written yet */ -noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, +int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, @@ -682,25 +699,37 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; - if (test_bit(BTRFS_ROOT_DELETING, &root->state)) - btrfs_err(fs_info, - "COW'ing blocks on a fs root that's being dropped"); - - if (trans->transaction != fs_info->running_transaction) - WARN(1, KERN_CRIT "trans %llu running %llu\n", - trans->transid, - fs_info->running_transaction->transid); + if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, + "attempt to COW block %llu on root %llu that is being deleted", + buf->start, btrfs_root_id(root)); + return -EUCLEAN; + } - if (trans->transid != fs_info->generation) - WARN(1, KERN_CRIT "trans %llu running %llu\n", - trans->transid, fs_info->generation); + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", + buf->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; return 0; } - search_start = buf->start & ~((u64)SZ_1G - 1); + search_start = round_down(buf->start, SZ_1G); /* * Before CoWing this block for later modification, check if it's @@ -709,8 +738,8 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); - ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0, nest); + ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); trace_btrfs_cow_block(root, buf, *cow_ret); @@ -719,49 +748,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); /* - * helper function for defrag to decide if two blocks pointed to by a - * node are actually close by - */ -static int close_blocks(u64 blocknr, u64 other, u32 blocksize) -{ - if (blocknr < other && other - (blocknr + blocksize) < 32768) - return 1; - if (blocknr > other && blocknr - (other + blocksize) < 32768) - return 1; - return 0; -} - -#ifdef __LITTLE_ENDIAN - -/* - * Compare two keys, on little-endian the disk order is same as CPU order and - * we can avoid the conversion. - */ -static int comp_keys(const struct btrfs_disk_key *disk_key, - const struct btrfs_key *k2) -{ - const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; - - return btrfs_comp_cpu_keys(k1, k2); -} - -#else - -/* - * compare two keys in a memcmp fashion - */ -static int comp_keys(const struct btrfs_disk_key *disk, - const struct btrfs_key *k2) -{ - struct btrfs_key k1; - - btrfs_disk_key_to_cpu(&k1, disk); - - return btrfs_comp_cpu_keys(&k1, k2); -} -#endif - -/* * same as comp_keys only with two btrfs_key's */ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) @@ -782,91 +768,6 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke } /* - * this is used by the defrag code to go through all the - * leaves pointed to by a node and reallocate them so that - * disk order is close to key order - */ -int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, u64 *last_ret, - struct btrfs_key *progress) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_buffer *cur; - u64 blocknr; - u64 search_start = *last_ret; - u64 last_block = 0; - u64 other; - u32 parent_nritems; - int end_slot; - int i; - int err = 0; - u32 blocksize; - int progress_passed = 0; - struct btrfs_disk_key disk_key; - - WARN_ON(trans->transaction != fs_info->running_transaction); - WARN_ON(trans->transid != fs_info->generation); - - parent_nritems = btrfs_header_nritems(parent); - blocksize = fs_info->nodesize; - end_slot = parent_nritems - 1; - - if (parent_nritems <= 1) - return 0; - - for (i = start_slot; i <= end_slot; i++) { - int close = 1; - - btrfs_node_key(parent, &disk_key, i); - if (!progress_passed && comp_keys(&disk_key, progress) < 0) - continue; - - progress_passed = 1; - blocknr = btrfs_node_blockptr(parent, i); - if (last_block == 0) - last_block = blocknr; - - if (i > 0) { - other = btrfs_node_blockptr(parent, i - 1); - close = close_blocks(blocknr, other, blocksize); - } - if (!close && i < end_slot) { - other = btrfs_node_blockptr(parent, i + 1); - close = close_blocks(blocknr, other, blocksize); - } - if (close) { - last_block = blocknr; - continue; - } - - cur = btrfs_read_node_slot(parent, i); - if (IS_ERR(cur)) - return PTR_ERR(cur); - if (search_start == 0) - search_start = last_block; - - btrfs_tree_lock(cur); - err = __btrfs_cow_block(trans, root, cur, parent, i, - &cur, search_start, - min(16 * blocksize, - (end_slot - i) * blocksize), - BTRFS_NESTING_COW); - if (err) { - btrfs_tree_unlock(cur); - free_extent_buffer(cur); - break; - } - search_start = cur->start; - last_block = cur->start; - *last_ret = search_start; - btrfs_tree_unlock(cur); - free_extent_buffer(cur); - } - return err; -} - -/* * Search for a key in the given extent_buffer. * * The lower boundary for the search is specified by the slot number @first_slot. @@ -932,7 +833,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, tmp = &unaligned; } - ret = comp_keys(tmp, key); + ret = btrfs_comp_keys(tmp, key); if (ret < 0) low = mid + 1; @@ -947,19 +848,19 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, return 1; } -static void root_add_used(struct btrfs_root *root, u32 size) +static void root_add_used_bytes(struct btrfs_root *root) { spin_lock(&root->accounting_lock); btrfs_set_root_used(&root->root_item, - btrfs_root_used(&root->root_item) + size); + btrfs_root_used(&root->root_item) + root->fs_info->nodesize); spin_unlock(&root->accounting_lock); } -static void root_sub_used(struct btrfs_root *root, u32 size) +static void root_sub_used_bytes(struct btrfs_root *root) { spin_lock(&root->accounting_lock); btrfs_set_root_used(&root->root_item, - btrfs_root_used(&root->root_item) - size); + btrfs_root_used(&root->root_item) - root->fs_info->nodesize); spin_unlock(&root->accounting_lock); } @@ -1075,7 +976,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* once for the path */ free_extent_buffer(mid); - root_sub_used(root, mid->len); + root_sub_used_bytes(root); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); @@ -1145,7 +1046,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = NULL; goto out; } - root_sub_used(root, right->len); + root_sub_used_bytes(root); btrfs_free_tree_block(trans, btrfs_root_id(root), right, 0, 1); free_extent_buffer_stale(right); @@ -1160,7 +1061,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } btrfs_set_node_key(parent, &right_key, pslot + 1); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); } } if (btrfs_header_nritems(mid) == 1) { @@ -1203,7 +1104,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, mid = NULL; goto out; } - root_sub_used(root, mid->len); + root_sub_used_bytes(root); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; @@ -1218,7 +1119,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } btrfs_set_node_key(parent, &mid_key, pslot); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); } /* update the path */ @@ -1325,7 +1226,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, return ret; } btrfs_set_node_key(parent, &disk_key, pslot); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); if (btrfs_header_nritems(left) > orig_slot) { path->nodes[level] = left; path->slots[level + 1] -= 1; @@ -1385,7 +1286,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, return ret; } btrfs_set_node_key(parent, &disk_key, pslot + 1); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); if (btrfs_header_nritems(mid) <= orig_slot) { path->nodes[level] = right; @@ -1969,7 +1870,7 @@ static int search_leaf(struct btrfs_trans_handle *trans, * the extent buffer's header and we have recently accessed * the header's level field. */ - ret = comp_keys(&first_key, key); + ret = btrfs_comp_keys(&first_key, key); if (ret < 0) { /* * The first key is smaller than the key we want @@ -2054,8 +1955,8 @@ static int search_leaf(struct btrfs_trans_handle *trans, } /* - * btrfs_search_slot - look for a key in a tree and perform necessary - * modifications to preserve tree invariants. + * Look for a key in a tree and perform necessary modifications to preserve + * tree invariants. * * @trans: Handle of transaction, used when modifying the tree * @p: Holds all btree nodes along the search path @@ -2478,7 +2379,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) */ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); - ret = comp_keys(&found_key, &orig_key); + ret = btrfs_comp_keys(&found_key, &orig_key); if (ret == 0) { if (path->slots[0] > 0) { path->slots[0]--; @@ -2493,7 +2394,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) } btrfs_item_key(path->nodes[0], &found_key, 0); - ret = comp_keys(&found_key, &key); + ret = btrfs_comp_keys(&found_key, &key); /* * We might have had an item with the previous key in the tree right * before we released our path. And after we released our path, that @@ -2641,7 +2542,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, * higher levels * */ -static void fixup_low_keys(struct btrfs_path *path, +static void fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_path *path, struct btrfs_disk_key *key, int level) { int i; @@ -2658,7 +2560,7 @@ static void fixup_low_keys(struct btrfs_path *path, BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); - btrfs_mark_buffer_dirty(path->nodes[i]); + btrfs_mark_buffer_dirty(trans, path->nodes[i]); if (tslot != 0) break; } @@ -2670,10 +2572,11 @@ static void fixup_low_keys(struct btrfs_path *path, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, struct btrfs_path *path, const struct btrfs_key *new_key) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_disk_key disk_key; struct extent_buffer *eb; int slot; @@ -2682,7 +2585,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, slot = path->slots[0]; if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); - if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { + if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", @@ -2696,7 +2599,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); - if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { + if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", @@ -2711,9 +2614,9 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(eb, &disk_key, slot); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); if (slot == 0) - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } /* @@ -2844,8 +2747,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, } btrfs_set_header_nritems(src, src_nritems - push_items); btrfs_set_header_nritems(dst, dst_nritems + push_items); - btrfs_mark_buffer_dirty(src); - btrfs_mark_buffer_dirty(dst); + btrfs_mark_buffer_dirty(trans, src); + btrfs_mark_buffer_dirty(trans, dst); return ret; } @@ -2920,8 +2823,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(src, src_nritems - push_items); btrfs_set_header_nritems(dst, dst_nritems + push_items); - btrfs_mark_buffer_dirty(src); - btrfs_mark_buffer_dirty(dst); + btrfs_mark_buffer_dirty(trans, src); + btrfs_mark_buffer_dirty(trans, dst); return ret; } @@ -2937,7 +2840,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 lower_gen; struct extent_buffer *lower; struct extent_buffer *c; @@ -2956,11 +2858,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, &lower_key, level, root->node->start, 0, - BTRFS_NESTING_NEW_ROOT); + 0, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) return PTR_ERR(c); - root_add_used(root, fs_info->nodesize); + root_add_used_bytes(root); btrfs_set_header_nritems(c, 1); btrfs_set_node_key(c, &lower_key, 0); @@ -2970,7 +2872,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(c, 0, lower_gen); - btrfs_mark_buffer_dirty(c); + btrfs_mark_buffer_dirty(trans, c); old = root->node; ret = btrfs_tree_mod_log_insert_root(root->node, c, false); @@ -3042,7 +2944,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, WARN_ON(trans->transid == 0); btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); - btrfs_mark_buffer_dirty(lower); + btrfs_mark_buffer_dirty(trans, lower); return 0; } @@ -3100,11 +3002,11 @@ static noinline int split_node(struct btrfs_trans_handle *trans, split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, &disk_key, level, c->start, 0, - BTRFS_NESTING_SPLIT); + 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) return PTR_ERR(split); - root_add_used(root, fs_info->nodesize); + root_add_used_bytes(root); ASSERT(btrfs_header_level(c) == level); ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); @@ -3121,8 +3023,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(split, c_nritems - mid); btrfs_set_header_nritems(c, mid); - btrfs_mark_buffer_dirty(c); - btrfs_mark_buffer_dirty(split); + btrfs_mark_buffer_dirty(trans, c); + btrfs_mark_buffer_dirty(trans, split); ret = insert_ptr(trans, path, &disk_key, split->start, path->slots[level + 1] + 1, level + 1); @@ -3288,15 +3190,15 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(left, left_nritems); if (left_nritems) - btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(trans, left); else btrfs_clear_buffer_dirty(trans, left); - btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); btrfs_set_node_key(upper, &disk_key, slot + 1); - btrfs_mark_buffer_dirty(upper); + btrfs_mark_buffer_dirty(trans, upper); /* then fixup the leaf pointer in the path */ if (path->slots[0] >= left_nritems) { @@ -3508,14 +3410,14 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_token_item_offset(&token, i, push_space); } - btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(trans, left); if (right_nritems) - btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(trans, right); else btrfs_clear_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -3646,8 +3548,8 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, if (ret < 0) return ret; - btrfs_mark_buffer_dirty(right); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, right); + btrfs_mark_buffer_dirty(trans, l); BUG_ON(path->slots[0] != slot); if (mid <= slot) { @@ -3851,13 +3753,13 @@ again: * use BTRFS_NESTING_NEW_ROOT. */ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0, + &disk_key, 0, l->start, 0, 0, num_doubles ? BTRFS_NESTING_NEW_ROOT : BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); - root_add_used(root, fs_info->nodesize); + root_add_used_bytes(root); if (split == 0) { if (mid <= slot) { @@ -3888,7 +3790,7 @@ again: path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } /* * We create a new leaf 'right' for the required ins_len and @@ -3987,7 +3889,8 @@ err: return ret; } -static noinline int split_item(struct btrfs_path *path, +static noinline int split_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, const struct btrfs_key *new_key, unsigned long split_offset) { @@ -4046,7 +3949,7 @@ static noinline int split_item(struct btrfs_path *path, write_extent_buffer(leaf, buf + split_offset, btrfs_item_ptr_offset(leaf, slot), item_size - split_offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); BUG_ON(btrfs_leaf_free_space(leaf) < 0); kfree(buf); @@ -4080,7 +3983,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = split_item(path, new_key, split_offset); + ret = split_item(trans, path, new_key, split_offset); return ret; } @@ -4090,7 +3993,8 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -4166,11 +4070,11 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } btrfs_set_item_size(leaf, slot, new_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); @@ -4181,7 +4085,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) /* * make the item pointed to by the path bigger, data_size is the added size. */ -void btrfs_extend_item(struct btrfs_path *path, u32 data_size) +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; @@ -4231,7 +4136,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) data_end = old_data; old_size = btrfs_item_size(leaf, slot); btrfs_set_item_size(leaf, slot, old_size + data_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); @@ -4242,6 +4147,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) /* * Make space in the node before inserting one or more items. * + * @trans: transaction handle * @root: root we are inserting items to * @path: points to the leaf/slot where we are going to insert new items * @batch: information about the batch of items to insert @@ -4249,7 +4155,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) * Main purpose is to save stack depth by doing the bulk of the work in a * function that doesn't call btrfs_search_slot */ -static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, +static void setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_item_batch *batch) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4269,7 +4176,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p */ if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); @@ -4328,7 +4235,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p } btrfs_set_header_nritems(leaf, nritems + batch->nr); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); @@ -4339,12 +4246,14 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p /* * Insert a new item into a leaf. * + * @trans: Transaction handle. * @root: The root of the btree. * @path: A path pointing to the target leaf and slot. * @key: The key of the new item. * @data_size: The size of the data associated with the new key. */ -void btrfs_setup_item_for_insert(struct btrfs_root *root, +void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key, u32 data_size) @@ -4356,7 +4265,7 @@ void btrfs_setup_item_for_insert(struct btrfs_root *root, batch.total_data_size = data_size; batch.nr = 1; - setup_items_for_insert(root, path, &batch); + setup_items_for_insert(trans, root, path, &batch); } /* @@ -4382,7 +4291,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, batch); + setup_items_for_insert(trans, root, path, batch); return 0; } @@ -4407,7 +4316,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, leaf = path->nodes[0]; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, data, ptr, data_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } btrfs_free_path(path); return ret; @@ -4438,7 +4347,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - btrfs_setup_item_for_insert(root, path, new_key, item_size); + btrfs_setup_item_for_insert(trans, root, path, new_key, item_size); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -4496,9 +4405,9 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(path, &disk_key, level + 1); + fixup_low_keys(trans, path, &disk_key, level + 1); } - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); return 0; } @@ -4530,7 +4439,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - root_sub_used(root, leaf->len); + root_sub_used_bytes(root); atomic_inc(&leaf->refs); btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); @@ -4595,7 +4504,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } /* @@ -4660,11 +4569,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * dirtied this buffer */ if (path->nodes[0] == leaf) - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); free_extent_buffer(leaf); } } else { - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } } return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9419f4e37a58..196c005c31f6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,37 +6,10 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H -#include <linux/mm.h> -#include <linux/sched/signal.h> -#include <linux/highmem.h> -#include <linux/fs.h> -#include <linux/rwsem.h> -#include <linux/semaphore.h> -#include <linux/completion.h> -#include <linux/backing-dev.h> -#include <linux/wait.h> -#include <linux/slab.h> -#include <trace/events/btrfs.h> -#include <asm/unaligned.h> #include <linux/pagemap.h> -#include <linux/btrfs.h> -#include <linux/btrfs_tree.h> -#include <linux/workqueue.h> -#include <linux/security.h> -#include <linux/sizes.h> -#include <linux/dynamic_debug.h> -#include <linux/refcount.h> -#include <linux/crc32c.h> -#include <linux/iomap.h> -#include <linux/fscrypt.h> -#include "extent-io-tree.h" -#include "extent_io.h" -#include "extent_map.h" -#include "async-thread.h" -#include "block-rsv.h" #include "locking.h" -#include "misc.h" #include "fs.h" +#include "accessors.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -218,10 +191,22 @@ struct btrfs_root { atomic_t log_commit[2]; /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_batch; + /* + * Protected by the 'log_mutex' lock but can be read without holding + * that lock to avoid unnecessary lock contention, in which case it + * should be read using btrfs_get_root_log_transid() except if it's a + * log tree in which case it can be directly accessed. Updates to this + * field should always use btrfs_set_root_log_transid(), except for log + * trees where the field can be updated directly. + */ int log_transid; /* No matter the commit succeeds or not*/ int log_transid_committed; - /* Just be updated when the commit succeeds. */ + /* + * Just be updated when the commit succeeds. Use + * btrfs_get_root_last_log_commit() and btrfs_set_root_last_log_commit() + * to access this field. + */ int last_log_commit; pid_t log_start_pid; @@ -326,6 +311,9 @@ struct btrfs_root { /* Used only by log trees, when logging csum items */ struct extent_io_tree log_csum_range; + /* Used in simple quotas, track root during relocation. */ + u64 relocation_src_root; + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif @@ -352,6 +340,26 @@ static inline u64 btrfs_root_id(const struct btrfs_root *root) return root->root_key.objectid; } +static inline int btrfs_get_root_log_transid(const struct btrfs_root *root) +{ + return READ_ONCE(root->log_transid); +} + +static inline void btrfs_set_root_log_transid(struct btrfs_root *root, int log_transid) +{ + WRITE_ONCE(root->log_transid, log_transid); +} + +static inline int btrfs_get_root_last_log_commit(const struct btrfs_root *root) +{ + return READ_ONCE(root->last_log_commit); +} + +static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int commit_id) +{ + WRITE_ONCE(root->last_log_commit, commit_id); +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. @@ -470,30 +478,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sectorsize_bits) -static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) -{ - return crc32c(crc, address, length); -} - -static inline void btrfs_crc32c_final(u32 crc, u8 *result) -{ - put_unaligned_le32(~crc, result); -} - -static inline u64 btrfs_name_hash(const char *name, int len) -{ - return crc32c((u32)~1, name, len); -} - -/* - * Figure the key offset of an extended inode ref - */ -static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, - int len) -{ - return (u64) crc32c(parent_objectid, name, len); -} - static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); @@ -513,12 +497,42 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); + +#ifdef __LITTLE_ENDIAN + +/* + * Compare two keys, on little-endian the disk order is same as CPU order and + * we can avoid the conversion. + */ +static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk_key, + const struct btrfs_key *k2) +{ + const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; + + return btrfs_comp_cpu_keys(k1, k2); +} + +#else + +/* Compare two keys in a memcmp fashion. */ +static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk, + const struct btrfs_key *k2) +{ + struct btrfs_key k1; + + btrfs_disk_key_to_cpu(&k1, disk); + + return btrfs_comp_cpu_keys(&k1, k2); +} + +#endif + int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); -void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); @@ -536,16 +550,26 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, enum btrfs_lock_nesting nest); +int btrfs_force_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); -int btrfs_block_can_be_shared(struct btrfs_root *root, +int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf); int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -void btrfs_extend_item(struct btrfs_path *path, u32 data_size); -void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end); +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 data_size); +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -566,10 +590,6 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int find_higher, int return_any); -int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, u64 *last_ret, - struct btrfs_key *progress); void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); @@ -609,7 +629,8 @@ struct btrfs_item_batch { int nr; }; -void btrfs_setup_item_for_insert(struct btrfs_root *root, +void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key, u32 data_size); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index f2ff4cbe8656..5244561e2016 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -338,13 +338,118 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) } /* + * Check if two blocks addresses are close, used by defrag. + */ +static bool close_blocks(u64 blocknr, u64 other, u32 blocksize) +{ + if (blocknr < other && other - (blocknr + blocksize) < SZ_32K) + return true; + if (blocknr > other && blocknr - (other + blocksize) < SZ_32K) + return true; + return false; +} + +/* + * Go through all the leaves pointed to by a node and reallocate them so that + * disk order is close to key order. + */ +static int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *parent, + int start_slot, u64 *last_ret, + struct btrfs_key *progress) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + const u32 blocksize = fs_info->nodesize; + const int end_slot = btrfs_header_nritems(parent) - 1; + u64 search_start = *last_ret; + u64 last_block = 0; + int ret = 0; + bool progress_passed = false; + + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", + parent->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } + + if (btrfs_header_nritems(parent) <= 1) + return 0; + + for (int i = start_slot; i <= end_slot; i++) { + struct extent_buffer *cur; + struct btrfs_disk_key disk_key; + u64 blocknr; + u64 other; + bool close = true; + + btrfs_node_key(parent, &disk_key, i); + if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0) + continue; + + progress_passed = true; + blocknr = btrfs_node_blockptr(parent, i); + if (last_block == 0) + last_block = blocknr; + + if (i > 0) { + other = btrfs_node_blockptr(parent, i - 1); + close = close_blocks(blocknr, other, blocksize); + } + if (!close && i < end_slot) { + other = btrfs_node_blockptr(parent, i + 1); + close = close_blocks(blocknr, other, blocksize); + } + if (close) { + last_block = blocknr; + continue; + } + + cur = btrfs_read_node_slot(parent, i); + if (IS_ERR(cur)) + return PTR_ERR(cur); + if (search_start == 0) + search_start = last_block; + + btrfs_tree_lock(cur); + ret = btrfs_force_cow_block(trans, root, cur, parent, i, + &cur, search_start, + min(16 * blocksize, + (end_slot - i) * blocksize), + BTRFS_NESTING_COW); + if (ret) { + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + break; + } + search_start = cur->start; + last_block = cur->start; + *last_ret = search_start; + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + } + return ret; +} + +/* * Defrag all the leaves in a given btree. * Read all the leaves and try to get key order to * better reflect disk order */ -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { struct btrfs_path *path = NULL; struct btrfs_key key; @@ -461,6 +566,45 @@ done: } /* + * Defrag a given btree. Every leaf in the btree is read and defragmented. + */ +int btrfs_defrag_root(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) + return 0; + + while (1) { + struct btrfs_trans_handle *trans; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + + ret = btrfs_defrag_leaves(trans, root); + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + cond_resched(); + + if (btrfs_fs_closing(fs_info) || ret != -EAGAIN) + break; + + if (btrfs_defrag_cancelled(fs_info)) { + btrfs_debug(fs_info, "defrag_root cancelled"); + ret = -EAGAIN; + break; + } + } + clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); + return ret; +} + +/* * Defrag specific helper to get an extent map. * * Differences between this and btrfs_get_extent() are: @@ -891,8 +1035,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * very likely resulting in a larger extent after writeback is * triggered (except in a case of free space fragmentation). */ - if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, - EXTENT_DELALLOC, 0, NULL)) + if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC)) goto next; /* diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 5305f2283b5e..5a62763528d1 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -12,7 +12,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_defrag_root(struct btrfs_root *root); static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 427abaf608b8..2833e8ef4c09 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -199,7 +199,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, start = round_down(start, fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(fs_info, len); - btrfs_qgroup_free_data(inode, reserved, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len, NULL); } /* @@ -322,9 +322,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, } else { if (current->journal_info) flush = BTRFS_RESERVE_FLUSH_LIMIT; - - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); } num_bytes = ALIGN(num_bytes, fs_info->sectorsize); @@ -346,7 +343,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, noflush); if (ret) return ret; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + meta_reserve, flush); if (ret) { btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); return ret; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6d51db066503..7381241334e8 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -313,7 +313,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, { struct btrfs_delayed_item *item; - item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); + item = kmalloc(struct_size(item, data, data_len), GFP_NOFS); if (item) { item->data_len = data_len; item->type = type; @@ -328,7 +328,8 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, } /* - * __btrfs_lookup_delayed_item - look up the delayed item by key + * Look up the delayed item by key. + * * @delayed_node: pointer to the delayed node * @index: the dir index value to lookup (offset of a dir index key) * @@ -412,6 +413,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root) static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { + struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node; struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; @@ -419,18 +421,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) if (RB_EMPTY_NODE(&delayed_item->rb_node)) return; - delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; + /* If it's in a rbtree, then we need to have delayed node locked. */ + lockdep_assert_held(&delayed_node->mutex); + + delayed_root = delayed_node->root->fs_info->delayed_root; BUG_ON(!delayed_root); if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) - root = &delayed_item->delayed_node->ins_root; + root = &delayed_node->ins_root; else - root = &delayed_item->delayed_node->del_root; + root = &delayed_node->del_root; rb_erase_cached(&delayed_item->rb_node, root); RB_CLEAR_NODE(&delayed_item->rb_node); - delayed_item->delayed_node->count--; + delayed_node->count--; finish_one_item(delayed_root); } @@ -513,7 +518,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, /* * For insertions we track reserved metadata space by accounting * for the number of leaves that will be used, based on the delayed - * node's index_items_size field. + * node's curr_index_batch_size and index_item_leaves fields. */ if (item->type == BTRFS_DELAYED_DELETION_ITEM) item->bytes_reserved = num_bytes; @@ -1026,7 +1031,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_item); write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, sizeof(struct btrfs_inode_item)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) goto out; @@ -1153,20 +1158,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); if (ret) { - btrfs_release_delayed_node(curr_node); - curr_node = NULL; btrfs_abort_transaction(trans, ret); break; } prev_node = curr_node; curr_node = btrfs_next_delayed_node(curr_node); + /* + * See the comment below about releasing path before releasing + * node. If the commit of delayed items was successful the path + * should always be released, but in case of an error, it may + * point to locked extent buffers (a leaf at the very least). + */ + ASSERT(path->nodes[0] == NULL); btrfs_release_delayed_node(prev_node); } + /* + * Release the path to avoid a potential deadlock and lockdep splat when + * releasing the delayed node, as that requires taking the delayed node's + * mutex. If another task starts running delayed items before we take + * the mutex, it will first lock the mutex and then it may try to lock + * the same btree path (leaf). + */ + btrfs_free_path(path); + if (curr_node) btrfs_release_delayed_node(curr_node); - btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; @@ -1361,8 +1379,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL, - NULL); + btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL); async_work->nr = nr; btrfs_queue_work(fs_info->delayed_workers, &async_work->work); @@ -1413,7 +1430,29 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); } -/* Will return 0 or -ENOMEM */ +static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return; + + /* + * Adding the new dir index item does not require touching another + * leaf, so we can release 1 unit of metadata that was previously + * reserved when starting the transaction. This applies only to + * the case where we had a transaction start and excludes the + * transaction join case (when replaying log trees). + */ + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, bytes, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); + ASSERT(trans->bytes_reserved >= bytes); + trans->bytes_reserved -= bytes; +} + +/* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, @@ -1455,6 +1494,27 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); + /* + * First attempt to insert the delayed item. This is to make the error + * handling path simpler in case we fail (-EEXIST). There's no risk of + * any other task coming in and running the delayed item before we do + * the metadata space reservation below, because we are holding the + * delayed node's mutex and that mutex must also be locked before the + * node's delayed items can be run. + */ + ret = __btrfs_add_delayed_item(delayed_node, delayed_item); + if (unlikely(ret)) { + btrfs_err(trans->fs_info, +"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d", + name_len, name, index, btrfs_root_id(delayed_node->root), + delayed_node->inode_id, dir->index_cnt, + delayed_node->index_cnt, ret); + btrfs_release_delayed_item(delayed_item); + btrfs_release_dir_index_item_space(trans); + mutex_unlock(&delayed_node->mutex); + goto release_node; + } + if (delayed_node->index_item_leaves == 0 || delayed_node->curr_index_batch_size + data_len > leaf_data_size) { delayed_node->curr_index_batch_size = data_len; @@ -1472,36 +1532,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, * impossible. */ if (WARN_ON(ret)) { - mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_item(delayed_item); + mutex_unlock(&delayed_node->mutex); goto release_node; } delayed_node->index_item_leaves++; - } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { - const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); - - /* - * Adding the new dir index item does not require touching another - * leaf, so we can release 1 unit of metadata that was previously - * reserved when starting the transaction. This applies only to - * the case where we had a transaction start and excludes the - * transaction join case (when replaying log trees). - */ - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, bytes, 0); - btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); - ASSERT(trans->bytes_reserved >= bytes); - trans->bytes_reserved -= bytes; - } - - ret = __btrfs_add_delayed_item(delayed_node, delayed_item); - if (unlikely(ret)) { - btrfs_err(trans->fs_info, - "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - name_len, name, delayed_node->root->root_key.objectid, - delayed_node->inode_id, ret); - BUG(); + } else { + btrfs_release_dir_index_item_space(trans); } mutex_unlock(&delayed_node->mutex); @@ -1722,8 +1760,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, } /* - * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree - * + * Read dir info stored in the delayed tree. */ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, struct list_head *ins_list) @@ -1736,9 +1773,6 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, int over = 0; unsigned char d_type; - if (list_empty(ins_list)) - return 0; - /* * Changing the data of the delayed item is impossible. So * we needn't lock them. And we have held i_mutex of the @@ -1799,24 +1833,22 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, - inode->i_atime.tv_sec); + inode_get_atime_sec(inode)); btrfs_set_stack_timespec_nsec(&inode_item->atime, - inode->i_atime.tv_nsec); + inode_get_atime_nsec(inode)); btrfs_set_stack_timespec_sec(&inode_item->mtime, - inode->i_mtime.tv_sec); + inode_get_mtime_sec(inode)); btrfs_set_stack_timespec_nsec(&inode_item->mtime, - inode->i_mtime.tv_nsec); + inode_get_mtime_nsec(inode)); btrfs_set_stack_timespec_sec(&inode_item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime_sec(inode)); btrfs_set_stack_timespec_nsec(&inode_item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime_nsec(inode)); - btrfs_set_stack_timespec_sec(&inode_item->otime, - BTRFS_I(inode)->i_otime.tv_sec); - btrfs_set_stack_timespec_nsec(&inode_item->otime, - BTRFS_I(inode)->i_otime.tv_nsec); + btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec); } int btrfs_fill_inode(struct inode *inode, u32 *rdev) @@ -1856,19 +1888,17 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); - inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); - inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); + inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime), + btrfs_stack_timespec_nsec(&inode_item->atime)); - inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); - inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); + inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime), + btrfs_stack_timespec_nsec(&inode_item->mtime)); - inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime); - inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime); + inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), + btrfs_stack_timespec_nsec(&inode_item->ctime)); - BTRFS_I(inode)->i_otime.tv_sec = - btrfs_stack_timespec_sec(&inode_item->otime); - BTRFS_I(inode)->i_otime.tv_nsec = - btrfs_stack_timespec_nsec(&inode_item->otime); + BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); + BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); inode->i_generation = BTRFS_I(inode)->generation; BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1879,9 +1909,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) } int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_delayed_node *delayed_node; int ret = 0; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index dc1085b2a397..5cceb31bbd16 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -95,7 +95,7 @@ struct btrfs_delayed_item { bool logged; /* The maximum leaf size is 64K, so u16 is more than enough. */ u16 data_len; - char data[]; + char data[] __counted_by(data_len); }; static inline void btrfs_init_delayed_root( @@ -135,7 +135,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); int btrfs_fill_inode(struct inode *inode, u32 *rdev); int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6a13cf00218b..891ea2fa263c 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -57,16 +57,20 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) * Release a ref head's reservation. * * @fs_info: the filesystem - * @nr: number of items to drop + * @nr_refs: number of delayed refs to drop + * @nr_csums: number of csum items to drop * * Drops the delayed ref head's count from the delayed refs rsv and free any * excess reservation we had. */ -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr); - u64 released = 0; + u64 num_bytes; + u64 released; + + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs); + num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) @@ -77,50 +81,135 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) /* * Adjust the size of the delayed refs rsv. * - * This is to be called anytime we may have adjusted trans->delayed_ref_updates, - * it'll calculate the additional size and add it to the delayed_refs_rsv. + * This is to be called anytime we may have adjusted trans->delayed_ref_updates + * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and + * add it to the delayed_refs_rsv. */ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv; u64 num_bytes; + u64 reserved_bytes; + + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); + num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, + trans->delayed_ref_csum_deletions); - if (!trans->delayed_ref_updates) + if (num_bytes == 0) return; - num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, - trans->delayed_ref_updates); + /* + * Try to take num_bytes from the transaction's local delayed reserve. + * If not possible, try to take as much as it's available. If the local + * reserve doesn't have enough reserved space, the delayed refs reserve + * will be refilled next time btrfs_delayed_refs_rsv_refill() is called + * by someone or if a transaction commit is triggered before that, the + * global block reserve will be used. We want to minimize using the + * global block reserve for cases we can account for in advance, to + * avoid exhausting it and reach -ENOSPC during a transaction commit. + */ + spin_lock(&local_rsv->lock); + reserved_bytes = min(num_bytes, local_rsv->reserved); + local_rsv->reserved -= reserved_bytes; + local_rsv->full = (local_rsv->reserved >= local_rsv->size); + spin_unlock(&local_rsv->lock); spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; - delayed_rsv->full = false; + delayed_rsv->reserved += reserved_bytes; + delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size); spin_unlock(&delayed_rsv->lock); trans->delayed_ref_updates = 0; + trans->delayed_ref_csum_deletions = 0; +} + +/* + * Adjust the size of the delayed refs block reserve for 1 block group item + * insertion, used after allocating a block group. + */ +void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + + spin_lock(&delayed_rsv->lock); + /* + * Inserting a block group item does not require changing the free space + * tree, only the extent tree or the block group tree, so this is all we + * need. + */ + delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1); + delayed_rsv->full = false; + spin_unlock(&delayed_rsv->lock); +} + +/* + * Adjust the size of the delayed refs block reserve to release space for 1 + * block group item insertion. + */ +void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 released; + + released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); +} + +/* + * Adjust the size of the delayed refs block reserve for 1 block group item + * update. + */ +void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + + spin_lock(&delayed_rsv->lock); + /* + * Updating a block group item does not result in new nodes/leaves and + * does not require changing the free space tree, only the extent tree + * or the block group tree, so this is all we need. + */ + delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1); + delayed_rsv->full = false; + spin_unlock(&delayed_rsv->lock); +} + +/* + * Adjust the size of the delayed refs block reserve to release space for 1 + * block group item update. + */ +void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1); + u64 released; + + released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); } /* * Transfer bytes to our delayed refs rsv. * * @fs_info: the filesystem - * @src: source block rsv to transfer from * @num_bytes: number of bytes to transfer * - * This transfers up to the num_bytes amount from the src rsv to the + * This transfers up to the num_bytes amount, previously reserved, to the * delayed_refs_rsv. Any extra bytes are returned to the space info. */ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, u64 num_bytes) { struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; u64 to_free = 0; - spin_lock(&src->lock); - src->reserved -= num_bytes; - src->size -= num_bytes; - spin_unlock(&src->lock); - spin_lock(&delayed_refs_rsv->lock); if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { u64 delta = delayed_refs_rsv->size - @@ -161,8 +250,11 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_space_info *space_info = block_rsv->space_info; u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); u64 num_bytes = 0; + u64 refilled_bytes; + u64 to_free; int ret = -ENOSPC; spin_lock(&block_rsv->lock); @@ -175,12 +267,40 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, if (!num_bytes) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); if (ret) return ret; - btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); + + /* + * We may have raced with someone else, so check again if we the block + * reserve is still not full and release any excess space. + */ + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + u64 needed = block_rsv->size - block_rsv->reserved; + + if (num_bytes >= needed) { + block_rsv->reserved += needed; + block_rsv->full = true; + to_free = num_bytes - needed; + refilled_bytes = needed; + } else { + block_rsv->reserved += num_bytes; + to_free = 0; + refilled_bytes = num_bytes; + } + } else { + to_free = num_bytes; + refilled_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (to_free > 0) + btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); + + if (refilled_bytes > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, + refilled_bytes, 1); return 0; } @@ -398,7 +518,8 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, return 0; } -static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, +static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { @@ -409,9 +530,11 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, list_del(&ref->add_list); btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } -static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, +static bool merge_ref(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) @@ -440,10 +563,10 @@ static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, mod = -next->ref_mod; } - drop_delayed_ref(delayed_refs, head, next); + drop_delayed_ref(fs_info, delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(delayed_refs, head, ref); + drop_delayed_ref(fs_info, delayed_refs, head, ref); done = true; } else { /* @@ -481,7 +604,7 @@ again: ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; - if (merge_ref(delayed_refs, head, ref, seq)) + if (merge_ref(fs_info, delayed_refs, head, ref, seq)) goto again; } } @@ -560,10 +683,11 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, * Return true if the ref was merged into an existing one (and therefore can be * freed by the caller). */ -static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, +static bool insert_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { + struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs; struct btrfs_delayed_ref_node *exist; int mod; @@ -574,6 +698,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); spin_unlock(&href->lock); + trans->delayed_ref_updates++; return false; } @@ -602,7 +727,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) - drop_delayed_ref(root, href, exist); + drop_delayed_ref(trans->fs_info, root, href, exist); spin_unlock(&href->lock); return true; } @@ -623,6 +748,15 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, BUG_ON(existing->is_data != update->is_data); spin_lock(&existing->lock); + + /* + * When freeing an extent, we may not know the owning root when we + * first create the head_ref. However, some deref before the last deref + * will know it, so we just need to update the head_ref accordingly. + */ + if (!existing->owning_root) + existing->owning_root = update->owning_root; + if (update->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref @@ -632,6 +766,7 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, * Set it again here */ existing->must_insert_reserved = update->must_insert_reserved; + existing->owning_root = update->owning_root; /* * update the num_bytes so we make sure the accounting @@ -671,6 +806,8 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, /* * If we are going to from a positive ref mod to a negative or vice * versa we need to make sure to adjust pending_csums accordingly. + * We reserve bytes for csum deletion when adding or updating a ref head + * see add_delayed_ref_head() for more details. */ if (existing->is_data) { u64 csum_leaves = @@ -679,11 +816,11 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes; - btrfs_delayed_refs_rsv_release(fs_info, csum_leaves); + btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves); } if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes; - trans->delayed_ref_updates += csum_leaves; + trans->delayed_ref_csum_deletions += csum_leaves; } } @@ -694,7 +831,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, int action, bool is_data, - bool is_system) + bool is_system, u64 owning_root) { int count_mod = 1; bool must_insert_reserved = false; @@ -734,7 +871,9 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, head_ref->bytenr = bytenr; head_ref->num_bytes = num_bytes; head_ref->ref_mod = count_mod; + head_ref->reserved_bytes = reserved; head_ref->must_insert_reserved = must_insert_reserved; + head_ref->owning_root = owning_root; head_ref->is_data = is_data; head_ref->is_system = is_system; head_ref->ref_tree = RB_ROOT_CACHED; @@ -795,16 +934,21 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + /* + * We reserve the amount of bytes needed to delete csums when + * adding the ref head and not when adding individual drop refs + * since the csum items are deleted only after running the last + * delayed drop ref (the data extent's ref count drops to 0). + */ if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; - trans->delayed_ref_updates += + trans->delayed_ref_csum_deletions += btrfs_csum_bytes_to_leaves(trans->fs_info, head_ref->num_bytes); } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; @@ -813,8 +957,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } /* - * init_delayed_ref_common - Initialize the structure which represents a - * modification to a an extent. + * Initialize the structure which represents a modification to a an extent. * * @fs_info: Internal to the mounted filesystem mount structure. * @@ -885,7 +1028,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, u64 parent = generic_ref->parent; u8 ref_type; - is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); + is_system = (generic_ref->tree_ref.ref_root == BTRFS_CHUNK_TREE_OBJECTID); ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); @@ -898,8 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - !generic_ref->skip_qgroup) { + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); @@ -914,15 +1056,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ref_type = BTRFS_TREE_BLOCK_REF_KEY; init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - generic_ref->tree_ref.owning_root, action, + generic_ref->tree_ref.ref_root, action, ref_type); - ref->root = generic_ref->tree_ref.owning_root; + ref->root = generic_ref->tree_ref.ref_root; ref->parent = parent; ref->level = level; init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - generic_ref->tree_ref.owning_root, 0, action, - false, is_system); + generic_ref->tree_ref.ref_root, 0, action, + false, is_system, generic_ref->owning_root); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -935,7 +1077,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(trans, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -974,7 +1116,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; - u64 ref_root = generic_ref->data_ref.owning_root; + u64 ref_root = generic_ref->data_ref.ref_root; u64 owner = generic_ref->data_ref.ino; u64 offset = generic_ref->data_ref.offset; u8 ref_type; @@ -1002,8 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - !generic_ref->skip_qgroup) { + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); @@ -1014,7 +1155,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, } init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, - reserved, action, true, false); + reserved, action, true, false, generic_ref->owning_root); head_ref->extent_op = NULL; delayed_refs = &trans->transaction->delayed_refs; @@ -1027,7 +1168,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(trans, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -1060,7 +1201,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return -ENOMEM; init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, false, false); + BTRFS_UPDATE_DELAYED_HEAD, false, false, 0); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index b8e14b0ba5f1..62d679d40f4f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -9,10 +9,16 @@ #include <linux/refcount.h> /* these are the possible values of struct btrfs_delayed_ref_node->action */ -#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ -#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ -#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ -#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ +enum btrfs_delayed_ref_action { + /* Add one backref to the tree */ + BTRFS_ADD_DELAYED_REF = 1, + /* Delete one backref from the tree */ + BTRFS_DROP_DELAYED_REF, + /* Record a full extent allocation */ + BTRFS_ADD_DELAYED_EXTENT, + /* Not changing ref count on head ref */ + BTRFS_UPDATE_DELAYED_HEAD, +} __packed; struct btrfs_delayed_ref_node { struct rb_node ref_node; @@ -105,6 +111,18 @@ struct btrfs_delayed_ref_head { int ref_mod; /* + * The root that triggered the allocation when must_insert_reserved is + * set to true. + */ + u64 owning_root; + + /* + * Track reserved bytes when setting must_insert_reserved. On success + * or cleanup, we will need to free the reservation. + */ + u64 reserved_bytes; + + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree * until the delayed ref is processed. must_insert_reserved is @@ -117,6 +135,7 @@ struct btrfs_delayed_ref_head { * the free has happened. */ bool must_insert_reserved; + bool is_data; bool is_system; bool processing; @@ -183,13 +202,13 @@ enum btrfs_ref_type { BTRFS_REF_DATA, BTRFS_REF_METADATA, BTRFS_REF_LAST, -}; +} __packed; struct btrfs_data_ref { /* For EXTENT_DATA_REF */ - /* Original root this data extent belongs to */ - u64 owning_root; + /* Root which owns this data reference. */ + u64 ref_root; /* Inode which refers to this data extent */ u64 ino; @@ -212,18 +231,18 @@ struct btrfs_tree_ref { int level; /* - * Root which owns this tree block. + * Root which owns this tree block reference. * * For TREE_BLOCK_REF (skinny metadata, either inline or keyed) */ - u64 owning_root; + u64 ref_root; /* For non-skinny metadata, no special member needed */ }; struct btrfs_ref { enum btrfs_ref_type type; - int action; + enum btrfs_delayed_ref_action action; /* * Whether this extent should go through qgroup record. @@ -239,6 +258,7 @@ struct btrfs_ref { #endif u64 bytenr; u64 len; + u64 owning_root; /* Bytenr of the parent tree block */ u64 parent; @@ -277,24 +297,37 @@ static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_in return num_bytes; } +static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info, + int num_csum_items) +{ + /* + * Deleting csum items does not result in new nodes/leaves and does not + * require changing the free space tree, only the csum tree, so this is + * all we need. + */ + return btrfs_calc_metadata_size(fs_info, num_csum_items); +} + static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, - int action, u64 bytenr, u64 len, u64 parent) + int action, u64 bytenr, u64 len, + u64 parent, u64 owning_root) { generic_ref->action = action; generic_ref->bytenr = bytenr; generic_ref->len = len; generic_ref->parent = parent; + generic_ref->owning_root = owning_root; } -static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, - int level, u64 root, u64 mod_root, bool skip_qgroup) +static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, + u64 root, u64 mod_root, bool skip_qgroup) { #ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: root; #endif generic_ref->tree_ref.level = level; - generic_ref->tree_ref.owning_root = root; + generic_ref->tree_ref.ref_root = root; generic_ref->type = BTRFS_REF_METADATA; if (skip_qgroup || !(is_fstree(root) && (!mod_root || is_fstree(mod_root)))) @@ -312,7 +345,7 @@ static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: ref_root; #endif - generic_ref->data_ref.owning_root = ref_root; + generic_ref->data_ref.ref_root = ref_root; generic_ref->data_ref.ino = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; @@ -338,7 +371,6 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { - WARN_ON(refcount_read(&ref->refs) == 0); if (refcount_dec_and_test(&ref->refs)) { WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); switch (ref->type) { @@ -402,12 +434,15 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums); void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info); +void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info); +void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); +void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, u64 num_bytes); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5f10965fd72b..f9544fda38e9 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -17,7 +17,6 @@ #include "print-tree.h" #include "volumes.h" #include "async-thread.h" -#include "check-integrity.h" #include "dev-replace.h" #include "sysfs.h" #include "zoned.h" @@ -247,6 +246,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; + struct bdev_handle *bdev_handle; struct block_device *bdev; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -257,12 +257,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); - if (IS_ERR(bdev)) { + bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + fs_info->bdev_holder, NULL); + if (IS_ERR(bdev_handle)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); - return PTR_ERR(bdev); + return PTR_ERR(bdev_handle); } + bdev = bdev_handle->bdev; if (!btrfs_check_device_zone_type(fs_info, bdev)) { btrfs_err(fs_info, @@ -313,9 +314,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->commit_bytes_used = device->bytes_used; device->fs_info = fs_info; device->bdev = bdev; + device->bdev_handle = bdev_handle; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->holder = fs_info->bdev_holder; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_devices; @@ -334,7 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - blkdev_put(bdev, fs_info->bdev_holder); + bdev_release(bdev_handle); return ret; } @@ -442,7 +443,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) dev_replace->item_needs_writeback = 0; up_write(&dev_replace->rwsem); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); out: btrfs_free_path(path); @@ -792,9 +793,9 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, lockdep_assert_held(&srcdev->fs_info->chunk_mutex); - while (!find_first_extent_bit(&srcdev->alloc_state, start, - &found_start, &found_end, - CHUNK_ALLOCATED, &cached_state)) { + while (find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { ret = set_extent_bit(&tgtdev->alloc_state, found_start, found_end, CHUNK_ALLOCATED, NULL); if (ret) diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 082eb0e19598..9c07d5c3e5ad 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -38,7 +38,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (di) return ERR_PTR(-EEXIST); - btrfs_extend_item(path, data_size); + btrfs_extend_item(trans, path, data_size); } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); @@ -93,7 +93,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, name, name_ptr, name_len); write_extent_buffer(leaf, data, data_ptr, data_len); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); return ret; } @@ -153,7 +153,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name->name, name_ptr, name->len); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); second_insert: /* FIXME, use some real flag for selecting the extra index */ @@ -439,7 +439,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); - btrfs_truncate_item(path, item_len - sub_item_len, 1); + btrfs_truncate_item(trans, path, item_len - sub_item_len, 1); } return ret; } diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index aab4b7cc7fa0..e40a226373d7 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -3,6 +3,10 @@ #ifndef BTRFS_DIR_ITEM_H #define BTRFS_DIR_ITEM_H +#include <linux/crc32c.h> + +struct fscrypt_str; + int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const struct fscrypt_str *name); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, @@ -39,4 +43,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, const char *name, int name_len); +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return crc32c((u32)~1, name, len); +} + #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a9a2c5446c18..62cb97f7c94f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,7 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" @@ -245,6 +244,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start = btrfs_header_bytenr(eb); + u64 last_trans; u8 result[BTRFS_CSUM_SIZE]; int ret; @@ -282,12 +282,12 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) * Also check the generation, the eb reached here must be newer than * last committed. Or something seriously wrong happened. */ - if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { + last_trans = btrfs_get_last_trans_committed(fs_info); + if (unlikely(btrfs_header_generation(eb) <= last_trans)) { ret = -EUCLEAN; btrfs_err(fs_info, "block=%llu bad generation, have %llu expect > %llu", - eb->start, btrfs_header_generation(eb), - fs_info->last_trans_committed); + eb->start, btrfs_header_generation(eb), last_trans); goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); @@ -313,21 +313,17 @@ static bool check_tree_block_fsid(struct extent_buffer *eb) struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; u8 fsid[BTRFS_FSID_SIZE]; - u8 *metadata_uuid; read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE); + /* - * Checking the incompat flag is only valid for the current fs. For - * seed devices it's forbidden to have their uuid changed so reading - * ->fsid in this case is fine + * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid. + * This is then overwritten by metadata_uuid if it is present in the + * device_list_add(). The same true for a seed device as well. So use of + * fs_devices::metadata_uuid is appropriate here. */ - if (btrfs_fs_incompat(fs_info, METADATA_UUID)) - metadata_uuid = fs_devices->metadata_uuid; - else - metadata_uuid = fs_devices->fsid; - - if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) + if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0) return false; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) @@ -525,6 +521,7 @@ static bool btree_dirty_folio(struct address_space *mapping, struct folio *folio) { struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_subpage_info *spi = fs_info->subpage_info; struct btrfs_subpage *subpage; struct extent_buffer *eb; int cur_bit = 0; @@ -538,18 +535,19 @@ static bool btree_dirty_folio(struct address_space *mapping, btrfs_assert_tree_write_locked(eb); return filemap_dirty_folio(mapping, folio); } + + ASSERT(spi); subpage = folio_get_private(folio); - ASSERT(subpage->dirty_bitmap); - while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { + for (cur_bit = spi->dirty_offset; + cur_bit < spi->dirty_offset + spi->bitmap_nr_bits; + cur_bit++) { unsigned long flags; u64 cur; - u16 tmp = (1 << cur_bit); spin_lock_irqsave(&subpage->lock, flags); - if (!(tmp & subpage->dirty_bitmap)) { + if (!test_bit(cur_bit, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - cur_bit++; continue; } spin_unlock_irqrestore(&subpage->lock, flags); @@ -562,7 +560,7 @@ static bool btree_dirty_folio(struct address_space *mapping, btrfs_assert_tree_write_locked(eb); free_extent_buffer(eb); - cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); + cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1; } return filemap_dirty_folio(mapping, folio); } @@ -678,9 +676,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, refcount_set(&root->refs, 1); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); - root->log_transid = 0; + btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; - root->last_log_commit = 0; + btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, @@ -862,7 +860,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->root_key.offset = 0; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, - BTRFS_NESTING_NORMAL); + 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -870,7 +868,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, } root->node = leaf; - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); root->commit_root = btrfs_root_node(root); set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); @@ -939,13 +937,13 @@ int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, - NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); + NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) return PTR_ERR(leaf); root->node = leaf; - btrfs_mark_buffer_dirty(root->node); + btrfs_mark_buffer_dirty(trans, root->node); btrfs_tree_unlock(root->node); return 0; @@ -1007,9 +1005,9 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; - root->log_transid = 0; + btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; - root->last_log_commit = 0; + btrfs_set_root_last_log_commit(root, 0); return 0; } @@ -1182,6 +1180,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, return btrfs_grab_root(fs_info->block_group_root); case BTRFS_FREE_SPACE_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); + case BTRFS_RAID_STRIPE_TREE_OBJECTID: + return btrfs_grab_root(fs_info->stripe_root); default: return NULL; } @@ -1262,6 +1262,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); btrfs_put_root(fs_info->block_group_root); + btrfs_put_root(fs_info->stripe_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1405,7 +1406,8 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, } /* - * btrfs_get_fs_root_commit_root - return a root for the given objectid + * Return a root for the given objectid. + * * @fs_info: the fs_info * @objectid: the objectid we need to lookup * @@ -1552,7 +1554,7 @@ static int transaction_kthread(void *arg) delta = ktime_get_seconds() - cur->start_time; if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && - cur->state < TRANS_STATE_COMMIT_START && + cur->state < TRANS_STATE_COMMIT_PREP && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay -= msecs_to_jiffies((delta - 1) * 1000); @@ -1702,11 +1704,11 @@ static void backup_super_roots(struct btrfs_fs_info *info) } /* - * read_backup_root - Reads a backup root based on the passed priority. Prio 0 - * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots + * Reads a backup root based on the passed priority. Prio 0 is the newest, prio + * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots * - * fs_info - filesystem whose backup roots need to be read - * priority - priority of backup root required + * @fs_info: filesystem whose backup roots need to be read + * @priority: priority of backup root required * * Returns backup root index on success and -EINVAL otherwise. */ @@ -1806,6 +1808,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); free_root_extent_buffers(info->block_group_root); + free_root_extent_buffers(info->stripe_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -2265,7 +2268,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(root)) { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); fs_info->quota_root = root; } @@ -2282,6 +2284,20 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) fs_info->uuid_root = root; } + if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) { + location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->stripe_root = root; + } + } + return 0; out: btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", @@ -2384,21 +2400,19 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, - BTRFS_FSID_SIZE)) { + if (!fs_info->fs_devices->temp_fsid && + memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", - fs_info->super_copy->fsid, fs_info->fs_devices->fsid); + sb->fsid, fs_info->fs_devices->fsid); ret = -EINVAL; } - if (btrfs_fs_incompat(fs_info, METADATA_UUID) && - memcmp(fs_info->fs_devices->metadata_uuid, - fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) { + if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb), + BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", - fs_info->super_copy->metadata_uuid, - fs_info->fs_devices->metadata_uuid); + btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid); ret = -EINVAL; } @@ -2640,7 +2654,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) /* All successful */ fs_info->generation = btrfs_header_generation(tree_root->node); - fs_info->last_trans_committed = fs_info->generation; + btrfs_set_last_trans_committed(fs_info, fs_info->generation); fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ @@ -2690,8 +2704,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered); btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent); - btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start, - BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep, + BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked, BTRFS_LOCKDEP_TRANS_UNBLOCKED); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed, @@ -2741,9 +2755,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->ordered_root_lock); btrfs_init_scrub(fs_info); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - fs_info->check_integrity_print_mask = 0; -#endif btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); @@ -2869,6 +2880,56 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) return 0; } +static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) +{ + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i = 0; + int err = 0; + unsigned int ret = 0; + + while (1) { + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) { + spin_unlock(&fs_info->fs_roots_radix_lock); + break; + } + root_objectid = gang[ret - 1]->root_key.objectid + 1; + + for (i = 0; i < ret; i++) { + /* Avoid to grab roots in dead_roots. */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* Grab all the search result for later use. */ + gang[i] = btrfs_grab_root(gang[i]); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); + if (err) + goto out; + btrfs_put_root(gang[i]); + } + root_objectid++; + } +out: + /* Release the uncleaned roots due to error. */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_root(gang[i]); + } + return err; +} + /* * Some options only have meaning at mount time and shouldn't persist across * remounts, or be displayed. Clear these at the end of mount and remount @@ -3113,7 +3174,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device u32 nodesize; u32 stripesize; u64 generation; - u64 features; u16 csum_type; struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -3153,6 +3213,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } + btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid); /* * Verify the type first, if that or the checksum value are * corrupted, we'll find out @@ -3195,15 +3256,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device disk_super = fs_info->super_copy; - - features = btrfs_super_flags(disk_super); - if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { - features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2; - btrfs_set_super_flags(disk_super, features); - btrfs_info(fs_info, - "found metadata UUID change in progress flag, clearing"); - } - memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_for_commit)); @@ -3222,7 +3274,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + WRITE_ONCE(fs_info->fs_error, -EUCLEAN); /* * In the long term, we'll store the compression type in the super @@ -3417,6 +3469,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_free_zone_cache(fs_info); + btrfs_check_active_zone_reservation(fs_info); + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, @@ -3463,18 +3517,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device "auto enabling async discard"); } -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { - ret = btrfsic_mount(fs_info, fs_devices, - btrfs_test_opt(fs_info, - CHECK_INTEGRITY_DATA) ? 1 : 0, - fs_info->check_integrity_print_mask); - if (ret) - btrfs_warn(fs_info, - "failed to initialize integrity check module: %d", - ret); - } -#endif ret = btrfs_read_qgroup_config(fs_info); if (ret) goto fail_trans_kthread; @@ -3774,8 +3816,6 @@ static int write_dev_supers(struct btrfs_device *device, */ if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; - - btrfsic_check_bio(bio); submit_bio(bio); if (btrfs_advance_sb_log(device, i)) @@ -3871,28 +3911,11 @@ static void write_dev_flush(struct btrfs_device *device) device->last_flush_error = BLK_STS_OK; -#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY - /* - * When a disk has write caching disabled, we skip submission of a bio - * with flush and sync requests before writing the superblock, since - * it's not needed. However when the integrity checker is enabled, this - * results in reports that there are metadata blocks referred by a - * superblock that were not properly flushed. So don't skip the bio - * submission only when the integrity checker is enabled for the sake - * of simplicity, since this is a debug tool and not meant for use in - * non-debug builds. - */ - if (!bdev_write_cache(device->bdev)) - return; -#endif - bio_init(bio, device->bdev, NULL, 0, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; - - btrfsic_check_bio(bio); submit_bio(bio); set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } @@ -4136,56 +4159,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, btrfs_put_root(root); } -int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) -{ - u64 root_objectid = 0; - struct btrfs_root *gang[8]; - int i = 0; - int err = 0; - unsigned int ret = 0; - - while (1) { - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, root_objectid, - ARRAY_SIZE(gang)); - if (!ret) { - spin_unlock(&fs_info->fs_roots_radix_lock); - break; - } - root_objectid = gang[ret - 1]->root_key.objectid + 1; - - for (i = 0; i < ret; i++) { - /* Avoid to grab roots in dead_roots */ - if (btrfs_root_refs(&gang[i]->root_item) == 0) { - gang[i] = NULL; - continue; - } - /* grab all the search result for later use */ - gang[i] = btrfs_grab_root(gang[i]); - } - spin_unlock(&fs_info->fs_roots_radix_lock); - - for (i = 0; i < ret; i++) { - if (!gang[i]) - continue; - root_objectid = gang[i]->root_key.objectid; - err = btrfs_orphan_cleanup(gang[i]); - if (err) - goto out; - btrfs_put_root(gang[i]); - } - root_objectid++; - } -out: - /* release the uncleaned roots due to error */ - for (; i < ret; i++) { - if (gang[i]) - btrfs_put_root(gang[i]); - } - return err; -} - int btrfs_commit_super(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->tree_root; @@ -4228,7 +4201,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) u64 found_end; found = true; - while (!find_first_extent_bit(&trans->dirty_pages, cur, + while (find_first_extent_bit(&trans->dirty_pages, cur, &found_start, &found_end, EXTENT_DIRTY, &cached)) { dirty_bytes += found_end + 1 - found_start; cur = found_end + 1; @@ -4418,16 +4391,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) iput(fs_info->btree_inode); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) - btrfsic_unmount(fs_info->fs_devices); -#endif - btrfs_mapping_tree_free(&fs_info->mapping_tree); btrfs_close_devices(fs_info->fs_devices); } -void btrfs_mark_buffer_dirty(struct extent_buffer *buf) +void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf) { struct btrfs_fs_info *fs_info = buf->fs_info; u64 transid = btrfs_header_generation(buf); @@ -4441,21 +4410,16 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) return; #endif + /* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */ + ASSERT(trans->transid == fs_info->generation); btrfs_assert_tree_write_locked(buf); - if (transid != fs_info->generation) - WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", - buf->start, transid, fs_info->generation); - set_extent_buffer_dirty(buf); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - /* - * btrfs_check_leaf() won't check item data if we don't have WRITTEN - * set, so this will only validate the basic structure of the items. - */ - if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) { - btrfs_print_leaf(buf); - ASSERT(0); + if (unlikely(transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu", + buf->start, transid, fs_info->generation); } -#endif + set_extent_buffer_dirty(buf); } static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info, @@ -4552,9 +4516,7 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root) static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct list_head splice; - - INIT_LIST_HEAD(&splice); + LIST_HEAD(splice); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); @@ -4617,6 +4579,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, list_del(&ref->add_list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } if (head->must_insert_reserved) pin_bytes = true; @@ -4660,9 +4623,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; - struct list_head splice; - - INIT_LIST_HEAD(&splice); + LIST_HEAD(splice); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); @@ -4695,9 +4656,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct list_head splice; - - INIT_LIST_HEAD(&splice); + LIST_HEAD(splice); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); @@ -4716,21 +4675,16 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->delalloc_root_lock); } -static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, - struct extent_io_tree *dirty_pages, - int mark) +static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, + struct extent_io_tree *dirty_pages, + int mark) { - int ret; struct extent_buffer *eb; u64 start = 0; u64 end; - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark, NULL); - if (ret) - break; - + while (find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL)) { clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = find_extent_buffer(fs_info, start); @@ -4746,16 +4700,13 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, free_extent_buffer_stale(eb); } } - - return ret; } -static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, - struct extent_io_tree *unpin) +static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, + struct extent_io_tree *unpin) { u64 start; u64 end; - int ret; while (1) { struct extent_state *cached_state = NULL; @@ -4767,9 +4718,8 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, * the same extent range. */ mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state); - if (ret) { + if (!find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } @@ -4780,8 +4730,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } - - return 0; } static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) @@ -4829,7 +4777,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_put_block_group(cache); - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4851,6 +4799,32 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, } } +static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *gang[8]; + int i; + int ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + struct btrfs_root *root = gang[i]; + + btrfs_qgroup_free_meta_all_pertrans(root); + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + } + } + spin_unlock(&fs_info->fs_roots_radix_lock); +} + void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_fs_info *fs_info) { @@ -4879,6 +4853,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); + btrfs_free_all_qgroup_pertrans(fs_info); + cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } @@ -4893,7 +4869,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) while (!list_empty(&fs_info->trans_list)) { t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); - if (t->state >= TRANS_STATE_COMMIT_START) { + if (t->state >= TRANS_STATE_COMMIT_PREP) { refcount_inc(&t->use_count); spin_unlock(&fs_info->trans_lock); btrfs_wait_for_commit(fs_info, t->transid); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b03767f4d7ed..50dab8f639dc 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -77,7 +77,6 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); -int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, @@ -105,7 +104,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) } void btrfs_put_root(struct btrfs_root *root); -void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index a2315a4b8b75..ea149be28dff 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -105,32 +105,40 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, lockdep_set_class(&tree->lock, &file_extent_tree_class); } +/* + * Empty an io tree, removing and freeing every extent state record from the + * tree. This should be called once we are sure no other task can access the + * tree anymore, so no tree updates happen after we empty the tree and there + * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never + * set on any extent state when calling this function). + */ void extent_io_tree_release(struct extent_io_tree *tree) { + struct rb_root root; + struct extent_state *state; + struct extent_state *tmp; + spin_lock(&tree->lock); - /* - * Do a single barrier for the waitqueue_active check here, the state - * of the waitqueue should not change once extent_io_tree_release is - * called. - */ - smp_mb(); - while (!RB_EMPTY_ROOT(&tree->state)) { - struct rb_node *node; - struct extent_state *state; - - node = rb_first(&tree->state); - state = rb_entry(node, struct extent_state, rb_node); - rb_erase(&state->rb_node, &tree->state); + root = tree->state; + tree->state = RB_ROOT; + rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) { + /* Clear node to keep free_extent_state() happy. */ RB_CLEAR_NODE(&state->rb_node); + ASSERT(!(state->state & EXTENT_LOCKED)); /* - * btree io trees aren't supposed to have tasks waiting for - * changes in the flags of extent states ever. + * No need for a memory barrier here, as we are holding the tree + * lock and we only change the waitqueue while holding that lock + * (see wait_extent_bit()). */ ASSERT(!waitqueue_active(&state->wq)); free_extent_state(state); - cond_resched_lock(&tree->lock); } + /* + * Should still be empty even after a reschedule, no other task should + * be accessing the tree anymore. + */ + ASSERT(RB_EMPTY_ROOT(&tree->state)); spin_unlock(&tree->lock); } @@ -327,6 +335,36 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) "locking error: extent tree was modified by another thread while locked"); } +static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state) +{ + struct extent_state *prev; + + prev = prev_state(state); + if (prev && prev->end == state->start - 1 && prev->state == state->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, state, prev); + state->start = prev->start; + rb_erase(&prev->rb_node, &tree->state); + RB_CLEAR_NODE(&prev->rb_node); + free_extent_state(prev); + } +} + +static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state) +{ + struct extent_state *next; + + next = next_state(state); + if (next && next->start == state->end + 1 && next->state == state->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, state, next); + state->end = next->end; + rb_erase(&next->rb_node, &tree->state); + RB_CLEAR_NODE(&next->rb_node); + free_extent_state(next); + } +} + /* * Utility function to look for merge candidates inside a given range. Any * extents with matching state are merged together into a single extent in the @@ -338,31 +376,11 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) */ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) { - struct extent_state *other; - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) return; - other = prev_state(state); - if (other && other->end == state->start - 1 && - other->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, other); - state->start = other->start; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - other = next_state(state); - if (other && other->start == state->end + 1 && - other->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, other); - state->end = other->end; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } + merge_prev_state(tree, state); + merge_next_state(tree, state); } static void set_state_bits(struct extent_io_tree *tree, @@ -384,19 +402,27 @@ static void set_state_bits(struct extent_io_tree *tree, * Insert an extent_state struct into the tree. 'bits' are set on the * struct before it is inserted. * - * This may return -EEXIST if the extent is already there, in which case the - * state struct is freed. + * Returns a pointer to the struct extent_state record containing the range + * requested for insertion, which may be the same as the given struct or it + * may be an existing record in the tree that was expanded to accommodate the + * requested range. In case of an extent_state different from the one that was + * given, the later can be freed or reused by the caller. + * + * On error it returns an error pointer. * * The tree lock is not taken internally. This is a utility function and * probably isn't what you want to call (see set/clear_extent_bit). */ -static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, struct extent_changeset *changeset) +static struct extent_state *insert_state(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, + struct extent_changeset *changeset) { struct rb_node **node; struct rb_node *parent = NULL; - const u64 end = state->end; + const u64 start = state->start - 1; + const u64 end = state->end + 1; + const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)); set_state_bits(tree, state, bits, changeset); @@ -407,23 +433,42 @@ static int insert_state(struct extent_io_tree *tree, parent = *node; entry = rb_entry(parent, struct extent_state, rb_node); - if (end < entry->start) { + if (state->end < entry->start) { + if (try_merge && end == entry->start && + state->state == entry->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); + entry->start = state->start; + merge_prev_state(tree, entry); + state->state = 0; + return entry; + } node = &(*node)->rb_left; - } else if (end > entry->end) { + } else if (state->end > entry->end) { + if (try_merge && entry->end == start && + state->state == entry->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); + entry->end = state->end; + merge_next_state(tree, entry); + state->state = 0; + return entry; + } node = &(*node)->rb_right; } else { btrfs_err(tree->fs_info, "found node %llu %llu on insert of %llu %llu", - entry->start, entry->end, state->start, end); - return -EEXIST; + entry->start, entry->end, state->start, state->end); + return ERR_PTR(-EEXIST); } } rb_link_node(&state->rb_node, parent, node); rb_insert_color(&state->rb_node, &tree->state); - merge_state(tree, state); - return 0; + return state; } /* @@ -708,26 +753,13 @@ out: } -static void wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) - __releases(tree->lock) - __acquires(tree->lock) -{ - DEFINE_WAIT(wait); - prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&tree->lock); - schedule(); - spin_lock(&tree->lock); - finish_wait(&state->wq, &wait); -} - /* * Wait for one or more bits to clear on a range in the state tree. * The range [start, end] is inclusive. * The tree lock is taken by this function */ -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached_state) +static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state) { struct extent_state *state; @@ -758,9 +790,15 @@ process_node: goto out; if (state->state & bits) { + DEFINE_WAIT(wait); + start = state->start; refcount_inc(&state->refs); - wait_on_state(tree, state); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); free_extent_state(state); goto again; } @@ -831,15 +869,15 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t * * Note: If there are multiple bits set in @bits, any of them will match. * - * Return 0 if we find something, and update @start_ret and @end_ret. - * Return 1 if we found nothing. + * Return true if we find something, and update @start_ret and @end_ret. + * Return false if we found nothing. */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) +bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state) { struct extent_state *state; - int ret = 1; + bool ret = false; spin_lock(&tree->lock); if (cached_state && *cached_state) { @@ -847,10 +885,19 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, if (state->end == start - 1 && extent_state_in_tree(state)) { while ((state = next_state(state)) != NULL) { if (state->state & bits) - goto got_it; + break; } + /* + * If we found the next extent state, clear cached_state + * so that we can cache the next extent state below and + * avoid future calls going over the same extent state + * again. If we haven't found any, clear as well since + * it's now useless. + */ free_extent_state(*cached_state); *cached_state = NULL; + if (state) + goto got_it; goto out; } free_extent_state(*cached_state); @@ -863,7 +910,7 @@ got_it: cache_state_if_flags(state, cached_state, 0); *start_ret = state->start; *end_ret = state->end; - ret = 0; + ret = true; } out: spin_unlock(&tree->lock); @@ -1133,6 +1180,8 @@ hit_next: */ if (state->start > start) { u64 this_end; + struct extent_state *inserted_state; + if (end < last_start) this_end = end; else @@ -1148,12 +1197,15 @@ hit_next: */ prealloc->start = start; prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, changeset); - if (err) + inserted_state = insert_state(tree, prealloc, bits, changeset); + if (IS_ERR(inserted_state)) { + err = PTR_ERR(inserted_state); extent_io_tree_panic(tree, err); + } - cache_state(prealloc, cached_state); - prealloc = NULL; + cache_state(inserted_state, cached_state); + if (inserted_state == prealloc) + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -1356,6 +1408,8 @@ hit_next: */ if (state->start > start) { u64 this_end; + struct extent_state *inserted_state; + if (end < last_start) this_end = end; else @@ -1373,11 +1427,14 @@ hit_next: */ prealloc->start = start; prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, NULL); - if (err) + inserted_state = insert_state(tree, prealloc, bits, NULL); + if (IS_ERR(inserted_state)) { + err = PTR_ERR(inserted_state); extent_io_tree_panic(tree, err); - cache_state(prealloc, cached_state); - prealloc = NULL; + } + cache_state(inserted_state, cached_state); + if (inserted_state == prealloc) + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -1640,15 +1697,46 @@ search: } /* - * Search a range in the state tree for a given mask. If 'filled' == 1, this - * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 - * is returned if any bit in the range is found set. + * Check if the single @bit exists in the given range. */ -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached) +bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) { struct extent_state *state = NULL; - int bitset = 0; + bool bitset = false; + + ASSERT(is_power_of_2(bit)); + + spin_lock(&tree->lock); + state = tree_search(tree, start); + while (state && start <= end) { + if (state->start > end) + break; + + if (state->state & bit) { + bitset = true; + break; + } + + /* If state->end is (u64)-1, start will overflow to 0 */ + start = state->end + 1; + if (start > end || start == 0) + break; + state = next_state(state); + } + spin_unlock(&tree->lock); + return bitset; +} + +/* + * Check if the whole range [@start,@end) contains the single @bit set. + */ +bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached) +{ + struct extent_state *state = NULL; + bool bitset = true; + + ASSERT(is_power_of_2(bit)); spin_lock(&tree->lock); if (cached && extent_state_in_tree(cached) && cached->start <= start && @@ -1657,35 +1745,35 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, else state = tree_search(tree, start); while (state && start <= end) { - if (filled && state->start > start) { - bitset = 0; + if (state->start > start) { + bitset = false; break; } if (state->start > end) break; - if (state->state & bits) { - bitset = 1; - if (!filled) - break; - } else if (filled) { - bitset = 0; + if ((state->state & bit) == 0) { + bitset = false; break; } if (state->end == (u64)-1) break; + /* + * Last entry (if state->end is (u64)-1 and overflow happens), + * or next entry starts after the range. + */ start = state->end + 1; - if (start > end) + if (start > end || start == 0) break; state = next_state(state); } /* We ran out of states and were still inside of our range. */ - if (filled && !state) - bitset = 0; + if (!state) + bitset = false; spin_unlock(&tree->lock); return bitset; } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index fbd3b275ab1c..5602b0137fcd 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -131,8 +131,9 @@ u64 count_range_bits(struct extent_io_tree *tree, struct extent_state **cached_state); void free_extent_state(struct extent_state *state); -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached_state); +bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached_state); +bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -182,9 +183,9 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u32 clear_bits, struct extent_state **cached_state); -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state); +bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state); void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, @@ -192,7 +193,5 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached_state); #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f396a9afa403..01423670bc8a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -42,14 +42,16 @@ #include "file-item.h" #include "orphan.h" #include "tree-checker.h" +#include "raid-stripe-tree.h" #undef SCRAMBLE_DELAYED_REFS static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, + u64 owner_offset, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, @@ -57,7 +59,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod); + struct btrfs_key *ins, int ref_mod, u64 oref_root); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); @@ -69,27 +71,6 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits) return (cache->flags & bits) == bits; } -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes) -{ - u64 end = start + num_bytes - 1; - set_extent_bit(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE, NULL); - return 0; -} - -void btrfs_free_excluded_extents(struct btrfs_block_group *cache) -{ - struct btrfs_fs_info *fs_info = cache->fs_info; - u64 start, end; - - start = cache->start; - end = start + cache->length - 1; - - clear_extent_bits(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE); -} - /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { @@ -121,7 +102,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags) + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owning_root) { struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; @@ -133,6 +115,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u32 item_size; u64 num_refs; u64 extent_flags; + u64 owner = 0; int ret; /* @@ -186,9 +169,13 @@ search_again: struct btrfs_extent_item); num_refs = btrfs_extent_refs(leaf, ei); extent_flags = btrfs_extent_flags(leaf, ei); + owner = btrfs_get_extent_owner_root(fs_info, leaf, + path->slots[0]); } else { - ret = -EINVAL; - btrfs_print_v0_err(fs_info); + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); if (trans) btrfs_abort_transaction(trans, ret); else @@ -243,6 +230,8 @@ out: *refs = num_refs; if (flags) *flags = extent_flags; + if (owning_root) + *owning_root = owner; out_free: btrfs_free_path(path); return ret; @@ -363,9 +352,15 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data) { + struct btrfs_fs_info *fs_info = eb->fs_info; int type = btrfs_extent_inline_ref_type(eb, iref); u64 offset = btrfs_extent_inline_ref_offset(eb, iref); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + return type; + } + if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY || type == BTRFS_SHARED_DATA_REF_KEY || @@ -374,26 +369,25 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_TREE_BLOCK_REF_KEY) return type; if (type == BTRFS_SHARED_BLOCK_REF_KEY) { - ASSERT(eb->fs_info); + ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ - if (offset && - IS_ALIGNED(offset, eb->fs_info->sectorsize)) + if (offset && IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else if (is_data == BTRFS_REF_TYPE_DATA) { if (type == BTRFS_EXTENT_DATA_REF_KEY) return type; if (type == BTRFS_SHARED_DATA_REF_KEY) { - ASSERT(eb->fs_info); + ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->sectorsize)) + IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else { @@ -402,11 +396,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } } + WARN_ON(1); btrfs_print_leaf(eb); - btrfs_err(eb->fs_info, + btrfs_err(fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); - WARN_ON(1); return BTRFS_REF_TYPE_INVALID; } @@ -418,11 +412,11 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -594,7 +588,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = 0; fail: btrfs_release_path(path); @@ -624,12 +618,12 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, ref2 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); num_refs = btrfs_shared_data_ref_count(leaf, ref2); - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - btrfs_print_v0_err(trans->fs_info); - btrfs_abort_transaction(trans, -EINVAL); - return -EINVAL; } else { - BUG(); + btrfs_err(trans->fs_info, + "unrecognized backref key (%llu %u %llu)", + key.objectid, key.type, key.offset); + btrfs_abort_transaction(trans, -EUCLEAN); + return -EUCLEAN; } BUG_ON(num_refs < refs_to_drop); @@ -642,7 +636,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); else if (key.type == BTRFS_SHARED_DATA_REF_KEY) btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } return ret; } @@ -660,7 +654,6 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); if (iref) { /* * If type is invalid, we should have bailed out earlier than @@ -809,7 +802,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, int type; int want; int ret; - int err = 0; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); int needed; @@ -836,10 +828,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } /* * We may be a newly converted file system which still has the old fat @@ -866,19 +856,26 @@ again: } if (ret && !insert) { - err = -ENOENT; + ret = -ENOENT; goto out; } else if (WARN_ON(ret)) { - err = -EIO; + btrfs_print_leaf(path->nodes[0]); + btrfs_err(fs_info, +"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", + bytenr, num_bytes, parent, root_objectid, owner, + offset); + ret = -EUCLEAN; goto out; } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EINVAL; - btrfs_print_v0_err(fs_info); - btrfs_abort_transaction(trans, err); + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %llu expect >= %zu", + item_size, sizeof(*ei)); + btrfs_abort_transaction(trans, ret); goto out; } @@ -898,22 +895,17 @@ again: else needed = BTRFS_REF_TYPE_BLOCK; - err = -ENOENT; - while (1) { - if (ptr >= end) { - if (ptr > end) { - err = -EUCLEAN; - btrfs_print_leaf(path->nodes[0]); - btrfs_crit(fs_info, -"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", - path->slots[0], root_objectid, owner, offset, parent); - } - break; - } + ret = -ENOENT; + while (ptr < end) { iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + ptr += btrfs_extent_inline_ref_size(type); + continue; + } if (type == BTRFS_REF_TYPE_INVALID) { - err = -EUCLEAN; + ret = -EUCLEAN; goto out; } @@ -929,7 +921,7 @@ again: dref = (struct btrfs_extent_data_ref *)(&iref->offset); if (match_extent_data_ref(leaf, dref, root_objectid, owner, offset)) { - err = 0; + ret = 0; break; } if (hash_extent_data_ref_item(leaf, dref) < @@ -940,14 +932,14 @@ again: ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); if (parent > 0) { if (parent == ref_offset) { - err = 0; + ret = 0; break; } if (ref_offset < parent) break; } else { if (root_objectid == ref_offset) { - err = 0; + ret = 0; break; } if (ref_offset < root_objectid) @@ -956,10 +948,20 @@ again: } ptr += btrfs_extent_inline_ref_size(type); } - if (err == -ENOENT && insert) { + + if (unlikely(ptr > end)) { + ret = -EUCLEAN; + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(fs_info, +"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", + path->slots[0], root_objectid, owner, offset, parent); + goto out; + } + + if (ret == -ENOENT && insert) { if (item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } /* @@ -971,7 +973,7 @@ again: if (find_next_key(path, 0, &key) == 0 && key.objectid == bytenr && key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } } @@ -982,14 +984,14 @@ out: path->search_for_extension = 0; btrfs_unlock_up_safe(path, 1); } - return err; + return ret; } /* * helper to add new inline back ref */ static noinline_for_stack -void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, +void setup_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, u64 parent, u64 root_objectid, @@ -1012,7 +1014,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - btrfs_extend_item(path, size); + btrfs_extend_item(trans, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1046,7 +1048,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, } else { btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1079,13 +1081,15 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, /* * helper to update/remove inline back ref */ -static noinline_for_stack -void update_inline_extent_backref(struct btrfs_path *path, +static noinline_for_stack int update_inline_extent_backref( + struct btrfs_trans_handle *trans, + struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; struct btrfs_extent_data_ref *dref = NULL; struct btrfs_shared_data_ref *sref = NULL; @@ -1098,18 +1102,33 @@ void update_inline_extent_backref(struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); - WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); + if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) { + struct btrfs_key key; + u32 extent_size; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type == BTRFS_METADATA_ITEM_KEY) + extent_size = fs_info->nodesize; + else + extent_size = key.offset; + btrfs_print_leaf(leaf); + btrfs_err(fs_info, + "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu", + key.objectid, extent_size, refs_to_mod, refs); + return -EUCLEAN; + } refs += refs_to_mod; btrfs_set_extent_refs(leaf, ei, refs); if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); /* - * If type is invalid, we should have bailed out after - * lookup_inline_extent_backref(). + * Function btrfs_get_extent_inline_ref_type() has already printed + * error messages. */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); - ASSERT(type != BTRFS_REF_TYPE_INVALID); + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) + return -EUCLEAN; if (type == BTRFS_EXTENT_DATA_REF_KEY) { dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -1119,10 +1138,43 @@ void update_inline_extent_backref(struct btrfs_path *path, refs = btrfs_shared_data_ref_count(leaf, sref); } else { refs = 1; - BUG_ON(refs_to_mod != -1); + /* + * For tree blocks we can only drop one ref for it, and tree + * blocks should not have refs > 1. + * + * Furthermore if we're inserting a new inline backref, we + * won't reach this path either. That would be + * setup_inline_extent_backref(). + */ + if (unlikely(refs_to_mod != -1)) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + btrfs_print_leaf(leaf); + btrfs_err(fs_info, + "invalid refs_to_mod for tree block %llu, has %d expect -1", + key.objectid, refs_to_mod); + return -EUCLEAN; + } } - BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); + if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) { + struct btrfs_key key; + u32 extent_size; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type == BTRFS_METADATA_ITEM_KEY) + extent_size = fs_info->nodesize; + else + extent_size = key.offset; + btrfs_print_leaf(leaf); + btrfs_err(fs_info, +"invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu", + (unsigned long)iref, key.objectid, extent_size, + refs_to_mod, refs); + return -EUCLEAN; + } refs += refs_to_mod; if (refs > 0) { @@ -1139,9 +1191,10 @@ void update_inline_extent_backref(struct btrfs_path *path, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - btrfs_truncate_item(path, item_size, 1); + btrfs_truncate_item(trans, path, item_size, 1); } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); + return 0; } static noinline_for_stack @@ -1170,9 +1223,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, bytenr, num_bytes, root_objectid, path->slots[0]); return -EUCLEAN; } - update_inline_extent_backref(path, iref, refs_to_add, extent_op); + ret = update_inline_extent_backref(trans, path, iref, + refs_to_add, extent_op); } else if (ret == -ENOENT) { - setup_inline_extent_backref(trans->fs_info, path, iref, parent, + setup_inline_extent_backref(trans, path, iref, parent, root_objectid, owner, offset, refs_to_add, extent_op); ret = 0; @@ -1190,7 +1244,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (iref) - update_inline_extent_backref(path, iref, -refs_to_drop, NULL); + ret = update_inline_extent_backref(trans, path, iref, + -refs_to_drop, NULL); else if (is_data) ret = remove_extent_data_ref(trans, root, path, refs_to_drop); else @@ -1386,7 +1441,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID); + generic_ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); @@ -1399,7 +1454,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, } /* - * __btrfs_inc_extent_ref - insert backreference for a given extent + * Insert backreference for a given extent. * * The counterpart is in __btrfs_free_extent(), with examples and more details * how it works. @@ -1429,8 +1484,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * always passed as 0. For data extents it is the fileoffset * this extent belongs to. * - * @refs_to_add Number of references to add - * * @extent_op Pointer to a structure, holding information necessary when * updating a tree block's flags * @@ -1438,7 +1491,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, + u64 owner, u64 offset, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_path *path; @@ -1448,6 +1501,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; u64 refs; + int refs_to_add = node->ref_mod; int ret; path = btrfs_alloc_path(); @@ -1474,19 +1528,18 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* now insert the actual backref */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); + if (owner < BTRFS_FIRST_FREE_OBJECTID) ret = insert_tree_block_ref(trans, path, bytenr, parent, root_objectid); - } else { + else ret = insert_extent_data_ref(trans, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); - } + if (ret) btrfs_abort_transaction(trans, ret); out: @@ -1494,45 +1547,72 @@ out: return ret; } +static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *href) +{ + u64 root = href->owning_root; + + /* + * Don't check must_insert_reserved, as this is called from contexts + * where it has already been unset. + */ + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE || + !href->is_data || !is_fstree(root)) + return; + + btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, + BTRFS_QGROUP_RSV_DATA); +} + static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; struct btrfs_delayed_data_ref *ref; - struct btrfs_key ins; u64 parent = 0; - u64 ref_root = 0; u64 flags = 0; - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - ref = btrfs_delayed_node_to_data_ref(node); trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; - ref_root = ref->root; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + struct btrfs_key key; + struct btrfs_squota_delta delta = { + .root = href->owning_root, + .num_bytes = node->num_bytes, + .is_data = true, + .is_inc = true, + .generation = trans->transid, + }; + if (extent_op) flags |= extent_op->flags_to_set; - ret = alloc_reserved_file_extent(trans, parent, ref_root, + + key.objectid = node->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + + ret = alloc_reserved_file_extent(trans, parent, ref->root, flags, ref->objectid, - ref->offset, &ins, - node->ref_mod); + ref->offset, &key, + node->ref_mod, href->owning_root); + free_head_ref_squota_rsv(trans->fs_info, href); + if (!ret) + ret = btrfs_record_squota_delta(trans->fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, + ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root, ref->objectid, ref->offset, - node->ref_mod, extent_op); + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, node, parent, - ref_root, ref->objectid, - ref->offset, node->ref_mod, - extent_op); + ret = __btrfs_free_extent(trans, href, node, parent, + ref->root, ref->objectid, + ref->offset, extent_op); } else { BUG(); } @@ -1569,7 +1649,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; u32 item_size; int ret; - int err = 0; int metadata = 1; if (TRANS_ABORTED(trans)) @@ -1596,10 +1675,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - err = ret; goto out; - } - if (ret > 0) { + } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { path->slots[0]--; @@ -1620,7 +1697,10 @@ again: goto again; } } else { - err = -EIO; + ret = -EUCLEAN; + btrfs_err(fs_info, + "missing extent item for extent %llu num_bytes %llu level %d", + head->bytenr, head->num_bytes, extent_op->level); goto out; } } @@ -1629,27 +1709,31 @@ again: item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EINVAL; - btrfs_print_v0_err(fs_info); - btrfs_abort_transaction(trans, err); + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); + btrfs_abort_transaction(trans, ret); goto out; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); - return err; + return ret; } static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_tree_ref *ref; u64 parent = 0; u64 ref_root = 0; @@ -1661,22 +1745,32 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent = ref->parent; ref_root = ref->root; - if (node->ref_mod != 1) { + if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, - "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", + "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", node->bytenr, node->ref_mod, node->action, ref_root, parent); - return -EIO; + return -EUCLEAN; } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + struct btrfs_squota_delta delta = { + .root = href->owning_root, + .num_bytes = fs_info->nodesize, + .is_data = false, + .is_inc = true, + .generation = trans->transid, + }; + BUG_ON(!extent_op || !extent_op->update_flags); ret = alloc_reserved_tree_block(trans, node, extent_op); + if (!ret) + btrfs_record_squota_delta(fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, node, parent, ref_root, - ref->level, 0, 1, extent_op); + ret = __btrfs_free_extent(trans, href, node, parent, ref_root, + ref->level, 0, extent_op); } else { BUG(); } @@ -1685,6 +1779,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) @@ -1692,19 +1787,23 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, int ret = 0; if (TRANS_ABORTED(trans)) { - if (insert_reserved) + if (insert_reserved) { btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + free_head_ref_squota_rsv(trans->fs_info, href); + } return 0; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = run_delayed_tree_ref(trans, node, extent_op, + ret = run_delayed_tree_ref(trans, href, node, extent_op, insert_reserved); else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) - ret = run_delayed_data_ref(trans, node, extent_op, + ret = run_delayed_data_ref(trans, href, node, extent_op, insert_reserved); + else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) + ret = 0; else BUG(); if (ret && insert_reserved) @@ -1783,28 +1882,38 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, return ret ? ret : 1; } -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, +u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - int nr_items = 1; /* Dropping this ref head update. */ + u64 ret = 0; /* * We had csum deletions accounted for in our delayed refs rsv, we need * to drop the csum leaves for this update from our delayed_refs_rsv. */ if (head->total_ref_mod < 0 && head->is_data) { + int nr_csums; + spin_lock(&delayed_refs->lock); delayed_refs->pending_csums -= head->num_bytes; spin_unlock(&delayed_refs->lock); - nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + + btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums); + + ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); } + /* must_insert_reserved can be set only if we didn't run the head ref. */ + if (head->must_insert_reserved) + free_head_ref_squota_rsv(fs_info, head); - btrfs_delayed_refs_rsv_release(fs_info, nr_items); + return ret; } static int cleanup_ref_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) + struct btrfs_delayed_ref_head *head, + u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1849,7 +1958,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); @@ -1891,7 +2000,8 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( } static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *locked_ref) + struct btrfs_delayed_ref_head *locked_ref, + u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; @@ -1939,14 +2049,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, * spin lock. */ must_insert_reserved = locked_ref->must_insert_reserved; + /* + * Unsetting this on the head ref relinquishes ownership of + * the rsv_bytes, so it is critical that every possible code + * path from here forward frees all reserves including qgroup + * reserve. + */ locked_ref->must_insert_reserved = false; extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; spin_unlock(&locked_ref->lock); - ret = run_one_delayed_ref(trans, ref, extent_op, + ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op, must_insert_reserved); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); + *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1); btrfs_free_delayed_extent_op(extent_op); if (ret) { @@ -1970,15 +2088,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long nr) + u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *locked_ref = NULL; int ret; unsigned long count = 0; + unsigned long max_count = 0; + u64 bytes_processed = 0; delayed_refs = &trans->transaction->delayed_refs; + if (min_bytes == 0) { + max_count = delayed_refs->num_heads_ready; + min_bytes = U64_MAX; + } + do { if (!locked_ref) { locked_ref = btrfs_obtain_ref_head(trans); @@ -2006,7 +2131,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); - ret = btrfs_run_delayed_refs_for_head(trans, locked_ref); + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed); if (ret < 0 && ret != -EAGAIN) { /* * Error, btrfs_run_delayed_refs_for_head already @@ -2018,7 +2143,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * Success, perform the usual cleanup of a processed * head */ - ret = cleanup_ref_head(trans, locked_ref); + ret = cleanup_ref_head(trans, locked_ref, &bytes_processed); if (ret > 0 ) { /* We dropped our lock, we need to loop. */ ret = 0; @@ -2035,7 +2160,9 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, locked_ref = NULL; cond_resched(); - } while ((nr != -1 && count < nr) || locked_ref); + } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) || + (max_count > 0 && count < max_count) || + locked_ref); return 0; } @@ -2084,24 +2211,25 @@ static u64 find_middle(struct rb_root *root) #endif /* - * this starts processing the delayed reference count updates and - * extent insertions we have queued up so far. count can be - * 0, which means to process everything in the tree at the start - * of the run (but not newly added entries), or it can be some target - * number you'd like to process. + * Start processing the delayed reference count updates and extent insertions + * we have queued up so far. + * + * @trans: Transaction handle. + * @min_bytes: How many bytes of delayed references to process. After this + * many bytes we stop processing delayed references if there are + * any more. If 0 it means to run all existing delayed references, + * but not new ones added after running all existing ones. + * Use (u64)-1 (U64_MAX) to run all existing delayed references + * plus any new ones that are added. * * Returns 0 on success or if called with an aborted transaction * Returns <0 on error and aborts the transaction */ -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long count) +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_head *head; int ret; - int run_all = count == (unsigned long)-1; /* We'll clean this up in btrfs_cleanup_transaction */ if (TRANS_ABORTED(trans)) @@ -2111,42 +2239,30 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, return 0; delayed_refs = &trans->transaction->delayed_refs; - if (count == 0) - count = delayed_refs->num_heads_ready; - again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - ret = __btrfs_run_delayed_refs(trans, count); + ret = __btrfs_run_delayed_refs(trans, min_bytes); if (ret < 0) { btrfs_abort_transaction(trans, ret); return ret; } - if (run_all) { + if (min_bytes == U64_MAX) { btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - node = rb_first_cached(&delayed_refs->href_root); - if (!node) { + if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) { spin_unlock(&delayed_refs->lock); - goto out; + return 0; } - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); - /* Mutex was contended, block until it's released and retry. */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - - btrfs_put_delayed_ref_head(head); cond_resched(); goto again; } -out: + return 0; } @@ -2271,6 +2387,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_extent_item *ei; struct btrfs_key key; u32 item_size; + u32 expected_size; int type; int ret; @@ -2297,10 +2414,22 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = 1; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); + + /* No inline refs; we need to bail before checking for owner ref. */ + if (item_size == sizeof(*ei)) + goto out; + + /* Check for an owner ref; skip over it to the real inline refs. */ + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { + expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); + iref = (struct btrfs_extent_inline_ref *)(iref + 1); + } /* If extent item has more than 1 inline ref then it's shared */ - if (item_size != sizeof(*ei) + - btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + if (item_size != expected_size) goto out; /* @@ -2312,8 +2441,6 @@ static noinline int check_committed_ref(struct btrfs_root *root, btrfs_root_last_snapshot(&root->root_item))) goto out; - iref = (struct btrfs_extent_inline_ref *)(ei + 1); - /* If this extent has SHARED_DATA_REF then it's shared */ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) @@ -2410,7 +2537,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); key.offset -= btrfs_file_extent_offset(buf, fi); btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent); + num_bytes, parent, ref_root); btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, key.offset, root->root_key.objectid, for_reloc); @@ -2423,8 +2550,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = fs_info->nodesize; + /* We don't know the owning_root, use 0. */ btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent); + num_bytes, parent, 0); btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, root->root_key.objectid, for_reloc); if (inc) @@ -2525,16 +2653,13 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, return 0; } -/* - * this function must be called within transaction - */ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes) + const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret; - cache = btrfs_lookup_block_group(trans->fs_info, bytenr); + cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) return -EINVAL; @@ -2546,10 +2671,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, if (ret) goto out; - pin_down_extent(trans, cache, bytenr, num_bytes, 0); + pin_down_extent(trans, cache, eb->start, eb->len, 0); /* remove us from the free space cache (if we're there at all) */ - ret = btrfs_remove_free_space(cache, bytenr, num_bytes); + ret = btrfs_remove_free_space(cache, eb->start, eb->len); out: btrfs_put_block_group(cache); return ret; @@ -2751,9 +2876,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) struct extent_state *cached_state = NULL; mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state); - if (ret) { + if (!find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } @@ -2805,12 +2929,61 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +/* + * Parse an extent item's inline extents looking for a simple quotas owner ref. + * + * @fs_info: the btrfs_fs_info for this mount + * @leaf: a leaf in the extent tree containing the extent item + * @slot: the slot in the leaf where the extent item is found + * + * Returns the objectid of the root that originally allocated the extent item + * if the inline owner ref is expected and present, otherwise 0. + * + * If an extent item has an owner ref item, it will be the first inline ref + * item. Therefore the logic is to check whether there are any inline ref + * items, then check the type of the first one. + */ +u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_owner_ref *oref; + unsigned long ptr; + unsigned long end; + int type; + + if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) + return 0; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + btrfs_item_size(leaf, slot); + + /* No inline ref items of any kind, can't check type. */ + if (ptr == end) + return 0; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); + + /* We found an owner ref, get the root out of it. */ + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + return btrfs_extent_owner_ref_root_id(leaf, oref); + } + + /* We have inline refs, but not an owner ref. */ + return 0; +} + static int do_free_extent_accounting(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, bool is_data) + u64 bytenr, struct btrfs_squota_delta *delta) { int ret; + u64 num_bytes = delta->num_bytes; - if (is_data) { + if (delta->is_data) { struct btrfs_root *csum_root; csum_root = btrfs_csum_root(trans->fs_info, bytenr); @@ -2819,6 +2992,18 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } + + ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + ret = btrfs_record_squota_delta(trans->fs_info, delta); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; } ret = add_to_free_space_tree(trans, bytenr, num_bytes); @@ -2901,9 +3086,10 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, + u64 owner_offset, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; @@ -2918,11 +3104,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; + int refs_to_drop = node->ref_mod; u32 item_size; u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); + u64 delayed_ref_root = href->owning_root; extent_root = btrfs_extent_root(info, bytenr); ASSERT(extent_root); @@ -3059,8 +3247,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, extent_slot); if (unlikely(item_size < sizeof(*ei))) { - ret = -EINVAL; - btrfs_print_v0_err(info); + ret = -EUCLEAN; + btrfs_err(trans->fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); goto out; } @@ -3110,7 +3300,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } else { btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, @@ -3121,6 +3311,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } } else { + struct btrfs_squota_delta delta = { + .root = delayed_ref_root, + .num_bytes = num_bytes, + .is_data = is_data, + .is_inc = false, + .generation = btrfs_extent_generation(leaf, ei), + }; + /* In this branch refs == 1 */ if (found_extent) { if (is_data && refs_to_drop != @@ -3159,6 +3357,16 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, num_to_del = 2; } } + /* + * We can't infer the data owner from the delayed ref, so we need + * to try to get it from the owning ref item. + * + * If it is not present, then that extent was not written under + * simple quotas mode, so we don't need to account for its deletion. + */ + if (is_data) + delta.root = btrfs_get_extent_owner_root(trans->fs_info, + leaf, extent_slot); ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); @@ -3168,7 +3376,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); + ret = do_free_extent_accounting(trans, bytenr, &delta); } btrfs_release_path(path); @@ -3242,7 +3450,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, - buf->start, buf->len, parent); + buf->start, buf->len, parent, btrfs_header_owner(buf)); btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), root_id, 0, false); @@ -3329,10 +3537,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree, just update pinning info and exit early. */ if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) { - /* unlocks the pinned mutex */ + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { @@ -3342,20 +3549,47 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) } if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID))) + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) btrfs_ref_tree_mod(fs_info, ref); return ret; } enum btrfs_loop_type { + /* + * Start caching block groups but do not wait for progress or for them + * to be done. + */ LOOP_CACHING_NOWAIT, + + /* + * Wait for the block group free_space >= the space we're waiting for if + * the block group isn't cached. + */ LOOP_CACHING_WAIT, + + /* + * Allow allocations to happen from block groups that do not yet have a + * size classification. + */ LOOP_UNSET_SIZE_CLASS, + + /* + * Allocate a chunk and then retry the allocation. + */ LOOP_ALLOC_CHUNK, + + /* + * Ignore the size class restrictions for this allocation. + */ LOOP_WRONG_SIZE_CLASS, + + /* + * Ignore the empty size, only try to allocate the number of bytes + * needed for this allocation. + */ LOOP_NO_EMPTY_SIZE, }; @@ -3427,7 +3661,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, * Helper function for find_free_extent(). * * Return -ENOENT to inform caller that we need fallback to unclustered mode. - * Return -EAGAIN to inform caller that we need to re-search this block group * Return >0 to inform caller that we find nothing * Return 0 means we have found a location and set ffe_ctl->found_offset. */ @@ -3508,14 +3741,6 @@ refill_cluster: trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } - } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && - !ffe_ctl->retry_clustered) { - spin_unlock(&last_ptr->refill_lock); - - ffe_ctl->retry_clustered = true; - btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_cluster + ffe_ctl->empty_size); - return -EAGAIN; } /* * At this point we either didn't find a cluster or we weren't able to @@ -3530,7 +3755,6 @@ refill_cluster: /* * Return >0 to inform caller that we find nothing * Return 0 when we found an free extent and set ffe_ctrl->found_offset - * Return -EAGAIN to inform caller that we need to re-search this block group */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, struct find_free_extent_ctl *ffe_ctl) @@ -3568,25 +3792,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg, offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, ffe_ctl->num_bytes, ffe_ctl->empty_size, &ffe_ctl->max_extent_size); - - /* - * If we didn't find a chunk, and we haven't failed on this block group - * before, and this block group is in the middle of caching and we are - * ok with waiting, then go ahead and wait for progress to be made, and - * set @retry_unclustered to true. - * - * If @retry_unclustered is true then we've already waited on this - * block group once and should move on to the next block group. - */ - if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && - ffe_ctl->loop > LOOP_CACHING_NOWAIT) { - btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_size); - ffe_ctl->retry_unclustered = true; - return -EAGAIN; - } else if (!offset) { + if (!offset) return 1; - } ffe_ctl->found_offset = offset; return 0; } @@ -3600,7 +3807,7 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, /* We want to try and use the cluster allocator, so lets look there */ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); - if (ret >= 0 || ret == -EAGAIN) + if (ret >= 0) return ret; /* ret == -ENOENT case falls through */ } @@ -3685,7 +3892,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, } spin_unlock(&block_group->lock); - if (!ret && !btrfs_zone_activate(block_group)) { + /* Metadata block group is activated at write time. */ + if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && + !btrfs_zone_activate(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. @@ -3709,7 +3918,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, fs_info->data_reloc_bg == 0); if (block_group->ro || - test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + (!ffe_ctl->for_data_reloc && + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) { ret = 1; goto out; } @@ -3752,8 +3962,26 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, if (ffe_ctl->for_treelog && !fs_info->treelog_bg) fs_info->treelog_bg = block_group->start; - if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) - fs_info->data_reloc_bg = block_group->start; + if (ffe_ctl->for_data_reloc) { + if (!fs_info->data_reloc_bg) + fs_info->data_reloc_bg = block_group->start; + /* + * Do not allow allocations from this block group, unless it is + * for data relocation. Compared to increasing the ->ro, setting + * the ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + * + * Also, this flag avoids this block group to be zone finished. + */ + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); + } ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; @@ -3771,24 +3999,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc && - fs_info->data_reloc_bg == block_group->start) { - /* - * Do not allow further allocations from this block group. - * Compared to increasing the ->ro, setting the - * ->zoned_data_reloc_ongoing flag still allows nocow - * writers to come in. See btrfs_inc_nocow_writers(). - * - * We need to disable an allocation to avoid an allocation of - * regular (non-relocation data) extent. With mix of relocation - * extents and regular extents, we can dispatch WRITE commands - * (for relocation extents) and ZONE APPEND commands (for - * regular extents) at the same time to the same zone, which - * easily break the write pointer. - */ - set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); + if (ret && ffe_ctl->for_data_reloc) fs_info->data_reloc_bg = 0; - } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); @@ -3816,8 +4028,7 @@ static void release_block_group(struct btrfs_block_group *block_group, { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: - ffe_ctl->retry_clustered = false; - ffe_ctl->retry_unclustered = false; + ffe_ctl->retry_uncached = false; break; case BTRFS_EXTENT_ALLOC_ZONED: /* Nothing to do */ @@ -3861,6 +4072,10 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { + /* Block group's activeness is not a requirement for METADATA block groups. */ + if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + /* If we can activate new zone, just allocate a chunk and use it */ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) return 0; @@ -3949,15 +4164,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) return 1; - /* - * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking - * caching kthreads as we move along - * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching - * LOOP_UNSET_SIZE_CLASS, allow unset size class - * LOOP_ALLOC_CHUNK, force a chunk allocation and try again - * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try - * again - */ + /* See the comments for btrfs_loop_type for an explanation of the phases. */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; /* @@ -4168,9 +4375,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->orig_have_caching_bg = false; ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); ffe_ctl->loop = 0; - /* For clustered allocation */ - ffe_ctl->retry_clustered = false; - ffe_ctl->retry_unclustered = false; + ffe_ctl->retry_uncached = false; ffe_ctl->cached = 0; ffe_ctl->max_extent_size = 0; ffe_ctl->total_free_space = 0; @@ -4321,16 +4526,12 @@ have_block_group: bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); - if (ret == 0) { - if (bg_ret && bg_ret != block_group) { - btrfs_release_block_group(block_group, - ffe_ctl->delalloc); - block_group = bg_ret; - } - } else if (ret == -EAGAIN) { - goto have_block_group; - } else if (ret > 0) { + if (ret > 0) goto loop; + + if (bg_ret && bg_ret != block_group) { + btrfs_release_block_group(block_group, ffe_ctl->delalloc); + block_group = bg_ret; } /* Checks */ @@ -4371,6 +4572,15 @@ have_block_group: btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: + if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && + !ffe_ctl->retry_uncached) { + ffe_ctl->retry_uncached = true; + btrfs_wait_block_group_cache_progress(block_group, + ffe_ctl->num_bytes + + ffe_ctl->empty_cluster + + ffe_ctl->empty_size); + goto have_block_group; + } release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); cond_resched(); } @@ -4398,8 +4608,8 @@ loop: } /* - * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a - * hole that is at least as big as @num_bytes. + * Entry point to the extent allocator. Tries to find a hole that is at least + * as big as @num_bytes. * * @root - The root that will contain this extent * @@ -4518,20 +4728,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, - u64 len) +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, + const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret = 0; - cache = btrfs_lookup_block_group(trans->fs_info, start); + cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) { btrfs_err(trans->fs_info, "unable to find block group for %llu", - start); + eb->start); return -ENOSPC; } - ret = pin_down_extent(trans, cache, start, len, 1); + ret = pin_down_extent(trans, cache, eb->start, eb->len, 1); btrfs_put_block_group(cache); return ret; } @@ -4561,24 +4771,29 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod) + struct btrfs_key *ins, int ref_mod, u64 oref_root) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; + struct btrfs_extent_owner_ref *oref; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; int type; u32 size; + const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); if (parent > 0) type = BTRFS_SHARED_DATA_REF_KEY; else type = BTRFS_EXTENT_DATA_REF_KEY; - size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); + size = sizeof(*extent_item); + if (simple_quota) + size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); + size += btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); if (!path) @@ -4600,7 +4815,14 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, flags | BTRFS_EXTENT_FLAG_DATA); iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + if (simple_quota) { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY); + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root); + iref = (struct btrfs_extent_inline_ref *)(oref + 1); + } btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { struct btrfs_shared_data_ref *ref; ref = (struct btrfs_shared_data_ref *)(iref + 1); @@ -4615,7 +4837,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); } - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_free_path(path); return alloc_reserved_extent(trans, ins->objectid, ins->offset); @@ -4690,7 +4912,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); @@ -4702,12 +4924,17 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins) { struct btrfs_ref generic_ref = { 0 }; + u64 root_objectid = root->root_key.objectid; + u64 owning_root = root_objectid; + + BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); - BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) + owning_root = root->relocation_src_root; btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins->objectid, ins->offset, 0); - btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, + ins->objectid, ins->offset, 0, owning_root); + btrfs_init_data_ref(&generic_ref, root_objectid, owner, offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); @@ -4727,6 +4954,13 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, int ret; struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; + struct btrfs_squota_delta delta = { + .root = root_objectid, + .num_bytes = ins->offset, + .generation = trans->transid, + .is_data = true, + .is_inc = true, + }; /* * Mixed block groups will exclude before processing the log so we only @@ -4752,13 +4986,36 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, spin_unlock(&space_info->lock); ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, - offset, ins, 1); + offset, ins, 1, root_objectid); if (ret) btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); + ret = btrfs_record_squota_delta(fs_info, &delta); btrfs_put_block_group(block_group); return ret; } +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extra safety check in case the extent tree is corrupted and extent allocator + * chooses to use a tree block which is already used and locked. + */ +static bool check_eb_lock_owner(const struct extent_buffer *eb) +{ + if (eb->lock_owner == current->pid) { + btrfs_err_rl(eb->fs_info, +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", + eb->start, btrfs_header_owner(eb), current->pid); + return true; + } + return false; +} +#else +static bool check_eb_lock_owner(struct extent_buffer *eb) +{ + return false; +} +#endif + static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, int level, u64 owner, @@ -4772,15 +5029,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; - /* - * Extra safety check in case the extent tree is corrupted and extent - * allocator chooses to use a tree block which is already used and - * locked. - */ - if (buf->lock_owner == current->pid) { - btrfs_err_rl(fs_info, -"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", - buf->start, btrfs_header_owner(buf), current->pid); + if (check_eb_lock_owner(buf)) { free_extent_buffer(buf); return ERR_PTR(-EUCLEAN); } @@ -4857,6 +5106,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, + u64 reloc_src_root, enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4869,6 +5119,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, int ret; u32 blocksize = fs_info->nodesize; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + u64 owning_root; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { @@ -4895,11 +5146,13 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, ret = PTR_ERR(buf); goto out_free_reserved; } + owning_root = btrfs_header_owner(buf); if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) parent = ins.objectid; flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + owning_root = reloc_src_root; } else BUG_ON(parent > 0); @@ -4919,7 +5172,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->level = level; btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins.objectid, ins.offset, parent); + ins.objectid, ins.offset, parent, owning_root); btrfs_init_tree_ref(&generic_ref, level, root_objectid, root->root_key.objectid, false); btrfs_ref_tree_mod(fs_info, &generic_ref); @@ -5007,7 +5260,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, /* We don't lock the tree block, it's OK to be racy here */ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, wc->level - 1, 1, &refs, - &flags); + &flags, NULL); /* We don't care about errors in readahead. */ if (ret < 0) continue; @@ -5074,7 +5327,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); BUG_ON(ret == -ENOMEM); if (ret) return ret; @@ -5164,6 +5418,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; + u64 owner_root = 0; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_key key; struct btrfs_ref ref = { 0 }; @@ -5207,7 +5462,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], - &wc->flags[level - 1]); + &wc->flags[level - 1], + &owner_root); if (ret < 0) goto out_unlock; @@ -5340,7 +5596,7 @@ skip: find_next_key(path, level, &wc->drop_progress); btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - fs_info->nodesize, parent); + fs_info->nodesize, parent, owner_root); btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, 0, false); ret = btrfs_free_extent(trans, &ref); @@ -5407,7 +5663,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; @@ -5652,7 +5909,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], NULL); if (ret < 0) { err = ret; goto out_end_trans; diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 429d5c570061..2e066035ccee 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -7,6 +7,7 @@ #include "block-group.h" struct btrfs_free_cluster; +struct btrfs_delayed_ref_head; enum btrfs_extent_allocation_policy { BTRFS_EXTENT_ALLOC_CLUSTERED, @@ -48,16 +49,11 @@ struct find_free_extent_ctl { int loop; /* - * Whether we're refilling a cluster, if true we need to re-search - * current block group but don't try to refill the cluster again. + * Set to true if we're retrying the allocation on this block group + * after waiting for caching progress, this is so that we retry only + * once before moving on to another block group. */ - bool retry_clustered; - - /* - * Whether we're updating free space cache, if true we need to re-search - * current block group but don't try updating free space cache again. - */ - bool retry_unclustered; + bool retry_uncached; /* If current block group is cached */ int cached; @@ -96,21 +92,19 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes); -void btrfs_free_excluded_extents(struct btrfs_block_group *cache); -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes); +u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags); + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owner_root); int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes); + const struct extent_buffer *eb); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr, bool strict, @@ -121,6 +115,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, + u64 reloc_src_root, enum btrfs_lock_nesting nest); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 root_id, @@ -144,12 +139,15 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); +u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, + const struct extent_buffer *eb); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 90ad3006ef3a..8f724c54fc8e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -21,7 +21,6 @@ #include "ctree.h" #include "btrfs_inode.h" #include "bio.h" -#include "check-integrity.h" #include "locking.h" #include "rcu-string.h" #include "backref.h" @@ -181,34 +180,9 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) } } -void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) -{ - struct address_space *mapping = inode->i_mapping; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct folio *folio; - - while (index <= end_index) { - folio = filemap_get_folio(mapping, index); - filemap_dirty_folio(mapping, folio); - folio_account_redirty(folio); - index += folio_nr_pages(folio); - folio_put(folio); - } -} - -/* - * Process one page for __process_pages_contig(). - * - * Return >0 if we hit @page == @locked_page. - * Return 0 if we updated the page status. - * Return -EGAIN if the we need to try again. - * (For PAGE_LOCK case but got dirty page or page not belong to mapping) - */ -static int process_one_page(struct btrfs_fs_info *fs_info, - struct address_space *mapping, - struct page *page, struct page *locked_page, - unsigned long page_ops, u64 start, u64 end) +static void process_one_page(struct btrfs_fs_info *fs_info, + struct page *page, struct page *locked_page, + unsigned long page_ops, u64 start, u64 end) { u32 len; @@ -224,94 +198,36 @@ static int process_one_page(struct btrfs_fs_info *fs_info, if (page_ops & PAGE_END_WRITEBACK) btrfs_page_clamp_clear_writeback(fs_info, page, start, len); - if (page == locked_page) - return 1; - - if (page_ops & PAGE_LOCK) { - int ret; - - ret = btrfs_page_start_writer_lock(fs_info, page, start, len); - if (ret) - return ret; - if (!PageDirty(page) || page->mapping != mapping) { - btrfs_page_end_writer_lock(fs_info, page, start, len); - return -EAGAIN; - } - } - if (page_ops & PAGE_UNLOCK) + if (page != locked_page && (page_ops & PAGE_UNLOCK)) btrfs_page_end_writer_lock(fs_info, page, start, len); - return 0; } -static int __process_pages_contig(struct address_space *mapping, - struct page *locked_page, - u64 start, u64 end, unsigned long page_ops, - u64 *processed_end) +static void __process_pages_contig(struct address_space *mapping, + struct page *locked_page, u64 start, u64 end, + unsigned long page_ops) { struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; - unsigned long pages_processed = 0; struct folio_batch fbatch; - int err = 0; int i; - if (page_ops & PAGE_LOCK) { - ASSERT(page_ops == PAGE_LOCK); - ASSERT(processed_end && *processed_end == start); - } - folio_batch_init(&fbatch); while (index <= end_index) { int found_folios; found_folios = filemap_get_folios_contig(mapping, &index, end_index, &fbatch); - - if (found_folios == 0) { - /* - * Only if we're going to lock these pages, we can find - * nothing at @index. - */ - ASSERT(page_ops & PAGE_LOCK); - err = -EAGAIN; - goto out; - } - for (i = 0; i < found_folios; i++) { - int process_ret; struct folio *folio = fbatch.folios[i]; - process_ret = process_one_page(fs_info, mapping, - &folio->page, locked_page, page_ops, - start, end); - if (process_ret < 0) { - err = -EAGAIN; - folio_batch_release(&fbatch); - goto out; - } - pages_processed += folio_nr_pages(folio); + + process_one_page(fs_info, &folio->page, locked_page, + page_ops, start, end); } folio_batch_release(&fbatch); cond_resched(); } -out: - if (err && processed_end) { - /* - * Update @processed_end. I know this is awful since it has - * two different return value patterns (inclusive vs exclusive). - * - * But the exclusive pattern is necessary if @start is 0, or we - * underflow and check against processed_end won't work as - * expected. - */ - if (pages_processed) - *processed_end = min(end, - ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1); - else - *processed_end = start; - } - return err; } static noinline void __unlock_for_delalloc(struct inode *inode, @@ -326,29 +242,63 @@ static noinline void __unlock_for_delalloc(struct inode *inode, return; __process_pages_contig(inode->i_mapping, locked_page, start, end, - PAGE_UNLOCK, NULL); + PAGE_UNLOCK); } static noinline int lock_delalloc_pages(struct inode *inode, struct page *locked_page, - u64 delalloc_start, - u64 delalloc_end) + u64 start, + u64 end) { - unsigned long index = delalloc_start >> PAGE_SHIFT; - unsigned long end_index = delalloc_end >> PAGE_SHIFT; - u64 processed_end = delalloc_start; - int ret; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + pgoff_t index = start_index; + u64 processed_end = start; + struct folio_batch fbatch; - ASSERT(locked_page); if (index == locked_page->index && index == end_index) return 0; - ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start, - delalloc_end, PAGE_LOCK, &processed_end); - if (ret == -EAGAIN && processed_end > delalloc_start) - __unlock_for_delalloc(inode, locked_page, delalloc_start, - processed_end); - return ret; + folio_batch_init(&fbatch); + while (index <= end_index) { + unsigned int found_folios, i; + + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); + if (found_folios == 0) + goto out; + + for (i = 0; i < found_folios; i++) { + struct page *page = &fbatch.folios[i]->page; + u32 len = end + 1 - start; + + if (page == locked_page) + continue; + + if (btrfs_page_start_writer_lock(fs_info, page, start, + len)) + goto out; + + if (!PageDirty(page) || page->mapping != mapping) { + btrfs_page_end_writer_lock(fs_info, page, start, + len); + goto out; + } + + processed_end = page_offset(page) + PAGE_SIZE - 1; + } + folio_batch_release(&fbatch); + cond_resched(); + } + + return 0; +out: + folio_batch_release(&fbatch); + if (processed_end > start) + __unlock_for_delalloc(inode, locked_page, start, processed_end); + return -EAGAIN; } /* @@ -444,7 +394,7 @@ again: /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, 1, cached_state); + EXTENT_DELALLOC, cached_state); if (!ret) { unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); @@ -467,7 +417,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start, end, page_ops, NULL); + start, end, page_ops); } static bool btrfs_verify_page(struct page *page, u64 start) @@ -497,31 +447,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } -/* lots and lots of room for performance fixes in the end_bio funcs */ - -void end_extent_writepage(struct page *page, int err, u64 start, u64 end) -{ - struct btrfs_inode *inode; - const bool uptodate = (err == 0); - int ret = 0; - - ASSERT(page && page->mapping); - inode = BTRFS_I(page->mapping->host); - btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); - - if (!uptodate) { - const struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 len; - - ASSERT(end + 1 - start <= U32_MAX); - len = end + 1 - start; - - btrfs_page_clear_uptodate(fs_info, page, start, len); - ret = err < 0 ? err : -EIO; - mapping_set_error(page->mapping, ret); - } -} - /* * after a writepage IO is done, we need to: * clear the uptodate bits on error @@ -558,10 +483,8 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) bvec->bv_offset, bvec->bv_len); btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); - if (error) { - btrfs_page_clear_uptodate(fs_info, page, start, len); + if (error) mapping_set_error(page->mapping, error); - } btrfs_page_clear_writeback(fs_info, page, start, len); } @@ -751,8 +674,8 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) * the array will be skipped * * Return: 0 if all pages were able to be allocated; - * -ENOMEM otherwise, and the caller is responsible for freeing all - * non-null page pointers in the array. + * -ENOMEM otherwise, the partially allocated pages would be freed and + * the array slots zeroed */ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) { @@ -771,8 +694,13 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) * though alloc_pages_bulk_array() falls back to alloc_page() * if it could not bulk-allocate. So we must be out of memory. */ - if (allocated == last) + if (allocated == last) { + for (int i = 0; i < allocated; i++) { + __free_page(page_array[i]); + page_array[i] = NULL; + } return -ENOMEM; + } memalloc_retry_wait(GFP_NOFS); } @@ -1243,38 +1171,45 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc) { - const u64 page_end = page_offset(page) + PAGE_SIZE - 1; - u64 delalloc_start = page_offset(page); + const u64 page_start = page_offset(page); + const u64 page_end = page_start + PAGE_SIZE - 1; + u64 delalloc_start = page_start; + u64 delalloc_end = page_end; u64 delalloc_to_write = 0; - /* How many pages are started by btrfs_run_delalloc_range() */ - unsigned long nr_written = 0; - int ret; - int page_started = 0; + int ret = 0; while (delalloc_start < page_end) { - u64 delalloc_end = page_end; - bool found; - - found = find_lock_delalloc_range(&inode->vfs_inode, page, - &delalloc_start, - &delalloc_end); - if (!found) { + delalloc_end = page_end; + if (!find_lock_delalloc_range(&inode->vfs_inode, page, + &delalloc_start, &delalloc_end)) { delalloc_start = delalloc_end + 1; continue; } + ret = btrfs_run_delalloc_range(inode, page, delalloc_start, - delalloc_end, &page_started, &nr_written, wbc); - if (ret) + delalloc_end, wbc); + if (ret < 0) return ret; - /* - * delalloc_end is already one less than the total length, so - * we don't subtract one from PAGE_SIZE - */ - delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_SIZE) >> PAGE_SHIFT; delalloc_start = delalloc_end + 1; } + + /* + * delalloc_end is already one less than the total length, so + * we don't subtract one from PAGE_SIZE + */ + delalloc_to_write += + DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); + + /* + * If btrfs_run_dealloc_range() already started I/O and unlocked + * the pages, we just need to account for them here. + */ + if (ret == 1) { + wbc->nr_to_write -= delalloc_to_write; + return 1; + } + if (wbc->nr_to_write < delalloc_to_write) { int thresh = 8192; @@ -1284,16 +1219,6 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, thresh); } - /* Did btrfs_run_dealloc_range() already unlock and start the IO? */ - if (page_started) { - /* - * We've unlocked the page, so we can't update the mapping's - * writeback index, just update nr_to_write. - */ - wbc->nr_to_write -= nr_written; - return 1; - } - return 0; } @@ -1382,6 +1307,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { + u32 len = end - cur + 1; u64 disk_bytenr; u64 em_end; u64 dirty_range_start = cur; @@ -1389,8 +1315,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u32 iosize; if (cur >= i_size) { - btrfs_writepage_endio_finish_ordered(inode, page, cur, - end, true); + btrfs_mark_ordered_io_finished(inode, page, cur, len, + true); /* * This range is beyond i_size, thus we don't need to * bother writing back. @@ -1399,7 +1325,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur); + btrfs_page_clear_dirty(fs_info, page, cur, len); break; } @@ -1410,7 +1336,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, continue; } - em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); + em = btrfs_get_extent(inode, NULL, 0, cur, len); if (IS_ERR(em)) { ret = PTR_ERR_OR_ZERO(em); goto out_error; @@ -1486,7 +1412,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; const u64 page_start = page_offset(page); - const u64 page_end = page_start + PAGE_SIZE - 1; int ret; int nr = 0; size_t pg_offset; @@ -1530,8 +1455,11 @@ done: set_page_writeback(page); end_page_writeback(page); } - if (ret) - end_extent_writepage(page, ret, page_start, page_end); + if (ret) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, + PAGE_SIZE, !ret); + mapping_set_error(page->mapping, ret); + } unlock_page(page); ASSERT(ret <= 0); return ret; @@ -1696,8 +1624,6 @@ static void extent_buffer_write_end_io(struct btrfs_bio *bbio) struct page *page = bvec->bv_page; u32 len = bvec->bv_len; - if (!uptodate) - btrfs_page_clear_uptodate(fs_info, page, start, len); btrfs_page_clear_writeback(fs_info, page, start, len); bio_offset += len; } @@ -1877,11 +1803,10 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct writeback_control *wbc, - struct extent_buffer **eb_context) +static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) { + struct writeback_control *wbc = ctx->wbc; struct address_space *mapping = page->mapping; - struct btrfs_block_group *cache = NULL; struct extent_buffer *eb; int ret; @@ -1908,7 +1833,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return 0; } - if (eb == *eb_context) { + if (eb == ctx->eb) { spin_unlock(&mapping->private_lock); return 0; } @@ -1917,34 +1842,25 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!ret) return 0; - if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) { - /* - * If for_sync, this hole will be filled with - * trasnsaction commit. - */ - if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) - ret = -EAGAIN; - else + ctx->eb = eb; + + ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx); + if (ret) { + if (ret == -EBUSY) ret = 0; free_extent_buffer(eb); return ret; } - *eb_context = eb; - if (!lock_extent_buffer_for_io(eb, wbc)) { - btrfs_revert_meta_write_pointer(cache, eb); - if (cache) - btrfs_put_block_group(cache); free_extent_buffer(eb); return 0; } - if (cache) { - /* - * Implies write in zoned mode. Mark the last eb in a block group. - */ - btrfs_schedule_zone_finish_bg(cache, eb); - btrfs_put_block_group(cache); + /* Implies write in zoned mode. */ + if (ctx->zoned_bg) { + /* Mark the last eb in the block group. */ + btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb); + ctx->zoned_bg->meta_write_pointer += eb->len; } write_one_eb(eb, wbc); free_extent_buffer(eb); @@ -1954,7 +1870,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_buffer *eb_context = NULL; + struct btrfs_eb_write_context ctx = { .wbc = wbc }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; int done = 0; @@ -1996,7 +1912,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, wbc, &eb_context); + ret = submit_eb_page(&folio->page, &ctx); if (ret == 0) continue; if (ret < 0) { @@ -2057,6 +1973,9 @@ retry: ret = 0; if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; + + if (ctx.zoned_bg) + btrfs_put_block_group(ctx.zoned_bg); btrfs_zoned_meta_io_unlock(fs_info); return ret; } @@ -2150,7 +2069,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - done_index = folio->index + folio_nr_pages(folio); + done_index = folio_next_index(folio); /* * At this point we hold neither the i_pages lock nor * the page lock: the page may be truncated or @@ -2233,11 +2152,11 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - struct writeback_control *wbc) +void extent_write_locked_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, struct writeback_control *wbc, + bool pages_dirty) { bool found_error = false; - int first_error = 0; int ret = 0; struct address_space *mapping = inode->i_mapping; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2256,18 +2175,16 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + u32 cur_len = cur_end + 1 - cur; struct page *page; int nr = 0; page = find_get_page(mapping, cur >> PAGE_SHIFT); - /* - * All pages in the range are locked since - * btrfs_run_delalloc_range(), thus there is no way to clear - * the page dirty flag. - */ ASSERT(PageLocked(page)); - ASSERT(PageDirty(page)); - clear_page_dirty_for_io(page); + if (pages_dirty && page != locked_page) { + ASSERT(PageDirty(page)); + clear_page_dirty_for_io(page); + } ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl, i_size, &nr); @@ -2279,23 +2196,20 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, set_page_writeback(page); end_page_writeback(page); } - if (ret) - end_extent_writepage(page, ret, cur, cur_end); - btrfs_page_unlock_writer(fs_info, page, cur, cur_end + 1 - cur); - if (ret < 0) { - found_error = true; - first_error = ret; + if (ret) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, + cur, cur_len, !ret); + mapping_set_error(page->mapping, ret); } + btrfs_page_unlock_writer(fs_info, page, cur, cur_len); + if (ret < 0) + found_error = true; next_page: put_page(page); cur = cur_end + 1; } submit_write_bio(&bio_ctrl, found_error ? ret : 0); - - if (found_error) - return first_error; - return ret; } int extent_writepages(struct address_space *mapping, @@ -2384,11 +2298,12 @@ static int try_release_extent_state(struct extent_io_tree *tree, u64 end = start + PAGE_SIZE - 1; int ret = 1; - if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { + if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { ret = 0; } else { u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | - EXTENT_DELALLOC_NEW | EXTENT_CTLBITS); + EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | + EXTENT_QGROUP_RESERVED); /* * At this point we can safely clear everything except the @@ -2443,9 +2358,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) free_extent_map(em); break; } - if (test_range_bit(tree, em->start, - extent_map_end(em) - 1, - EXTENT_LOCKED, 0, NULL)) + if (test_range_bit_exists(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED)) goto next; /* * If it's not in the list of modified extents, used @@ -3315,8 +3230,8 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return NULL; } WARN_ON(PageDirty(p)); - copy_page(page_address(p), page_address(src->pages[i])); } + copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); return new; @@ -3545,6 +3460,12 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) start, fs_info->nodesize); return -EINVAL; } + if (!IS_ALIGNED(start, fs_info->nodesize) && + !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { + btrfs_warn(fs_info, +"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", + start, fs_info->nodesize); + } return 0; } @@ -3559,6 +3480,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *exists = NULL; struct page *p; struct address_space *mapping = fs_info->btree_inode->i_mapping; + struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; int uptodate = 1; int ret; @@ -3595,36 +3517,30 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++, index++) { - struct btrfs_subpage *prealloc = NULL; + /* + * Preallocate page->private for subpage case, so that we won't + * allocate memory with private_lock nor page lock hold. + * + * The memory will be freed by attach_extent_buffer_page() or freed + * manually if we exit earlier. + */ + if (fs_info->nodesize < PAGE_SIZE) { + prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (IS_ERR(prealloc)) { + exists = ERR_CAST(prealloc); + goto free_eb; + } + } + + for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); if (!p) { exists = ERR_PTR(-ENOMEM); + btrfs_free_subpage(prealloc); goto free_eb; } - /* - * Preallocate page->private for subpage case, so that we won't - * allocate memory with private_lock hold. The memory will be - * freed by attach_extent_buffer_page() or freed manually if - * we exit earlier. - * - * Although we have ensured one subpage eb can only have one - * page, but it may change in the future for 16K page size - * support, so we still preallocate the memory in the loop. - */ - if (fs_info->nodesize < PAGE_SIZE) { - prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); - if (IS_ERR(prealloc)) { - ret = PTR_ERR(prealloc); - unlock_page(p); - put_page(p); - exists = ERR_PTR(ret); - goto free_eb; - } - } - spin_lock(&mapping->private_lock); exists = grab_extent_buffer(fs_info, p); if (exists) { @@ -4090,8 +4006,14 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; unsigned long i = get_eb_page_index(start); - if (check_eb_range(eb, start, len)) + if (check_eb_range(eb, start, len)) { + /* + * Invalid range hit, reset the memory, so callers won't get + * some random garbage for their uninitialzed memory. + */ + memset(dstv, 0, len); return; + } offset = get_eb_offset_in_page(eb, start); @@ -4210,30 +4132,9 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, } } -void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, - const void *srcv) -{ - char *kaddr; - - assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + - get_eb_offset_in_page(eb, offsetof(struct btrfs_header, - chunk_tree_uuid)); - memcpy(kaddr, srcv, BTRFS_FSID_SIZE); -} - -void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) -{ - char *kaddr; - - assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + - get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid)); - memcpy(kaddr, srcv, BTRFS_FSID_SIZE); -} - -void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, - unsigned long start, unsigned long len) +static void __write_extent_buffer(const struct extent_buffer *eb, + const void *srcv, unsigned long start, + unsigned long len, bool use_memmove) { size_t cur; size_t offset; @@ -4241,6 +4142,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *kaddr; char *src = (char *)srcv; unsigned long i = get_eb_page_index(start); + /* For unmapped (dummy) ebs, no need to check their uptodate status. */ + const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); @@ -4251,11 +4154,15 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, while (len > 0) { page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); + if (check_uptodate) + assert_eb_page_uptodate(eb, page); cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); - memcpy(kaddr + offset, src, cur); + if (use_memmove) + memmove(kaddr + offset, src, cur); + else + memcpy(kaddr + offset, src, cur); src += cur; len -= cur; @@ -4264,55 +4171,54 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, } } -void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, - unsigned long len) +void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) { - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - unsigned long i = get_eb_page_index(start); + return __write_extent_buffer(eb, srcv, start, len, false); +} - if (check_eb_range(eb, start, len)) - return; +static void memset_extent_buffer(const struct extent_buffer *eb, int c, + unsigned long start, unsigned long len) +{ + unsigned long cur = start; - offset = get_eb_offset_in_page(eb, start); + while (cur < start + len) { + unsigned long index = get_eb_page_index(cur); + unsigned int offset = get_eb_offset_in_page(eb, cur); + unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset); + struct page *page = eb->pages[index]; - while (len > 0) { - page = eb->pages[i]; assert_eb_page_uptodate(eb, page); + memset(page_address(page) + offset, c, cur_len); - cur = min(len, PAGE_SIZE - offset); - kaddr = page_address(page); - memset(kaddr + offset, 0, cur); - - len -= cur; - offset = 0; - i++; + cur += cur_len; } } +void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + if (check_eb_range(eb, start, len)) + return; + return memset_extent_buffer(eb, 0, start, len); +} + void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { - int i; - int num_pages; + unsigned long cur = 0; ASSERT(dst->len == src->len); - if (dst->fs_info->nodesize >= PAGE_SIZE) { - num_pages = num_extent_pages(dst); - for (i = 0; i < num_pages; i++) - copy_page(page_address(dst->pages[i]), - page_address(src->pages[i])); - } else { - size_t src_offset = get_eb_offset_in_page(src, 0); - size_t dst_offset = get_eb_offset_in_page(dst, 0); + while (cur < src->len) { + unsigned long index = get_eb_page_index(cur); + unsigned long offset = get_eb_offset_in_page(src, cur); + unsigned long cur_len = min(src->len, PAGE_SIZE - offset); + void *addr = page_address(src->pages[index]) + offset; - ASSERT(src->fs_info->nodesize < PAGE_SIZE); - memcpy(page_address(dst->pages[0]) + dst_offset, - page_address(src->pages[0]) + src_offset, - src->len); + write_extent_buffer(dst, addr, cur, cur_len); + + cur += cur_len; } } @@ -4353,14 +4259,14 @@ void copy_extent_buffer(const struct extent_buffer *dst, } /* - * eb_bitmap_offset() - calculate the page and offset of the byte containing the - * given bit number - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @nr: bit number - * @page_index: return index of the page in the extent buffer that contains the - * given bit number - * @page_offset: return offset into the page given by page_index + * Calculate the page and offset of the byte containing the given bit number. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number + * @page_index: return index of the page in the extent buffer that contains + * the given bit number + * @page_offset: return offset into the page given by page_index * * This helper hides the ugliness of finding the byte in an extent buffer which * contains a given bit. @@ -4406,6 +4312,15 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } +static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) +{ + unsigned long index = get_eb_page_index(bytenr); + + if (check_eb_range(eb, bytenr, 1)) + return NULL; + return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr); +} + /* * Set an area of a bitmap to 1. * @@ -4417,35 +4332,28 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { + unsigned int first_byte = start + BIT_BYTE(pos); + unsigned int last_byte = start + BIT_BYTE(pos + len - 1); + const bool same_byte = (first_byte == last_byte); + u8 mask = BITMAP_FIRST_BYTE_MASK(pos); u8 *kaddr; - struct page *page; - unsigned long i; - size_t offset; - const unsigned int size = pos + len; - int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); - eb_bitmap_offset(eb, start, pos, &i, &offset); - page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); + if (same_byte) + mask &= BITMAP_LAST_BYTE_MASK(pos + len); - while (len >= bits_to_set) { - kaddr[offset] |= mask_to_set; - len -= bits_to_set; - bits_to_set = BITS_PER_BYTE; - mask_to_set = ~0; - if (++offset >= PAGE_SIZE && len > 0) { - offset = 0; - page = eb->pages[++i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); - } - } - if (len) { - mask_to_set &= BITMAP_LAST_BYTE_MASK(size); - kaddr[offset] |= mask_to_set; - } + /* Handle the first byte. */ + kaddr = extent_buffer_get_byte(eb, first_byte); + *kaddr |= mask; + if (same_byte) + return; + + /* Handle the byte aligned part. */ + ASSERT(first_byte + 1 <= last_byte); + memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1); + + /* Handle the last byte. */ + kaddr = extent_buffer_get_byte(eb, last_byte); + *kaddr |= BITMAP_LAST_BYTE_MASK(pos + len); } @@ -4461,35 +4369,28 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { + unsigned int first_byte = start + BIT_BYTE(pos); + unsigned int last_byte = start + BIT_BYTE(pos + len - 1); + const bool same_byte = (first_byte == last_byte); + u8 mask = BITMAP_FIRST_BYTE_MASK(pos); u8 *kaddr; - struct page *page; - unsigned long i; - size_t offset; - const unsigned int size = pos + len; - int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); - eb_bitmap_offset(eb, start, pos, &i, &offset); - page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); + if (same_byte) + mask &= BITMAP_LAST_BYTE_MASK(pos + len); - while (len >= bits_to_clear) { - kaddr[offset] &= ~mask_to_clear; - len -= bits_to_clear; - bits_to_clear = BITS_PER_BYTE; - mask_to_clear = ~0; - if (++offset >= PAGE_SIZE && len > 0) { - offset = 0; - page = eb->pages[++i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); - } - } - if (len) { - mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); - kaddr[offset] &= ~mask_to_clear; - } + /* Handle the first byte. */ + kaddr = extent_buffer_get_byte(eb, first_byte); + *kaddr &= ~mask; + if (same_byte) + return; + + /* Handle the byte aligned part. */ + ASSERT(first_byte + 1 <= last_byte); + memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1); + + /* Handle the last byte. */ + kaddr = extent_buffer_get_byte(eb, last_byte); + *kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len); } static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) @@ -4498,60 +4399,29 @@ static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned return distance < len; } -static void copy_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = page_address(dst_page); - char *src_kaddr; - int must_memmove = 0; - - if (dst_page != src_page) { - src_kaddr = page_address(src_page); - } else { - src_kaddr = dst_kaddr; - if (areas_overlap(src_off, dst_off, len)) - must_memmove = 1; - } - - if (must_memmove) - memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); - else - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); -} - void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; - unsigned long dst_i; - unsigned long src_i; + unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || check_eb_range(dst, src_offset, len)) return; - while (len > 0) { - dst_off_in_page = get_eb_offset_in_page(dst, dst_offset); - src_off_in_page = get_eb_offset_in_page(dst, src_offset); - - dst_i = get_eb_page_index(dst_offset); - src_i = get_eb_page_index(src_offset); - - cur = min(len, (unsigned long)(PAGE_SIZE - - src_off_in_page)); - cur = min_t(unsigned long, cur, - (unsigned long)(PAGE_SIZE - dst_off_in_page)); - - copy_pages(dst->pages[dst_i], dst->pages[src_i], - dst_off_in_page, src_off_in_page, cur); - - src_offset += cur; - dst_offset += cur; - len -= cur; + while (cur_off < len) { + unsigned long cur_src = cur_off + src_offset; + unsigned long pg_index = get_eb_page_index(cur_src); + unsigned long pg_off = get_eb_offset_in_page(dst, cur_src); + unsigned long cur_len = min(src_offset + len - cur_src, + PAGE_SIZE - pg_off); + void *src_addr = page_address(dst->pages[pg_index]) + pg_off; + const bool use_memmove = areas_overlap(src_offset + cur_off, + dst_offset + cur_off, cur_len); + + __write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len, + use_memmove); + cur_off += cur_len; } } @@ -4559,23 +4429,26 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - unsigned long dst_i; - unsigned long src_i; if (check_eb_range(dst, dst_offset, len) || check_eb_range(dst, src_offset, len)) return; + if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } + while (len > 0) { - dst_i = get_eb_page_index(dst_end); + unsigned long src_i; + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + void *src_addr; + bool use_memmove; + src_i = get_eb_page_index(src_end); dst_off_in_page = get_eb_offset_in_page(dst, dst_end); @@ -4583,9 +4456,14 @@ void memmove_extent_buffer(const struct extent_buffer *dst, cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); - copy_pages(dst->pages[dst_i], dst->pages[src_i], - dst_off_in_page - cur + 1, - src_off_in_page - cur + 1, cur); + + src_addr = page_address(dst->pages[src_i]) + src_off_in_page - + cur + 1; + use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, + cur); + + __write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur, + use_memmove); dst_end -= cur; src_end -= cur; @@ -4747,7 +4625,8 @@ int try_release_extent_buffer(struct page *page) } /* - * btrfs_readahead_tree_block - attempt to readahead a child block + * Attempt to readahead a child block. + * * @fs_info: the fs_info * @bytenr: bytenr to read * @owner_root: objectid of the root that owns this eb @@ -4786,7 +4665,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, } /* - * btrfs_readahead_node_child - readahead a node's child block + * Readahead a node's child block. + * * @node: parent node we're reading from * @slot: slot in the parent node for the child we want to read * diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c5fae3a7d911..2171057a4477 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -40,7 +40,6 @@ enum { ENUM_BIT(PAGE_START_WRITEBACK), ENUM_BIT(PAGE_END_WRITEBACK), ENUM_BIT(PAGE_SET_ORDERED), - ENUM_BIT(PAGE_LOCK), }; /* @@ -81,19 +80,26 @@ struct extent_buffer { spinlock_t refs_lock; atomic_t refs; int read_mirror; - struct rcu_head rcu_head; - pid_t lock_owner; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; + struct rcu_head rcu_head; struct rw_semaphore lock; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; + pid_t lock_owner; #endif }; +struct btrfs_eb_write_context { + struct writeback_control *wbc; + struct extent_buffer *eb; + /* Block group @eb resides in. Only used for zoned mode. */ + struct btrfs_block_group *zoned_bg; +}; + /* * Get the correct offset inside the page of extent buffer. * @@ -178,8 +184,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - struct writeback_control *wbc); +void extent_write_locked_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, struct writeback_control *wbc, + bool pages_dirty); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, @@ -236,11 +243,24 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dst, int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dst, unsigned long start, unsigned long len); -void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src); -void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, - const void *src); void write_extent_buffer(const struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); + +static inline void write_extent_buffer_chunk_tree_uuid( + const struct extent_buffer *eb, const void *chunk_tree_uuid) +{ + write_extent_buffer(eb, chunk_tree_uuid, + offsetof(struct btrfs_header, chunk_tree_uuid), + BTRFS_FSID_SIZE); +} + +static inline void write_extent_buffer_fsid(const struct extent_buffer *eb, + const void *fsid) +{ + write_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE); +} + void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src); void copy_extent_buffer(const struct extent_buffer *dst, @@ -266,7 +286,6 @@ void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); -void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); @@ -277,8 +296,6 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); -void end_extent_writepage(struct page *page, int err, u64 start, u64 end); - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 696bf695d8eb..45cae356e89b 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -194,7 +194,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -597,29 +597,37 @@ fail: * Each bit represents a sector. Thus caller should ensure @csum_buf passed * in is large enough to contain all csums. */ -int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap, - bool search_commit) +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 end, u8 *csum_buf, + unsigned long *csum_bitmap) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; - struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_csum_item *item; const u64 orig_start = start; + bool free_path = false; int ret; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + free_path = true; + } - if (search_commit) { - path->skip_locking = 1; - path->reada = READA_FORWARD; - path->search_commit_root = 1; + /* Check if we can reuse the previous path. */ + if (path->nodes[0]) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY && + key.offset <= start) + goto search_forward; + btrfs_release_path(path); } key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -656,6 +664,7 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, } } +search_forward: while (start <= end) { u64 csum_end; @@ -712,7 +721,8 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, } ret = 0; fail: - btrfs_free_path(path); + if (free_path) + btrfs_free_path(path); return ret; } @@ -801,11 +811,12 @@ blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) * This calls btrfs_truncate_item with the correct args based on the overlap, * and fixes up the key as required. */ -static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, +static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_key *key, u64 bytenr, u64 len) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_buffer *leaf; const u32 csum_size = fs_info->csum_size; u64 csum_end; @@ -826,7 +837,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(path, new_size, 1); + btrfs_truncate_item(trans, path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* @@ -838,10 +849,10 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(path, new_size, 0); + btrfs_truncate_item(trans, path, new_size, 0); key->offset = end_byte; - btrfs_set_item_key_safe(fs_info, path, key); + btrfs_set_item_key_safe(trans, path, key); } else { BUG(); } @@ -984,7 +995,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; } else { - truncate_one_csum(fs_info, path, &key, bytenr, len); + truncate_one_csum(trans, path, &key, bytenr, len); if (key.offset < bytenr) break; } @@ -1192,7 +1203,7 @@ extend_csum: diff /= csum_size; diff *= csum_size; - btrfs_extend_item(path, diff); + btrfs_extend_item(trans, path, diff); ret = 0; goto csum; } @@ -1239,7 +1250,7 @@ found: ins_size /= csum_size; total_bytes += ins_size * fs_info->sectorsize; - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); if (total_bytes < sums->len) { btrfs_release_path(path); cond_resched(); diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 4ec669b69008..04bd2d34efb1 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -57,9 +57,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); -int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap, - bool search_commit); +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 end, u8 *csum_buf, + unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fd03e689a6be..32611a4edd6b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -17,6 +17,7 @@ #include <linux/uio.h> #include <linux/iversion.h> #include <linux/fsverity.h> +#include <linux/iomap.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -368,12 +369,13 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->start); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) { btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - disk_bytenr, num_bytes, 0); + disk_bytenr, num_bytes, 0, + root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, new_key.objectid, @@ -405,13 +407,13 @@ next_slot: memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = args->end; - btrfs_set_item_key_safe(fs_info, path, &new_key); + btrfs_set_item_key_safe(trans, path, &new_key); extent_offset += args->end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->end); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += args->end - key.offset; break; @@ -431,7 +433,7 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, args->start - key.offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += extent_end - args->start; if (args->end == extent_end) @@ -463,7 +465,8 @@ delete_extent_item: } else if (update_refs && disk_bytenr > 0) { btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, - disk_bytenr, num_bytes, 0); + disk_bytenr, num_bytes, 0, + root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, key.objectid, @@ -536,7 +539,8 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size); + btrfs_setup_item_for_insert(trans, root, path, &key, + args->extent_item_size); args->extent_inserted = true; } @@ -593,7 +597,6 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_path *path; @@ -664,7 +667,7 @@ again: ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; - btrfs_set_item_key_safe(fs_info, path, &new_key); + btrfs_set_item_key_safe(trans, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, @@ -679,7 +682,7 @@ again: trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -698,7 +701,7 @@ again: trans->transid); path->slots[0]++; new_key.offset = start; - btrfs_set_item_key_safe(fs_info, path, &new_key); + btrfs_set_item_key_safe(trans, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -708,7 +711,7 @@ again: other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -742,10 +745,10 @@ again: btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, - num_bytes, 0); + num_bytes, 0, root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); @@ -771,7 +774,7 @@ again: other_start = end; other_end = 0; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, 0); + num_bytes, 0, root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, @@ -814,7 +817,7 @@ again: btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); @@ -823,7 +826,7 @@ again: btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { @@ -876,9 +879,9 @@ static int prepare_uptodate_page(struct inode *inode, return 0; } -static unsigned int get_prepare_fgp_flags(bool nowait) +static fgf_t get_prepare_fgp_flags(bool nowait) { - unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; if (nowait) fgp_flags |= FGP_NOWAIT; @@ -910,7 +913,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, int i; unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - unsigned int fgp_flags = get_prepare_fgp_flags(nowait); + fgf_t fgp_flags = get_prepare_fgp_flags(nowait); int err = 0; int faili; @@ -1108,17 +1111,19 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) static void update_time_for_write(struct inode *inode) { - struct timespec64 now; + struct timespec64 now, ts; if (IS_NOCMTIME(inode)) return; now = current_time(inode); - if (!timespec64_equal(&inode->i_mtime, &now)) - inode->i_mtime = now; + ts = inode_get_mtime(inode); + if (!timespec64_equal(&ts, &now)) + inode_set_mtime_to_ts(inode, now); - if (!timespec64_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; + ts = inode_get_ctime(inode); + if (!timespec64_equal(&ts, &now)) + inode_set_ctime_to_ts(inode, now); if (IS_I_VERSION(inode)) inode_inc_iversion(inode); @@ -1466,8 +1471,13 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; - /* If the write DIO is within EOF, use a shared lock */ - if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode)) + /* + * If the write DIO is within EOF, use a shared lock and also only if + * security bits will likely not be dropped by file_remove_privs() called + * from btrfs_write_check(). Either will need to be rechecked after the + * lock was acquired. + */ + if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) ilock_flags |= BTRFS_ILOCK_SHARED; relock: @@ -1475,6 +1485,13 @@ relock: if (err < 0) return err; + /* Shared lock cannot be used with security bits set. */ + if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + ilock_flags &= ~BTRFS_ILOCK_SHARED; + goto relock; + } + err = generic_write_checks(iocb, from); if (err <= 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); @@ -1733,7 +1750,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) struct btrfs_inode *inode = BTRFS_I(ctx->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (btrfs_inode_in_log(inode, fs_info->generation) && + if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) && list_empty(&ctx->ordered_extents)) return true; @@ -1744,7 +1761,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) * and for a fast fsync we don't wait for that, we only wait for the * writeback to complete. */ - if (inode->last_trans <= fs_info->last_trans_committed && + if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) && (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || list_empty(&ctx->ordered_extents))) return true; @@ -1873,7 +1890,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); - smp_mb(); if (skip_inode_logging(&ctx)) { /* * We've had everything committed since the last time we were @@ -2091,7 +2107,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); goto out; } @@ -2099,7 +2115,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, u64 num_bytes; key.offset = offset; - btrfs_set_item_key_safe(fs_info, path, &key); + btrfs_set_item_key_safe(trans, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - @@ -2108,7 +2124,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); goto out; } btrfs_release_path(path); @@ -2260,7 +2276,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); if (extent_info->is_new_extent) btrfs_set_file_extent_generation(leaf, extent, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, @@ -2290,7 +2306,8 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, extent_info->disk_offset, - extent_info->disk_len, 0); + extent_info->disk_len, 0, + root->root_key.objectid); ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, root->root_key.objectid, btrfs_ino(inode), ref_offset, 0, false); @@ -2459,12 +2476,11 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, */ inode_inc_iversion(&inode->vfs_inode); - if (!extent_info || extent_info->update_times) { - inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); - inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; - } + if (!extent_info || extent_info->update_times) + inode_set_mtime_to_ts(&inode->vfs_inode, + inode_set_ctime_current(&inode->vfs_inode)); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) break; @@ -2703,9 +2719,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -2721,18 +2736,17 @@ out_only_mutex: * for detecting, at fsync time, if the inode isn't yet in the * log tree or it's there but not up to date. */ - struct timespec64 now = current_time(inode); + struct timespec64 now = inode_set_ctime_current(inode); inode_inc_iversion(inode); - inode->i_mtime = now; - inode->i_ctime = now; + inode_set_mtime_to_ts(inode, now); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); } else { int ret2; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); if (!ret) ret = ret2; @@ -2796,10 +2810,10 @@ static int btrfs_fallocate_update_isize(struct inode *inode, if (IS_ERR(trans)) return PTR_ERR(trans); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); i_size_write(inode, end); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); return ret ? ret : ret2; @@ -3018,7 +3032,7 @@ static long btrfs_fallocate(struct file *file, int mode, struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; - struct list_head reserve_list; + LIST_HEAD(reserve_list); u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -3110,7 +3124,6 @@ static long btrfs_fallocate(struct file *file, int mode, btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); /* First, check if we exceed the qgroup limit */ - INIT_LIST_HEAD(&reserve_list); while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset); @@ -3179,7 +3192,7 @@ static long btrfs_fallocate(struct file *file, int mode, qgroup_reserved -= range->len; } else if (qgroup_reserved > 0) { btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, - range->start, range->len); + range->start, range->len, NULL); qgroup_reserved -= range->len; } list_del(&range->list); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 880800418075..6f93c9a2c3e3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -57,6 +57,11 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, bool update_stats); +static void btrfs_crc32c_final(u32 crc, u8 *result) +{ + put_unaligned_le32(~crc, result); +} + static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) { struct btrfs_free_space *info; @@ -195,7 +200,7 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -213,7 +218,7 @@ static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header); memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); btrfs_set_free_space_key(leaf, header, &disk_key); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -354,7 +359,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, if (ret) goto fail; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); fail: if (locked) @@ -540,7 +545,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); @@ -562,7 +567,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) val = *tmp; io_ctl_map_page(io_ctl, 0); - crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->fs_info, @@ -1185,7 +1190,7 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_set_free_space_entries(leaf, header, entries); btrfs_set_free_space_bitmaps(leaf, header, bitmaps); btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -1219,10 +1224,9 @@ static noinline_for_stack int write_pinned_extent_entries( start = block_group->start; while (start < block_group->start + block_group->length) { - ret = find_first_extent_bit(unpin, start, - &extent_start, &extent_end, - EXTENT_DIRTY, NULL); - if (ret) + if (!find_first_extent_bit(unpin, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ @@ -1322,7 +1326,7 @@ out: "failed to write free space cache for block group %llu error %d", block_group->start, ret); } - btrfs_update_inode(trans, root, BTRFS_I(inode)); + btrfs_update_inode(trans, BTRFS_I(inode)); if (block_group) { /* the dirty list is protected by the dirty_bgs_lock */ @@ -1363,7 +1367,6 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, /* * Write out cached info to an inode. * - * @root: root the inode belongs to * @inode: freespace inode we are writing out * @ctl: free space cache we are going to write out * @block_group: block_group for this cache if it belongs to a block_group @@ -1374,7 +1377,7 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, * on mount. This will return 0 if it was successful in writing the cache out, * or an errno if it was not. */ -static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, +static int __btrfs_write_out_cache(struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, @@ -1507,7 +1510,7 @@ out: invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } - btrfs_update_inode(trans, root, BTRFS_I(inode)); + btrfs_update_inode(trans, BTRFS_I(inode)); if (must_iput) iput(inode); return ret; @@ -1533,8 +1536,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return 0; - ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, - block_group, &block_group->io_ctl, trans); + ret = __btrfs_write_out_cache(inode, ctl, block_group, + &block_group->io_ctl, trans); if (ret) { btrfs_debug(fs_info, "failed to write free space cache for block group %llu error %d", @@ -2705,13 +2708,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); spin_lock(&ctl->tree_lock); - /* Count initial region as zone_unusable until it gets activated. */ if (!used) to_free = size; - else if (initial && - test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) && - (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) - to_free = 0; else if (initial) to_free = block_group->zone_capacity; else if (offset >= block_group->alloc_offset) @@ -2739,8 +2737,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, reclaimable_unusable = block_group->zone_unusable - (block_group->length - block_group->zone_capacity); /* All the region is now unusable. Mark it as unused and reclaim */ - if (block_group->zone_unusable == block_group->length && - block_group->alloc_offset) { + if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && reclaimable_unusable >= @@ -2944,7 +2941,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, btrfs_info(fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); btrfs_info(fs_info, - "%d blocks of free space at or bigger than bytes is", count); + "%d free space entries at or bigger than %llu bytes", + count, bytes); } void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index f169378e2ca6..7b598b070700 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -89,7 +89,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_free_space_info); btrfs_set_free_space_extent_count(leaf, info, 0); btrfs_set_free_space_flags(leaf, info, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -287,7 +287,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, flags |= BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); if (extent_count != expected_extent_count) { @@ -324,7 +324,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, bitmap_cursor, ptr, data_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); i += extent_size; @@ -430,7 +430,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; @@ -495,7 +495,7 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, extent_count += new_extents; btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && @@ -533,7 +533,8 @@ int free_space_test_bit(struct btrfs_block_group *block_group, return !!extent_buffer_test_bit(leaf, ptr, i); } -static void free_space_set_bits(struct btrfs_block_group *block_group, +static void free_space_set_bits(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 *start, u64 *size, int bit) { @@ -563,7 +564,7 @@ static void free_space_set_bits(struct btrfs_block_group *block_group, extent_buffer_bitmap_set(leaf, ptr, first, last - first); else extent_buffer_bitmap_clear(leaf, ptr, first, last - first); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); *size -= end - *start; *start = end; @@ -656,7 +657,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, cur_start = start; cur_size = size; while (1) { - free_space_set_bits(block_group, path, &cur_start, &cur_size, + free_space_set_bits(trans, block_group, path, &cur_start, &cur_size, !remove); if (cur_size == 0) break; @@ -1517,8 +1518,10 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } else if (prev_bit == 1 && bit == 0) { u64 space_added; - ret = add_new_free_space(block_group, extent_start, - offset, &space_added); + ret = btrfs_add_new_free_space(block_group, + extent_start, + offset, + &space_added); if (ret) goto out; total_found += space_added; @@ -1533,7 +1536,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } } if (prev_bit == 1) { - ret = add_new_free_space(block_group, extent_start, end, NULL); + ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL); if (ret) goto out; extent_count++; @@ -1590,8 +1593,9 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); - ret = add_new_free_space(block_group, key.objectid, - key.objectid + key.offset, &space_added); + ret = btrfs_add_new_free_space(block_group, key.objectid, + key.objectid + key.offset, + &space_added); if (ret) goto out; total_found += space_added; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 203d2a267828..318df6f9d9cb 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -46,8 +46,6 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); * Runtime (in-memory) states of filesystem */ enum { - /* Global indicator of serious filesystem errors */ - BTRFS_FS_STATE_ERROR, /* * Filesystem is being remounted, allow to skip some operations, like * defrag @@ -141,6 +139,12 @@ enum { */ BTRFS_FS_FEATURE_CHANGED, + /* + * Indicate that we have found a tree block which is only aligned to + * sectorsize, but not to nodesize. This should be rare nowadays. + */ + BTRFS_FS_UNALIGNED_TREE_BLOCK, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -173,19 +177,17 @@ enum { BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), - BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), - BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), - BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), - BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), - BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), - BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), - BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), - BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), - BTRFS_MOUNT_REF_VERIFY = (1UL << 27), - BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), - BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), - BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), - BTRFS_MOUNT_NODISCARD = (1UL << 31), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 19), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 20), + BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 21), + BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 22), + BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 23), + BTRFS_MOUNT_NOLOGREPLAY = (1UL << 24), + BTRFS_MOUNT_REF_VERIFY = (1UL << 25), + BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 26), + BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27), + BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28), + BTRFS_MOUNT_NODISCARD = (1UL << 29), }; /* @@ -218,7 +220,8 @@ enum { BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED) + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) #ifdef CONFIG_BTRFS_DEBUG /* @@ -227,6 +230,7 @@ enum { */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ + BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) #else @@ -371,6 +375,7 @@ struct btrfs_fs_info { struct btrfs_root *uuid_root; struct btrfs_root *data_reloc_root; struct btrfs_root *block_group_root; + struct btrfs_root *stripe_root; /* The log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -411,7 +416,17 @@ struct btrfs_fs_info { struct btrfs_block_rsv empty_block_rsv; + /* + * Updated while holding the lock 'trans_lock'. Due to the life cycle of + * a transaction, it can be directly read while holding a transaction + * handle, everywhere else must be read with btrfs_get_fs_generation(). + * Should always be updated using btrfs_set_fs_generation(). + */ u64 generation; + /* + * Always use btrfs_get_last_trans_committed() and + * btrfs_set_last_trans_committed() to read and update this field. + */ u64 last_trans_committed; /* * Generation of the last transaction used for block group relocation @@ -647,9 +662,6 @@ struct btrfs_fs_info { struct btrfs_discard_ctl discard_ctl; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - u32 check_integrity_print_mask; -#endif /* Is qgroup tracking in a consistent state? */ u64 qgroup_flags; @@ -685,6 +697,13 @@ struct btrfs_fs_info { /* Protected by qgroup_rescan_lock */ bool qgroup_rescan_running; u8 qgroup_drop_subtree_thres; + u64 qgroup_enable_gen; + + /* + * If this is not 0, then it indicates a serious filesystem error has + * happened and it contains that error (negative errno value). + */ + int fs_error; /* Filesystem state */ unsigned long fs_state; @@ -766,6 +785,9 @@ struct btrfs_fs_info { u64 data_reloc_bg; struct mutex zoned_data_reloc_io_lock; + struct btrfs_block_group *active_meta_bg; + struct btrfs_block_group *active_system_bg; + u64 nr_global_roots; spinlock_t zone_active_bgs_lock; @@ -805,6 +827,26 @@ struct btrfs_fs_info { #endif }; +static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->generation); +} + +static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen) +{ + WRITE_ONCE(fs_info->generation, gen); +} + +static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->last_trans_committed); +} + +static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen) +{ + WRITE_ONCE(fs_info->last_trans_committed, gen); +} + static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, u64 gen) { @@ -962,8 +1004,8 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); } -#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ - &(fs_info)->fs_state))) +#define BTRFS_FS_ERROR(fs_info) (READ_ONCE((fs_info)->fs_error)) + #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 4c322b720a80..7d734830e514 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -167,7 +167,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + del_len, item_size - (ptr + del_len - item_start)); - btrfs_truncate_item(path, item_size - del_len, 1); + btrfs_truncate_item(trans, path, item_size - del_len, 1); out: btrfs_free_path(path); @@ -229,7 +229,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - btrfs_truncate_item(path, item_size - sub_item_len, 1); + btrfs_truncate_item(trans, path, item_size - sub_item_len, 1); out: btrfs_free_path(path); @@ -247,7 +247,7 @@ out: } /* - * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree. + * Insert an extended inode ref into a tree. * * The caller must have checked against BTRFS_LINK_MAX already. */ @@ -282,7 +282,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, name)) goto out; - btrfs_extend_item(path, ins_len); + btrfs_extend_item(trans, path, ins_len); ret = 0; } if (ret < 0) @@ -299,7 +299,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr = (unsigned long)&extref->name; write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); @@ -338,7 +338,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, goto out; old_size = btrfs_item_size(path->nodes[0], path->slots[0]); - btrfs_extend_item(path, ins_len); + btrfs_extend_item(trans, path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); @@ -364,7 +364,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); } write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); @@ -591,7 +591,7 @@ search_again: num_dec = (orig_num_bytes - extent_num_bytes); if (extent_start != 0) control->sub_bytes += num_dec; - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } else { extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); @@ -617,7 +617,7 @@ search_again: btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(path, size, 1); + btrfs_truncate_item(trans, path, size, 1); } else if (!del_item) { /* * We have to bail so the last_size is set to @@ -676,7 +676,8 @@ delete: bytes_deleted += extent_num_bytes; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, - extent_start, extent_num_bytes, 0); + extent_start, extent_num_bytes, 0, + root->root_key.objectid); btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), control->ino, extent_offset, root->root_key.objectid, false); diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index ede43b6c6559..4337bb26f419 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -4,6 +4,7 @@ #define BTRFS_INODE_ITEM_H #include <linux/types.h> +#include <linux/crc32c.h> struct btrfs_trans_handle; struct btrfs_root; @@ -12,6 +13,7 @@ struct btrfs_key; struct btrfs_inode_extref; struct btrfs_inode; struct extent_buffer; +struct fscrypt_str; /* * Return this if we need to call truncate_block for the last bit of the @@ -76,6 +78,12 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, *ro_flags = (u32)(inode_item_flags >> 32); } +/* Figure the key offset of an extended inode ref. */ +static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len) +{ + return (u64)crc32c(parent_objectid, name, len); +} + int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aa090b0b5d29..fb3c3f43c3fa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -71,6 +71,7 @@ #include "super.h" #include "orphan.h" #include "backref.h" +#include "raid-stripe-tree.h" struct btrfs_iget_args { u64 ino; @@ -124,11 +125,11 @@ static struct kmem_cache *btrfs_inode_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); -static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock, - u64 *done_offset); + +static noinline int run_delalloc_cow(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc, + bool pages_dirty); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, @@ -348,7 +349,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, } /* - * btrfs_inode_lock - lock inode i_rwsem based on arguments passed + * Lock inode i_rwsem based on arguments passed. * * ilock_flags can have the following bit set: * @@ -382,7 +383,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) } /* - * btrfs_inode_unlock - unock inode i_rwsem + * Unock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. @@ -423,11 +424,10 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, while (index <= end_index) { /* - * For locked page, we will call end_extent_writepage() on it - * in run_delalloc_range() for the error handling. That - * end_extent_writepage() function will call - * btrfs_mark_ordered_io_finished() to clear page Ordered and - * run the ordered extent accounting. + * For locked page, we will call btrfs_mark_ordered_io_finished + * through btrfs_mark_ordered_io_finished() on it + * in run_delalloc_range() for the error handling, which will + * clear page Ordered and run the ordered extent accounting. * * Here we can't just clear the Ordered bit, or * btrfs_mark_ordered_io_finished() would skip the accounting @@ -574,7 +574,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, kunmap_local(kaddr); put_page(page); } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -671,7 +671,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, } btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -688,7 +688,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -815,24 +815,22 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, } /* - * we create compressed extents in two phases. The first - * phase compresses a range of pages that have already been - * locked (both pages and state bits are locked). + * Work queue call back to started compression on a file and pages. * - * This is done inside an ordered work queue, and the compression - * is spread across many cpus. The actual IO submission is step - * two, and the ordered work queue takes care of making sure that - * happens in the same order things were put onto the queue by - * writepages and friends. + * This is done inside an ordered work queue, and the compression is spread + * across many cpus. The actual IO submission is step two, and the ordered work + * queue takes care of making sure that happens in the same order things were + * put onto the queue by writepages and friends. * - * If this code finds it can't get good compression, it puts an - * entry onto the work queue to write the uncompressed bytes. This - * makes sure that both compressed inodes and uncompressed inodes - * are written in the same order that the flusher thread sent them - * down. + * If this code finds it can't get good compression, it puts an entry onto the + * work queue to write the uncompressed bytes. This makes sure that both + * compressed inodes and uncompressed inodes are written in the same order that + * the flusher thread sent them down. */ -static noinline int compress_file_range(struct async_chunk *async_chunk) +static void compress_file_range(struct btrfs_work *work) { + struct async_chunk *async_chunk = + container_of(work, struct async_chunk, work); struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; @@ -842,19 +840,24 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) u64 actual_end; u64 i_size; int ret = 0; - struct page **pages = NULL; + struct page **pages; unsigned long nr_pages; unsigned long total_compressed = 0; unsigned long total_in = 0; + unsigned int poff; int i; - int will_compress; int compress_type = fs_info->compress_type; - int compressed_extents = 0; - int redirty = 0; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* + * We need to call clear_page_dirty_for_io on each page in the range. + * Otherwise applications with the file mmap'd can wander in and change + * the page contents while we are compressing them. + */ + extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + + /* * We need to save i_size before now because it could change in between * us evaluating the size and assigning it. This is because we lock and * unlock the page in truncate and fallocate, and then modify the i_size @@ -868,7 +871,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - will_compress = 0; + pages = NULL; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); @@ -912,78 +915,57 @@ again: ret = 0; /* - * we do compression for mount -o compress and when the - * inode has not been flagged as nocompress. This flag can - * change at any time if we discover bad compression ratios. + * We do compression for mount -o compress and when the inode has not + * been flagged as NOCOMPRESS. This flag can change at any time if we + * discover bad compression ratios. */ - if (inode_need_compress(inode, start, end)) { - WARN_ON(pages); - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) { - /* just bail out to the uncompressed code */ - nr_pages = 0; - goto cont; - } - - if (inode->defrag_compress) - compress_type = inode->defrag_compress; - else if (inode->prop_compress) - compress_type = inode->prop_compress; + if (!inode_need_compress(inode, start, end)) + goto cleanup_and_bail_uncompressed; + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) { /* - * we need to call clear_page_dirty_for_io on each - * page in the range. Otherwise applications with the file - * mmap'd can wander in and change the page contents while - * we are compressing them. - * - * If the compression fails for any reason, we set the pages - * dirty again later on. - * - * Note that the remaining part is redirtied, the start pointer - * has moved, the end is the original one. + * Memory allocation failure is not a fatal error, we can fall + * back to uncompressed code. */ - if (!redirty) { - extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); - redirty = 1; - } + goto cleanup_and_bail_uncompressed; + } - /* Compression level is applied here and only here */ - ret = btrfs_compress_pages( - compress_type | (fs_info->compress_level << 4), - mapping, start, - pages, - &nr_pages, - &total_in, - &total_compressed); + if (inode->defrag_compress) + compress_type = inode->defrag_compress; + else if (inode->prop_compress) + compress_type = inode->prop_compress; + + /* Compression level is applied here. */ + ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), + mapping, start, pages, &nr_pages, &total_in, + &total_compressed); + if (ret) + goto mark_incompressible; - if (!ret) { - unsigned long offset = offset_in_page(total_compressed); - struct page *page = pages[nr_pages - 1]; + /* + * Zero the tail end of the last page, as we might be sending it down + * to disk. + */ + poff = offset_in_page(total_compressed); + if (poff) + memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); - /* zero the tail end of the last page, we might be - * sending it down to disk - */ - if (offset) - memzero_page(page, offset, PAGE_SIZE - offset); - will_compress = 1; - } - } -cont: /* + * Try to create an inline extent. + * + * If we didn't compress the entire range, try to create an uncompressed + * inline extent, else a compressed one. + * * Check cow_file_range() for why we don't even try to create inline - * extent for subpage case. + * extent for the subpage case. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { - /* lets try to make an inline extent */ - if (ret || total_in < actual_end) { - /* we didn't compress the entire range, try - * to make an uncompressed inline extent. - */ - ret = cow_file_range_inline(inode, actual_end, - 0, BTRFS_COMPRESS_NONE, - NULL, false); + if (total_in < actual_end) { + ret = cow_file_range_inline(inode, actual_end, 0, + BTRFS_COMPRESS_NONE, NULL, + false); } else { - /* try making a compressed inline extent */ ret = cow_file_range_inline(inode, actual_end, total_compressed, compress_type, pages, @@ -1013,99 +995,52 @@ cont: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - - /* - * Ensure we only free the compressed pages if we have - * them allocated, as we can still reach here with - * inode_need_compress() == false. - */ - if (pages) { - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); - } - kfree(pages); - } - return 0; + goto free_pages; } } - if (will_compress) { - /* - * we aren't doing an inline extent round the compressed size - * up to a block size boundary so the allocator does sane - * things - */ - total_compressed = ALIGN(total_compressed, blocksize); + /* + * We aren't doing an inline extent. Round the compressed size up to a + * block size boundary so the allocator does sane things. + */ + total_compressed = ALIGN(total_compressed, blocksize); - /* - * one last check to make sure the compression is really a - * win, compare the page count read with the blocks on disk, - * compression must free at least one sector size - */ - total_in = round_up(total_in, fs_info->sectorsize); - if (total_compressed + blocksize <= total_in) { - compressed_extents++; + /* + * One last check to make sure the compression is really a win, compare + * the page count read with the blocks on disk, compression must free at + * least one sector. + */ + total_in = round_up(total_in, fs_info->sectorsize); + if (total_compressed + blocksize > total_in) + goto mark_incompressible; - /* - * The async work queues will take care of doing actual - * allocation on disk for these compressed pages, and - * will submit them to the elevator. - */ - add_async_extent(async_chunk, start, total_in, - total_compressed, pages, nr_pages, - compress_type); - - if (start + total_in < end) { - start += total_in; - pages = NULL; - cond_resched(); - goto again; - } - return compressed_extents; - } + /* + * The async work queues will take care of doing actual allocation on + * disk for these compressed pages, and will submit the bios. + */ + add_async_extent(async_chunk, start, total_in, total_compressed, pages, + nr_pages, compress_type); + if (start + total_in < end) { + start += total_in; + cond_resched(); + goto again; } + return; + +mark_incompressible: + if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) + inode->flags |= BTRFS_INODE_NOCOMPRESS; +cleanup_and_bail_uncompressed: + add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); +free_pages: if (pages) { - /* - * the compression code ran but failed to make things smaller, - * free any pages it allocated and our page pointer array - */ for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); put_page(pages[i]); } kfree(pages); - pages = NULL; - total_compressed = 0; - nr_pages = 0; - - /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && - !(inode->prop_compress)) { - inode->flags |= BTRFS_INODE_NOCOMPRESS; - } - } -cleanup_and_bail_uncompressed: - /* - * No compression, but we still need to write the pages in the file - * we've been given so far. redirty the locked page if it corresponds - * to our extent and set things up for the async work queue to run - * cow_file_range to do the normal delalloc dance. - */ - if (async_chunk->locked_page && - (page_offset(async_chunk->locked_page) >= start && - page_offset(async_chunk->locked_page)) <= end) { - __set_page_dirty_nobuffers(async_chunk->locked_page); - /* unlocked later on in the async handlers */ } - - if (redirty) - extent_range_redirty_for_io(&inode->vfs_inode, start, end); - add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, - BTRFS_COMPRESS_NONE); - compressed_extents++; - - return compressed_extents; } static void free_async_extent_pages(struct async_extent *async_extent) @@ -1124,14 +1059,12 @@ static void free_async_extent_pages(struct async_extent *async_extent) async_extent->pages = NULL; } -static int submit_uncompressed_range(struct btrfs_inode *inode, - struct async_extent *async_extent, - struct page *locked_page) +static void submit_uncompressed_range(struct btrfs_inode *inode, + struct async_extent *async_extent, + struct page *locked_page) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; - unsigned long nr_written = 0; - int page_started = 0; int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -1140,45 +1073,30 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, .no_cgroup_owner = 1, }; - /* - * Call cow_file_range() to run the delalloc range directly, since we - * won't go to NOCOW or async path again. - * - * Also we call cow_file_range() with @unlock_page == 0, so that we - * can directly submit them without interruption. - */ - ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, 0, NULL); - /* Inline extent inserted, page gets unlocked and everything is done */ - if (page_started) - return 0; - + wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); + ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + wbc_detach_inode(&wbc); if (ret < 0) { btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); if (locked_page) { const u64 page_start = page_offset(locked_page); - const u64 page_end = page_start + PAGE_SIZE - 1; set_page_writeback(locked_page); end_page_writeback(locked_page); - end_extent_writepage(locked_page, ret, page_start, page_end); + btrfs_mark_ordered_io_finished(inode, locked_page, + page_start, PAGE_SIZE, + !ret); + mapping_set_error(locked_page->mapping, ret); unlock_page(locked_page); } - return ret; } - - /* All pages will be unlocked, including @locked_page */ - wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = extent_write_locked_range(&inode->vfs_inode, start, end, &wbc); - wbc_detach_inode(&wbc); - return ret; } -static int submit_one_async_extent(struct btrfs_inode *inode, - struct async_chunk *async_chunk, - struct async_extent *async_extent, - u64 *alloc_hint) +static void submit_one_async_extent(struct async_chunk *async_chunk, + struct async_extent *async_extent, + u64 *alloc_hint) { + struct btrfs_inode *inode = async_chunk->inode; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1206,9 +1124,8 @@ static int submit_one_async_extent(struct btrfs_inode *inode, } lock_extent(io_tree, start, end, NULL); - /* We have fall back to uncompressed write */ - if (!async_extent->pages) { - ret = submit_uncompressed_range(inode, async_extent, locked_page); + if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { + submit_uncompressed_range(inode, async_extent, locked_page); goto done; } @@ -1217,7 +1134,6 @@ static int submit_one_async_extent(struct btrfs_inode *inode, async_extent->compressed_size, 0, *alloc_hint, &ins, 1, 1); if (ret) { - free_async_extent_pages(async_extent); /* * Here we used to try again by going back to non-compressed * path for ENOSPC. But we can't reserve space even for @@ -1272,7 +1188,7 @@ done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); kfree(async_extent); - return ret; + return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1286,39 +1202,13 @@ out_free: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); free_async_extent_pages(async_extent); - goto done; -} - -/* - * Phase two of compressed writeback. This is the ordered portion of the code, - * which only gets called in the order the work was queued. We walk all the - * async extents created by compress_file_range and send them down to the disk. - */ -static noinline void submit_compressed_extents(struct async_chunk *async_chunk) -{ - struct btrfs_inode *inode = async_chunk->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_extent *async_extent; - u64 alloc_hint = 0; - int ret = 0; - - while (!list_empty(&async_chunk->extents)) { - u64 extent_start; - u64 ram_size; - - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); - list_del(&async_extent->list); - extent_start = async_extent->start; - ram_size = async_extent->ram_size; - - ret = submit_one_async_extent(inode, async_chunk, async_extent, - &alloc_hint); - btrfs_debug(fs_info, + if (async_chunk->blkcg_css) + kthread_associate_blkcg(NULL); + btrfs_debug(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", - inode->root->root_key.objectid, - btrfs_ino(inode), extent_start, ram_size, ret); - } + root->root_key.objectid, btrfs_ino(inode), start, + async_extent->ram_size, ret); + kfree(async_extent); } static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, @@ -1362,25 +1252,18 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * locked_page is the page that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * *page_started is set to one if we unlock locked_page and do everything - * required to start IO on it. It may be clean and already done with - * IO when we return. - * - * When unlock == 1, we unlock the pages in successfully allocated regions. - * When unlock == 0, we leave them locked for writing them out. + * When this function fails, it unlocks all pages except @locked_page. * - * However, we unlock all the pages except @locked_page in case of failure. + * When this function successfully creates an inline extent, it returns 1 and + * unlocks all pages including locked_page and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_page is + * the only page handled anyway). * - * In summary, page locking state will be as follow: + * When this function succeed and creates a normal extent, the page locking + * status depends on the passed in flags: * - * - page_started == 1 (return value) - * - All the pages are unlocked. IO is started. - * - Note that this can happen only on success - * - unlock == 1 - * - All the pages except @locked_page are unlocked in any case - * - unlock == 0 - * - On success, all the pages are locked for writing out them - * - On failure, all the pages except @locked_page are unlocked + * - If @keep_locked is set, all pages are kept locked. + * - Else all pages except for @locked_page are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept @@ -1389,10 +1272,9 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock, - u64 *done_offset) + struct page *locked_page, u64 start, u64 end, + u64 *done_offset, + bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1431,7 +1313,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * This means we can trigger inline extent even if we didn't want to. * So here we skip inline extent creation completely. */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { + if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) { u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); @@ -1451,9 +1333,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - *nr_written = *nr_written + - (end - start + PAGE_SIZE) / PAGE_SIZE; - *page_started = 1; /* * locked_page is locked by the caller of * writepage_delalloc(), not locked by @@ -1463,11 +1342,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * as it doesn't have any subpage::writers recorded. * * Here we manually unlock the page, since the caller - * can't use page_started to determine if it's an - * inline extent or a compressed extent. + * can't determine if it's an inline extent or a + * compressed extent. */ unlock_page(locked_page); - goto out; + ret = 1; + goto done; } else if (ret < 0) { goto out_unlock; } @@ -1498,6 +1378,31 @@ static noinline int cow_file_range(struct btrfs_inode *inode, ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, min_alloc_size, 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + /* + * btrfs_reserve_extent only returns -EAGAIN for zoned + * file systems, which is an indication that there are + * no active zones to allocate from at the moment. + * + * If this is the first loop iteration, wait for at + * least one zone to finish before retrying the + * allocation. Otherwise ask the caller to write out + * the already allocated blocks before coming back to + * us, or return -ENOSPC if it can't handle retries. + */ + ASSERT(btrfs_is_zoned(fs_info)); + if (start == orig_start) { + wait_on_bit_io(&inode->root->fs_info->flags, + BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + continue; + } + if (done_offset) { + *done_offset = start - 1; + return 0; + } + ret = -ENOSPC; + } if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; @@ -1558,7 +1463,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * Do set the Ordered (Private2) bit so we know this page was * properly setup for writepage. */ - page_ops = unlock ? PAGE_UNLOCK : 0; + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, @@ -1581,7 +1486,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } -out: +done: + if (done_offset) + *done_offset = end; return ret; out_drop_extent_cache: @@ -1591,21 +1498,6 @@ out_reserve: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: /* - * If done_offset is non-NULL and ret == -EAGAIN, we expect the - * caller to write out the successfully allocated region and retry. - */ - if (done_offset && ret == -EAGAIN) { - if (orig_start < start) - *done_offset = start - 1; - else - *done_offset = start; - return ret; - } else if (ret == -EAGAIN) { - /* Convert to -ENOSPC since the caller cannot retry. */ - ret = -ENOSPC; - } - - /* * Now, we have three regions to clean up: * * |-------(1)----|---(2)---|-------------(3)----------| @@ -1627,10 +1519,10 @@ out_unlock: * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup * function. * - * However, in case of unlock == 0, we still need to unlock the pages + * However, in case of @keep_locked, we still need to unlock the pages * (except @locked_page) to ensure all the pages are unlocked. */ - if (!unlock && orig_start < start) { + if (keep_locked && orig_start < start) { if (!locked_page) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, @@ -1671,43 +1563,46 @@ out_unlock: } /* - * work queue call back to started compression on a file and pages - */ -static noinline void async_cow_start(struct btrfs_work *work) -{ - struct async_chunk *async_chunk; - int compressed_extents; - - async_chunk = container_of(work, struct async_chunk, work); - - compressed_extents = compress_file_range(async_chunk); - if (compressed_extents == 0) { - btrfs_add_delayed_iput(async_chunk->inode); - async_chunk->inode = NULL; - } -} - -/* - * work queue call back to submit previously compressed pages + * Phase two of compressed writeback. This is the ordered portion of the code, + * which only gets called in the order the work was queued. We walk all the + * async extents created by compress_file_range and send them down to the disk. + * + * If called with @do_free == true then it'll try to finish the work and free + * the work struct eventually. */ -static noinline void async_cow_submit(struct btrfs_work *work) +static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); struct btrfs_fs_info *fs_info = btrfs_work_owner(work); + struct async_extent *async_extent; unsigned long nr_pages; + u64 alloc_hint = 0; + + if (do_free) { + struct async_chunk *async_chunk; + struct async_cow *async_cow; + + async_chunk = container_of(work, struct async_chunk, work); + btrfs_add_delayed_iput(async_chunk->inode); + if (async_chunk->blkcg_css) + css_put(async_chunk->blkcg_css); + + async_cow = async_chunk->async_cow; + if (atomic_dec_and_test(&async_cow->num_chunks)) + kvfree(async_cow); + return; + } nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; - /* - * ->inode could be NULL if async_chunk_start has failed to compress, - * in which case we don't have anything to submit, yet we need to - * always adjust ->async_delalloc_pages as its paired with the init - * happening in run_delalloc_compressed - */ - if (async_chunk->inode) - submit_compressed_extents(async_chunk); + while (!list_empty(&async_chunk->extents)) { + async_extent = list_entry(async_chunk->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + submit_one_async_extent(async_chunk, async_extent, &alloc_hint); + } /* atomic_sub_return implies a barrier */ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < @@ -1715,27 +1610,9 @@ static noinline void async_cow_submit(struct btrfs_work *work) cond_wake_up_nomb(&fs_info->async_submit_wait); } -static noinline void async_cow_free(struct btrfs_work *work) -{ - struct async_chunk *async_chunk; - struct async_cow *async_cow; - - async_chunk = container_of(work, struct async_chunk, work); - if (async_chunk->inode) - btrfs_add_delayed_iput(async_chunk->inode); - if (async_chunk->blkcg_css) - css_put(async_chunk->blkcg_css); - - async_cow = async_chunk->async_cow; - if (atomic_dec_and_test(&async_cow->num_chunks)) - kvfree(async_cow); -} - static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct writeback_control *wbc, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written) + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); @@ -1809,65 +1686,42 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, async_chunk[i].blkcg_css = NULL; } - btrfs_init_work(&async_chunk[i].work, async_cow_start, - async_cow_submit, async_cow_free); + btrfs_init_work(&async_chunk[i].work, compress_file_range, + submit_compressed_extents); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); - *nr_written += nr_pages; start = cur_end + 1; } - *page_started = 1; return true; } -static noinline int run_delalloc_zoned(struct btrfs_inode *inode, - struct page *locked_page, u64 start, - u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc) +/* + * Run the delalloc range from start to end, and write back any dirty pages + * covered by the range. + */ +static noinline int run_delalloc_cow(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc, + bool pages_dirty) { u64 done_offset = end; int ret; - bool locked_page_done = false; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 0, &done_offset); - if (ret && ret != -EAGAIN) + ret = cow_file_range(inode, locked_page, start, end, &done_offset, + true, false); + if (ret) return ret; - - if (*page_started) { - ASSERT(ret == 0); - return 0; - } - - if (ret == 0) - done_offset = end; - - if (done_offset == start) { - wait_on_bit_io(&inode->root->fs_info->flags, - BTRFS_FS_NEED_ZONE_FINISH, - TASK_UNINTERRUPTIBLE); - continue; - } - - if (!locked_page_done) { - __set_page_dirty_nobuffers(locked_page); - account_page_redirty(locked_page); - } - locked_page_done = true; - extent_write_locked_range(&inode->vfs_inode, start, done_offset, - wbc); + extent_write_locked_range(&inode->vfs_inode, locked_page, start, + done_offset, wbc, pages_dirty); start = done_offset + 1; } - *page_started = 1; - - return 0; + return 1; } static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, @@ -1894,8 +1748,7 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, } static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end, - int *page_started, unsigned long *nr_written) + const u64 start, const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); @@ -1903,6 +1756,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, struct extent_io_tree *io_tree = &inode->io_tree; u64 range_start = start; u64 count; + int ret; /* * If EXTENT_NORESERVE is set it means that when the buffered write was @@ -1955,8 +1809,14 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, NULL); } - return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 1, NULL); + /* + * Don't try to create inline extents, as a mix of inline extent that + * is written out and unlocked directly and a normal NOCOW extent + * doesn't work. + */ + ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ASSERT(ret != 1); + return ret; } struct can_nocow_file_extent_args { @@ -2105,9 +1965,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end, - int *page_started, - unsigned long *nr_written) + const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -2117,25 +1975,26 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, int ret; bool check_prev = true; u64 ino = btrfs_ino(inode); - struct btrfs_block_group *bg; - bool nocow = false; struct can_nocow_file_extent_args nocow_args = { 0 }; + /* + * Normally on a zoned device we're only doing COW writes, but in case + * of relocation on a zoned filesystem serializes I/O so that we're only + * writing sequentially and can end up here as well. + */ + ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); + path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(inode, start, end, locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, PAGE_UNLOCK | - PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); - return -ENOMEM; + ret = -ENOMEM; + goto error; } nocow_args.end = end; nocow_args.writeback_path = true; while (1) { + struct btrfs_block_group *nocow_bg = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; @@ -2146,8 +2005,6 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, int extent_type; bool is_prealloc; - nocow = false; - ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) @@ -2172,11 +2029,8 @@ next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (ret < 0) goto error; - } if (ret > 0) break; leaf = path->nodes[0]; @@ -2209,7 +2063,7 @@ next_slot: if (found_key.offset > cur_offset) { extent_end = found_key.offset; extent_type = 0; - goto out_check; + goto must_cow; } /* @@ -2239,24 +2093,22 @@ next_slot: nocow_args.start = cur_offset; ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (ret < 0) goto error; - } else if (ret == 0) { - goto out_check; - } + if (ret == 0) + goto must_cow; ret = 0; - bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); - if (bg) - nocow = true; -out_check: - /* - * If nocow is false then record the beginning of the range - * that needs to be COWed - */ - if (!nocow) { + nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + if (!nocow_bg) { +must_cow: + /* + * If we can't perform NOCOW writeback for the range, + * then record the beginning of the range that needs to + * be COWed. It will be written out before the next + * NOCOW range if we find one, or when exiting this + * loop. + */ if (cow_start == (u64)-1) cow_start = cur_offset; cur_offset = extent_end; @@ -2275,11 +2127,12 @@ out_check: */ if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_page, - cow_start, found_key.offset - 1, - page_started, nr_written); - if (ret) - goto error; + cow_start, found_key.offset - 1); cow_start = (u64)-1; + if (ret) { + btrfs_dec_nocow_writers(nocow_bg); + goto error; + } } nocow_end = cur_offset + nocow_args.num_bytes - 1; @@ -2296,6 +2149,7 @@ out_check: ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { + btrfs_dec_nocow_writers(nocow_bg); ret = PTR_ERR(em); goto error; } @@ -2309,6 +2163,7 @@ out_check: ? (1 << BTRFS_ORDERED_PREALLOC) : (1 << BTRFS_ORDERED_NOCOW), BTRFS_COMPRESS_NONE); + btrfs_dec_nocow_writers(nocow_bg); if (IS_ERR(ordered)) { if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, @@ -2318,11 +2173,6 @@ out_check: goto error; } - if (nocow) { - btrfs_dec_nocow_writers(bg); - nocow = false; - } - if (btrfs_is_data_reloc_root(root)) /* * Error handled later, as we must prevent @@ -2357,17 +2207,24 @@ out_check: if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end, - page_started, nr_written); + ret = fallback_to_cow(inode, locked_page, cow_start, end); + cow_start = (u64)-1; if (ret) goto error; } -error: - if (nocow) - btrfs_dec_nocow_writers(bg); + btrfs_free_path(path); + return 0; - if (ret && cur_offset < end) +error: + /* + * If an error happened while a COW region is outstanding, cur_offset + * needs to be reset to cow_start to ensure the COW region is unlocked + * as well. + */ + if (cow_start != (u64)-1) + cur_offset = cow_start; + if (cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | @@ -2382,8 +2239,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && - test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, - 0, NULL)) + test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } @@ -2395,49 +2251,37 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * being touched for the first time. */ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, unsigned long *nr_written, - struct writeback_control *wbc) + u64 start, u64 end, struct writeback_control *wbc) { - int ret = 0; const bool zoned = btrfs_is_zoned(inode->root->fs_info); + int ret; /* - * The range must cover part of the @locked_page, or the returned - * @page_started can confuse the caller. + * The range must cover part of the @locked_page, or a return of 1 + * can confuse the caller. */ ASSERT(!(end <= page_offset(locked_page) || start >= page_offset(locked_page) + PAGE_SIZE)); if (should_nocow(inode, start, end)) { - /* - * Normally on a zoned device we're only doing COW writes, but - * in case of relocation on a zoned filesystem we have taken - * precaution, that we're only writing sequentially. It's safe - * to use run_delalloc_nocow() here, like for regular - * preallocated inodes. - */ - ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); - ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, nr_written); + ret = run_delalloc_nocow(inode, locked_page, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, wbc, locked_page, start, - end, page_started, nr_written)) - goto out; + run_delalloc_compressed(inode, locked_page, start, end, wbc)) + return 1; if (zoned) - ret = run_delalloc_zoned(inode, locked_page, start, end, - page_started, nr_written, wbc); + ret = run_delalloc_cow(inode, locked_page, start, end, wbc, + true); else - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1, NULL); + ret = cow_file_range(inode, locked_page, start, end, NULL, + false, false); out: - ASSERT(ret <= 0); - if (ret) + if (ret < 0) btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); return ret; @@ -2840,23 +2684,19 @@ struct btrfs_writepage_fixup { static void btrfs_writepage_fixup_worker(struct btrfs_work *work) { - struct btrfs_writepage_fixup *fixup; + struct btrfs_writepage_fixup *fixup = + container_of(work, struct btrfs_writepage_fixup, work); struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page; - struct btrfs_inode *inode; - u64 page_start; - u64 page_end; + struct page *page = fixup->page; + struct btrfs_inode *inode = fixup->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 page_start = page_offset(page); + u64 page_end = page_offset(page) + PAGE_SIZE - 1; int ret = 0; bool free_delalloc_space = true; - fixup = container_of(work, struct btrfs_writepage_fixup, work); - page = fixup->page; - inode = fixup->inode; - page_start = page_offset(page); - page_end = page_offset(page) + PAGE_SIZE - 1; - /* * This is similar to page_mkwrite, we need to reserve the space before * we take the page lock. @@ -2949,10 +2789,11 @@ out_page: * to reflect the errors and clean the page. */ mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); + btrfs_mark_ordered_io_finished(inode, page, page_start, + PAGE_SIZE, !ret); clear_page_dirty_for_io(page); } - btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE); + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); @@ -3009,7 +2850,7 @@ int btrfs_writepage_cow_fixup(struct page *page) ihold(inode); btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); get_page(page); - btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); + btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); fixup->page = page; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); @@ -3074,7 +2915,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -3232,7 +3073,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } trans->block_rsv = &inode->block_rsv; - ret = btrfs_update_inode_fallback(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; @@ -3253,6 +3094,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; + ret = btrfs_insert_raid_extent(trans, ordered_extent); + if (ret) + goto out; + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { @@ -3298,7 +3143,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); - ret = btrfs_update_inode_fallback(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; @@ -3359,6 +3204,13 @@ out: btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, 1); + /* + * Actually free the qgroup rsv which was released when + * the ordered extent was created. + */ + btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, + ordered_extent->qgroup_rsv, + BTRFS_QGROUP_RSV_DATA); } } @@ -3379,20 +3231,12 @@ out: int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && - !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && + list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); return btrfs_finish_one_ordered(ordered); } -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate) -{ - trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - - btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate); -} - /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. @@ -3446,7 +3290,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, if (btrfs_is_data_reloc_root(inode->root) && test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, - 1, NULL)) { + NULL)) { /* Skip the range without csum for data reloc inode */ clear_extent_bits(&inode->io_tree, file_offset, end, EXTENT_NODATASUM); @@ -3470,7 +3314,7 @@ zeroit: } /* - * btrfs_add_delayed_iput - perform a delayed iput on @inode + * Perform a delayed iput on @inode. * * @inode: The inode we want to perform iput on * @@ -3662,9 +3506,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) */ if (found_key.offset == last_objectid) { + /* + * We found the same inode as before. This means we were + * not able to remove its items via eviction triggered + * by an iput(). A transaction abort may have happened, + * due to -ENOSPC for example, so try to grab the error + * that lead to a transaction abort, if any. + */ btrfs_err(fs_info, "Error removing orphan entry, stopping orphan cleanup"); - ret = -EINVAL; + ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL; goto out; } @@ -3911,19 +3762,17 @@ static int btrfs_read_locked_inode(struct inode *inode, btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, round_up(i_size_read(inode), fs_info->sectorsize)); - inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); - inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); + inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); - inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); + inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + btrfs_timespec_nsec(leaf, &inode_item->mtime)); - inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); - inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); + inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime.tv_sec = - btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime.tv_nsec = - btrfs_timespec_nsec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); @@ -3949,7 +3798,7 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in delayed_nodes_tree. */ - if (BTRFS_I(inode)->last_trans == fs_info->generation) + if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -4079,24 +3928,22 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); btrfs_set_token_timespec_sec(&token, &item->atime, - inode->i_atime.tv_sec); + inode_get_atime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->atime, - inode->i_atime.tv_nsec); + inode_get_atime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->mtime, - inode->i_mtime.tv_sec); + inode_get_mtime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode->i_mtime.tv_nsec); + inode_get_mtime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime_nsec(inode)); - btrfs_set_token_timespec_sec(&token, &item->otime, - BTRFS_I(inode)->i_otime.tv_sec); - btrfs_set_token_timespec_nsec(&token, &item->otime, - BTRFS_I(inode)->i_otime.tv_nsec); + btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec); btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); btrfs_set_token_inode_generation(&token, item, @@ -4114,8 +3961,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * copy everything in the in-memory inode into the btree. */ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode) + struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; struct btrfs_path *path; @@ -4126,7 +3972,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1); + ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1); if (ret) { if (ret > 0) ret = -ENOENT; @@ -4138,7 +3984,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: @@ -4149,10 +3995,10 @@ failed: /* * copy everything in the in-memory inode into the btree. */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode) +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -4168,23 +4014,23 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); - ret = btrfs_delayed_update_inode(trans, root, inode); + ret = btrfs_delayed_update_inode(trans, inode); if (!ret) btrfs_set_inode_last_trans(trans, inode); return ret; } - return btrfs_update_inode_item(trans, root, inode); + return btrfs_update_inode_item(trans, inode); } int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode) + struct btrfs_inode *inode) { int ret; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret == -ENOSPC) - return btrfs_update_inode_item(trans, root, inode); + return btrfs_update_inode_item(trans, inode); return ret; } @@ -4289,10 +4135,8 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); - dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime; - dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime; - ret = btrfs_update_inode(trans, root, dir); + inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); + ret = btrfs_update_inode(trans, dir); out: return ret; } @@ -4306,7 +4150,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); - ret = btrfs_update_inode(trans, inode->root, inode); + ret = btrfs_update_inode(trans, inode); } return ret; } @@ -4464,9 +4308,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); inode_inc_iversion(&dir->vfs_inode); - dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode); - dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime; - ret = btrfs_update_inode_fallback(trans, root, dir); + inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); + ret = btrfs_update_inode_fallback(trans, dir); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -4800,7 +4643,8 @@ out_notrans: } /* - * btrfs_truncate_block - read, zero a chunk and write a block + * Read, zero a chunk and write a block. + * * @inode - inode that we're zeroing * @from - the offset to start zeroing * @len - the length to zero, 0 to zero the entire range respective to the @@ -4950,9 +4794,9 @@ out: return ret; } -static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, - u64 offset, u64 len) +static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_drop_extents_args drop_args = { 0 }; @@ -4992,7 +4836,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, btrfs_abort_transaction(trans, ret); } else { btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, inode); } btrfs_end_transaction(trans); return ret; @@ -5048,8 +4892,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; - err = maybe_insert_hole(root, inode, cur_offset, - hole_size); + err = maybe_insert_hole(inode, cur_offset, hole_size); if (err) break; @@ -5075,7 +4918,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; hole_em->compress_type = BTRFS_COMPRESS_NONE; - hole_em->generation = fs_info->generation; + hole_em->generation = btrfs_get_fs_generation(fs_info); err = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); @@ -5115,8 +4958,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (newsize != oldsize) { inode_inc_iversion(inode); if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); } } @@ -5144,7 +4987,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, newsize); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); pagecache_isize_extended(inode, oldsize, newsize); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { @@ -5289,7 +5132,7 @@ static void evict_inode_truncate_pages(struct inode *inode) */ if (state_flags & EXTENT_DELALLOC) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, - end - start + 1); + end - start + 1, NULL); clear_extent_bit(io_tree, start, end, EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, @@ -5738,11 +5581,12 @@ struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root return btrfs_iget_path(s, ino, root, NULL); } -static struct inode *new_simple_dir(struct super_block *s, +static struct inode *new_simple_dir(struct inode *dir, struct btrfs_key *key, struct btrfs_root *root) { - struct inode *inode = new_inode(s); + struct timespec64 ts; + struct inode *inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -5760,10 +5604,15 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = current_time(inode); - inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; - BTRFS_I(inode)->i_otime = inode->i_mtime; + + ts = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, ts); + inode_set_atime_to_ts(inode, inode_get_atime(dir)); + BTRFS_I(inode)->i_otime_sec = ts.tv_sec; + BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; + + inode->i_uid = dir->i_uid; + inode->i_gid = dir->i_gid; return inode; } @@ -5822,7 +5671,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret != -ENOENT) inode = ERR_PTR(ret); else - inode = new_simple_dir(dir->i_sb, &location, root); + inode = new_simple_dir(dir, &location, root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); btrfs_put_root(sub_root); @@ -5924,20 +5773,24 @@ out: static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) { - if (dir->index_cnt == (u64)-1) { - int ret; + int ret = 0; + btrfs_inode_lock(dir, 0); + if (dir->index_cnt == (u64)-1) { ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) { ret = btrfs_set_inode_index_count(dir); if (ret) - return ret; + goto out; } } - *index = dir->index_cnt; + /* index_cnt is the index number of next new entry, so decrement it. */ + *index = dir->index_cnt - 1; +out: + btrfs_inode_unlock(dir, 0); - return 0; + return ret; } /* @@ -5972,6 +5825,19 @@ static int btrfs_opendir(struct inode *inode, struct file *file) return 0; } +static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct btrfs_file_private *private = file->private_data; + int ret; + + ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)), + &private->last_index); + if (ret) + return ret; + + return generic_file_llseek(file, offset, whence); +} + struct dir_entry { u64 ino; u64 offset; @@ -6007,8 +5873,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct btrfs_key found_key; struct btrfs_path *path; void *addr; - struct list_head ins_list; - struct list_head del_list; + LIST_HEAD(ins_list); + LIST_HEAD(del_list); int ret; char *name_ptr; int name_len; @@ -6027,8 +5893,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) addr = private->filldir_buf; path->reada = READA_FORWARD; - INIT_LIST_HEAD(&ins_list); - INIT_LIST_HEAD(&del_list); put = btrfs_readdir_get_delayed_items(inode, private->last_index, &ins_list, &del_list); @@ -6144,15 +6008,15 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); - if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { + ret = btrfs_update_inode(trans, inode); + if (ret == -ENOSPC || ret == -EDQUOT) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); } btrfs_end_transaction(trans); if (inode->delayed_node) @@ -6165,23 +6029,15 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes. */ -static int btrfs_update_time(struct inode *inode, struct timespec64 *now, - int flags) +static int btrfs_update_time(struct inode *inode, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; - bool dirty = flags & ~S_VERSION; + bool dirty; if (btrfs_root_readonly(root)) return -EROFS; - if (flags & S_VERSION) - dirty |= inode_maybe_inc_iversion(inode, dirty); - if (flags & S_CTIME) - inode->i_ctime = *now; - if (flags & S_MTIME) - inode->i_mtime = *now; - if (flags & S_ATIME) - inode->i_atime = *now; + dirty = inode_update_timestamps(inode, flags); return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; } @@ -6312,6 +6168,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode * int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { + struct timespec64 ts; struct inode *dir = args->dir; struct inode *inode = args->inode; const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; @@ -6429,10 +6286,9 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, goto discard; } - inode->i_mtime = current_time(inode); - inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; - BTRFS_I(inode)->i_otime = inode->i_mtime; + ts = simple_inode_init_ts(inode); + BTRFS_I(inode)->i_otime_sec = ts.tv_sec; + BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; /* * We're going to fill the inode item now, so at this point the inode @@ -6463,7 +6319,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); /* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in @@ -6596,13 +6452,11 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, * log replay procedure is responsible for setting them to their correct * values (the ones it had when the fsync was done). */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { - struct timespec64 now = current_time(&parent_inode->vfs_inode); + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) + inode_set_mtime_to_ts(&parent_inode->vfs_inode, + inode_set_ctime_current(&parent_inode->vfs_inode)); - parent_inode->vfs_inode.i_mtime = now; - parent_inode->vfs_inode.i_ctime = now; - } - ret = btrfs_update_inode(trans, root, parent_inode); + ret = btrfs_update_inode(trans, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); return ret; @@ -6741,7 +6595,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); @@ -6753,7 +6607,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } else { struct dentry *parent = dentry->d_parent; - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); + err = btrfs_update_inode(trans, BTRFS_I(inode)); if (err) goto fail; if (inode->i_nlink == 1) { @@ -7129,8 +6983,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); +again: ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + ASSERT(btrfs_is_zoned(fs_info)); + wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + goto again; + } if (ret) return ERR_PTR(ret); @@ -7258,8 +7119,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, range_end = round_up(offset + nocow_args.num_bytes, root->fs_info->sectorsize) - 1; - ret = test_range_bit(io_tree, offset, range_end, - EXTENT_DELALLOC, 0, NULL); + ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); if (ret) { ret = -EAGAIN; goto out; @@ -8160,11 +8020,11 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); - spin_lock_irq(&inode->ordered_tree.lock); + spin_lock_irq(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); ordered->truncated_len = min(ordered->truncated_len, cur - ordered->file_offset); - spin_unlock_irq(&inode->ordered_tree.lock); + spin_unlock_irq(&inode->ordered_tree_lock); /* * If the ordered extent has finished, we're safe to delete all @@ -8199,7 +8059,7 @@ next: * reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); if (!inode_evicting) { clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_UPTODATE | @@ -8494,7 +8354,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) if (ret != -ENOSPC && ret != -EAGAIN) break; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) break; @@ -8547,7 +8407,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) int ret2; trans->block_rsv = &fs_info->trans_block_rsv; - ret2 = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_update_inode(trans, inode); if (ret2 && !ret) ret = ret2; @@ -8636,8 +8496,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->delayed_node = NULL; - ei->i_otime.tv_sec = 0; - ei->i_otime.tv_nsec = 0; + ei->i_otime_sec = 0; + ei->i_otime_nsec = 0; inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); @@ -8646,7 +8506,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); mutex_init(&ei->log_mutex); - btrfs_ordered_inode_tree_init(&ei->ordered_tree); + spin_lock_init(&ei->ordered_tree_lock); + ei->ordered_tree = RB_ROOT; + ei->ordered_tree_last = NULL; INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); @@ -8789,8 +8651,8 @@ static int btrfs_getattr(struct mnt_idmap *idmap, u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; stat->result_mask |= STATX_BTIME; - stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; - stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; + stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec; + stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec; if (bi_flags & BTRFS_INODE_APPEND) stat->attributes |= STATX_ATTR_APPEND; if (bi_flags & BTRFS_INODE_COMPRESS) @@ -8807,7 +8669,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -8831,7 +8693,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; - struct timespec64 ctime = current_time(old_inode); struct btrfs_rename_ctx old_rename_ctx; struct btrfs_rename_ctx new_rename_ctx; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); @@ -8962,12 +8823,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); - old_dir->i_mtime = ctime; - old_dir->i_ctime = ctime; - new_dir->i_mtime = ctime; - new_dir->i_ctime = ctime; - old_inode->i_ctime = ctime; - new_inode->i_ctime = ctime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -8984,7 +8840,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); if (!ret) - ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -8999,7 +8855,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); if (!ret) - ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); + ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9231,11 +9087,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_mtime = current_time(old_dir); - old_dir->i_ctime = old_dir->i_mtime; - new_dir->i_mtime = old_dir->i_mtime; - new_dir->i_ctime = old_dir->i_mtime; - old_inode->i_ctime = old_dir->i_mtime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9248,7 +9100,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); if (!ret) - ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9257,7 +9109,6 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (new_inode) { inode_inc_iversion(new_inode); - new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); @@ -9374,7 +9225,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); + btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL); return work; } @@ -9390,14 +9241,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; - struct list_head works; - struct list_head splice; + LIST_HEAD(works); + LIST_HEAD(splice); int ret = 0; bool full_flush = wbc->nr_to_write == LONG_MAX; - INIT_LIST_HEAD(&works); - INIT_LIST_HEAD(&splice); - mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); @@ -9485,14 +9333,12 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, .range_end = LLONG_MAX, }; struct btrfs_root *root; - struct list_head splice; + LIST_HEAD(splice); int ret; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; - INIT_LIST_HEAD(&splice); - mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); @@ -9617,7 +9463,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); d_instantiate_new(dentry, inode); @@ -9645,7 +9491,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; - int qgroup_released; + u64 qgroup_released = 0; int ret; memset(&stack_fi, 0, sizeof(stack_fi)); @@ -9658,9 +9504,9 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ - qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); - if (qgroup_released < 0) - return ERR_PTR(qgroup_released); + ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); + if (ret < 0) + return ERR_PTR(ret); if (trans) { ret = insert_reserved_file_extent(trans, inode, @@ -9797,7 +9643,7 @@ next: *alloc_hint = ins.objectid + ins.offset; inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && @@ -9810,7 +9656,7 @@ next: btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); @@ -10555,7 +10401,7 @@ out_delalloc_release: btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); out_qgroup_free_data: if (ret < 0) - btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL); out_free_data_space: /* * If btrfs_reserve_extent() succeeded, then we already decremented @@ -11052,7 +10898,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { }; static const struct file_operations btrfs_dir_file_operations = { - .llseek = generic_file_llseek, + .llseek = btrfs_dir_llseek, .read = generic_read_dir, .iterate_shared = btrfs_real_readdir, .open = btrfs_opendir, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a895d105464b..a1743904202b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -384,8 +384,8 @@ update_flags: binode->flags = binode_flags; btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + inode_set_ctime_current(inode); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); out_end_trans: btrfs_end_transaction(trans); @@ -652,18 +652,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap, /* Tree log can't currently deal with an inode which is a new root. */ btrfs_set_log_full_commit(trans); - ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); + ret = btrfs_qgroup_inherit(trans, 0, objectid, root->root_key.objectid, inherit); if (ret) goto out; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, - BTRFS_NESTING_NORMAL); + 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto out; } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); inode_item = &root_item->inode; btrfs_set_stack_inode_generation(inode_item, 1); @@ -1290,6 +1290,15 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, * are limited to own subvolumes only */ ret = -EPERM; + } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) { + /* + * Snapshots must be made with the src_inode referring + * to the subvolume inode, otherwise the permission + * checking above is useless because we may have + * permission on a lower directory but not the subvol + * itself. + */ + ret = -EINVAL; } else { ret = btrfs_mksnapshot(&file->f_path, idmap, name, namelen, @@ -1528,7 +1537,7 @@ static noinline int key_in_sk(struct btrfs_key *key, static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, - size_t *buf_size, + u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, int *num_found) @@ -1660,7 +1669,7 @@ out: static noinline int search_ioctl(struct inode *inode, struct btrfs_ioctl_search_key *sk, - size_t *buf_size, + u64 *buf_size, char __user *ubuf) { struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); @@ -1733,7 +1742,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, struct btrfs_ioctl_search_args __user *uargs = argp; struct btrfs_ioctl_search_key sk; int ret; - size_t buf_size; + u64 buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1763,8 +1772,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, struct btrfs_ioctl_search_args_v2 __user *uarg = argp; struct btrfs_ioctl_search_args_v2 args; int ret; - size_t buf_size; - const size_t buf_limit = SZ_16M; + u64 buf_size; + const u64 buf_limit = SZ_16M; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1958,6 +1967,13 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } + /* + * We don't need the path anymore, so release it and + * avoid deadlocks and lockdep warnings in case + * btrfs_iget() needs to lookup the inode from its root + * btree and lock the same leaf. + */ + btrfs_release_path(path); temp_inode = btrfs_iget(sb, key2.objectid, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); @@ -1978,7 +1994,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } - btrfs_release_path(path); key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; @@ -2629,6 +2644,12 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) return -EINVAL; } + if (fs_info->fs_devices->temp_fsid) { + btrfs_err(fs_info, + "device add not supported on cloned temp-fsid mount"); + return -EINVAL; + } + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; @@ -2670,8 +2691,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; - struct block_device *bdev = NULL; - void *holder; + struct bdev_handle *bdev_handle = NULL; int ret; bool cancel = false; @@ -2708,7 +2728,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; /* Exclusive operation is now claimed */ - ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); + ret = btrfs_rm_device(fs_info, &args, &bdev_handle); btrfs_exclop_finish(fs_info); @@ -2722,8 +2742,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } err_drop: mnt_drop_write_file(file); - if (bdev) - blkdev_put(bdev, holder); + if (bdev_handle) + bdev_release(bdev_handle); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2736,8 +2756,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; - struct block_device *bdev = NULL; - void *holder; + struct bdev_handle *bdev_handle = NULL; int ret; bool cancel = false; @@ -2764,15 +2783,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); + ret = btrfs_rm_device(fs_info, &args, &bdev_handle); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } mnt_drop_write_file(file); - if (bdev) - blkdev_put(bdev, holder); + if (bdev_handle) + bdev_release(bdev_handle); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2816,7 +2835,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, } if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) { - fi_args->generation = fs_info->generation; + fi_args->generation = btrfs_get_fs_generation(fs_info); fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; } @@ -2941,7 +2960,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); @@ -2972,7 +2991,7 @@ static void get_block_group_info(struct list_head *groups_list, static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, void __user *arg) { - struct btrfs_ioctl_space_args space_args; + struct btrfs_ioctl_space_args space_args = { 0 }; struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; struct btrfs_ioctl_space_info *dest_orig; @@ -3125,7 +3144,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, return PTR_ERR(trans); /* No running transaction, don't bother */ - transid = root->fs_info->last_trans_committed; + transid = btrfs_get_last_trans_committed(root->fs_info); goto out; } transid = trans->transid; @@ -3691,7 +3710,8 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) switch (sa->cmd) { case BTRFS_QUOTA_CTL_ENABLE: - ret = btrfs_quota_enable(fs_info); + case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA: + ret = btrfs_quota_enable(fs_info, sa); break; case BTRFS_QUOTA_CTL_DISABLE: ret = btrfs_quota_disable(fs_info); @@ -4332,7 +4352,7 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) - struct btrfs_ioctl_send_args_32 args32; + struct btrfs_ioctl_send_args_32 args32 = { 0 }; ret = copy_from_user(&args32, argp, sizeof(args32)); if (ret) @@ -4345,6 +4365,7 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat arg->clone_sources = compat_ptr(args32.clone_sources); arg->parent_root = args32.parent_root; arg->flags = args32.flags; + arg->version = args32.version; memcpy(arg->reserved, args32.reserved, sizeof(args32.reserved)); #else diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 7979449a58d6..74d8e2003f58 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -8,6 +8,7 @@ #include <linux/spinlock.h> #include <linux/page-flags.h> #include <asm/bug.h> +#include <trace/events/btrfs.h> #include "misc.h" #include "ctree.h" #include "extent_io.h" @@ -73,6 +74,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, + { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") }, { .id = 0, DEFINE_NAME("tree") }, }; @@ -102,6 +104,15 @@ void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff #endif +#ifdef CONFIG_BTRFS_DEBUG +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) +{ + eb->lock_owner = owner; +} +#else +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { } +#endif + /* * Extent buffer locking * ===================== @@ -164,7 +175,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) int btrfs_try_tree_write_lock(struct extent_buffer *eb) { if (down_write_trylock(&eb->lock)) { - eb->lock_owner = current->pid; + btrfs_set_eb_lock_owner(eb, current->pid); trace_btrfs_try_tree_write_lock(eb); return 1; } @@ -181,7 +192,8 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) } /* - * __btrfs_tree_lock - lock eb for write + * Lock eb for write. + * * @eb: the eb to lock * @nest: the nesting to use for the lock * @@ -196,7 +208,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) start_ns = ktime_get_ns(); down_write_nested(&eb->lock, nest); - eb->lock_owner = current->pid; + btrfs_set_eb_lock_owner(eb, current->pid); trace_btrfs_tree_lock(eb, start_ns); } @@ -211,7 +223,7 @@ void btrfs_tree_lock(struct extent_buffer *eb) void btrfs_tree_unlock(struct extent_buffer *eb) { trace_btrfs_tree_unlock(eb); - eb->lock_owner = 0; + btrfs_set_eb_lock_owner(eb, 0); up_write(&eb->lock); } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index edb9b4a0dba1..7d6ee1e609bf 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -79,7 +79,7 @@ enum btrfs_lock_nesting { }; enum btrfs_lockdep_trans_states { - BTRFS_LOCKDEP_TRANS_COMMIT_START, + BTRFS_LOCKDEP_TRANS_COMMIT_PREP, BTRFS_LOCKDEP_TRANS_UNBLOCKED, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, BTRFS_LOCKDEP_TRANS_COMPLETED, diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 23fc11af498a..b8f9c9e56c8c 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -10,14 +10,13 @@ #ifdef CONFIG_PRINTK #define STATE_STRING_PREFACE ": state " -#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1) /* * Characters to print to indicate error conditions or uncommon filesystem state. * RO is not an error. */ static const char fs_state_chars[] = { - [BTRFS_FS_STATE_ERROR] = 'E', [BTRFS_FS_STATE_REMOUNTING] = 'M', [BTRFS_FS_STATE_RO] = 0, [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', @@ -37,6 +36,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); curr += sizeof(STATE_STRING_PREFACE) - 1; + if (BTRFS_FS_ERROR(info)) { + *curr++ = 'E'; + states_printed = true; + } + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { @@ -68,11 +72,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) * over the error. Each subsequent error that doesn't have any context * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. */ -const char * __attribute_const__ btrfs_decode_error(int errno) +const char * __attribute_const__ btrfs_decode_error(int error) { char *errstr = "unknown"; - switch (errno) { + switch (error) { case -ENOENT: /* -2 */ errstr = "No such entry"; break; @@ -106,12 +110,12 @@ const char * __attribute_const__ btrfs_decode_error(int errno) } /* - * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the appropriate error response. + * Decodes expected errors from the caller and invokes the appropriate error + * response. */ __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) + unsigned int line, int error, const char *fmt, ...) { struct super_block *sb = fs_info->sb; #ifdef CONFIG_PRINTK @@ -128,11 +132,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * Special case: if the error is EROFS, and we're already under * SB_RDONLY, then it is safe here. */ - if (errno == -EROFS && sb_rdonly(sb)) + if (error == -EROFS && sb_rdonly(sb)) return; #ifdef CONFIG_PRINTK - errstr = btrfs_decode_error(errno); + errstr = btrfs_decode_error(error); btrfs_state_to_string(fs_info, statestr); if (fmt) { struct va_format vaf; @@ -143,11 +147,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function vaf.va = &args; pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, statestr, function, line, errno, errstr, &vaf); + sb->s_id, statestr, function, line, error, errstr, &vaf); va_end(args); } else { pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", - sb->s_id, statestr, function, line, errno, errstr); + sb->s_id, statestr, function, line, error, errstr); } #endif @@ -155,7 +159,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * Today we only save the error info to memory. Long term we'll also * send it down to the disk. */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + WRITE_ONCE(fs_info->fs_error, error); /* Don't go through full error handling during mount. */ if (!(sb->s_flags & SB_BORN)) @@ -252,12 +256,6 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, } #endif -void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) -{ - btrfs_err(fs_info, -"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); -} - #if BITS_PER_LONG == 32 void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) { @@ -285,12 +283,12 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) #endif /* - * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an - * alert, and either panics or BUGs, depending on mount options. + * Decode unexpected, fatal errors from the caller, issue an alert, and either + * panic or BUGs, depending on mount options. */ __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) + unsigned int line, int error, const char *fmt, ...) { char *s_id = "<unknown>"; const char *errstr; @@ -303,13 +301,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, va_start(args, fmt); vaf.va = &args; - errstr = btrfs_decode_error(errno); + errstr = btrfs_decode_error(error); if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); + s_id, function, line, &vaf, error, errstr); btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", - function, line, &vaf, errno, errstr); + function, line, &vaf, error, errstr); va_end(args); /* Caller calls BUG() */ } diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index deedc1a168e2..4d04c1fa5899 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -181,30 +181,28 @@ do { \ #define ASSERT(expr) (void)(expr) #endif -void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info); - __printf(5, 6) __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); + unsigned int line, int error, const char *fmt, ...); -const char * __attribute_const__ btrfs_decode_error(int errno); +const char * __attribute_const__ btrfs_decode_error(int error); -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ +#define btrfs_handle_fs_error(fs_info, error, fmt, args...) \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args) + (error), fmt, ##args) __printf(5, 6) __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); + unsigned int line, int error, const char *fmt, ...); /* * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic * will panic(). Otherwise we BUG() here. */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ +#define btrfs_panic(fs_info, error, fmt, args...) \ do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + __btrfs_panic(fs_info, __func__, __LINE__, error, fmt, ##args); \ BUG(); \ } while (0) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 005751a12911..40f2d9f1a17a 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -8,8 +8,6 @@ #include <linux/math64.h> #include <linux/rbtree.h> -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) - /* * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. */ diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a629532283bc..a82e1417c4d2 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -124,25 +124,24 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, * look find the first ordered struct that has this offset, otherwise * the first one less than this offset */ -static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, - u64 file_offset) +static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode, + u64 file_offset) { - struct rb_root *root = &tree->tree; struct rb_node *prev = NULL; struct rb_node *ret; struct btrfs_ordered_extent *entry; - if (tree->last) { - entry = rb_entry(tree->last, struct btrfs_ordered_extent, + if (inode->ordered_tree_last) { + entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent, rb_node); if (in_range(file_offset, entry->file_offset, entry->num_bytes)) - return tree->last; + return inode->ordered_tree_last; } - ret = __tree_search(root, file_offset, &prev); + ret = __tree_search(&inode->ordered_tree, file_offset, &prev); if (!ret) ret = prev; if (ret) - tree->last = ret; + inode->ordered_tree_last = ret; return ret; } @@ -153,11 +152,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( { struct btrfs_ordered_extent *entry; int ret; + u64 qgroup_rsv = 0; if (flags & ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { /* For nocow write, we can release the qgroup rsv right now */ - ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) return ERR_PTR(ret); } else { @@ -165,7 +165,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( * The ordered extent has reserved qgroup space, release now * and pass the reserved number for qgroup_record to free. */ - ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) return ERR_PTR(ret); } @@ -183,7 +183,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->inode = igrab(&inode->vfs_inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; - entry->qgroup_rsv = ret; + entry->qgroup_rsv = qgroup_rsv; entry->flags = flags; refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -191,6 +191,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); + INIT_LIST_HEAD(&entry->bioc_list); init_completion(&entry->completion); /* @@ -208,7 +209,6 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( static void insert_ordered_extent(struct btrfs_ordered_extent *entry) { struct btrfs_inode *inode = BTRFS_I(entry->inode); - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; @@ -221,13 +221,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) /* One ref for the tree. */ refcount_inc(&entry->refs); - spin_lock_irq(&tree->lock); - node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node); + spin_lock_irq(&inode->ordered_tree_lock); + node = tree_insert(&inode->ordered_tree, entry->file_offset, + &entry->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "inconsistency in ordered tree at offset %llu", entry->file_offset); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -287,12 +288,11 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum) { - struct btrfs_ordered_inode_tree *tree; + struct btrfs_inode *inode = BTRFS_I(entry->inode); - tree = &BTRFS_I(entry->inode)->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irq(&inode->ordered_tree_lock); list_add_tail(&sum->list, &entry->list); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); } static void finish_ordered_fn(struct btrfs_work *work) @@ -310,7 +310,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - lockdep_assert_held(&inode->ordered_tree.lock); + lockdep_assert_held(&inode->ordered_tree_lock); if (page) { ASSERT(page->mapping); @@ -364,7 +364,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? fs_info->endio_freespace_worker : fs_info->endio_write_workers; - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL); btrfs_queue_work(wq, &ordered->work); } @@ -378,9 +378,9 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); - spin_lock_irqsave(&inode->ordered_tree.lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); - spin_unlock_irqrestore(&inode->ordered_tree.lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); if (ret) btrfs_queue_ordered_fn(ordered); @@ -404,19 +404,22 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, u64 num_bytes, bool uptodate) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; u64 cur = file_offset; - spin_lock_irqsave(&tree->lock, flags); + trace_btrfs_writepage_end_io_hook(inode, file_offset, + file_offset + num_bytes - 1, + uptodate); + + spin_lock_irqsave(&inode->ordered_tree_lock, flags); while (cur < file_offset + num_bytes) { u64 entry_end; u64 end; u32 len; - node = tree_search(tree, cur); + node = ordered_tree_search(inode, cur); /* No ordered extents at all */ if (!node) break; @@ -463,13 +466,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, len = end + 1 - cur; if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); btrfs_queue_ordered_fn(entry); - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); } cur += len; } - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); } /* @@ -493,19 +496,18 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; bool finished = false; - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); if (cached && *cached) { entry = *cached; goto have_entry; } - node = tree_search(tree, file_offset); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -536,7 +538,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); return finished; } @@ -574,7 +576,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry) { - struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = btrfs_inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; @@ -599,22 +600,23 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, release = entry->disk_num_bytes; else release = entry->num_bytes; - btrfs_delalloc_release_metadata(btrfs_inode, release, false); + btrfs_delalloc_release_metadata(btrfs_inode, release, + test_bit(BTRFS_ORDERED_IOERR, + &entry->flags)); } percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); - tree = &btrfs_inode->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irq(&btrfs_inode->ordered_tree_lock); node = &entry->rb_node; - rb_erase(node, &tree->tree); + rb_erase(node, &btrfs_inode->ordered_tree); RB_CLEAR_NODE(node); - if (tree->last == node) - tree->last = NULL; + if (btrfs_inode->ordered_tree_last == node) + btrfs_inode->ordered_tree_last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&btrfs_inode->ordered_tree_lock); /* * The current running transaction is waiting on us, we need to let it @@ -635,7 +637,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); - ASSERT(trans); + ASSERT(trans || BTRFS_FS_ERROR(fs_info)); if (trans) { if (atomic_dec_and_test(&trans->pending_ordered)) wake_up(&trans->pending_wait); @@ -707,7 +709,7 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, - btrfs_run_ordered_extent_work, NULL, NULL); + btrfs_run_ordered_extent_work, NULL); list_add_tail(&ordered->work_list, &works); btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); @@ -736,11 +738,9 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len) { struct btrfs_root *root; - struct list_head splice; + LIST_HEAD(splice); u64 done; - INIT_LIST_HEAD(&splice); - mutex_lock(&fs_info->ordered_operations_mutex); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); @@ -873,14 +873,12 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; - tree = &inode->ordered_tree; - spin_lock_irqsave(&tree->lock, flags); - node = tree_search(tree, file_offset); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -892,7 +890,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino trace_btrfs_ordered_extent_lookup(inode, entry); } out: - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); return entry; } @@ -902,15 +900,13 @@ out: struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); - node = tree_search(tree, file_offset); + spin_lock_irq(&inode->ordered_tree_lock); + node = ordered_tree_search(inode, file_offset); if (!node) { - node = tree_search(tree, file_offset + len); + node = ordered_tree_search(inode, file_offset + len); if (!node) goto out; } @@ -934,7 +930,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_range(inode, entry); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -945,13 +941,12 @@ out: void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct list_head *list) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *n; ASSERT(inode_is_locked(&inode->vfs_inode)); - spin_lock_irq(&tree->lock); - for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + spin_lock_irq(&inode->ordered_tree_lock); + for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { struct btrfs_ordered_extent *ordered; ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); @@ -964,7 +959,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, refcount_inc(&ordered->refs); trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); } /* @@ -974,13 +969,11 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); - node = tree_search(tree, file_offset); + spin_lock_irq(&inode->ordered_tree_lock); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -988,7 +981,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_first(inode, entry); out: - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -1004,15 +997,14 @@ out: struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct rb_node *cur; struct rb_node *prev; struct rb_node *next; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&tree->lock); - node = tree->tree.rb_node; + spin_lock_irq(&inode->ordered_tree_lock); + node = inode->ordered_tree.rb_node; /* * Here we don't want to use tree_search() which will use tree->last * and screw up the search order. @@ -1066,7 +1058,7 @@ out: trace_btrfs_ordered_extent_lookup_first_range(inode, entry); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -1145,7 +1137,6 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( struct btrfs_ordered_extent *ordered, u64 len) { struct btrfs_inode *inode = BTRFS_I(ordered->inode); - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 file_offset = ordered->file_offset; @@ -1185,13 +1176,13 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( refcount_inc(&new->refs); spin_lock_irq(&root->ordered_extent_lock); - spin_lock(&tree->lock); + spin_lock(&inode->ordered_tree_lock); /* Remove from tree once */ node = &ordered->rb_node; - rb_erase(node, &tree->tree); + rb_erase(node, &inode->ordered_tree); RB_CLEAR_NODE(node); - if (tree->last == node) - tree->last = NULL; + if (inode->ordered_tree_last == node) + inode->ordered_tree_last = NULL; ordered->file_offset += len; ordered->disk_bytenr += len; @@ -1222,18 +1213,19 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( } /* Re-insert the node */ - node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); + node = tree_insert(&inode->ordered_tree, ordered->file_offset, + &ordered->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "zoned: inconsistency in ordered tree at offset %llu", ordered->file_offset); - node = tree_insert(&tree->tree, new->file_offset, &new->rb_node); + node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "zoned: inconsistency in ordered tree at offset %llu", new->file_offset); - spin_unlock(&tree->lock); + spin_unlock(&inode->ordered_tree_lock); list_add_tail(&new->root_extent_list, &root->ordered_extents); root->nr_ordered_extents++; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 173bd5c5df26..567a6d3d4712 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -6,13 +6,6 @@ #ifndef BTRFS_ORDERED_DATA_H #define BTRFS_ORDERED_DATA_H -/* one of these per inode */ -struct btrfs_ordered_inode_tree { - spinlock_t lock; - struct rb_root tree; - struct rb_node *last; -}; - struct btrfs_ordered_sum { /* * Logical start address and length for of the blocks covered by @@ -151,15 +144,9 @@ struct btrfs_ordered_extent { struct completion completion; struct btrfs_work flush_work; struct list_head work_list; -}; -static inline void -btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) -{ - spin_lock_init(&t->lock); - t->tree = RB_ROOT; - t->last = NULL; -} + struct list_head bioc_list; +}; int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent); int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index aa06d9ca911d..7e46aa8a0444 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -9,6 +9,8 @@ #include "print-tree.h" #include "accessors.h" #include "tree-checker.h" +#include "volumes.h" +#include "raid-stripe-tree.h" struct root_name_map { u64 id; @@ -28,6 +30,7 @@ static const struct root_name_map root_map[] = { { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" }, { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, + { BTRFS_RAID_STRIPE_TREE_OBJECTID, "RAID_STRIPE_TREE" }, }; const char *btrfs_root_name(const struct btrfs_key *key, char *buf) @@ -80,12 +83,20 @@ static void print_extent_data_ref(const struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } +static void print_extent_owner_ref(const struct extent_buffer *eb, + const struct btrfs_extent_owner_ref *ref) +{ + ASSERT(btrfs_fs_incompat(eb->fs_info, SIMPLE_QUOTA)); + pr_cont("extent data owner root %llu\n", btrfs_extent_owner_ref_root_id(eb, ref)); +} + static void print_extent_item(const struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; + struct btrfs_extent_owner_ref *oref; struct btrfs_disk_key key; unsigned long end; unsigned long ptr; @@ -95,8 +106,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type int ref_index = 0; if (unlikely(item_size < sizeof(*ei))) { - btrfs_print_v0_err(eb->fs_info); - btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); + btrfs_err(eb->fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); + btrfs_handle_fs_error(eb->fs_info, -EUCLEAN, NULL); } ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); @@ -159,6 +172,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type "\t\t\t(parent %llu not aligned to sectorsize %u)\n", offset, eb->fs_info->sectorsize); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + print_extent_owner_ref(eb, oref); + break; default: pr_cont("(extent %llu has INVALID ref type %d)\n", eb->start, type); @@ -187,6 +204,22 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset, } } +static void print_raid_stripe_key(const struct extent_buffer *eb, u32 item_size, + struct btrfs_stripe_extent *stripe) +{ + const int num_stripes = btrfs_num_raid_stripes(item_size); + const u8 encoding = btrfs_stripe_extent_encoding(eb, stripe); + + pr_info("\t\t\tencoding: %s\n", + (encoding && encoding < BTRFS_NR_RAID_TYPES) ? + btrfs_raid_array[encoding].raid_name : "unknown"); + + for (int i = 0; i < num_stripes; i++) + pr_info("\t\t\tstride %d devid %llu physical %llu\n", + i, btrfs_raid_stride_devid(eb, &stripe->strides[i]), + btrfs_raid_stride_physical(eb, &stripe->strides[i])); +} + /* * Helper to output refs and locking status of extent buffer. Useful to debug * race condition related problems. @@ -291,10 +324,6 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_file_extent_num_bytes(l, fi), btrfs_file_extent_ram_bytes(l, fi)); break; - case BTRFS_EXTENT_REF_V0_KEY: - btrfs_print_v0_err(fs_info); - btrfs_handle_fs_error(fs_info, -EINVAL, NULL); - break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); @@ -351,6 +380,10 @@ void btrfs_print_leaf(const struct extent_buffer *l) print_uuid_item(l, btrfs_item_ptr_offset(l, i), btrfs_item_size(l, i)); break; + case BTRFS_RAID_STRIPE_KEY: + print_raid_stripe_key(l, btrfs_item_size(l, i), + btrfs_item_ptr(l, i, struct btrfs_stripe_extent)); + break; } } } diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 0755af0e53e3..f9bf591a0718 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -15,6 +15,7 @@ #include "fs.h" #include "accessors.h" #include "super.h" +#include "dir-item.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2637d6b157ff..e46774e8f49f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -30,6 +30,25 @@ #include "root-tree.h" #include "tree-checker.h" +enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info) +{ + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return BTRFS_QGROUP_MODE_DISABLED; + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) + return BTRFS_QGROUP_MODE_SIMPLE; + return BTRFS_QGROUP_MODE_FULL; +} + +bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info) +{ + return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED; +} + +bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info) +{ + return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL; +} + /* * Helpers to access qgroup reservation * @@ -146,16 +165,6 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; -static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) -{ - return (u64)(uintptr_t)qg; -} - -static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) -{ - return (struct btrfs_qgroup *)(uintptr_t)n->aux; -} - static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); @@ -180,34 +189,46 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, return NULL; } -/* must be called with qgroup_lock held */ +/* + * Add qgroup to the filesystem's qgroup tree. + * + * Must be called with qgroup_lock held and @prealloc preallocated. + * + * The control on the lifespan of @prealloc would be transfered to this + * function, thus caller should no longer touch @prealloc. + */ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *prealloc, u64 qgroupid) { struct rb_node **p = &fs_info->qgroup_tree.rb_node; struct rb_node *parent = NULL; struct btrfs_qgroup *qgroup; + /* Caller must have pre-allocated @prealloc. */ + ASSERT(prealloc); + while (*p) { parent = *p; qgroup = rb_entry(parent, struct btrfs_qgroup, node); - if (qgroup->qgroupid < qgroupid) + if (qgroup->qgroupid < qgroupid) { p = &(*p)->rb_left; - else if (qgroup->qgroupid > qgroupid) + } else if (qgroup->qgroupid > qgroupid) { p = &(*p)->rb_right; - else + } else { + kfree(prealloc); return qgroup; + } } - qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); - if (!qgroup) - return ERR_PTR(-ENOMEM); - + qgroup = prealloc; qgroup->qgroupid = qgroupid; INIT_LIST_HEAD(&qgroup->groups); INIT_LIST_HEAD(&qgroup->members); INIT_LIST_HEAD(&qgroup->dirty); + INIT_LIST_HEAD(&qgroup->iterator); + INIT_LIST_HEAD(&qgroup->nested_iterator); rb_link_node(&qgroup->node, parent, p); rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); @@ -254,27 +275,26 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) /* * Add relation specified by two qgroups. * - * Must be called with qgroup_lock held. + * Must be called with qgroup_lock held, the ownership of @prealloc is + * transferred to this function and caller should not touch it anymore. * * Return: 0 on success * -ENOENT if one of the qgroups is NULL * <0 other errors */ -static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) +static int __add_relation_rb(struct btrfs_qgroup_list *prealloc, + struct btrfs_qgroup *member, + struct btrfs_qgroup *parent) { - struct btrfs_qgroup_list *list; - - if (!member || !parent) + if (!member || !parent) { + kfree(prealloc); return -ENOENT; + } - list = kzalloc(sizeof(*list), GFP_ATOMIC); - if (!list) - return -ENOMEM; - - list->group = parent; - list->member = member; - list_add_tail(&list->next_group, &member->groups); - list_add_tail(&list->next_member, &parent->members); + prealloc->group = parent; + prealloc->member = member; + list_add_tail(&prealloc->next_group, &member->groups); + list_add_tail(&prealloc->next_member, &parent->members); return 0; } @@ -288,7 +308,9 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p * -ENOENT if one of the ids does not exist * <0 other errors */ -static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) +static int add_relation_rb(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_list *prealloc, + u64 memberid, u64 parentid) { struct btrfs_qgroup *member; struct btrfs_qgroup *parent; @@ -296,7 +318,7 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 pare member = find_qgroup_rb(fs_info, memberid); parent = find_qgroup_rb(fs_info, parentid); - return __add_relation_rb(member, parent); + return __add_relation_rb(prealloc, member, parent); } /* Must be called with qgroup_lock held */ @@ -340,11 +362,22 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) { + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return; fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); } +static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot, + struct btrfs_qgroup_status_item *ptr) +{ + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr)); + fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr); +} + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -361,7 +394,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) u64 flags = 0; u64 rescan_progress = 0; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!fs_info->quota_root) return 0; fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); @@ -411,14 +444,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) "old qgroup version, quota disabled"); goto out; } - if (btrfs_qgroup_status_generation(l, ptr) != - fs_info->generation) { + fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) { + qgroup_read_enable_gen(fs_info, l, slot, ptr); + } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { qgroup_mark_inconsistent(fs_info); btrfs_err(fs_info, "qgroup generation mismatch, marked as inconsistent"); } - fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, - ptr); rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } @@ -434,11 +467,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup_mark_inconsistent(fs_info); } if (!qgroup) { - qgroup = add_qgroup_rb(fs_info, found_key.offset); - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); + struct btrfs_qgroup *prealloc; + + prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); + if (!prealloc) { + ret = -ENOMEM; goto out; } + qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) @@ -489,6 +525,8 @@ next1: if (ret) goto out; while (1) { + struct btrfs_qgroup_list *list = NULL; + slot = path->slots[0]; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); @@ -502,8 +540,14 @@ next1: goto next2; } - ret = add_relation_rb(fs_info, found_key.objectid, + list = kzalloc(sizeof(*list), GFP_KERNEL); + if (!list) { + ret = -ENOMEM; + goto out; + } + ret = add_relation_rb(fs_info, list, found_key.objectid, found_key.offset); + list = NULL; if (ret == -ENOENT) { btrfs_warn(fs_info, "orphan qgroup relation 0x%llx->0x%llx", @@ -522,13 +566,12 @@ next2: out: btrfs_free_path(path); fs_info->qgroup_flags |= flags; - if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && - ret >= 0) - ret = qgroup_rescan_init(fs_info, rescan_progress, 0); - - if (ret < 0) { + if (ret >= 0) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON) + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + ret = qgroup_rescan_init(fs_info, rescan_progress, 0); + } else { ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; @@ -550,7 +593,7 @@ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) struct rb_node *node; bool ret = false; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return ret; /* * Since we're unmounting, there is no race and no need to grab qgroup @@ -622,7 +665,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_free_path(path); return ret; @@ -700,7 +743,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); @@ -719,7 +762,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -808,7 +851,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, l); out: btrfs_free_path(path); @@ -854,7 +897,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, l); out: btrfs_free_path(path); @@ -896,7 +939,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, l); out: btrfs_free_path(path); @@ -949,7 +992,8 @@ out: return ret; } -int btrfs_quota_enable(struct btrfs_fs_info *fs_info) +int btrfs_quota_enable(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_quota_ctl_args *quota_ctl_args) { struct btrfs_root *quota_root; struct btrfs_root *tree_root = fs_info->tree_root; @@ -959,8 +1003,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) struct btrfs_key key; struct btrfs_key found_key; struct btrfs_qgroup *qgroup = NULL; + struct btrfs_qgroup *prealloc = NULL; struct btrfs_trans_handle *trans = NULL; struct ulist *ulist = NULL; + const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA); int ret = 0; int slot; @@ -1063,13 +1109,18 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) struct btrfs_qgroup_status_item); btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); - fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; + if (simple) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); + } else { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); key.objectid = 0; key.type = BTRFS_ROOT_REF_KEY; @@ -1094,6 +1145,15 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) /* Release locks on tree_root before we access quota_root */ btrfs_release_path(path); + /* We should not have a stray @prealloc pointer. */ + ASSERT(prealloc == NULL); + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + ret = add_qgroup_item(trans, quota_root, found_key.offset); if (ret) { @@ -1101,7 +1161,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) goto out_free_path; } - qgroup = add_qgroup_rb(fs_info, found_key.offset); + qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); + prealloc = NULL; if (IS_ERR(qgroup)) { ret = PTR_ERR(qgroup); btrfs_abort_transaction(trans, ret); @@ -1144,18 +1205,22 @@ out_add_root: goto out_free_path; } - qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); - btrfs_abort_transaction(trans, ret); + ASSERT(prealloc == NULL); + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; goto out_free_path; } + qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID); + prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } + fs_info->qgroup_enable_gen = trans->transid; + mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * Commit the transaction while not holding qgroup_ioctl_lock, to avoid @@ -1180,8 +1245,14 @@ out_add_root: spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + if (simple) + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); + /* Skip rescan for simple qgroups. */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + goto out_free_path; + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); @@ -1222,6 +1293,39 @@ out: else if (trans) ret = btrfs_end_transaction(trans); ulist_free(ulist); + kfree(prealloc); + return ret; +} + +/* + * It is possible to have outstanding ordered extents which reserved bytes + * before we disabled. We need to fully flush delalloc, ordered extents, and a + * commit to ensure that we don't leak such reservations, only to have them + * come back if we re-enable. + * + * - enable simple quotas + * - reserve space + * - release it, store rsv_bytes in OE + * - disable quotas + * - enable simple quotas (qgroup rsv are all 0) + * - OE finishes + * - run delayed refs + * - free rsv_bytes, resulting in miscounting or even underflow + */ +static int flush_reservations(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + int ret; + + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); + if (ret) + return ret; + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + btrfs_commit_transaction(trans); + return ret; } @@ -1269,6 +1373,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); + ret = flush_reservations(fs_info); + if (ret) + goto out_unlock_cleaner; + /* * 1 For the root item * @@ -1295,6 +1403,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) quota_root = fs_info->quota_root; fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; spin_unlock(&fs_info->qgroup_lock); @@ -1329,7 +1438,8 @@ out: if (ret && trans) btrfs_end_transaction(trans); else if (trans) - ret = btrfs_end_transaction(trans); + ret = btrfs_commit_transaction(trans); +out_unlock_cleaner: mutex_unlock(&fs_info->cleaner_mutex); return ret; @@ -1342,6 +1452,24 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } +static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup) +{ + if (!list_empty(&qgroup->iterator)) + return; + + list_add_tail(&qgroup->iterator, head); +} + +static void qgroup_iterator_clean(struct list_head *head) +{ + while (!list_empty(head)) { + struct btrfs_qgroup *qgroup; + + qgroup = list_first_entry(head, struct btrfs_qgroup, iterator); + list_del_init(&qgroup->iterator); + } +} + /* * The easy accounting, we're updating qgroup relationship whose child qgroup * only has exclusive extents. @@ -1356,14 +1484,12 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, * * Caller should hold fs_info->qgroup_lock. */ -static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, - struct ulist *tmp, u64 ref_root, +static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *src, int sign) { struct btrfs_qgroup *qgroup; - struct btrfs_qgroup_list *glist; - struct ulist_node *unode; - struct ulist_iterator uiter; + struct btrfs_qgroup *cur; + LIST_HEAD(qgroup_list); u64 num_bytes = src->excl; int ret = 0; @@ -1371,53 +1497,30 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, if (!qgroup) goto out; - qgroup->rfer += sign * num_bytes; - qgroup->rfer_cmpr += sign * num_bytes; - - WARN_ON(sign < 0 && qgroup->excl < num_bytes); - qgroup->excl += sign * num_bytes; - qgroup->excl_cmpr += sign * num_bytes; - - if (sign > 0) - qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); - else - qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); - - qgroup_dirty(fs_info, qgroup); - - /* Get all of the parent groups that contain this qgroup */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(cur, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; - /* Iterate all of the parents and adjust their reference counts */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - qgroup = unode_aux_to_qgroup(unode); qgroup->rfer += sign * num_bytes; qgroup->rfer_cmpr += sign * num_bytes; + WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + if (sign > 0) qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); else qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); - qgroup->excl_cmpr += sign * num_bytes; qgroup_dirty(fs_info, qgroup); - /* Add any parents of the parents */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + /* Append parent qgroups to @qgroup_list. */ + list_for_each_entry(glist, &qgroup->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); } ret = 0; out: + qgroup_iterator_clean(&qgroup_list); return ret; } @@ -1434,8 +1537,7 @@ out: * Return < 0 for other error. */ static int quick_update_accounting(struct btrfs_fs_info *fs_info, - struct ulist *tmp, u64 src, u64 dst, - int sign) + u64 src, u64 dst, int sign) { struct btrfs_qgroup *qgroup; int ret = 1; @@ -1446,8 +1548,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info, goto out; if (qgroup->excl == qgroup->rfer) { ret = 0; - err = __qgroup_excl_accounting(fs_info, tmp, dst, - qgroup, sign); + err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); if (err < 0) { ret = err; goto out; @@ -1459,28 +1560,19 @@ out: return ret; } -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, - u64 dst) +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; - struct ulist *tmp; - unsigned int nofs_flag; + struct btrfs_qgroup_list *prealloc = NULL; int ret = 0; /* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; - /* We hold a transaction handle open, must do a NOFS allocation. */ - nofs_flag = memalloc_nofs_save(); - tmp = ulist_alloc(GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - if (!tmp) - return -ENOMEM; - mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; @@ -1501,6 +1593,11 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, } } + prealloc = kzalloc(sizeof(*list), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; + goto out; + } ret = add_qgroup_relation_item(trans, src, dst); if (ret) goto out; @@ -1512,16 +1609,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, } spin_lock(&fs_info->qgroup_lock); - ret = __add_relation_rb(member, parent); + ret = __add_relation_rb(prealloc, member, parent); + prealloc = NULL; if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); goto out; } - ret = quick_update_accounting(fs_info, tmp, src, dst, 1); + ret = quick_update_accounting(fs_info, src, dst, 1); spin_unlock(&fs_info->qgroup_lock); out: + kfree(prealloc); mutex_unlock(&fs_info->qgroup_ioctl_lock); - ulist_free(tmp); return ret; } @@ -1532,19 +1630,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; - struct ulist *tmp; bool found = false; - unsigned int nofs_flag; int ret = 0; int ret2; - /* We hold a transaction handle open, must do a NOFS allocation. */ - nofs_flag = memalloc_nofs_save(); - tmp = ulist_alloc(GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - if (!tmp) - return -ENOMEM; - if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; @@ -1582,11 +1671,10 @@ delete_item: if (found) { spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); - ret = quick_update_accounting(fs_info, tmp, src, dst, -1); + ret = quick_update_accounting(fs_info, src, dst, -1); spin_unlock(&fs_info->qgroup_lock); } out: - ulist_free(tmp); return ret; } @@ -1608,8 +1696,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *prealloc = NULL; int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) + return 0; + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; @@ -1622,21 +1714,25 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) goto out; } + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; + goto out; + } + ret = add_qgroup_item(trans, quota_root, qgroupid); if (ret) goto out; spin_lock(&fs_info->qgroup_lock); - qgroup = add_qgroup_rb(fs_info, qgroupid); + qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid); spin_unlock(&fs_info->qgroup_lock); + prealloc = NULL; - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); - goto out; - } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); + kfree(prealloc); return ret; } @@ -1771,6 +1867,17 @@ out: return ret; } +/* + * Inform qgroup to trace one dirty extent, its info is recorded in @record. + * So qgroup can account it at transaction committing time. + * + * No lock version, caller must acquire delayed ref lock and allocated memory, + * then call btrfs_qgroup_trace_extent_post() after exiting lock context. + * + * Return 0 for success insert + * Return >0 for existing record, caller can free @record safely. + * Error is not possible + */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) @@ -1780,6 +1887,9 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_extent_record *entry; u64 bytenr = record->bytenr; + if (!btrfs_qgroup_full_accounting(fs_info)) + return 1; + lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); @@ -1806,12 +1916,35 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, return 0; } +/* + * Post handler after qgroup_trace_extent_nolock(). + * + * NOTE: Current qgroup does the expensive backref walk at transaction + * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming + * new transaction. + * This is designed to allow btrfs_find_all_roots() to get correct new_roots + * result. + * + * However for old_roots there is no need to do backref walk at that time, + * since we search commit roots to walk backref and result will always be + * correct. + * + * Due to the nature of no lock version, we can't do backref there. + * So we must call btrfs_qgroup_trace_extent_post() after exiting + * spinlock context. + * + * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result + * using current root, then we can move all expensive backref walk out of + * transaction committing, but not now as qgroup accounting will be wrong again. + */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord) { struct btrfs_backref_walk_ctx ctx = { 0 }; int ret; + if (!btrfs_qgroup_full_accounting(trans->fs_info)) + return 0; /* * We are always called in a context where we are already holding a * transaction handle. Often we are called when adding a data delayed @@ -1859,6 +1992,19 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, return 0; } +/* + * Inform qgroup to trace one dirty extent, specified by @bytenr and + * @num_bytes. + * So qgroup can account it at commit trans time. + * + * Better encapsulated version, with memory allocation and backref walk for + * commit roots. + * So this can sleep. + * + * Return 0 if the operation is done. + * Return <0 for error, like memory allocation failure or invalid parameter + * (NULL trans) + */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { @@ -1867,8 +2013,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_delayed_ref_root *delayed_refs; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) - || bytenr == 0 || num_bytes == 0) + if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) return 0; record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) @@ -1889,6 +2034,12 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, return btrfs_qgroup_trace_extent_post(trans, record); } +/* + * Inform qgroup to trace all leaf items of data + * + * Return 0 for success + * Return <0 for error(ENOMEM) + */ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { @@ -1900,7 +2051,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, u64 bytenr, num_bytes; /* We can be called directly from walk_up_proc() */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; for (i = 0; i < nr; i++) { @@ -2276,7 +2427,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, int level; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* Wrong parameter order */ @@ -2319,6 +2470,16 @@ out: return ret; } +/* + * Inform qgroup to trace a whole subtree, including all its child tree + * blocks and data. + * The root tree block is specified by @root_eb. + * + * Normally used by relocation(tree block swap) and subvolume deletion. + * + * Return 0 for success + * Return <0 for error(ENOMEM or tree search error) + */ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level) @@ -2333,7 +2494,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); BUG_ON(root_eb == NULL); - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; spin_lock(&fs_info->qgroup_lock); @@ -2445,62 +2606,64 @@ out: return ret; } +static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup) +{ + if (!list_empty(&qgroup->nested_iterator)) + return; + + list_add_tail(&qgroup->nested_iterator, head); +} + +static void qgroup_iterator_nested_clean(struct list_head *head) +{ + while (!list_empty(head)) { + struct btrfs_qgroup *qgroup; + + qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator); + list_del_init(&qgroup->nested_iterator); + } +} + #define UPDATE_NEW 0 #define UPDATE_OLD 1 /* * Walk all of the roots that points to the bytenr and adjust their refcnts. */ -static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - struct ulist *qgroups, u64 seq, int update_old) +static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct list_head *qgroups, + u64 seq, int update_old) { struct ulist_node *unode; struct ulist_iterator uiter; - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; struct btrfs_qgroup *qg; - int ret = 0; if (!roots) - return 0; + return; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { + LIST_HEAD(tmp); + qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; - ulist_reinit(tmp); - ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); - if (ret < 0) - return ret; - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + qgroup_iterator_nested_add(qgroups, qg); + qgroup_iterator_add(&tmp, qg); + list_for_each_entry(qg, &tmp, iterator) { struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(tmp_unode); if (update_old) btrfs_qgroup_update_old_refcnt(qg, seq, 1); else btrfs_qgroup_update_new_refcnt(qg, seq, 1); + list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(qgroups, glist->group->qgroupid, - qgroup_to_aux(glist->group), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, glist->group->qgroupid, - qgroup_to_aux(glist->group), - GFP_ATOMIC); - if (ret < 0) - return ret; + qgroup_iterator_nested_add(qgroups, glist->group); + qgroup_iterator_add(&tmp, glist->group); } } + qgroup_iterator_clean(&tmp); } - return 0; } /* @@ -2539,22 +2702,16 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * But this time we don't need to consider other things, the codes and logic * is easy to understand now. */ -static int qgroup_update_counters(struct btrfs_fs_info *fs_info, - struct ulist *qgroups, - u64 nr_old_roots, - u64 nr_new_roots, - u64 num_bytes, u64 seq) +static void qgroup_update_counters(struct btrfs_fs_info *fs_info, + struct list_head *qgroups, u64 nr_old_roots, + u64 nr_new_roots, u64 num_bytes, u64 seq) { - struct ulist_node *unode; - struct ulist_iterator uiter; struct btrfs_qgroup *qg; - u64 cur_new_count, cur_old_count; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(qgroups, &uiter))) { + list_for_each_entry(qg, qgroups, nested_iterator) { + u64 cur_new_count, cur_old_count; bool dirty = false; - qg = unode_aux_to_qgroup(unode); cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); @@ -2625,7 +2782,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, if (dirty) qgroup_dirty(fs_info, qg); } - return 0; } /* @@ -2662,8 +2818,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct ulist *new_roots) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct ulist *qgroups = NULL; - struct ulist *tmp = NULL; + LIST_HEAD(qgroups); u64 seq; u64 nr_new_roots = 0; u64 nr_old_roots = 0; @@ -2673,7 +2828,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (!btrfs_qgroup_full_accounting(fs_info) || fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) goto out_free; @@ -2697,17 +2852,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, num_bytes, nr_old_roots, nr_new_roots); - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) { - ret = -ENOMEM; - goto out_free; - } - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) { - ret = -ENOMEM; - goto out_free; - } - mutex_lock(&fs_info->qgroup_rescan_lock); if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { @@ -2722,29 +2866,27 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, seq = fs_info->qgroup_seq; /* Update old refcnts using old_roots */ - ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, - UPDATE_OLD); - if (ret < 0) - goto out; + qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD); /* Update new refcnts using new_roots */ - ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, - UPDATE_NEW); - if (ret < 0) - goto out; + qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW); - qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, + qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots, num_bytes, seq); /* + * We're done using the iterator, release all its qgroups while holding + * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup() + * and trigger use-after-free accesses to qgroups. + */ + qgroup_iterator_nested_clean(&qgroups); + + /* * Bump qgroup_seq to avoid seq overlap */ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; -out: spin_unlock(&fs_info->qgroup_lock); out_free: - ulist_free(tmp); - ulist_free(qgroups); ulist_free(old_roots); ulist_free(new_roots); return ret; @@ -2761,6 +2903,9 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) u64 qgroup_to_skip; int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return 0; + delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; while ((node = rb_first(&delayed_refs->dirty_extent_root))) { @@ -2876,7 +3021,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) qgroup_mark_inconsistent(fs_info); spin_lock(&fs_info->qgroup_lock); } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (btrfs_qgroup_enabled(fs_info)) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; else fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; @@ -2889,6 +3034,47 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) return ret; } +static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, + u64 inode_rootid, + struct btrfs_qgroup_inherit **inherit) +{ + int i = 0; + u64 num_qgroups = 0; + struct btrfs_qgroup *inode_qg; + struct btrfs_qgroup_list *qg_list; + struct btrfs_qgroup_inherit *res; + size_t struct_sz; + u64 *qgids; + + if (*inherit) + return -EEXIST; + + inode_qg = find_qgroup_rb(fs_info, inode_rootid); + if (!inode_qg) + return -ENOENT; + + num_qgroups = list_count_nodes(&inode_qg->groups); + + if (!num_qgroups) + return 0; + + struct_sz = struct_size(res, qgroups, num_qgroups); + if (struct_sz == SIZE_MAX) + return -ERANGE; + + res = kzalloc(struct_sz, GFP_NOFS); + if (!res) + return -ENOMEM; + res->num_qgroups = num_qgroups; + qgids = res->qgroups; + + list_for_each_entry(qg_list, &inode_qg->groups, next_group) + qgids[i] = qg_list->group->qgroupid; + + *inherit = res; + return 0; +} + /* * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will @@ -2896,7 +3082,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) * when a readonly fs is a reasonable outcome. */ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, - u64 objectid, struct btrfs_qgroup_inherit *inherit) + u64 objectid, u64 inode_rootid, + struct btrfs_qgroup_inherit *inherit) { int ret = 0; int i; @@ -2906,10 +3093,17 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; + struct btrfs_qgroup *prealloc; + struct btrfs_qgroup_list **qlist_prealloc = NULL; + bool free_inherit = false; bool need_rescan = false; u32 level_size = 0; u64 nums; + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) + return -ENOMEM; + /* * There are only two callers of this function. * @@ -2929,7 +3123,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (!committing) mutex_lock(&fs_info->qgroup_ioctl_lock); - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_enabled(fs_info)) goto out; quota_root = fs_info->quota_root; @@ -2938,6 +3132,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, goto out; } + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) { + ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit); + if (ret) + goto out; + free_inherit = true; + } + if (inherit) { i_qgroups = (u64 *)(inherit + 1); nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + @@ -2982,16 +3183,28 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, goto out; } ret = 0; - } + qlist_prealloc = kcalloc(inherit->num_qgroups, + sizeof(struct btrfs_qgroup_list *), + GFP_NOFS); + if (!qlist_prealloc) { + ret = -ENOMEM; + goto out; + } + for (int i = 0; i < inherit->num_qgroups; i++) { + qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list), + GFP_NOFS); + if (!qlist_prealloc[i]) { + ret = -ENOMEM; + goto out; + } + } + } spin_lock(&fs_info->qgroup_lock); - dstgroup = add_qgroup_rb(fs_info, objectid); - if (IS_ERR(dstgroup)) { - ret = PTR_ERR(dstgroup); - goto unlock; - } + dstgroup = add_qgroup_rb(fs_info, prealloc, objectid); + prealloc = NULL; if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { dstgroup->lim_flags = inherit->lim.flags; @@ -3003,7 +3216,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, qgroup_dirty(fs_info, dstgroup); } - if (srcid) { + if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) { srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock; @@ -3038,7 +3251,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i) { if (*i_qgroups) { - ret = add_relation_rb(fs_info, objectid, *i_qgroups); + ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid, + *i_qgroups); + qlist_prealloc[i] = NULL; if (ret) goto unlock; } @@ -3102,6 +3317,14 @@ out: mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan) qgroup_mark_inconsistent(fs_info); + if (qlist_prealloc) { + for (int i = 0; i < inherit->num_qgroups; i++) + kfree(qlist_prealloc[i]); + kfree(qlist_prealloc); + } + if (free_inherit) + kfree(inherit); + kfree(prealloc); return ret; } @@ -3125,8 +3348,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; - struct ulist_node *unode; - struct ulist_iterator uiter; + LIST_HEAD(qgroup_list); if (!is_fstree(ref_root)) return 0; @@ -3146,49 +3368,28 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, if (!qgroup) goto out; - /* - * in a first step, we check all affected qgroups if any limits would - * be exceeded - */ - ulist_reinit(fs_info->qgroup_ulist); - ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, - qgroup_to_aux(qgroup), GFP_ATOMIC); - if (ret < 0) - goto out; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(unode); - - if (enforce && !qgroup_check_limits(qg, num_bytes)) { + if (enforce && !qgroup_check_limits(qgroup, num_bytes)) { ret = -EDQUOT; goto out; } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(fs_info->qgroup_ulist, - glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + list_for_each_entry(glist, &qgroup->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); } + ret = 0; /* * no limits exceeded, now record the reservation into all qgroups */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; - - qg = unode_aux_to_qgroup(unode); - - qgroup_rsv_add(fs_info, qg, num_bytes, type); - } + list_for_each_entry(qgroup, &qgroup_list, iterator) + qgroup_rsv_add(fs_info, qgroup, num_bytes, type); out: + qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); return ret; } @@ -3207,9 +3408,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, enum btrfs_qgroup_rsv_type type) { struct btrfs_qgroup *qgroup; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; + LIST_HEAD(qgroup_list); if (!is_fstree(ref_root)) return; @@ -3237,30 +3436,17 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, */ num_bytes = qgroup->rsv.values[type]; - ulist_reinit(fs_info->qgroup_ulist); - ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, - qgroup_to_aux(qgroup), GFP_ATOMIC); - if (ret < 0) - goto out; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(unode); - - qgroup_rsv_release(fs_info, qg, num_bytes, type); - - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(fs_info->qgroup_ulist, - glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; + qgroup_rsv_release(fs_info, qgroup, num_bytes, type); + list_for_each_entry(glist, &qgroup->groups, next_group) { + qgroup_iterator_add(&qgroup_list, glist->group); } } - out: + qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); } @@ -3295,6 +3481,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, int slot; int ret; + if (!btrfs_qgroup_full_accounting(fs_info)) + return 1; + mutex_lock(&fs_info->qgroup_rescan_lock); extent_root = btrfs_extent_root(fs_info, fs_info->qgroup_rescan_progress.objectid); @@ -3375,10 +3564,15 @@ out: static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { - return btrfs_fs_closing(fs_info) || - test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || - !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; + if (btrfs_fs_closing(fs_info)) + return true; + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + return true; + if (!btrfs_qgroup_enabled(fs_info)) + return true; + if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) + return true; + return false; } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) @@ -3392,6 +3586,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) bool stopped = false; bool did_leaf_rescans = false; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return; + path = btrfs_alloc_path(); if (!path) goto out; @@ -3495,6 +3692,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, { int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { + btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode"); + return -EINVAL; + } + if (!init_flags) { /* we're resuming qgroup rescan at mount time */ if (!(fs_info->qgroup_flags & @@ -3525,7 +3727,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, btrfs_warn(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; - } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { /* Quota disable is in progress */ ret = -EBUSY; } @@ -3546,7 +3748,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, mutex_unlock(&fs_info->qgroup_rescan_lock); btrfs_init_work(&fs_info->qgroup_rescan_work, - btrfs_qgroup_rescan_worker, NULL, NULL); + btrfs_qgroup_rescan_worker, NULL); return 0; } @@ -3590,15 +3792,16 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) * going to clear all tracking information for a clean start. */ - trans = btrfs_join_transaction(fs_info->fs_root); - if (IS_ERR(trans)) { + trans = btrfs_attach_transaction_barrier(fs_info->fs_root); + if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; return PTR_ERR(trans); - } - ret = btrfs_commit_transaction(trans); - if (ret) { - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - return ret; + } else if (trans != ERR_PTR(-ENOENT)) { + ret = btrfs_commit_transaction(trans); + if (ret) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return ret; + } } qgroup_rescan_zero_tracking(fs_info); @@ -3757,9 +3960,11 @@ static int try_flush_qgroup(struct btrfs_root *root) goto out; btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); - trans = btrfs_join_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; goto out; } @@ -3781,7 +3986,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, u64 to_reserve; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || + if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid) || len == 0) return 0; @@ -3852,13 +4057,14 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, /* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len, u64 *freed_ret) { struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset changeset; - int freed = 0; + u64 freed = 0; int ret; extent_changeset_init(&changeset); @@ -3899,7 +4105,9 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, } btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, BTRFS_QGROUP_RSV_DATA); - ret = freed; + if (freed_ret) + *freed_ret = freed; + ret = 0; out: extent_changeset_release(&changeset); return ret; @@ -3907,19 +4115,23 @@ out: static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, - int free) + u64 *released, int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) - return 0; + if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { + extent_changeset_init(&changeset); + return clear_record_extent_bits(&inode->io_tree, start, + start + len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + } /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) - return qgroup_free_reserved_data(inode, reserved, start, len); + return qgroup_free_reserved_data(inode, reserved, start, len, released); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); @@ -3934,7 +4146,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, btrfs_qgroup_free_refroot(inode->root->fs_info, inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); - ret = changeset.bytes_changed; + if (released) + *released = changeset.bytes_changed; out: extent_changeset_release(&changeset); return ret; @@ -3953,9 +4166,10 @@ out: * NOTE: This function may sleep for memory allocation. */ int btrfs_qgroup_free_data(struct btrfs_inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len, u64 *freed) { - return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); } /* @@ -3973,9 +4187,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) { - return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); } static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, @@ -4024,7 +4238,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid) || num_bytes == 0) return 0; @@ -4061,11 +4275,15 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); } +/* + * Per-transaction meta reservation should be all freed at transaction commit + * time + */ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; @@ -4081,7 +4299,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; @@ -4101,9 +4319,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, int num_bytes) { struct btrfs_qgroup *qgroup; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; + LIST_HEAD(qgroup_list); if (num_bytes == 0) return; @@ -4114,39 +4330,36 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out; - ulist_reinit(fs_info->qgroup_ulist); - ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, - qgroup_to_aux(qgroup), GFP_ATOMIC); - if (ret < 0) - goto out; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(unode); + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; - qgroup_rsv_release(fs_info, qg, num_bytes, + qgroup_rsv_release(fs_info, qgroup, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); - qgroup_rsv_add(fs_info, qg, num_bytes, - BTRFS_QGROUP_RSV_META_PERTRANS); - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(fs_info->qgroup_ulist, - glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + if (!sb_rdonly(fs_info->sb)) + qgroup_rsv_add(fs_info, qgroup, num_bytes, + BTRFS_QGROUP_RSV_META_PERTRANS); + + list_for_each_entry(glist, &qgroup->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); } out: + qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); } +/* + * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. + * + * This is called when preallocated meta reservation needs to be used. + * Normally after btrfs_join_transaction() call. + */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ @@ -4254,7 +4467,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, int level = btrfs_header_level(subvol_parent) - 1; int ret = 0; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > @@ -4364,7 +4577,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, int ret = 0; int i; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (!is_fstree(root->root_key.objectid) || !root->reloc_root) return 0; @@ -4447,3 +4660,61 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) } *root = RB_ROOT; } + +void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) +{ + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) + return; + + if (!is_fstree(root)) + return; + + btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); +} + +int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, + struct btrfs_squota_delta *delta) +{ + int ret; + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *qg; + LIST_HEAD(qgroup_list); + u64 root = delta->root; + u64 num_bytes = delta->num_bytes; + const int sign = (delta->is_inc ? 1 : -1); + + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) + return 0; + + if (!is_fstree(root)) + return 0; + + /* If the extent predates enabling quotas, don't count it. */ + if (delta->generation < fs_info->qgroup_enable_gen) + return 0; + + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, root); + if (!qgroup) { + ret = -ENOENT; + goto out; + } + + ret = 0; + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qg, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; + + qg->excl += num_bytes * sign; + qg->rfer += num_bytes * sign; + qgroup_dirty(fs_info, qg); + + list_for_each_entry(glist, &qg->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); + } + qgroup_iterator_clean(&qgroup_list); + +out: + spin_unlock(&fs_info->qgroup_lock); + return ret; +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 7bffa10589d6..be18c862e64e 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -101,8 +101,15 @@ * subtree rescan for them. */ -#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3) -#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4) +/* + * These flags share the flags field of the btrfs_qgroup_status_item with the + * persisted flags defined in btrfs_tree.h. + * + * To minimize the chance of collision with new persisted status flags, these + * count backwards from the MSB. + */ +#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63) +#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62) /* * Record a dirty extent, and info qgroup to update quota on it @@ -220,6 +227,33 @@ struct btrfs_qgroup { struct list_head groups; /* groups this group is member of */ struct list_head members; /* groups that are members of this group */ struct list_head dirty; /* dirty groups */ + + /* + * For qgroup iteration usage. + * + * The iteration list should always be empty until qgroup_iterator_add() + * is called. And should be reset to empty after the iteration is + * finished. + */ + struct list_head iterator; + + /* + * For nested iterator usage. + * + * Here we support at most one level of nested iterator calls like: + * + * LIST_HEAD(all_qgroups); + * { + * LIST_HEAD(local_qgroups); + * qgroup_iterator_add(local_qgroups, qg); + * qgroup_iterator_nested_add(all_qgroups, qg); + * do_some_work(local_qgroups); + * qgroup_iterator_clean(local_qgroups); + * } + * do_some_work(all_qgroups); + * qgroup_iterator_nested_clean(all_qgroups); + */ + struct list_head nested_iterator; struct rb_node node; /* tree of qgroups */ /* @@ -235,6 +269,19 @@ struct btrfs_qgroup { struct kobject kobj; }; +struct btrfs_squota_delta { + /* The fstree root this delta counts against. */ + u64 root; + /* The number of bytes in the extent being counted. */ + u64 num_bytes; + /* The generation the extent was created in. */ + u64 generation; + /* Whether we are using or freeing the extent. */ + bool is_inc; + /* Whether the extent is data or metadata. */ + bool is_data; +}; + static inline u64 btrfs_qgroup_subvolid(u64 qgroupid) { return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); @@ -249,14 +296,23 @@ enum { ENUM_BIT(QGROUP_FREE), }; -int btrfs_quota_enable(struct btrfs_fs_info *fs_info); +enum btrfs_qgroup_mode { + BTRFS_QGROUP_MODE_DISABLED, + BTRFS_QGROUP_MODE_FULL, + BTRFS_QGROUP_MODE_SIMPLE +}; + +enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info); +bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info); +bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info); +int btrfs_quota_enable(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_quota_ctl_args *quota_ctl_args); int btrfs_quota_disable(struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, bool interruptible); -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, - u64 dst); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst); int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); @@ -267,80 +323,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -/* - * Inform qgroup to trace one dirty extent, its info is recorded in @record. - * So qgroup can account it at transaction committing time. - * - * No lock version, caller must acquire delayed ref lock and allocated memory, - * then call btrfs_qgroup_trace_extent_post() after exiting lock context. - * - * Return 0 for success insert - * Return >0 for existing record, caller can free @record safely. - * Error is not possible - */ int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record); - -/* - * Post handler after qgroup_trace_extent_nolock(). - * - * NOTE: Current qgroup does the expensive backref walk at transaction - * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming - * new transaction. - * This is designed to allow btrfs_find_all_roots() to get correct new_roots - * result. - * - * However for old_roots there is no need to do backref walk at that time, - * since we search commit roots to walk backref and result will always be - * correct. - * - * Due to the nature of no lock version, we can't do backref there. - * So we must call btrfs_qgroup_trace_extent_post() after exiting - * spinlock context. - * - * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result - * using current root, then we can move all expensive backref walk out of - * transaction committing, but not now as qgroup accounting will be wrong again. - */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord); - -/* - * Inform qgroup to trace one dirty extent, specified by @bytenr and - * @num_bytes. - * So qgroup can account it at commit trans time. - * - * Better encapsulated version, with memory allocation and backref walk for - * commit roots. - * So this can sleep. - * - * Return 0 if the operation is done. - * Return <0 for error, like memory allocation failure or invalid parameter - * (NULL trans) - */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); - -/* - * Inform qgroup to trace all leaf items of data - * - * Return 0 for success - * Return <0 for error(ENOMEM) - */ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb); -/* - * Inform qgroup to trace a whole subtree, including all its child tree - * blocks and data. - * The root tree block is specified by @root_eb. - * - * Normally used by relocation(tree block swap) and subvolume deletion. - * - * Return 0 for success - * Return <0 for error(ENOMEM or tree search error) - */ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level); @@ -350,7 +342,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); int btrfs_run_qgroups(struct btrfs_trans_handle *trans); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, - u64 objectid, struct btrfs_qgroup_inherit *inherit); + u64 objectid, u64 inode_rootid, + struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type); @@ -363,10 +356,10 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, /* New io_tree based accurate qgroup reserve API */ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len); +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released); int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, - u64 len); + u64 len, u64 *freed); int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, @@ -408,20 +401,8 @@ static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, BTRFS_QGROUP_RSV_META_PREALLOC); } -/* - * Per-transaction meta reservation should be all freed at transaction commit - * time - */ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); - -/* - * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. - * - * This is called when preallocated meta reservation needs to be used. - * Normally after btrfs_join_transaction() call. - */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); - void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode); /* btrfs_qgroup_swapped_blocks related functions */ @@ -439,5 +420,8 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info); +void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes); +int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, + struct btrfs_squota_delta *delta); #endif diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c new file mode 100644 index 000000000000..9589362acfbf --- /dev/null +++ b/fs/btrfs/raid-stripe-tree.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Western Digital Corporation or its affiliates. + */ + +#include <linux/btrfs_tree.h> +#include "ctree.h" +#include "fs.h" +#include "accessors.h" +#include "transaction.h" +#include "disk-io.h" +#include "raid-stripe-tree.h" +#include "volumes.h" +#include "misc.h" +#include "print-tree.h" + +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *stripe_root = fs_info->stripe_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + u64 found_start; + u64 found_end; + u64 end = start + length; + int slot; + int ret; + + if (!stripe_root) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + key.objectid = start; + key.type = BTRFS_RAID_STRIPE_KEY; + key.offset = length; + + ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + found_start = key.objectid; + found_end = found_start + key.offset; + + /* That stripe ends before we start, we're done. */ + if (found_end <= start) + break; + + trace_btrfs_raid_extent_delete(fs_info, start, end, + found_start, found_end); + + ASSERT(found_start >= start && found_end <= end); + ret = btrfs_del_item(trans, stripe_root, path); + if (ret) + break; + + btrfs_release_path(path); + } + + btrfs_free_path(path); + return ret; +} + +static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key stripe_key; + struct btrfs_root *stripe_root = fs_info->stripe_root; + const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type); + u8 encoding = btrfs_bg_flags_to_raid_index(bioc->map_type); + struct btrfs_stripe_extent *stripe_extent; + const size_t item_size = struct_size(stripe_extent, strides, num_stripes); + int ret; + + stripe_extent = kzalloc(item_size, GFP_NOFS); + if (!stripe_extent) { + btrfs_abort_transaction(trans, -ENOMEM); + btrfs_end_transaction(trans); + return -ENOMEM; + } + + trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size, + num_stripes); + btrfs_set_stack_stripe_extent_encoding(stripe_extent, encoding); + for (int i = 0; i < num_stripes; i++) { + u64 devid = bioc->stripes[i].dev->devid; + u64 physical = bioc->stripes[i].physical; + u64 length = bioc->stripes[i].length; + struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i]; + + if (length == 0) + length = bioc->size; + + btrfs_set_stack_raid_stride_devid(raid_stride, devid); + btrfs_set_stack_raid_stride_physical(raid_stride, physical); + } + + stripe_key.objectid = bioc->logical; + stripe_key.type = BTRFS_RAID_STRIPE_KEY; + stripe_key.offset = bioc->size; + + ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, + item_size); + if (ret) + btrfs_abort_transaction(trans, ret); + + kfree(stripe_extent); + + return ret; +} + +int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_ordered_extent *ordered_extent) +{ + struct btrfs_io_context *bioc; + int ret; + + if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE)) + return 0; + + list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) { + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) + return ret; + } + + while (!list_empty(&ordered_extent->bioc_list)) { + bioc = list_first_entry(&ordered_extent->bioc_list, + typeof(*bioc), rst_ordered_entry); + list_del(&bioc->rst_ordered_entry); + btrfs_put_bioc(bioc); + } + + return 0; +} + +int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length, u64 map_type, + u32 stripe_index, struct btrfs_io_stripe *stripe) +{ + struct btrfs_root *stripe_root = fs_info->stripe_root; + struct btrfs_stripe_extent *stripe_extent; + struct btrfs_key stripe_key; + struct btrfs_key found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + const u64 end = logical + *length; + int num_stripes; + u8 encoding; + u64 offset; + u64 found_logical; + u64 found_length; + u64 found_end; + int slot; + int ret; + + stripe_key.objectid = logical; + stripe_key.type = BTRFS_RAID_STRIPE_KEY; + stripe_key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (stripe->is_scrub) { + path->skip_locking = 1; + path->search_commit_root = 1; + } + + ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0); + if (ret < 0) + goto free_path; + if (ret) { + if (path->slots[0] != 0) + path->slots[0]--; + } + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + found_logical = found_key.objectid; + found_length = found_key.offset; + found_end = found_logical + found_length; + + if (found_logical > end) { + ret = -ENOENT; + goto out; + } + + if (in_range(logical, found_logical, found_length)) + break; + + ret = btrfs_next_item(stripe_root, path); + if (ret) + goto out; + } + + offset = logical - found_logical; + + /* + * If we have a logically contiguous, but physically non-continuous + * range, we need to split the bio. Record the length after which we + * must split the bio. + */ + if (end > found_end) + *length -= end - found_end; + + num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); + stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + encoding = btrfs_stripe_extent_encoding(leaf, stripe_extent); + + if (encoding != btrfs_bg_flags_to_raid_index(map_type)) { + ret = -EUCLEAN; + btrfs_handle_fs_error(fs_info, ret, + "on-disk stripe encoding %d doesn't match RAID index %d", + encoding, + btrfs_bg_flags_to_raid_index(map_type)); + goto out; + } + + for (int i = 0; i < num_stripes; i++) { + struct btrfs_raid_stride *stride = &stripe_extent->strides[i]; + u64 devid = btrfs_raid_stride_devid(leaf, stride); + u64 physical = btrfs_raid_stride_physical(leaf, stride); + + if (devid != stripe->dev->devid) + continue; + + if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i) + continue; + + stripe->physical = physical + offset; + + trace_btrfs_get_raid_extent_offset(fs_info, logical, *length, + stripe->physical, devid); + + ret = 0; + goto free_path; + } + + /* If we're here, we haven't found the requested devid in the stripe. */ + ret = -ENOENT; +out: + if (ret > 0) + ret = -ENOENT; + if (ret && ret != -EIO && !stripe->is_scrub) { + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) + btrfs_print_tree(leaf, 1); + btrfs_err(fs_info, + "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", + logical, logical + *length, stripe->dev->devid, + btrfs_bg_type_to_raid_name(map_type)); + } +free_path: + btrfs_free_path(path); + + return ret; +} diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h new file mode 100644 index 000000000000..cdb58b38fcb5 --- /dev/null +++ b/fs/btrfs/raid-stripe-tree.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Western Digital Corporation or its affiliates. + */ + +#ifndef BTRFS_RAID_STRIPE_TREE_H +#define BTRFS_RAID_STRIPE_TREE_H + +#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID1_MASK | \ + BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID10) + +struct btrfs_io_context; +struct btrfs_io_stripe; +struct btrfs_ordered_extent; +struct btrfs_trans_handle; + +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length); +int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length, u64 map_type, + u32 stripe_index, struct btrfs_io_stripe *stripe); +int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_ordered_extent *ordered_extent); + +static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, + u64 map_type) +{ + u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK; + u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) + return false; + + if (type != BTRFS_BLOCK_GROUP_DATA) + return false; + + if (profile & BTRFS_RST_SUPP_BLOCK_GROUP_MASK) + return true; + + return false; +} + +static inline int btrfs_num_raid_stripes(u32 item_size) +{ + return (item_size - offsetof(struct btrfs_stripe_extent, strides)) / + sizeof(struct btrfs_raid_stride); +} + +#endif diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0249ea52bb80..3e014b9370a3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -584,8 +584,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING || - last->operation == BTRFS_RBIO_READ_REBUILD) + if (last->operation == BTRFS_RBIO_READ_REBUILD) return 0; return 1; @@ -784,10 +783,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) spin_unlock(&rbio->bio_list_lock); spin_unlock(&h->lock); - if (next->operation == BTRFS_RBIO_READ_REBUILD) - start_async_work(next, recover_rbio_work_locked); - else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { - steal_rbio(rbio, next); + if (next->operation == BTRFS_RBIO_READ_REBUILD) { start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); @@ -1517,11 +1513,11 @@ static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_read_end_io; - if (trace_raid56_scrub_read_recover_enabled()) { + if (trace_raid56_read_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + trace_raid56_read(rbio, bio, &trace_info); } submit_bio(bio); } @@ -1698,8 +1694,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); @@ -1763,8 +1758,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); @@ -1897,8 +1891,7 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) goto out; } - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock(&rbio->bio_list_lock); @@ -2112,8 +2105,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) goto error; } - ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, - rbio->csum_buf, rbio->csum_bitmap, false); + ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, + rbio->csum_buf, rbio->csum_bitmap); if (ret < 0) goto error; if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) @@ -2198,11 +2191,11 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio, while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_write_end_io; - if (trace_raid56_write_stripe_enabled()) { + if (trace_raid56_write_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_write_stripe(rbio, bio, &trace_info); + trace_raid56_write(rbio, bio, &trace_info); } submit_bio(bio); } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0e84c9c9293f..45e6ff78316f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -14,7 +14,6 @@ enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, BTRFS_RBIO_PARITY_SCRUB, - BTRFS_RBIO_REBUILD_MISSING, }; struct btrfs_raid_bio { diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 95d28497de7c..6486f0d7e993 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -485,6 +485,9 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, ret = add_shared_data_ref(fs_info, offset, count, key->objectid, key->offset); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + break; default: btrfs_err(fs_info, "invalid key type in iref"); ret = -EINVAL; @@ -652,7 +655,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info, } /* - * btrfs_ref_tree_mod: called when we modify a ref for a bytenr + * Called when we modify a ref for a bytenr. * * This will add an action item to the given bytenr and do sanity checks to make * sure we haven't messed something up. If we are making a new allocation and @@ -681,10 +684,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, if (generic_ref->type == BTRFS_REF_METADATA) { if (!parent) - ref_root = generic_ref->tree_ref.owning_root; + ref_root = generic_ref->tree_ref.ref_root; owner = generic_ref->tree_ref.level; } else if (!parent) { - ref_root = generic_ref->data_ref.owning_root; + ref_root = generic_ref->data_ref.ref_root; owner = generic_ref->data_ref.ino; offset = generic_ref->data_ref.offset; } @@ -791,6 +794,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); + kfree(re); goto out_unlock; } else if (be->num_refs == 0) { btrfs_err(fs_info, @@ -800,6 +804,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); + kfree(re); goto out_unlock; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 0474bbe39da7..f88b0c2ac3fe 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -25,13 +25,11 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, const u64 olen, int no_time_update) { - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; inode_inc_iversion(inode); if (!no_time_update) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } /* * We round up to the block size at eof when determining which @@ -44,7 +42,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 46c3c1d57266..f5d9e5f74a52 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -111,8 +111,8 @@ struct tree_block { }; /* Use rb_simple_node for search/insert */ u64 owner; struct btrfs_key key; - unsigned int level:8; - unsigned int key_ready:1; + u8 level; + bool key_ready; }; #define MAX_EXTENTS 128 @@ -122,6 +122,13 @@ struct file_extent_cluster { u64 end; u64 boundary[MAX_EXTENTS]; unsigned int nr; + u64 owning_root; +}; + +/* Stages of data relocation. */ +enum reloc_stage { + MOVE_DATA_EXTENTS, + UPDATE_DATA_PTRS }; struct reloc_control { @@ -155,16 +162,12 @@ struct reloc_control { u64 search_start; u64 extents_found; - unsigned int stage:8; - unsigned int create_reloc_tree:1; - unsigned int merge_reloc_tree:1; - unsigned int found_file_extent:1; + enum reloc_stage stage; + bool create_reloc_tree; + bool merge_reloc_tree; + bool found_file_extent; }; -/* stages of data relocation */ -#define MOVE_DATA_EXTENTS 0 -#define UPDATE_DATA_PTRS 1 - static void mark_block_processed(struct reloc_control *rc, struct btrfs_backref_node *node) { @@ -180,13 +183,6 @@ static void mark_block_processed(struct reloc_control *rc, node->processed = 1; } - -static void mapping_tree_init(struct mapping_tree *tree) -{ - tree->rb_root = RB_ROOT; - spin_lock_init(&tree->lock); -} - /* * walk up backref nodes until reach node presents tree root */ @@ -299,7 +295,7 @@ static int update_backref_cache(struct btrfs_trans_handle *trans, return 1; } -static bool reloc_root_is_dead(struct btrfs_root *root) +static bool reloc_root_is_dead(const struct btrfs_root *root) { /* * Pair with set_bit/clear_bit in clean_dirty_subvols and @@ -320,7 +316,7 @@ static bool reloc_root_is_dead(struct btrfs_root *root) * from no reloc root. But btrfs_should_ignore_reloc_root() below is a * special case. */ -static bool have_reloc_root(struct btrfs_root *root) +static bool have_reloc_root(const struct btrfs_root *root) { if (reloc_root_is_dead(root)) return false; @@ -329,31 +325,30 @@ static bool have_reloc_root(struct btrfs_root *root) return true; } -int btrfs_should_ignore_reloc_root(struct btrfs_root *root) +bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) { struct btrfs_root *reloc_root; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - return 0; + return false; /* This root has been merged with its reloc tree, we can ignore it */ if (reloc_root_is_dead(root)) - return 1; + return true; reloc_root = root->reloc_root; if (!reloc_root) - return 0; + return false; if (btrfs_header_generation(reloc_root->commit_root) == root->fs_info->running_transaction->transid) - return 0; + return false; /* - * if there is reloc tree and it was created in previous - * transaction backref lookup can find the reloc tree, - * so backref node for the fs tree root is useless for - * relocation. + * If there is reloc tree and it was created in previous transaction + * backref lookup can find the reloc tree, so backref node for the fs + * tree root is useless for relocation. */ - return 1; + return true; } /* @@ -466,6 +461,7 @@ static bool handle_useless_nodes(struct reloc_control *rc, * cached. */ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( + struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_key *node_key, int level, u64 bytenr) { @@ -499,8 +495,8 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( /* Breadth-first search to build backref cache */ do { - ret = btrfs_backref_add_tree_node(cache, path, iter, node_key, - cur); + ret = btrfs_backref_add_tree_node(trans, cache, path, iter, + node_key, cur); if (ret < 0) { err = ret; goto out; @@ -546,7 +542,7 @@ out: */ static int clone_backref_node(struct btrfs_trans_handle *trans, struct reloc_control *rc, - struct btrfs_root *src, + const struct btrfs_root *src, struct btrfs_root *dest) { struct btrfs_root *reloc_root = src->reloc_root; @@ -631,7 +627,7 @@ fail: /* * helper to add 'address of tree root -> reloc tree' mapping */ -static int __must_check __add_reloc_root(struct btrfs_root *root) +static int __add_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; @@ -1158,7 +1154,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(leaf, fi); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, - num_bytes, parent); + num_bytes, parent, root->root_key.objectid); btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), key.objectid, key.offset, root->root_key.objectid, false); @@ -1169,7 +1165,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, } btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, parent); + num_bytes, parent, root->root_key.objectid); btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), key.objectid, key.offset, root->root_key.objectid, false); @@ -1180,15 +1176,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans, } } if (dirty) - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (inode) btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } -static noinline_for_stack -int memcmp_node_keys(struct extent_buffer *eb, int slot, - struct btrfs_path *path, int level) +static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, + int slot, const struct btrfs_path *path, + int level) { struct btrfs_disk_key key1; struct btrfs_disk_key key2; @@ -1373,16 +1369,17 @@ again: */ btrfs_set_node_blockptr(parent, slot, new_bytenr); btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); btrfs_set_node_blockptr(path->nodes[level], path->slots[level], old_bytenr); btrfs_set_node_ptr_generation(path->nodes[level], path->slots[level], old_ptr_gen); - btrfs_mark_buffer_dirty(path->nodes[level]); + btrfs_mark_buffer_dirty(trans, path->nodes[level]); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, - blocksize, path->nodes[level]->start); + blocksize, path->nodes[level]->start, + src->root_key.objectid); btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); @@ -1391,7 +1388,7 @@ again: break; } btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, - blocksize, 0); + blocksize, 0, dest->root_key.objectid); btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); @@ -1400,8 +1397,9 @@ again: break; } + /* We don't know the real owning_root, use 0. */ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, - blocksize, path->nodes[level]->start); + blocksize, path->nodes[level]->start, 0); btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 0, true); ret = btrfs_free_extent(trans, &ref); @@ -1410,8 +1408,9 @@ again: break; } + /* We don't know the real owning_root, use 0. */ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, - blocksize, 0); + blocksize, 0, 0); btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, true); ret = btrfs_free_extent(trans, &ref); @@ -1517,8 +1516,8 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, * [min_key, max_key) */ static int invalidate_extent_cache(struct btrfs_root *root, - struct btrfs_key *min_key, - struct btrfs_key *max_key) + const struct btrfs_key *min_key, + const struct btrfs_key *max_key) { struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode = NULL; @@ -1896,7 +1895,7 @@ again: } } - rc->merge_reloc_tree = 1; + rc->merge_reloc_tree = true; while (!list_empty(&rc->reloc_roots)) { reloc_root = list_entry(rc->reloc_roots.next, @@ -2516,11 +2515,12 @@ static int do_relocation(struct btrfs_trans_handle *trans, node->eb->start); btrfs_set_node_ptr_generation(upper->eb, slot, trans->transid); - btrfs_mark_buffer_dirty(upper->eb); + btrfs_mark_buffer_dirty(trans, upper->eb); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, node->eb->start, blocksize, - upper->eb->start); + upper->eb->start, + btrfs_header_owner(upper->eb)); btrfs_init_tree_ref(&ref, node->level, btrfs_header_owner(upper->eb), root->root_key.objectid, false); @@ -2632,7 +2632,7 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) u32 blocksize = rc->extent_root->fs_info->nodesize; if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) + bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) return 1; return 0; } @@ -2659,7 +2659,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, else btrfs_node_key_to_cpu(eb, &block->key, 0); free_extent_buffer(eb); - block->key_ready = 1; + block->key_ready = true; return 0; } @@ -2803,7 +2803,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Do tree relocation */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { - node = build_backref_tree(rc, &block->key, + node = build_backref_tree(trans, rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { err = PTR_ERR(node); @@ -2829,7 +2829,7 @@ out_free_blocks: static noinline_for_stack int prealloc_file_extent_cluster( struct btrfs_inode *inode, - struct file_extent_cluster *cluster) + const struct file_extent_cluster *cluster) { u64 alloc_hint = 0; u64 start; @@ -2964,7 +2964,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod /* * Allow error injection to test balance/relocation cancellation */ -noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) +noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) { return atomic_read(&fs_info->balance_cancel_req) || atomic_read(&fs_info->reloc_cancel_req) || @@ -2972,7 +2972,7 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) } ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); -static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster, +static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, int cluster_nr) { /* Last extent, use cluster end directly */ @@ -2984,7 +2984,7 @@ static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster, } static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, - struct file_extent_cluster *cluster, + const struct file_extent_cluster *cluster, int *cluster_nr, unsigned long page_index) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3006,9 +3006,6 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, if (!page) return -ENOMEM; } - ret = set_page_extent_mapped(page); - if (ret < 0) - goto release_page; if (PageReadahead(page)) page_cache_async_readahead(inode->i_mapping, ra, NULL, @@ -3024,6 +3021,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, } } + /* + * We could have lost page private when we dropped the lock to read the + * page above, make sure we set_page_extent_mapped here so we have any + * of the subpage blocksize stuff we need in place. + */ + ret = set_page_extent_mapped(page); + if (ret < 0) + goto release_page; + page_start = page_offset(page); page_end = page_start + PAGE_SIZE - 1; @@ -3113,7 +3119,7 @@ release_page: } static int relocate_file_extent_cluster(struct inode *inode, - struct file_extent_cluster *cluster) + const struct file_extent_cluster *cluster) { u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; @@ -3151,11 +3157,12 @@ out: return ret; } -static noinline_for_stack -int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, - struct file_extent_cluster *cluster) +static noinline_for_stack int relocate_data_extent(struct inode *inode, + const struct btrfs_key *extent_key, + struct file_extent_cluster *cluster) { int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { ret = relocate_file_extent_cluster(inode, cluster); @@ -3164,8 +3171,38 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, cluster->nr = 0; } - if (!cluster->nr) + /* + * Under simple quotas, we set root->relocation_src_root when we find + * the extent. If adjacent extents have different owners, we can't merge + * them while relocating. Handle this by storing the owning root that + * started a cluster and if we see an extent from a different root break + * cluster formation (just like the above case of non-adjacent extents). + * + * Without simple quotas, relocation_src_root is always 0, so we should + * never see a mismatch, and it should have no effect on relocation + * clusters. + */ + if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { + u64 tmp = root->relocation_src_root; + + /* + * root->relocation_src_root is the state that actually affects + * the preallocation we do here, so set it to the root owning + * the cluster we need to relocate. + */ + root->relocation_src_root = cluster->owning_root; + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + /* And reset it back for the current extent's owning root. */ + root->relocation_src_root = tmp; + } + + if (!cluster->nr) { cluster->start = extent_key->objectid; + cluster->owning_root = root->relocation_src_root; + } else BUG_ON(cluster->nr >= MAX_EXTENTS); cluster->end = extent_key->objectid + extent_key->offset - 1; @@ -3186,7 +3223,7 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, * the major work is getting the generation and level of the block */ static int add_tree_block(struct reloc_control *rc, - struct btrfs_key *extent_key, + const struct btrfs_key *extent_key, struct btrfs_path *path, struct rb_root *blocks) { @@ -3250,12 +3287,13 @@ static int add_tree_block(struct reloc_control *rc, if (type == BTRFS_TREE_BLOCK_REF_KEY) owner = btrfs_extent_inline_ref_offset(eb, iref); } - } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { - btrfs_print_v0_err(eb->fs_info); - btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); - return -EINVAL; } else { - BUG(); + btrfs_print_leaf(eb); + btrfs_err(rc->block_group->fs_info, + "unrecognized tree backref at tree block %llu slot %u", + eb->start, path->slots[0]); + btrfs_release_path(path); + return -EUCLEAN; } btrfs_release_path(path); @@ -3270,7 +3308,7 @@ static int add_tree_block(struct reloc_control *rc, block->key.objectid = rc->extent_root->fs_info->nodesize; block->key.offset = generation; block->level = level; - block->key_ready = 0; + block->key_ready = false; block->owner = owner; rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); @@ -3436,11 +3474,10 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, /* * helper to find all tree blocks that reference a given data extent */ -static noinline_for_stack -int add_data_references(struct reloc_control *rc, - struct btrfs_key *extent_key, - struct btrfs_path *path, - struct rb_root *blocks) +static noinline_for_stack int add_data_references(struct reloc_control *rc, + const struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) { struct btrfs_backref_walk_ctx ctx = { 0 }; struct ulist_iterator leaf_uiter; @@ -3498,6 +3535,8 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, last = rc->block_group->start + rc->block_group->length; while (1) { + bool block_found; + cond_resched(); if (rc->search_start >= last) { ret = 1; @@ -3548,11 +3587,11 @@ next: goto next; } - ret = find_first_extent_bit(&rc->processed_blocks, - key.objectid, &start, &end, - EXTENT_DIRTY, NULL); + block_found = find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY, NULL); - if (ret == 0 && start <= key.objectid) { + if (block_found && start <= key.objectid) { btrfs_release_path(path); rc->search_start = end + 1; } else { @@ -3612,7 +3651,7 @@ int prepare_to_relocate(struct reloc_control *rc) if (ret) return ret; - rc->create_reloc_tree = 1; + rc->create_reloc_tree = true; set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); @@ -3692,6 +3731,21 @@ restart: struct btrfs_extent_item); flags = btrfs_extent_flags(path->nodes[0], ei); + /* + * If we are relocating a simple quota owned extent item, we + * need to note the owner on the reloc data root so that when + * we allocate the replacement item, we can attribute it to the + * correct eventual owner (rather than the reloc data root). + */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { + struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; + u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, + path->nodes[0], + path->slots[0]); + + root->relocation_src_root = owning_root_id; + } + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); } else if (rc->stage == UPDATE_DATA_PTRS && @@ -3724,7 +3778,7 @@ restart: if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { - rc->found_file_extent = 1; + rc->found_file_extent = true; ret = relocate_data_extent(rc->data_inode, &key, &rc->cluster); if (ret < 0) { @@ -3761,7 +3815,7 @@ restart: err = ret; } - rc->create_reloc_tree = 0; + rc->create_reloc_tree = false; set_reloc_control(rc); btrfs_backref_release_cache(&rc->backref_cache); @@ -3779,7 +3833,7 @@ restart: merge_reloc_roots(rc); - rc->merge_reloc_tree = 0; + rc->merge_reloc_tree = false; unset_reloc_control(rc); btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); @@ -3825,7 +3879,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -3864,9 +3918,9 @@ out: * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 */ -static noinline_for_stack -struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group *group) +static noinline_for_stack struct inode *create_reloc_inode( + struct btrfs_fs_info *fs_info, + const struct btrfs_block_group *group) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; @@ -3961,8 +4015,9 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->reloc_roots); INIT_LIST_HEAD(&rc->dirty_subvol_roots); - btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); - mapping_tree_init(&rc->reloc_root_tree); + btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); + rc->reloc_root_tree.rb_root = RB_ROOT; + spin_lock_init(&rc->reloc_root_tree.lock); extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } @@ -3994,7 +4049,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, block_group->start, buf); } -static const char *stage_to_string(int stage) +static const char *stage_to_string(enum reloc_stage stage) { if (stage == MOVE_DATA_EXTENTS) return "move data extents"; @@ -4110,7 +4165,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) WARN_ON(ret && ret != -EAGAIN); while (1) { - int finishes_stage; + enum reloc_stage finishes_stage; mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); @@ -4293,7 +4348,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) goto out_unset; } - rc->merge_reloc_tree = 1; + rc->merge_reloc_tree = true; while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, @@ -4412,7 +4467,8 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) } int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, + struct btrfs_root *root, + const struct extent_buffer *buf, struct extent_buffer *cow) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4551,7 +4607,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, * * Return U64_MAX if no running relocation. */ -u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info) +u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) { u64 logical = U64_MAX; diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 77d69f6ae967..5fb60f2deb53 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -10,15 +10,16 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, + struct btrfs_root *root, + const struct extent_buffer *buf, struct extent_buffer *cow); void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); -int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); +int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info); struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); -int btrfs_should_ignore_reloc_root(struct btrfs_root *root); -u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info); +bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root); +u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 859874579456..603ad1459368 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -51,7 +51,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, } /* - * btrfs_find_root - lookup the root by the key. + * Lookup the root by the key. + * * root: the root of the root tree * search_key: the key to search * path: the path we search @@ -191,7 +192,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); write_extent_buffer(l, item, ptr, sizeof(*item)); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); return ret; @@ -438,7 +439,7 @@ again: btrfs_set_root_ref_name_len(leaf, ref, name->len); ptr = (unsigned long)(ref + 1); write_extent_buffer(leaf, name->name, ptr, name->len); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { btrfs_release_path(path); @@ -485,7 +486,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, } /* - * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * Reserve space for subvolume operation. + * * root: the root of the parent directory * rsv: block reservation * items: the number of items that we need do reservation @@ -508,7 +510,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + if (btrfs_qgroup_enabled(fs_info)) { /* One for parent inode, two for dir entries */ qgroup_num_bytes = 3 * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_prealloc(root, diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index cbbaca32126e..8b2c3859e464 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -3,6 +3,8 @@ #ifndef BTRFS_ROOT_TREE_H #define BTRFS_ROOT_TREE_H +struct fscrypt_str; + int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); @@ -18,10 +20,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_root_item *item); -int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *key, - struct btrfs_root_item *item); +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_root_item *item); int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, struct btrfs_path *path, struct btrfs_root_item *root_item, struct btrfs_key *root_key); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7289f5bff397..f62a408671cb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -16,7 +16,6 @@ #include "backref.h" #include "extent_io.h" #include "dev-replace.h" -#include "check-integrity.h" #include "raid56.h" #include "block-group.h" #include "zoned.h" @@ -24,6 +23,7 @@ #include "accessors.h" #include "file-item.h" #include "scrub.h" +#include "raid-stripe-tree.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -43,9 +43,20 @@ struct scrub_ctx; /* * The following value only influences the performance. * - * This determines the batch size for stripe submitted in one go. + * This detemines how many stripes would be submitted in one go, + * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). */ -#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */ +#define SCRUB_STRIPES_PER_GROUP 8 + +/* + * How many groups we have for each sctx. + * + * This would be 8M per device, the same value as the old scrub in-flight bios + * size limit. + */ +#define SCRUB_GROUPS_PER_SCTX 16 + +#define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP) /* * The following value times PAGE_SIZE needs to be large enough to match the @@ -172,9 +183,11 @@ struct scrub_stripe { }; struct scrub_ctx { - struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX]; + struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES]; struct scrub_stripe *raid56_data_stripes; struct btrfs_fs_info *fs_info; + struct btrfs_path extent_path; + struct btrfs_path csum_path; int first_free; int cur_stripe; atomic_t cancel_req; @@ -315,10 +328,10 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; - for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) + for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) release_scrub_stripe(&sctx->stripes[i]); - kfree(sctx); + kvfree(sctx); } static void scrub_put_ctx(struct scrub_ctx *sctx) @@ -333,13 +346,20 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( struct scrub_ctx *sctx; int i; - sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); + /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use + * kvzalloc(). + */ + sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; - for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { + sctx->extent_path.search_commit_root = 1; + sctx->extent_path.skip_locking = 1; + sctx->csum_path.search_commit_root = 1; + sctx->csum_path.skip_locking = 1; + for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { int ret; ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); @@ -877,7 +897,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, ASSERT(stripe->mirror_num >= 1); ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, stripe->logical, &mapped_len, &bioc, - NULL, NULL, 1); + NULL, NULL); /* * If we failed, dev will be NULL, and later detailed reports * will just be skipped. @@ -970,6 +990,9 @@ skip: spin_unlock(&sctx->stat_lock); } +static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, + unsigned long write_bitmap, bool dev_replace); + /* * The main entrance for all read related scrub work, including: * @@ -978,13 +1001,16 @@ skip: * - Go through the remaining mirrors and try to read as large blocksize as * possible * - Go through all mirrors (including the failed mirror) sector-by-sector + * - Submit writeback for repaired sectors * - * Writeback does not happen here, it needs extra synchronization. + * Writeback for dev-replace does not happen here, it needs extra + * synchronization for zoned devices. */ static void scrub_stripe_read_repair_worker(struct work_struct *work) { struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); - struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct scrub_ctx *sctx = stripe->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, stripe->bg->length); int mirror; @@ -1049,7 +1075,23 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) goto out; } out: - scrub_stripe_report_errors(stripe->sctx, stripe); + /* + * Submit the repaired sectors. For zoned case, we cannot do repair + * in-place, but queue the bg to be relocated. + */ + if (btrfs_is_zoned(fs_info)) { + if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); + } else if (!sctx->readonly) { + unsigned long repaired; + + bitmap_andnot(&repaired, &stripe->init_error_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, repaired, false); + wait_scrub_stripe_io(stripe); + } + + scrub_stripe_report_errors(sctx, stripe); set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); wake_up(&stripe->repair_wait); } @@ -1262,7 +1304,6 @@ static int get_raid56_logic_offset(u64 physical, int num, /* Work out the disk rotation on this stripe-set */ rot = stripe_nr % map->num_stripes; - stripe_nr /= map->num_stripes; /* calculate which stripe this data locates */ rot += i; stripe_index = rot % map->num_stripes; @@ -1468,6 +1509,8 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) * Return <0 for error. */ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, + struct btrfs_path *extent_path, + struct btrfs_path *csum_path, struct btrfs_device *dev, u64 physical, int mirror_num, u64 logical_start, u32 logical_len, @@ -1477,7 +1520,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); const u64 logical_end = logical_start + logical_len; - struct btrfs_path path = { 0 }; u64 cur_logical = logical_start; u64 stripe_end; u64 extent_start; @@ -1493,14 +1535,13 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, /* The range must be inside the bg. */ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); - path.search_commit_root = 1; - path.skip_locking = 1; - - ret = find_first_extent_item(extent_root, &path, logical_start, logical_len); + ret = find_first_extent_item(extent_root, extent_path, logical_start, + logical_len); /* Either error or not found. */ if (ret) goto out; - get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen); + get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, + &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) stripe->nr_meta_extents++; if (extent_flags & BTRFS_EXTENT_FLAG_DATA) @@ -1528,7 +1569,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, /* Fill the extent info for the remaining sectors. */ while (cur_logical <= stripe_end) { - ret = find_first_extent_item(extent_root, &path, cur_logical, + ret = find_first_extent_item(extent_root, extent_path, cur_logical, stripe_end - cur_logical + 1); if (ret < 0) goto out; @@ -1536,7 +1577,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, ret = 0; break; } - get_extent_info(&path, &extent_start, &extent_len, + get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) stripe->nr_meta_extents++; @@ -1561,9 +1602,9 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, */ ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); - ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical, - stripe_end, stripe->csums, - &csum_bitmap, true); + ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, + stripe->logical, stripe_end, + stripe->csums, &csum_bitmap); if (ret < 0) goto out; if (ret > 0) @@ -1576,7 +1617,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, } set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); out: - btrfs_release_path(&path); return ret; } @@ -1595,6 +1635,71 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe) } } +static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, + struct scrub_stripe *stripe) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct btrfs_bio *bbio = NULL; + u64 stripe_len = BTRFS_STRIPE_LEN; + int mirror = stripe->mirror_num; + int i; + + atomic_inc(&stripe->pending_io); + + for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) { + struct page *page = scrub_stripe_get_page(stripe, i); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); + + /* The current sector cannot be merged, submit the bio. */ + if (bbio && + ((i > 0 && + !test_bit(i - 1, &stripe->extent_sector_bitmap)) || + bbio->bio.bi_iter.bi_size >= stripe_len)) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bio(bbio, mirror); + bbio = NULL; + } + + if (!bbio) { + struct btrfs_io_stripe io_stripe = {}; + struct btrfs_io_context *bioc = NULL; + const u64 logical = stripe->logical + + (i << fs_info->sectorsize_bits); + int err; + + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, + fs_info, scrub_read_endio, stripe); + bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; + + io_stripe.is_scrub = true; + err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &stripe_len, &bioc, &io_stripe, + &mirror); + btrfs_put_bioc(bioc); + if (err) { + btrfs_bio_end_io(bbio, + errno_to_blk_status(err)); + return; + } + } + + __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + } + + if (bbio) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bio(bbio, mirror); + } + + if (atomic_dec_and_test(&stripe->pending_io)) { + wake_up(&stripe->io_wait); + INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); + queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); + } +} + static void scrub_submit_initial_read(struct scrub_ctx *sctx, struct scrub_stripe *stripe) { @@ -1606,6 +1711,11 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, ASSERT(stripe->mirror_num > 0); ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { + scrub_submit_extent_sector_read(sctx, stripe); + return; + } + bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); @@ -1654,6 +1764,28 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe) return false; } +static void submit_initial_group_read(struct scrub_ctx *sctx, + unsigned int first_slot, + unsigned int nr_stripes) +{ + struct blk_plug plug; + + ASSERT(first_slot < SCRUB_TOTAL_STRIPES); + ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES); + + scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, + btrfs_stripe_nr_to_offset(nr_stripes)); + blk_start_plug(&plug); + for (int i = 0; i < nr_stripes; i++) { + struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; + + /* Those stripes should be initialized. */ + ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + scrub_submit_initial_read(sctx, stripe); + } + blk_finish_plug(&plug); +} + static int flush_scrub_stripes(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->fs_info; @@ -1666,11 +1798,11 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); - scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, - btrfs_stripe_nr_to_offset(nr_stripes)); - for (int i = 0; i < nr_stripes; i++) { - stripe = &sctx->stripes[i]; - scrub_submit_initial_read(sctx, stripe); + /* Submit the stripes which are populated but not submitted. */ + if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { + const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP); + + submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot); } for (int i = 0; i < nr_stripes; i++) { @@ -1680,32 +1812,6 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); } - /* - * Submit the repaired sectors. For zoned case, we cannot do repair - * in-place, but queue the bg to be relocated. - */ - if (btrfs_is_zoned(fs_info)) { - for (int i = 0; i < nr_stripes; i++) { - stripe = &sctx->stripes[i]; - - if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) { - btrfs_repair_one_zone(fs_info, - sctx->stripes[0].bg->start); - break; - } - } - } else if (!sctx->readonly) { - for (int i = 0; i < nr_stripes; i++) { - unsigned long repaired; - - stripe = &sctx->stripes[i]; - - bitmap_andnot(&repaired, &stripe->init_error_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); - scrub_write_sectors(sctx, stripe, repaired, false); - } - } - /* Submit for dev-replace. */ if (sctx->is_dev_replace) { /* @@ -1750,28 +1856,42 @@ static void raid56_scrub_wait_endio(struct bio *bio) static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, struct btrfs_device *dev, int mirror_num, - u64 logical, u32 length, u64 physical) + u64 logical, u32 length, u64 physical, + u64 *found_logical_ret) { struct scrub_stripe *stripe; int ret; - /* No available slot, submit all stripes and wait for them. */ - if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) { - ret = flush_scrub_stripes(sctx); - if (ret < 0) - return ret; - } + /* + * There should always be one slot left, as caller filling the last + * slot should flush them all. + */ + ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); - stripe = &sctx->stripes[sctx->cur_stripe]; + /* @found_logical_ret must be specified. */ + ASSERT(found_logical_ret); - /* We can queue one stripe using the remaining slot. */ + stripe = &sctx->stripes[sctx->cur_stripe]; scrub_reset_stripe(stripe); - ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num, - logical, length, stripe); + ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, + &sctx->csum_path, dev, physical, + mirror_num, logical, length, stripe); /* Either >0 as no more extents or <0 for error. */ if (ret) return ret; + *found_logical_ret = stripe->logical; sctx->cur_stripe++; + + /* We filled one group, submit it. */ + if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { + const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; + + submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP); + } + + /* Last slot used, flush them all. */ + if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES) + return flush_scrub_stripes(sctx); return 0; } @@ -1785,6 +1905,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_io_context *bioc = NULL; + struct btrfs_path extent_path = { 0 }; + struct btrfs_path csum_path = { 0 }; struct bio *bio; struct scrub_stripe *stripe; bool all_empty = true; @@ -1795,6 +1917,16 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); + /* + * For data stripe search, we cannot re-use the same extent/csum paths, + * as the data stripe bytenr may be smaller than previous extent. Thus + * we have to use our own extent/csum paths. + */ + extent_path.search_commit_root = 1; + extent_path.skip_locking = 1; + csum_path.search_commit_root = 1; + csum_path.skip_locking = 1; + for (int i = 0; i < data_stripes; i++) { int stripe_index; int rot; @@ -1809,7 +1941,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, scrub_reset_stripe(stripe); set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); - ret = scrub_find_fill_first_stripe(bg, + ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, map->stripes[stripe_index].dev, physical, 1, full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); @@ -1854,24 +1986,6 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, /* For now, no zoned support for RAID56. */ ASSERT(!btrfs_is_zoned(sctx->fs_info)); - /* Writeback for the repaired sectors. */ - for (int i = 0; i < data_stripes; i++) { - unsigned long repaired; - - stripe = &sctx->raid56_data_stripes[i]; - - bitmap_andnot(&repaired, &stripe->init_error_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); - scrub_write_sectors(sctx, stripe, repaired, false); - } - - /* Wait for the above writebacks to finish. */ - for (int i = 0; i < data_stripes; i++) { - stripe = &sctx->raid56_data_stripes[i]; - - wait_scrub_stripe_io(stripe); - } - /* * Now all data stripes are properly verified. Check if we have any * unrepaired, if so abort immediately or we could further corrupt the @@ -1910,7 +2024,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, - &length, &bioc, NULL, NULL, 1); + &length, &bioc, NULL, NULL); if (ret < 0) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); @@ -1937,6 +2051,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bio_put(bio); btrfs_bio_counter_dec(fs_info); + btrfs_release_path(&extent_path); + btrfs_release_path(&csum_path); out: return ret; } @@ -1958,18 +2074,15 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; const u64 logical_end = logical_start + logical_length; - /* An artificial limit, inherit from old scrub behavior */ - struct btrfs_path path = { 0 }; u64 cur_logical = logical_start; int ret; /* The range must be inside the bg */ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); - path.search_commit_root = 1; - path.skip_locking = 1; /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { + u64 found_logical = U64_MAX; u64 cur_physical = physical + cur_logical - logical_start; /* Canceled? */ @@ -1994,7 +2107,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, ret = queue_scrub_stripe(sctx, bg, device, mirror_num, cur_logical, logical_end - cur_logical, - cur_physical); + cur_physical, &found_logical); if (ret > 0) { /* No more extent, just update the accounting */ sctx->stat.last_physical = physical + logical_length; @@ -2004,14 +2117,13 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, if (ret < 0) break; - ASSERT(sctx->cur_stripe > 0); - cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical - + BTRFS_STRIPE_LEN; + /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ + ASSERT(found_logical != U64_MAX); + cur_logical = found_logical + BTRFS_STRIPE_LEN; /* Don't hold CPU for too long time */ cond_resched(); } - btrfs_release_path(&path); return ret; } @@ -2109,6 +2221,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 stripe_logical; int stop_loop = 0; + /* Extent_path should be released by now. */ + ASSERT(sctx->extent_path.nodes[0] == NULL); + scrub_blocked_if_needed(fs_info); if (sctx->is_dev_replace && @@ -2227,6 +2342,9 @@ out: ret2 = flush_scrub_stripes(sctx); if (!ret) ret = ret2; + btrfs_release_path(&sctx->extent_path); + btrfs_release_path(&sctx->csum_path); + if (sctx->raid56_data_stripes) { for (int i = 0; i < nr_data_stripes(map); i++) release_scrub_stripe(&sctx->raid56_data_stripes[i]); @@ -2673,7 +2791,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (scrub_dev->fs_devices != fs_info->fs_devices) gen = scrub_dev->generation; else - gen = fs_info->last_trans_committed; + gen = btrfs_get_last_trans_committed(fs_info); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); @@ -2711,8 +2829,7 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ -static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, - int is_dev_replace) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) { struct workqueue_struct *scrub_workers = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; @@ -2722,10 +2839,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - if (is_dev_replace) - scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags); - else - scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); + scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); if (!scrub_workers) return -ENOMEM; @@ -2777,7 +2891,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (IS_ERR(sctx)) return PTR_ERR(sctx); - ret = scrub_workers_get(fs_info, is_dev_replace); + ret = scrub_workers_get(fs_info); if (ret) goto out_free_ctx; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8bfd44750efe..4e36550618e5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -796,7 +796,7 @@ static int send_cmd(struct send_ctx *sctx) put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); put_unaligned_le32(0, &hdr->crc); - crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); put_unaligned_le32(crc, &hdr->crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -3685,7 +3685,7 @@ static void tail_append_pending_moves(struct send_ctx *sctx, static int apply_children_dir_moves(struct send_ctx *sctx) { struct pending_dir_move *pm; - struct list_head stack; + LIST_HEAD(stack); u64 parent_ino = sctx->cur_ino; int ret = 0; @@ -3693,7 +3693,6 @@ static int apply_children_dir_moves(struct send_ctx *sctx) if (!pm) return 0; - INIT_LIST_HEAD(&stack); tail_append_pending_moves(sctx, pm, &stack); while (!list_empty(&stack)) { @@ -4165,7 +4164,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) int ret = 0; struct recorded_ref *cur; struct recorded_ref *cur2; - struct list_head check_dirs; + LIST_HEAD(check_dirs); struct fs_path *valid_path = NULL; u64 ow_inode = 0; u64 ow_gen; @@ -4184,7 +4183,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * which is always '..' */ BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); - INIT_LIST_HEAD(&check_dirs); valid_path = fs_path_alloc(); if (!valid_path) { @@ -5671,8 +5669,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, hdr = (struct btrfs_cmd_header *)sctx->send_buf; hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); hdr->crc = 0; - crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size); - crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); + crc = crc32c(0, sctx->send_buf, sctx->send_size); + crc = crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -8160,7 +8158,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) } sctx->send_filp = fget(arg->send_fd); - if (!sctx->send_filp) { + if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 75e7fa337e66..571bb13587d5 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -345,8 +345,10 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { + struct btrfs_space_info *data_sinfo; u64 profile; u64 avail; + u64 data_chunk_size; int factor; if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) @@ -364,6 +366,36 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, */ factor = btrfs_bg_type_to_factor(profile); avail = div_u64(avail, factor); + if (avail == 0) + return 0; + + /* + * Calculate the data_chunk_size, space_info->chunk_size is the + * "optimal" chunk size based on the fs size. However when we actually + * allocate the chunk we will strip this down further, making it no more + * than 10% of the disk or 1G, whichever is smaller. + */ + data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + data_chunk_size = min(data_sinfo->chunk_size, + mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); + data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + + /* + * Since data allocations immediately use block groups as part of the + * reservation, because we assume that data reservations will == actual + * usage, we could potentially overcommit and then immediately have that + * available space used by a data allocation, which could put us in a + * bind when we get close to filling the file system. + * + * To handle this simply remove the data_chunk_size from the available + * space. If we are relatively empty this won't affect our ability to + * overcommit much, and if we're very close to full it'll keep us from + * getting into a position where we've given ourselves very little + * metadata wiggle room. + */ + if (avail <= data_chunk_size) + return 0; + avail -= data_chunk_size; /* * If we aren't flushing all things, let us overcommit up to @@ -389,11 +421,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) && - (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) - avail = 0; - else - avail = calc_available_free_space(fs_info, space_info, flush); + avail = calc_available_free_space(fs_info, space_info, flush); if (used + bytes < space_info->total_bytes + avail) return 1; @@ -510,6 +538,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, int dump_block_groups) { struct btrfs_block_group *cache; + u64 total_avail = 0; int index = 0; spin_lock(&info->lock); @@ -523,18 +552,27 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, down_read(&info->groups_sem); again: list_for_each_entry(cache, &info->block_groups[index], list) { + u64 avail; + spin_lock(&cache->lock); + avail = cache->length - cache->used - cache->pinned - + cache->reserved - cache->delalloc_bytes - + cache->bytes_super - cache->zone_unusable; btrfs_info(fs_info, - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s", - cache->start, cache->length, cache->used, cache->pinned, - cache->reserved, cache->zone_unusable, - cache->ro ? "[readonly]" : ""); +"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", + cache->start, cache->length, cache->used, cache->pinned, + cache->reserved, cache->delalloc_bytes, + cache->bytes_super, cache->zone_unusable, + avail, cache->ro ? "[readonly]" : ""); spin_unlock(&cache->lock); btrfs_dump_free_space(cache, bytes); + total_avail += avail; } if (++index < BTRFS_NR_RAID_TYPES) goto again; up_read(&info->groups_sem); + + btrfs_info(fs_info, "%llu bytes available across all block groups", total_avail); } static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, @@ -550,18 +588,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, return nr; } -static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info, - u64 to_reclaim) -{ - const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1); - u64 nr; - - nr = div64_u64(to_reclaim, bytes); - if (!nr) - nr = 1; - return nr; -} - #define EXTENT_SIZE_PER_ITEM SZ_256K /* @@ -715,9 +741,11 @@ static void flush_space(struct btrfs_fs_info *fs_info, else nr = -1; - trans = btrfs_join_transaction(root); + trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; break; } ret = btrfs_run_delayed_items_nr(trans, nr); @@ -733,32 +761,21 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELAYED_REFS_NR: case FLUSH_DELAYED_REFS: - trans = btrfs_join_transaction(root); + trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; break; } if (state == FLUSH_DELAYED_REFS_NR) - nr = calc_delayed_refs_nr(fs_info, num_bytes); + btrfs_run_delayed_refs(trans, num_bytes); else - nr = 0; - btrfs_run_delayed_refs(trans, nr); + btrfs_run_delayed_refs(trans, 0); btrfs_end_transaction(trans); break; case ALLOC_CHUNK: case ALLOC_CHUNK_FORCE: - /* - * For metadata space on zoned filesystem, reaching here means we - * don't have enough space left in active_total_bytes. Try to - * activate a block group first, because we may have inactive - * block group already allocated. - */ - ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false); - if (ret < 0) - break; - else if (ret == 1) - break; - trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -770,22 +787,6 @@ static void flush_space(struct btrfs_fs_info *fs_info, CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); - /* - * For metadata space on zoned filesystem, allocating a new chunk - * is not enough. We still need to activate the block * group. - * Active the newly allocated block group by (maybe) finishing - * a block group. - */ - if (ret == 1) { - ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); - /* - * Revert to the original ret regardless we could finish - * one block group or not. - */ - if (ret >= 0) - ret = 1; - } - if (ret > 0 || ret == -ENOSPC) ret = 0; break; @@ -800,9 +801,18 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case COMMIT_TRANS: ASSERT(current->journal_info == NULL); - trans = btrfs_join_transaction(root); + /* + * We don't want to start a new transaction, just attach to the + * current one or wait it fully commits in case its commit is + * happening at the moment. Note: we don't use a nostart join + * because that does not wait for a transaction to fully commit + * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). + */ + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; break; } ret = btrfs_commit_transaction(trans); @@ -987,7 +997,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, } /* - * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets + * We've exhausted our flushing, start failing tickets. + * * @fs_info - fs_info for this fs * @space_info - the space info we were flushing * @@ -1408,8 +1419,18 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, } } - /* Attempt to steal from the global rsv if we can. */ - if (!steal_from_global_rsv(fs_info, space_info, ticket)) { + /* + * Attempt to steal from the global rsv if we can, except if the fs was + * turned into error mode due to a transaction abort when flushing space + * above, in that case fail with the abort error instead of returning + * success to the caller if we can steal from the global rsv - this is + * just to have caller fail immeditelly instead of later when trying to + * modify the fs, making it easier to debug -ENOSPC problems. + */ + if (BTRFS_FS_ERROR(fs_info)) { + ticket->error = BTRFS_FS_ERROR(fs_info); + remove_ticket(space_info, ticket); + } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { ticket->error = -ENOSPC; remove_ticket(space_info, ticket); } @@ -1741,7 +1762,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * Try to reserve metadata bytes from the block_rsv's space. * * @fs_info: the filesystem - * @block_rsv: block_rsv we're allocating for + * @space_info: the space_info we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation * @@ -1753,21 +1774,19 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * space already. */ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, + struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; - ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); + ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", - block_rsv->space_info->flags, - orig_bytes, 1); + space_info->flags, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, block_rsv->space_info, - orig_bytes, 0); + btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0); } return ret; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 0bb9d14e60a8..92c595fed1b0 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -3,6 +3,7 @@ #ifndef BTRFS_SPACE_INFO_H #define BTRFS_SPACE_INFO_H +#include <trace/events/btrfs.h> #include "volumes.h" /* @@ -212,7 +213,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, + struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f1dd172d8d5b..ef256b944c72 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -26,6 +26,7 @@ #include <linux/ratelimit.h> #include <linux/crc32c.h> #include <linux/btrfs.h> +#include <linux/security.h> #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -79,7 +80,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data); static void btrfs_put_super(struct super_block *sb) { - close_ctree(btrfs_sb(sb)); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + btrfs_info(fs_info, "last unmount of filesystem %pU", fs_info->fs_devices->fsid); + close_ctree(fs_info); } enum { @@ -129,9 +133,6 @@ enum { Opt_inode_cache, Opt_noinode_cache, /* Debugging options */ - Opt_check_integrity, - Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, Opt_enospc_debug, Opt_noenospc_debug, #ifdef CONFIG_BTRFS_DEBUG Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, @@ -200,9 +201,6 @@ static const match_table_t tokens = { {Opt_recovery, "recovery"}, /* Debugging options */ - {Opt_check_integrity, "check_int"}, - {Opt_check_integrity_including_extent_data, "check_int_data"}, - {Opt_check_integrity_print_mask, "check_int_print_mask=%u"}, {Opt_enospc_debug, "enospc_debug"}, {Opt_noenospc_debug, "noenospc_debug"}, #ifdef CONFIG_BTRFS_DEBUG @@ -707,38 +705,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_skip_balance: btrfs_set_opt(info->mount_opt, SKIP_BALANCE); break; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - case Opt_check_integrity_including_extent_data: - btrfs_info(info, - "enabling check integrity including extent data"); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); - break; - case Opt_check_integrity: - btrfs_info(info, "enabling check integrity"); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); - break; - case Opt_check_integrity_print_mask: - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, - "unrecognized check_integrity_print_mask value %s", - args[0].from); - goto out; - } - info->check_integrity_print_mask = intarg; - btrfs_info(info, "check_integrity_print_mask 0x%x", - info->check_integrity_print_mask); - break; -#else - case Opt_check_integrity_including_extent_data: - case Opt_check_integrity: - case Opt_check_integrity_print_mask: - btrfs_err(info, - "support for check_integrity* not compiled in!"); - ret = -EINVAL; - goto out; -#endif case Opt_fatal_errors: if (strcmp(args[0].from, "panic") == 0) { btrfs_set_opt(info->mount_opt, @@ -883,7 +849,7 @@ static int btrfs_parse_device_options(const char *options, blk_mode_t flags) error = -ENOMEM; goto out; } - device = btrfs_scan_one_device(device_name, flags); + device = btrfs_scan_one_device(device_name, flags, false); kfree(device_name); if (IS_ERR(device)) { error = PTR_ERR(device); @@ -1299,15 +1265,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",autodefrag"); if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA)) - seq_puts(seq, ",check_int_data"); - else if (btrfs_test_opt(info, CHECK_INTEGRITY)) - seq_puts(seq, ",check_int"); - if (info->check_integrity_print_mask) - seq_printf(seq, ",check_int_print_mask=%d", - info->check_integrity_print_mask); -#endif if (info->metadata_ratio) seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio); if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) @@ -1478,7 +1435,12 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, goto error_fs_info; } - device = btrfs_scan_one_device(device_name, mode); + /* + * With 'true' passed to btrfs_scan_one_device() (mount time) we expect + * either a valid device or an error. + */ + device = btrfs_scan_one_device(device_name, mode, true); + ASSERT(device != NULL); if (IS_ERR(device)) { mutex_unlock(&uuid_mutex); error = PTR_ERR(device); @@ -1513,7 +1475,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, error = -EBUSY; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, + shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name, s->s_id); btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data); @@ -2111,7 +2073,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * calculated f_bavail. */ if (!mixed && block_rsv->space_info->full && - total_free_meta - thresh < block_rsv->size) + (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size)) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; @@ -2190,7 +2152,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case BTRFS_IOC_SCAN_DEV: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); + /* + * Scanning outside of mount can return NULL which would turn + * into 0 error code. + */ + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; @@ -2204,8 +2170,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); - if (IS_ERR(device)) { + /* + * Scanning outside of mount can return NULL which would turn + * into 0 error code. + */ + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); + if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); ret = PTR_ERR(device); break; @@ -2250,6 +2220,7 @@ static int check_dev_super(struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_super_block *sb; + u64 last_trans; u16 csum_type; int ret = 0; @@ -2285,10 +2256,10 @@ static int check_dev_super(struct btrfs_device *dev) if (ret < 0) goto out; - if (btrfs_super_generation(sb) != fs_info->last_trans_committed) { + last_trans = btrfs_get_last_trans_committed(fs_info); + if (btrfs_super_generation(sb) != last_trans) { btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", - btrfs_super_generation(sb), - fs_info->last_trans_committed); + btrfs_super_generation(sb), last_trans); ret = -EUCLEAN; goto out; } @@ -2398,9 +2369,6 @@ static int __init btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_ASSERT ", assert=on" #endif -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - ", integrity-checker=on" -#endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY ", ref-verify=on" #endif diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 25294e624851..e6b51fb3ddc1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -291,12 +291,15 @@ BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); +BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif #ifdef CONFIG_BTRFS_DEBUG /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); +/* Remove once support for raid stripe tree is feature complete. */ +BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); @@ -322,11 +325,13 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), BTRFS_FEAT_ATTR_PTR(block_group_tree), + BTRFS_FEAT_ATTR_PTR(simple_quota), #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif #ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(extent_tree_v2), + BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), @@ -414,6 +419,19 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, BTRFS_ATTR(static_feature, supported_sectorsizes, supported_sectorsizes_show); +static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); +} +BTRFS_ATTR(static_feature, acl, acl_show); + +static ssize_t temp_fsid_supported_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + return sysfs_emit(buf, "0\n"); +} +BTRFS_ATTR(static_feature, temp_fsid, temp_fsid_supported_show); + /* * Features which only depend on kernel version. * @@ -421,11 +439,13 @@ BTRFS_ATTR(static_feature, supported_sectorsizes, * btrfs_supported_feature_attrs. */ static struct attribute *btrfs_supported_static_feature_attrs[] = { + BTRFS_ATTR_PTR(static_feature, acl), BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), BTRFS_ATTR_PTR(static_feature, supported_rescue_options), BTRFS_ATTR_PTR(static_feature, supported_sectorsizes), + BTRFS_ATTR_PTR(static_feature, temp_fsid), NULL }; @@ -1189,10 +1209,19 @@ static ssize_t btrfs_generation_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%llu\n", fs_info->generation); + return sysfs_emit(buf, "%llu\n", btrfs_get_fs_generation(fs_info)); } BTRFS_ATTR(, generation, btrfs_generation_show); +static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, "%d\n", fs_info->fs_devices->temp_fsid); +} +BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); + static const char * const btrfs_read_policy_name[] = { "pid" }; static ssize_t btrfs_read_policy_show(struct kobject *kobj, @@ -1295,6 +1324,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, read_policy), BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), + BTRFS_ATTR_PTR(, temp_fsid), NULL, }; @@ -2079,6 +2109,33 @@ static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, } BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show); +static ssize_t qgroup_mode_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + ssize_t ret = 0; + + spin_lock(&fs_info->qgroup_lock); + ASSERT(btrfs_qgroup_enabled(fs_info)); + switch (btrfs_qgroup_mode(fs_info)) { + case BTRFS_QGROUP_MODE_FULL: + ret = sysfs_emit(buf, "qgroup\n"); + break; + case BTRFS_QGROUP_MODE_SIMPLE: + ret = sysfs_emit(buf, "squota\n"); + break; + default: + btrfs_warn(fs_info, "unexpected qgroup mode %d\n", + btrfs_qgroup_mode(fs_info)); + break; + } + spin_unlock(&fs_info->qgroup_lock); + + return ret; +} +BTRFS_ATTR(qgroups, mode, qgroup_mode_show); + static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) @@ -2141,6 +2198,7 @@ static struct attribute *qgroups_attrs[] = { BTRFS_ATTR_PTR(qgroups, enabled), BTRFS_ATTR_PTR(qgroups, inconsistent), BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold), + BTRFS_ATTR_PTR(qgroups, mode), NULL }; ATTRIBUTE_GROUPS(qgroups); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 5ef0b90e25c3..6a43a64ba55a 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -61,7 +61,11 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - btrfs_setup_item_for_insert(root, path, &key, value_len); + /* + * Passing a NULL trans handle is fine here, we have a dummy root eb + * and the tree is a single node (level 0). + */ + btrfs_setup_item_for_insert(NULL, root, path, &key, value_len); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index f6bc6d738555..1cc86af97dc6 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -319,86 +319,139 @@ out: return ret; } -static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb, - unsigned long len) +static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb) { unsigned long i; - for (i = 0; i < len * BITS_PER_BYTE; i++) { + for (i = 0; i < eb->len * BITS_PER_BYTE; i++) { int bit, bit1; bit = !!test_bit(i, bitmap); bit1 = !!extent_buffer_test_bit(eb, 0, i); if (bit1 != bit) { - test_err("bits do not match"); + u8 has; + u8 expect; + + read_extent_buffer(eb, &has, i / BITS_PER_BYTE, 1); + expect = bitmap_get_value8(bitmap, ALIGN(i, BITS_PER_BYTE)); + + test_err( + "bits do not match, start byte 0 bit %lu, byte %lu has 0x%02x expect 0x%02x", + i, i / BITS_PER_BYTE, has, expect); return -EINVAL; } bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, i % BITS_PER_BYTE); if (bit1 != bit) { - test_err("offset bits do not match"); + u8 has; + u8 expect; + + read_extent_buffer(eb, &has, i / BITS_PER_BYTE, 1); + expect = bitmap_get_value8(bitmap, ALIGN(i, BITS_PER_BYTE)); + + test_err( + "bits do not match, start byte %lu bit %lu, byte %lu has 0x%02x expect 0x%02x", + i / BITS_PER_BYTE, i % BITS_PER_BYTE, + i / BITS_PER_BYTE, has, expect); return -EINVAL; } } return 0; } -static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, - unsigned long len) +static int test_bitmap_set(const char *name, unsigned long *bitmap, + struct extent_buffer *eb, + unsigned long byte_start, unsigned long bit_start, + unsigned long bit_len) +{ + int ret; + + bitmap_set(bitmap, byte_start * BITS_PER_BYTE + bit_start, bit_len); + extent_buffer_bitmap_set(eb, byte_start, bit_start, bit_len); + ret = check_eb_bitmap(bitmap, eb); + if (ret < 0) + test_err("%s test failed", name); + return ret; +} + +static int test_bitmap_clear(const char *name, unsigned long *bitmap, + struct extent_buffer *eb, + unsigned long byte_start, unsigned long bit_start, + unsigned long bit_len) +{ + int ret; + + bitmap_clear(bitmap, byte_start * BITS_PER_BYTE + bit_start, bit_len); + extent_buffer_bitmap_clear(eb, byte_start, bit_start, bit_len); + ret = check_eb_bitmap(bitmap, eb); + if (ret < 0) + test_err("%s test failed", name); + return ret; +} +static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb) { unsigned long i, j; + unsigned long byte_len = eb->len; u32 x; int ret; - memset(bitmap, 0, len); - memzero_extent_buffer(eb, 0, len); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_err("bitmap was not zeroed"); - return -EINVAL; - } + ret = test_bitmap_clear("clear all run 1", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("setting all bits failed"); + ret = test_bitmap_set("set all", bitmap, eb, 0, 0, byte_len * BITS_PER_BYTE); + if (ret < 0) return ret; - } - bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("clearing all bits failed"); + ret = test_bitmap_clear("clear all run 2", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_set("same byte set", bitmap, eb, 0, 2, 4); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("same byte partial clear", bitmap, eb, 0, 4, 1); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross byte set", bitmap, eb, 2, 4, 8); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross multi byte set", bitmap, eb, 4, 4, 24); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("cross byte clear", bitmap, eb, 2, 6, 4); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("cross multi byte clear", bitmap, eb, 4, 6, 20); + if (ret < 0) return ret; - } /* Straddling pages test */ - if (len > PAGE_SIZE) { - bitmap_set(bitmap, - (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("setting straddling pages failed"); + if (byte_len > PAGE_SIZE) { + ret = test_bitmap_set("cross page set", bitmap, eb, + PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross page set all", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) return ret; - } - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - bitmap_clear(bitmap, - (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, + ret = test_bitmap_clear("cross page clear", bitmap, eb, + PAGE_SIZE - sizeof(long) / 2, 0, sizeof(long) * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("clearing straddling pages failed"); + if (ret < 0) return ret; - } } /* @@ -406,9 +459,12 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, * something repetitive that could miss some hypothetical off-by-n bug. */ x = 0; - bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); - for (i = 0; i < len * BITS_PER_BYTE / 32; i++) { + ret = test_bitmap_clear("clear all run 3", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + for (i = 0; i < byte_len * BITS_PER_BYTE / 32; i++) { x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffU; for (j = 0; j < 32; j++) { if (x & (1U << j)) { @@ -418,7 +474,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, } } - ret = check_eb_bitmap(bitmap, eb, len); + ret = check_eb_bitmap(bitmap, eb); if (ret) { test_err("random bit pattern failed"); return ret; @@ -456,7 +512,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - ret = __test_eb_bitmaps(bitmap, eb, nodesize); + ret = __test_eb_bitmaps(bitmap, eb); if (ret) goto out; @@ -473,7 +529,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - ret = __test_eb_bitmaps(bitmap, eb, nodesize); + ret = __test_eb_bitmaps(bitmap, eb); out: free_extent_buffer(eb); kfree(bitmap); @@ -592,6 +648,146 @@ out: return ret; } +static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory, + const char *test_name) +{ + for (int i = 0; i < eb->len; i++) { + struct page *page = eb->pages[i >> PAGE_SHIFT]; + void *addr = page_address(page) + offset_in_page(i); + + if (memcmp(addr, memory + i, 1) != 0) { + test_err("%s failed", test_name); + test_err("eb and memory diffs at byte %u, eb has 0x%02x memory has 0x%02x", + i, *(u8 *)addr, *(u8 *)(memory + i)); + return; + } + } +} + +static int verify_eb_and_memory(struct extent_buffer *eb, void *memory, + const char *test_name) +{ + for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) { + void *eb_addr = page_address(eb->pages[i]); + + if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) { + dump_eb_and_memory_contents(eb, memory, test_name); + return -EUCLEAN; + } + } + return 0; +} + +/* + * Init both memory and extent buffer contents to the same randomly generated + * contents. + */ +static void init_eb_and_memory(struct extent_buffer *eb, void *memory) +{ + get_random_bytes(memory, eb->len); + write_extent_buffer(eb, memory, 0, eb->len); +} + +static int test_eb_mem_ops(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info; + struct extent_buffer *eb = NULL; + void *memory = NULL; + int ret; + + test_msg("running extent buffer memory operation tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + memory = kvzalloc(nodesize, GFP_KERNEL); + if (!memory) { + test_err("failed to allocate memory"); + ret = -ENOMEM; + goto out; + } + + eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize); + if (!eb) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = -ENOMEM; + goto out; + } + + init_eb_and_memory(eb, memory); + ret = verify_eb_and_memory(eb, memory, "full eb write"); + if (ret < 0) + goto out; + + memcpy(memory, memory + 16, 16); + memcpy_extent_buffer(eb, 0, 16, 16); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 1"); + if (ret < 0) + goto out; + + memcpy(memory, memory + 2048, 16); + memcpy_extent_buffer(eb, 0, 2048, 16); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 2"); + if (ret < 0) + goto out; + memcpy(memory, memory + 2048, 2048); + memcpy_extent_buffer(eb, 0, 2048, 2048); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 3"); + if (ret < 0) + goto out; + + memmove(memory + 512, memory + 256, 512); + memmove_extent_buffer(eb, 512, 256, 512); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 1"); + if (ret < 0) + goto out; + + memmove(memory + 2048, memory + 512, 2048); + memmove_extent_buffer(eb, 2048, 512, 2048); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 2"); + if (ret < 0) + goto out; + memmove(memory + 512, memory + 2048, 2048); + memmove_extent_buffer(eb, 512, 2048, 2048); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 3"); + if (ret < 0) + goto out; + + if (nodesize > PAGE_SIZE) { + memcpy(memory, memory + 4096 - 128, 256); + memcpy_extent_buffer(eb, 0, 4096 - 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page non-overlapping memcpy 1"); + if (ret < 0) + goto out; + + memcpy(memory + 4096 - 128, memory + 4096 + 128, 256); + memcpy_extent_buffer(eb, 4096 - 128, 4096 + 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page non-overlapping memcpy 2"); + if (ret < 0) + goto out; + + memmove(memory + 4096 - 128, memory + 4096 - 64, 256); + memmove_extent_buffer(eb, 4096 - 128, 4096 - 64, 256); + ret = verify_eb_and_memory(eb, memory, "cross page overlapping memcpy 1"); + if (ret < 0) + goto out; + + memmove(memory + 4096 - 64, memory + 4096 - 128, 256); + memmove_extent_buffer(eb, 4096 - 64, 4096 - 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page overlapping memcpy 2"); + if (ret < 0) + goto out; + } +out: + free_extent_buffer(eb); + kvfree(memory); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; @@ -607,6 +803,10 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) goto out; ret = test_eb_bitmaps(sectorsize, nodesize); + if (ret) + goto out; + + ret = test_eb_mem_ops(sectorsize, nodesize); out: return ret; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index ed0f36ae5346..29bdd08b241f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -6,6 +6,7 @@ #include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../btrfs_inode.h" #include "../volumes.h" #include "../disk-io.h" #include "../block-group.h" @@ -442,6 +443,406 @@ static int test_case_4(struct btrfs_fs_info *fs_info, return ret; } +static int add_compressed_extent(struct extent_map_tree *em_tree, + u64 start, u64 len, u64 block_start) +{ + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + em->start = start; + em->len = len; + em->block_start = block_start; + em->block_len = SZ_4K; + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("cannot add extent map [%llu, %llu)", start, start + len); + return ret; + } + + return 0; +} + +struct extent_range { + u64 start; + u64 len; +}; + +/* The valid states of the tree after every drop, as described below. */ +struct extent_range valid_ranges[][7] = { + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 3, .len = SZ_4K * 3}, /* [12k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K * 3}, /* [24k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K * 3}, /* [24k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K}, /* [24k, 28k) */ + { .start = SZ_32K, .len = SZ_4K}, /* [32k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K}, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K}, /* [24k, 28k) */ + } +}; + +static int validate_range(struct extent_map_tree *em_tree, int index) +{ + struct rb_node *n; + int i; + + for (i = 0, n = rb_first_cached(&em_tree->map); + valid_ranges[index][i].len && n; + i++, n = rb_next(n)) { + struct extent_map *entry = rb_entry(n, struct extent_map, rb_node); + + if (entry->start != valid_ranges[index][i].start) { + test_err("mapping has start %llu expected %llu", + entry->start, valid_ranges[index][i].start); + return -EINVAL; + } + + if (entry->len != valid_ranges[index][i].len) { + test_err("mapping has len %llu expected %llu", + entry->len, valid_ranges[index][i].len); + return -EINVAL; + } + } + + /* + * We exited because we don't have any more entries in the extent_map + * but we still expect more valid entries. + */ + if (valid_ranges[index][i].len) { + test_err("missing an entry"); + return -EINVAL; + } + + /* We exited the loop but still have entries in the extent map. */ + if (n) { + test_err("we have a left over entry in the extent map we didn't expect"); + return -EINVAL; + } + + return 0; +} + +/* + * Test scenario: + * + * Test the various edge cases of btrfs_drop_extent_map_range, create the + * following ranges + * + * [0, 12k)[12k, 24k)[24k, 36k)[36k, 40k)[40k,64k) + * + * And then we'll drop: + * + * [8k, 12k) - test the single front split + * [12k, 20k) - test the single back split + * [28k, 32k) - test the double split + * [32k, 64k) - test whole em dropping + * + * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from + * merging the em's. + */ +static int test_case_5(void) +{ + struct extent_map_tree *em_tree; + struct inode *inode; + u64 start, end; + int ret; + + test_msg("Running btrfs_drop_extent_map_range tests"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return -ENOMEM; + } + + em_tree = &BTRFS_I(inode)->extent_tree; + + /* [0, 12k) */ + ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0); + if (ret) { + test_err("cannot add extent range [0, 12K)"); + goto out; + } + + /* [12k, 24k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [24k, 36k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [36k, 40k) */ + ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [40k, 64k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* Drop [8k, 12k) */ + start = SZ_8K; + end = (3 * SZ_4K) - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 0); + if (ret) + goto out; + + /* Drop [12k, 20k) */ + start = SZ_4K * 3; + end = SZ_16K + SZ_4K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 1); + if (ret) + goto out; + + /* Drop [28k, 32k) */ + start = SZ_32K - SZ_4K; + end = SZ_32K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 2); + if (ret) + goto out; + + /* Drop [32k, 64k) */ + start = SZ_32K; + end = SZ_64K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 3); + if (ret) + goto out; +out: + iput(inode); + return ret; +} + +/* + * Test the btrfs_add_extent_mapping helper which will attempt to create an em + * for areas between two existing ems. Validate it doesn't do this when there + * are two unmerged em's side by side. + */ +static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) +{ + struct extent_map *em = NULL; + int ret; + + ret = add_compressed_extent(em_tree, 0, SZ_4K, 0); + if (ret) + goto out; + + ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0); + if (ret) + goto out; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + em->start = SZ_4K; + em->len = SZ_4K; + em->block_start = SZ_16K; + em->block_len = SZ_16K; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, 0, SZ_8K); + write_unlock(&em_tree->lock); + + if (ret != 0) { + test_err("got an error when adding our em: %d", ret); + goto out; + } + + ret = -EINVAL; + if (em->start != 0) { + test_err("unexpected em->start at %llu, wanted 0", em->start); + goto out; + } + if (em->len != SZ_4K) { + test_err("unexpected em->len %llu, expected 4K", em->len); + goto out; + } + ret = 0; +out: + free_extent_map(em); + free_extent_map_tree(em_tree); + return ret; +} + +/* + * Regression test for btrfs_drop_extent_map_range. Calling with skip_pinned == + * true would mess up the start/end calculations and subsequent splits would be + * incorrect. + */ +static int test_case_7(void) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct inode *inode; + int ret; + + test_msg("Running btrfs_drop_extent_cache with pinned"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return -ENOMEM; + } + + em_tree = &BTRFS_I(inode)->extent_tree; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* [0, 16K), pinned */ + em->start = 0; + em->len = SZ_16K; + em->block_start = 0; + em->block_len = SZ_4K; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("couldn't add extent map"); + goto out; + } + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* [32K, 48K), not pinned */ + em->start = SZ_32K; + em->len = SZ_16K; + em->block_start = SZ_32K; + em->block_len = SZ_16K; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("couldn't add extent map"); + goto out; + } + free_extent_map(em); + + /* + * Drop [0, 36K) This should skip the [0, 4K) extent and then split the + * [32K, 48K) extent. + */ + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (36 * SZ_1K) - 1, true); + + /* Make sure our extent maps look sane. */ + ret = -EINVAL; + + em = lookup_extent_mapping(em_tree, 0, SZ_16K); + if (!em) { + test_err("didn't find an em at 0 as expected"); + goto out; + } + + if (em->start != 0) { + test_err("em->start is %llu, expected 0", em->start); + goto out; + } + + if (em->len != SZ_16K) { + test_err("em->len is %llu, expected 16K", em->len); + goto out; + } + + free_extent_map(em); + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); + read_unlock(&em_tree->lock); + if (em) { + test_err("found an em when we weren't expecting one"); + goto out; + } + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); + read_unlock(&em_tree->lock); + if (!em) { + test_err("didn't find an em at 32K as expected"); + goto out; + } + + if (em->start != (36 * SZ_1K)) { + test_err("em->start is %llu, expected 36K", em->start); + goto out; + } + + if (em->len != (12 * SZ_1K)) { + test_err("em->len is %llu, expected 12K", em->len); + goto out; + } + + free_extent_map(em); + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); + read_unlock(&em_tree->lock); + if (em) { + test_err("found an unexpected em above 48K"); + goto out; + } + + ret = 0; +out: + free_extent_map(em); + iput(inode); + return ret; +} + struct rmap_test_vector { u64 raid_type; u64 physical_start; @@ -619,6 +1020,17 @@ int btrfs_test_extent_map(void) if (ret) goto out; ret = test_case_4(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_5(); + if (ret) + goto out; + ret = test_case_6(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_7(); + if (ret) + goto out; test_msg("running rmap tests"); for (i = 0; i < ARRAY_SIZE(rmap_tests); i++) { diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 05b03f5eab83..492d69d2fa73 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -34,7 +34,11 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - btrfs_setup_item_for_insert(root, &path, &key, value_len); + /* + * Passing a NULL trans handle is fine here, we have a dummy root eb + * and the tree is a single node (level 0). + */ + btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -64,7 +68,11 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - btrfs_setup_item_for_insert(root, &path, &key, value_len); + /* + * Passing a NULL trans handle is fine here, we have a dummy root eb + * and the tree is a single node (level 0). + */ + btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 91b6c2fdc420..5b3333ceef04 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -37,8 +37,6 @@ static struct kmem_cache *btrfs_trans_handle_cachep; -#define BTRFS_ROOT_TRANS_TAG 0 - /* * Transaction states and transitions * @@ -56,12 +54,17 @@ static struct kmem_cache *btrfs_trans_handle_cachep; * | Call btrfs_commit_transaction() on any trans handle attached to * | transaction N * V - * Transaction N [[TRANS_STATE_COMMIT_START]] + * Transaction N [[TRANS_STATE_COMMIT_PREP]] + * | + * | If there are simultaneous calls to btrfs_commit_transaction() one will win + * | the race and the rest will wait for the winner to commit the transaction. * | - * | Will wait for previous running transaction to completely finish if there - * | is one + * | The winner will wait for previous running transaction to completely finish + * | if there is one. * | - * | Then one of the following happes: + * Transaction N [[TRANS_STATE_COMMIT_START]] + * | + * | Then one of the following happens: * | - Wait for all other trans handle holders to release. * | The btrfs_commit_transaction() caller will do the commit work. * | - Wait for current transaction to be committed by others. @@ -112,6 +115,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep; */ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, + [TRANS_STATE_COMMIT_PREP] = 0U, [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | __TRANS_ATTACH | @@ -292,10 +296,11 @@ loop: spin_unlock(&fs_info->trans_lock); /* - * If we are ATTACH, we just want to catch the current transaction, - * and commit it. If there is no transaction, just return ENOENT. + * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the + * current transaction, and commit it. If there is no transaction, just + * return ENOENT. */ - if (type == TRANS_ATTACH) + if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART) return -ENOENT; /* @@ -379,7 +384,7 @@ loop: IO_TREE_TRANS_DIRTY_PAGES); extent_io_tree_init(fs_info, &cur_trans->pinned_extents, IO_TREE_FS_PINNED_EXTENTS); - fs_info->generation++; + btrfs_set_fs_generation(fs_info, fs_info->generation + 1); cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; cur_trans->aborted = 0; @@ -554,6 +559,69 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root) return true; } +static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush, + u64 num_bytes, + u64 *delayed_refs_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info; + u64 extra_delayed_refs_bytes = 0; + u64 bytes; + int ret; + + /* + * If there's a gap between the size of the delayed refs reserve and + * its reserved space, than some tasks have added delayed refs or bumped + * its size otherwise (due to block group creation or removal, or block + * group item update). Also try to allocate that gap in order to prevent + * using (and possibly abusing) the global reserve when committing the + * transaction. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL && + !btrfs_block_rsv_full(delayed_refs_rsv)) { + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) + extra_delayed_refs_bytes = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + spin_unlock(&delayed_refs_rsv->lock); + } + + bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes; + + /* + * We want to reserve all the bytes we may need all at once, so we only + * do 1 enospc flushing cycle per transaction start. + */ + ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + if (ret == 0) { + if (extra_delayed_refs_bytes > 0) + btrfs_migrate_to_delayed_refs_rsv(fs_info, + extra_delayed_refs_bytes); + return 0; + } + + if (extra_delayed_refs_bytes > 0) { + bytes -= extra_delayed_refs_bytes; + ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + if (ret == 0) + return 0; + } + + /* + * If we are an emergency flush, which can steal from the global block + * reserve, then attempt to not reserve space for the delayed refs, as + * we will consume space for them from the global block reserve. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + bytes -= *delayed_refs_bytes; + *delayed_refs_bytes = 0; + ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + } + + return ret; +} + static struct btrfs_trans_handle * start_transaction(struct btrfs_root *root, unsigned int num_items, unsigned int type, enum btrfs_reserve_flush_enum flush, @@ -561,10 +629,12 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; u64 qgroup_reserved = 0; + u64 delayed_refs_bytes = 0; bool reloc_reserved = false; bool do_chunk_alloc = false; int ret; @@ -587,29 +657,27 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * the appropriate flushing if need be. */ if (num_items && root != fs_info->chunk_root) { - struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv; - u64 delayed_refs_bytes = 0; - qgroup_reserved = num_items * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, - enforce_qgroups); + /* + * Use prealloc for now, as there might be a currently running + * transaction that could free this reserved space prematurely + * by committing. + */ + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved, + enforce_qgroups, false); if (ret) return ERR_PTR(ret); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); /* - * We want to reserve all the bytes we may need all at once, so - * we only do 1 enospc flushing cycle per transaction start. We - * accomplish this by simply assuming we'll do num_items worth - * of delayed refs updates in this trans handle, and refill that - * amount for whatever is missing in the reserve. + * If we plan to insert/update/delete "num_items" from a btree, + * we will also generate delayed refs for extent buffers in the + * respective btree paths, so reserve space for the delayed refs + * that will be generated by the caller as it modifies btrees. + * Try to reserve them to avoid excessive use of the global + * block reserve. */ - num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); - if (flush == BTRFS_RESERVE_FLUSH_ALL && - !btrfs_block_rsv_full(delayed_refs_rsv)) { - delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, - num_items); - num_bytes += delayed_refs_bytes; - } + delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items); /* * Do the reservation for the relocation root creation @@ -619,16 +687,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush); + ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes, + &delayed_refs_bytes); if (ret) goto reserve_fail; - if (delayed_refs_bytes) { - btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv, - delayed_refs_bytes); - num_bytes -= delayed_refs_bytes; - } - if (rsv->space_info->force_alloc) + btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true); + + if (trans_rsv->space_info->force_alloc) do_chunk_alloc = true; } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && !btrfs_block_rsv_full(delayed_refs_rsv)) { @@ -688,6 +754,7 @@ again: h->type = type; INIT_LIST_HEAD(&h->new_bgs); + btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS); smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START && @@ -700,11 +767,28 @@ again: if (num_bytes) { trace_btrfs_space_reservation(fs_info, "transaction", h->transid, num_bytes, 1); - h->block_rsv = &fs_info->trans_block_rsv; + h->block_rsv = trans_rsv; h->bytes_reserved = num_bytes; + if (delayed_refs_bytes > 0) { + trace_btrfs_space_reservation(fs_info, + "local_delayed_refs_rsv", + h->transid, + delayed_refs_bytes, 1); + h->delayed_refs_bytes_reserved = delayed_refs_bytes; + btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true); + delayed_refs_bytes = 0; + } h->reloc_reserved = reloc_reserved; } + /* + * Now that we have found a transaction to be a part of, convert the + * qgroup reservation from prealloc to pertrans. A different transaction + * can't race in and free our pertrans out from under us. + */ + if (qgroup_reserved) + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + got_it: if (!current->journal_info) current->journal_info = h; @@ -749,10 +833,12 @@ join_fail: kmem_cache_free(btrfs_trans_handle_cachep, h); alloc_fail: if (num_bytes) - btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, - num_bytes, NULL); + btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL); + if (delayed_refs_bytes) + btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info, + delayed_refs_bytes); reserve_fail: - btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved); + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); } @@ -785,7 +871,10 @@ struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root * /* * Similar to regular join but it never starts a transaction when none is - * running or after waiting for the current one to finish. + * running or when there's a running one at a state >= TRANS_STATE_UNBLOCKED. + * This is similar to btrfs_attach_transaction() but it allows the join to + * happen if the transaction commit already started but it's not yet in the + * "doing" phase (the state is < TRANS_STATE_COMMIT_DOING). */ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root) { @@ -794,7 +883,7 @@ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *roo } /* - * btrfs_attach_transaction() - catch the running transaction + * Catch the running transaction. * * It is used when we want to commit the current the transaction, but * don't want to start a new one. @@ -813,7 +902,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) } /* - * btrfs_attach_transaction_barrier() - catch the running transaction + * Catch the running transaction. * * It is similar to the above function, the difference is this one * will wait for all the inactive transactions until they fully @@ -889,7 +978,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) int ret = 0; if (transid) { - if (transid <= fs_info->last_trans_committed) + if (transid <= btrfs_get_last_trans_committed(fs_info)) goto out; /* find specified transaction */ @@ -913,7 +1002,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) * raced with btrfs_commit_transaction */ if (!cur_trans) { - if (transid > fs_info->last_trans_committed) + if (transid > btrfs_get_last_trans_committed(fs_info)) ret = -EINVAL; goto out; } @@ -968,11 +1057,14 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) if (!trans->block_rsv) { ASSERT(!trans->bytes_reserved); + ASSERT(!trans->delayed_refs_bytes_reserved); return; } - if (!trans->bytes_reserved) + if (!trans->bytes_reserved) { + ASSERT(!trans->delayed_refs_bytes_reserved); return; + } ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); trace_btrfs_space_reservation(fs_info, "transaction", @@ -980,6 +1072,16 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) btrfs_block_rsv_release(fs_info, trans->block_rsv, trans->bytes_reserved, NULL); trans->bytes_reserved = 0; + + if (!trans->delayed_refs_bytes_reserved) + return; + + trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv", + trans->transid, + trans->delayed_refs_bytes_reserved, 0); + btrfs_block_rsv_release(fs_info, &trans->delayed_rsv, + trans->delayed_refs_bytes_reserved, NULL); + trans->delayed_refs_bytes_reserved = 0; } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, @@ -1060,8 +1162,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (!find_first_extent_bit(dirty_pages, start, &start, &end, - mark, &cached_state)) { + while (find_first_extent_bit(dirty_pages, start, &start, &end, + mark, &cached_state)) { bool wait_writeback = false; err = convert_extent_bit(dirty_pages, start, end, @@ -1114,8 +1216,8 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (!find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_NEED_WAIT, &cached_state)) { + while (find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT, &cached_state)) { /* * Ignore -ENOMEM errors returned by clear_extent_bit(). * When committing the transaction, we'll remove any entries @@ -1311,7 +1413,7 @@ again: } /* Now flush any delayed refs generated by updating all of the roots */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) return ret; @@ -1326,7 +1428,7 @@ again: * so we want to keep this flushing in this loop to make sure * everything gets run. */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) return ret; } @@ -1461,45 +1563,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) } /* - * defrag a given btree. - * Every leaf in the btree is read and defragged. - */ -int btrfs_defrag_root(struct btrfs_root *root) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_trans_handle *trans; - int ret; - - if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) - return 0; - - while (1) { - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - - ret = btrfs_defrag_leaves(trans, root); - - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(info); - cond_resched(); - - if (btrfs_fs_closing(info) || ret != -EAGAIN) - break; - - if (btrfs_defrag_cancelled(info)) { - btrfs_debug(info, "defrag_root cancelled"); - ret = -EAGAIN; - break; - } - } - clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); - return ret; -} - -/* * Do all special snapshot related qgroup dirty hack. * * Will do all needed qgroup inherit and dirty hack like switch commit @@ -1516,11 +1579,10 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, int ret; /* - * Save some performance in the case that qgroups are not - * enabled. If this check races with the ioctl, rescan will - * kick in anyway. + * Save some performance in the case that qgroups are not enabled. If + * this check races with the ioctl, rescan will kick in anyway. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* @@ -1544,7 +1606,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * for now flush the delayed refs to narrow the race window where the * qgroup counters could end up wrong. */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -1559,7 +1621,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, /* Now qgroup are all updated, we can inherit it to new qgroups */ ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid, - inherit); + parent->root_key.objectid, inherit); if (ret < 0) goto out; @@ -1709,6 +1771,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_release_path(path); + ret = btrfs_create_qgroup(trans, objectid); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + /* * pull in the delayed directory update * and the delayed inode item @@ -1820,8 +1888,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * To co-operate with that hack, we do hack again. * Or snapshot will be greatly slowed down by a subtree qgroup rescan */ - ret = qgroup_account_snapshot(trans, root, parent_root, - pending->inherit, objectid); + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) + ret = qgroup_account_snapshot(trans, root, parent_root, + pending->inherit, objectid); + else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid, + parent_root->root_key.objectid, pending->inherit); if (ret < 0) goto fail; @@ -1837,9 +1909,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + fname.disk_name.len * 2); - parent_inode->i_mtime = current_time(parent_inode); - parent_inode->i_ctime = parent_inode->i_mtime; - ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); + inode_set_mtime_to_ts(parent_inode, + inode_set_ctime_current(parent_inode)); + ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1966,7 +2038,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) * Wait for the current transaction commit to start and block * subsequent transaction joins */ - btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); @@ -2062,7 +2134,7 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) struct btrfs_block_group *block_group, *tmp; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); list_del_init(&block_group->bg_list); } } @@ -2113,7 +2185,7 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) return; lockdep_assert_held(&trans->fs_info->trans_lock); - ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP); list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } @@ -2137,7 +2209,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); - btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); @@ -2197,7 +2269,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) } spin_lock(&fs_info->trans_lock); - if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; add_pending_snapshot(trans); @@ -2209,7 +2281,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) want_state = TRANS_STATE_SUPER_COMMITTED; btrfs_trans_state_lockdep_release(fs_info, - BTRFS_LOCKDEP_TRANS_COMMIT_START); + BTRFS_LOCKDEP_TRANS_COMMIT_PREP); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans, want_state); @@ -2221,9 +2293,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; } - cur_trans->state = TRANS_STATE_COMMIT_START; + cur_trans->state = TRANS_STATE_COMMIT_PREP; wake_up(&fs_info->transaction_blocked_wait); - btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); if (cur_trans->list.prev != &fs_info->trans_list) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; @@ -2244,11 +2316,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_put_transaction(prev_trans); if (ret) goto lockdep_release; - } else { - spin_unlock(&fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); } } else { - spin_unlock(&fs_info->trans_lock); /* * The previous transaction was aborted and was already removed * from the list of transactions at fs_info->trans_list. So we @@ -2256,11 +2326,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * corrupt state (pointing to trees with unwritten nodes/leafs). */ if (BTRFS_FS_ERROR(fs_info)) { + spin_unlock(&fs_info->trans_lock); ret = -EROFS; goto lockdep_release; } } + cur_trans->state = TRANS_STATE_COMMIT_START; + wake_up(&fs_info->transaction_blocked_wait); + spin_unlock(&fs_info->trans_lock); + /* * Get the time spent on the work done by the commit thread and not * the time spent waiting on a previous commit @@ -2378,7 +2453,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto unlock_reloc; - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) goto unlock_reloc; @@ -2511,7 +2586,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(fs_info); - fs_info->last_trans_committed = cur_trans->transid; + btrfs_set_last_trans_committed(fs_info, cur_trans->transid); /* * We needn't acquire the lock here because there is no other task * which can change it. @@ -2570,7 +2645,7 @@ lockdep_release: goto cleanup_transaction; lockdep_trans_commit_start_release: - btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_end_transaction(trans); return ret; } @@ -2629,18 +2704,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) */ void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, - unsigned int line, int errno, bool first_hit) + unsigned int line, int error, bool first_hit) { struct btrfs_fs_info *fs_info = trans->fs_info; - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) + WRITE_ONCE(trans->aborted, error); + WRITE_ONCE(trans->transaction->aborted, error); + if (first_hit && error == -ENOSPC) btrfs_dump_space_info_for_trans_abort(fs_info); /* Wake up anybody who may be waiting on this transaction */ wake_up(&fs_info->transaction_wait); wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); + __btrfs_handle_fs_error(fs_info, function, line, error, NULL); } int __init btrfs_transaction_init(void) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 8e9fa23bd7fe..2bf8bbdfd0b3 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -12,8 +12,12 @@ #include "ctree.h" #include "misc.h" +/* Radix-tree tag for roots that are part of the trasaction. */ +#define BTRFS_ROOT_TRANS_TAG 0 + enum btrfs_trans_state { TRANS_STATE_RUNNING, + TRANS_STATE_COMMIT_PREP, TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, @@ -117,8 +121,10 @@ enum { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 delayed_refs_bytes_reserved; u64 chunk_bytes_reserved; unsigned long delayed_ref_updates; + unsigned long delayed_ref_csum_deletions; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; @@ -138,6 +144,7 @@ struct btrfs_trans_handle { bool in_fsync; struct btrfs_fs_info *fs_info; struct list_head new_bgs; + struct btrfs_block_rsv delayed_rsv; }; /* @@ -171,7 +178,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, { spin_lock(&inode->lock); inode->last_trans = trans->transaction->transid; - inode->last_sub_trans = inode->root->log_transid; + inode->last_sub_trans = btrfs_get_root_log_transid(inode->root); inode->last_log_commit = inode->last_sub_trans - 1; spin_unlock(&inode->lock); } @@ -199,32 +206,32 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } -bool __cold abort_should_print_stack(int errno); +bool __cold abort_should_print_stack(int error); /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact stack trace is reported for some errors. */ -#define btrfs_abort_transaction(trans, errno) \ +#define btrfs_abort_transaction(trans, error) \ do { \ bool first = false; \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ first = true; \ - if (WARN(abort_should_print_stack(errno), \ + if (WARN(abort_should_print_stack(error), \ KERN_ERR \ "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ + (error))) { \ /* Stack trace printed. */ \ } else { \ - btrfs_debug((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ + btrfs_err((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (error)); \ } \ } \ __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ + __LINE__, (error), first); \ } while (0) int btrfs_end_transaction(struct btrfs_trans_handle *trans); @@ -242,7 +249,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier( int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); -int btrfs_defrag_root(struct btrfs_root *root); void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); @@ -263,7 +269,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, - unsigned int line, int errno, bool first_hit); + unsigned int line, int error, bool first_hit); int __init btrfs_transaction_init(void); void __cold btrfs_transaction_exit(void); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ab08a0b01311..50fdc69fdddf 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -29,6 +29,9 @@ #include "accessors.h" #include "file-item.h" #include "inode-item.h" +#include "dir-item.h" +#include "raid-stripe-tree.h" +#include "extent-tree.h" /* * Error message should follow the following format: @@ -1274,6 +1277,8 @@ static int check_extent_item(struct extent_buffer *leaf, unsigned long ptr; /* Current pointer inside inline refs */ unsigned long end; /* Extent item end */ const u32 item_size = btrfs_item_size(leaf, slot); + u8 last_type = 0; + u64 last_seq = U64_MAX; u64 flags; u64 generation; u64 total_refs; /* Total refs in btrfs_extent_item */ @@ -1320,6 +1325,18 @@ static int check_extent_item(struct extent_buffer *leaf, * 2.2) Ref type specific data * Either using btrfs_extent_inline_ref::offset, or specific * data structure. + * + * All above inline items should follow the order: + * + * - All btrfs_extent_inline_ref::type should be in an ascending + * order + * + * - Within the same type, the items should follow a descending + * order by their sequence number. The sequence number is + * determined by: + * * btrfs_extent_inline_ref::offset for all types other than + * EXTENT_DATA_REF + * * hash_extent_data_ref() for EXTENT_DATA_REF */ if (unlikely(item_size < sizeof(*ei))) { extent_err(leaf, slot, @@ -1401,6 +1418,7 @@ static int check_extent_item(struct extent_buffer *leaf, struct btrfs_extent_inline_ref *iref; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; + u64 seq; u64 dref_offset; u64 inline_offset; u8 inline_type; @@ -1414,6 +1432,7 @@ static int check_extent_item(struct extent_buffer *leaf, iref = (struct btrfs_extent_inline_ref *)ptr; inline_type = btrfs_extent_inline_ref_type(leaf, iref); inline_offset = btrfs_extent_inline_ref_offset(leaf, iref); + seq = inline_offset; if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", @@ -1444,6 +1463,10 @@ static int check_extent_item(struct extent_buffer *leaf, case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); dref_offset = btrfs_extent_data_ref_offset(leaf, dref); + seq = hash_extent_data_ref( + btrfs_extent_data_ref_root(leaf, dref), + btrfs_extent_data_ref_objectid(leaf, dref), + btrfs_extent_data_ref_offset(leaf, dref)); if (unlikely(!IS_ALIGNED(dref_offset, fs_info->sectorsize))) { extent_err(leaf, slot, @@ -1465,11 +1488,32 @@ static int check_extent_item(struct extent_buffer *leaf, } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + break; default: extent_err(leaf, slot, "unknown inline ref type: %u", inline_type); return -EUCLEAN; } + if (inline_type < last_type) { + extent_err(leaf, slot, + "inline ref out-of-order: has type %u, prev type %u", + inline_type, last_type); + return -EUCLEAN; + } + /* Type changed, allow the sequence starts from U64_MAX again. */ + if (inline_type > last_type) + last_seq = U64_MAX; + if (seq > last_seq) { + extent_err(leaf, slot, +"inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx", + inline_type, inline_offset, seq, + last_type, last_seq); + return -EUCLEAN; + } + last_type = inline_type; + last_seq = seq; ptr += btrfs_extent_inline_ref_size(inline_type); } /* No padding is allowed */ @@ -1631,6 +1675,44 @@ static int check_inode_ref(struct extent_buffer *leaf, return 0; } +static int check_raid_stripe_extent(const struct extent_buffer *leaf, + const struct btrfs_key *key, int slot) +{ + struct btrfs_stripe_extent *stripe_extent = + btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + + if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { + generic_err(leaf, slot, +"invalid key objectid for raid stripe extent, have %llu expect aligned to %u", + key->objectid, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + + if (unlikely(!btrfs_fs_incompat(leaf->fs_info, RAID_STRIPE_TREE))) { + generic_err(leaf, slot, + "RAID_STRIPE_EXTENT present but RAID_STRIPE_TREE incompat bit unset"); + return -EUCLEAN; + } + + switch (btrfs_stripe_extent_encoding(leaf, stripe_extent)) { + case BTRFS_STRIPE_RAID0: + case BTRFS_STRIPE_RAID1: + case BTRFS_STRIPE_DUP: + case BTRFS_STRIPE_RAID10: + case BTRFS_STRIPE_RAID5: + case BTRFS_STRIPE_RAID6: + case BTRFS_STRIPE_RAID1C3: + case BTRFS_STRIPE_RAID1C4: + break; + default: + generic_err(leaf, slot, "invalid raid stripe encoding %u", + btrfs_stripe_extent_encoding(leaf, stripe_extent)); + return -EUCLEAN; + } + + return 0; +} + /* * Common point to switch the item-specific validation. */ @@ -1685,6 +1767,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, case BTRFS_EXTENT_DATA_REF_KEY: ret = check_extent_data_ref(leaf, key, slot); break; + case BTRFS_RAID_STRIPE_KEY: + ret = check_raid_stripe_extent(leaf, key, slot); + break; } if (ret) @@ -2005,7 +2090,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, * So we only checks tree blocks which is read from disk, whose * generation <= fs_info->last_trans_committed. */ - if (btrfs_header_generation(eb) > fs_info->last_trans_committed) + if (btrfs_header_generation(eb) > btrfs_get_last_trans_committed(fs_info)) return 0; /* We have @first_key, so this @eb must have at least one item */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 365a1cc0a3c3..7d6729d9fd2f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -347,8 +347,7 @@ static int process_one_buffer(struct btrfs_root *log, } if (wc->pin) { - ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, - eb->len); + ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); if (ret) return ret; @@ -504,9 +503,9 @@ insert: found_size = btrfs_item_size(path->nodes[0], path->slots[0]); if (found_size > item_size) - btrfs_truncate_item(path, item_size, 1); + btrfs_truncate_item(trans, path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(path, item_size - found_size); + btrfs_extend_item(trans, path, item_size - found_size); } else if (ret) { return ret; } @@ -574,7 +573,7 @@ insert: } } no_copy: - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -767,7 +766,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } else if (ret == 0) { btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - ins.objectid, ins.offset, 0); + ins.objectid, ins.offset, 0, + root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, key->objectid, offset, 0, false); @@ -890,7 +890,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, update_inode: btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); out: iput(inode); return ret; @@ -1445,7 +1445,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) goto out; } @@ -1483,8 +1483,7 @@ out: return ret; } -static int count_inode_extrefs(struct btrfs_root *root, - struct btrfs_inode *inode, struct btrfs_path *path) +static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path) { int ret = 0; int name_len; @@ -1498,8 +1497,8 @@ static int count_inode_extrefs(struct btrfs_root *root, struct extent_buffer *leaf; while (1) { - ret = btrfs_find_one_extref(root, inode_objectid, offset, path, - &extref, &offset); + ret = btrfs_find_one_extref(inode->root, inode_objectid, offset, + path, &extref, &offset); if (ret) break; @@ -1527,8 +1526,7 @@ static int count_inode_extrefs(struct btrfs_root *root, return nlink; } -static int count_inode_refs(struct btrfs_root *root, - struct btrfs_inode *inode, struct btrfs_path *path) +static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path) { int ret; struct btrfs_key key; @@ -1543,7 +1541,7 @@ static int count_inode_refs(struct btrfs_root *root, key.offset = (u64)-1; while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0); if (ret < 0) break; if (ret > 0) { @@ -1595,9 +1593,9 @@ process_slot: * will free the inode. */ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; int ret; u64 nlink = 0; @@ -1607,13 +1605,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = count_inode_refs(root, BTRFS_I(inode), path); + ret = count_inode_refs(BTRFS_I(inode), path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(root, BTRFS_I(inode), path); + ret = count_inode_extrefs(BTRFS_I(inode), path); if (ret < 0) goto out; @@ -1623,7 +1621,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (nlink != inode->i_nlink) { set_nlink(inode, nlink); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) goto out; } @@ -1685,7 +1683,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; } - ret = fixup_inode_link_count(trans, root, inode); + ret = fixup_inode_link_count(trans, inode); iput(inode); if (ret) break; @@ -1732,7 +1730,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, set_nlink(inode, 1); else inc_nlink(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); } else if (ret == -EEXIST) { ret = 0; } @@ -1939,7 +1937,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + ret = btrfs_update_inode(trans, BTRFS_I(dir)); } kfree(name.name); iput(dir); @@ -2483,7 +2481,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, drop_args.bytes_found); /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, - root, BTRFS_I(inode)); + BTRFS_I(inode)); } iput(inode); if (ret) @@ -2574,7 +2572,7 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans, btrfs_tree_unlock(eb); if (trans) { - ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len); + ret = btrfs_pin_reserved_extent(trans, eb); if (ret) return ret; btrfs_redirty_list_add(trans->transaction, eb); @@ -2848,10 +2846,9 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, } /* - * btrfs_sync_log does sends a given tree log down to the disk and - * updates the super blocks to record it. When this call is done, - * you know that any inodes previously logged are safely on disk only - * if it returns 0. + * Sends a given tree log down to the disk and updates the super blocks to + * record it. When this call is done, you know that any inodes previously + * logged are safely on disk only if it returns 0. * * Any other return value means you need to call btrfs_commit_transaction. * Some of the edge cases for fsyncing directories that have had unlinks @@ -2961,7 +2958,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_root_node(&log->root_item, log->node); memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); - root->log_transid++; + btrfs_set_root_log_transid(root, root->log_transid + 1); log->log_transid = root->log_transid; root->log_start_pid = 0; /* @@ -2999,9 +2996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = update_log_root(trans, log, &new_root_item); if (ret) { - if (!list_empty(&root_log_ctx.list)) - list_del_init(&root_log_ctx.list); - + list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); btrfs_set_log_full_commit(trans); if (ret != -ENOSPC) @@ -3021,7 +3016,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); ret = btrfs_wait_tree_log_extents(log, mark); @@ -3136,8 +3130,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * someone else already started it. We use <= and not < because the * first log transaction has an ID of 0. */ - ASSERT(root->last_log_commit <= log_transid); - root->last_log_commit = log_transid; + ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid); + btrfs_set_root_last_log_commit(root, log_transid); out_wake_log_root: mutex_lock(&log_root_tree->log_mutex); @@ -3211,8 +3205,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, } } - clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); + extent_io_tree_release(&log->dirty_log_pages); extent_io_tree_release(&log->log_csum_range); btrfs_put_root(log); @@ -3530,7 +3523,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, last_offset = max(last_offset, curr_end); } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -4138,19 +4131,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); btrfs_set_token_timespec_sec(&token, &item->atime, - inode->i_atime.tv_sec); + inode_get_atime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->atime, - inode->i_atime.tv_nsec); + inode_get_atime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->mtime, - inode->i_mtime.tv_sec); + inode_get_mtime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode->i_mtime.tv_nsec); + inode_get_mtime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime_nsec(inode)); /* * We do not need to set the nbytes field, in fact during a fast fsync @@ -4488,7 +4481,7 @@ copy_item: dst_index++; } - btrfs_mark_buffer_dirty(dst_path->nodes[0]); + btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]); btrfs_release_path(dst_path); out: kfree(ins_data); @@ -4693,7 +4686,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, &fi, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(fi)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); @@ -4722,7 +4715,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; int ins_nr = 0; - int start_slot; + int start_slot = 0; int ret; if (!(inode->flags & BTRFS_INODE_PREALLOC)) @@ -4841,13 +4834,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *tmp; struct extent_map *em, *n; - struct list_head extents; + LIST_HEAD(extents); struct extent_map_tree *tree = &inode->extent_tree; int ret = 0; int num = 0; - INIT_LIST_HEAD(&extents); - write_lock(&tree->lock); list_for_each_entry_safe(em, n, &tree->modified_extents, list) { @@ -4923,12 +4914,12 @@ process: set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { - spin_lock_irq(&inode->ordered_tree.lock); + spin_lock_irq(&inode->ordered_tree_lock); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); atomic_inc(&trans->transaction->pending_ordered); } - spin_unlock_irq(&inode->ordered_tree.lock); + spin_unlock_irq(&inode->ordered_tree_lock); } btrfs_put_ordered_extent(ordered); } @@ -6794,8 +6785,8 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, while (true) { struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; + struct extent_buffer *leaf; + int slot; struct btrfs_key search_key; struct inode *inode; u64 ino; @@ -7206,9 +7197,7 @@ again: * each subsequent pass. */ if (ret == -ENOENT) - ret = btrfs_pin_extent_for_log_replay(trans, - log->node->start, - log->node->len); + ret = btrfs_pin_extent_for_log_replay(trans, log->node); btrfs_put_root(log); if (!ret) diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 33606025513d..b4ac2b0cd235 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -223,7 +223,8 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, } /* - * ulist_del - delete one node from ulist + * Delete one node from ulist. + * * @ulist: ulist to remove node from * @val: value to delete * @aux: aux to delete diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 7c7001f42b14..5be74f9e47eb 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -124,7 +124,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, * An item with that type already exists. * Extend the item and store the new subid at the end. */ - btrfs_extend_item(path, sizeof(subid_le)); + btrfs_extend_item(trans, path, sizeof(subid_le)); eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); @@ -139,7 +139,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, ret = 0; subid_le = cpu_to_le64(subid_cpu); write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); out: btrfs_free_path(path); @@ -221,7 +221,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, move_src = offset + sizeof(subid); move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot)); memmove_extent_buffer(eb, move_dst, move_src, move_len); - btrfs_truncate_item(path, item_size - sizeof(subid), 1); + btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1); out: btrfs_free_path(path); diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index c5ff16f9e9fa..66e2270b0dae 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -487,7 +487,7 @@ static int rollback_verity(struct btrfs_inode *inode) } inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -554,7 +554,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, } inode->ro_flags |= BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) goto end_trans; ret = del_orphan(trans, inode); @@ -715,7 +715,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - struct page *page; + struct folio *folio; u64 off = (u64)index << PAGE_SHIFT; loff_t merkle_pos = merkle_file_pos(inode); int ret; @@ -726,29 +726,36 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode, return ERR_PTR(-EFBIG); index += merkle_pos >> PAGE_SHIFT; again: - page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); - if (page) { - if (PageUptodate(page)) - return page; + folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); + if (!IS_ERR(folio)) { + if (folio_test_uptodate(folio)) + goto out; - lock_page(page); - /* - * We only insert uptodate pages, so !Uptodate has to be - * an error - */ - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); + folio_lock(folio); + /* If it's not uptodate after we have the lock, we got a read error. */ + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-EIO); } - unlock_page(page); - return page; + folio_unlock(folio); + goto out; } - page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS)); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS), + 0); + if (!folio) return ERR_PTR(-ENOMEM); + ret = filemap_add_folio(inode->i_mapping, folio, index, GFP_NOFS); + if (ret) { + folio_put(folio); + /* Did someone else insert a folio here? */ + if (ret == -EEXIST) + goto again; + return ERR_PTR(ret); + } + /* * Merkle item keys are indexed from byte 0 in the merkle tree. * They have the form: @@ -756,28 +763,19 @@ again: * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] */ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, - page_address(page), PAGE_SIZE, page); + folio_address(folio), PAGE_SIZE, &folio->page); if (ret < 0) { - put_page(page); + folio_put(folio); return ERR_PTR(ret); } if (ret < PAGE_SIZE) - memzero_page(page, ret, PAGE_SIZE - ret); + folio_zero_segment(folio, ret, PAGE_SIZE); - SetPageUptodate(page); - ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS); + folio_mark_uptodate(folio); + folio_unlock(folio); - if (!ret) { - /* Inserted and ready for fsverity */ - unlock_page(page); - } else { - put_page(page); - /* Did someone race us into inserting this page? */ - if (ret == -EEXIST) - goto again; - page = ERR_PTR(ret); - } - return page; +out: + return folio_file_page(folio, index); } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6aa9bf3661ac..f627674b37db 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -35,6 +35,7 @@ #include "relocation.h" #include "scrub.h" #include "super.h" +#include "raid-stripe-tree.h" #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ @@ -357,21 +358,19 @@ struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) } /* - * alloc_fs_devices - allocate struct btrfs_fs_devices - * @fsid: if not NULL, copy the UUID to fs_devices::fsid - * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid + * Allocate new btrfs_fs_devices structure identified by a fsid. + * + * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to + * fs_devices::metadata_fsid * * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). * The returned struct is not linked onto any lists and can be destroyed with * kfree() right away. */ -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, - const u8 *metadata_fsid) +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) { struct btrfs_fs_devices *fs_devs; - ASSERT(fsid || !metadata_fsid); - fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); if (!fs_devs) return ERR_PTR(-ENOMEM); @@ -385,8 +384,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, if (fsid) { memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); - memcpy(fs_devs->metadata_uuid, - metadata_fsid ?: fsid, BTRFS_FSID_SIZE); + memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); } return fs_devs; @@ -457,91 +455,41 @@ static noinline struct btrfs_fs_devices *find_fsid( return NULL; } -/* - * First check if the metadata_uuid is different from the fsid in the given - * fs_devices. Then check if the given fsid is the same as the metadata_uuid - * in the fs_devices. If it is, return true; otherwise, return false. - */ -static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices, - const u8 *fsid) -{ - return memcmp(fs_devices->fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0; -} - -static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( - struct btrfs_super_block *disk_super) -{ - - struct btrfs_fs_devices *fs_devices; - - /* - * Handle scanned device having completed its fsid change but - * belonging to a fs_devices that was created by first scanning - * a device which didn't have its fsid/metadata_uuid changed - * at all and the CHANGING_FSID_V2 flag set. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (!fs_devices->fsid_change) - continue; - - if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid, - fs_devices->fsid)) - return fs_devices; - } - - /* - * Handle scanned device having completed its fsid change but - * belonging to a fs_devices that was created by a device that - * has an outdated pair of fsid/metadata_uuid and - * CHANGING_FSID_V2 flag set. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (!fs_devices->fsid_change) - continue; - - if (check_fsid_changed(fs_devices, disk_super->metadata_uuid)) - return fs_devices; - } - - return find_fsid(disk_super->fsid, disk_super->metadata_uuid); -} - - static int btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, - int flush, struct block_device **bdev, + int flush, struct bdev_handle **bdev_handle, struct btrfs_super_block **disk_super) { + struct block_device *bdev; int ret; - *bdev = blkdev_get_by_path(device_path, flags, holder, NULL); + *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL); - if (IS_ERR(*bdev)) { - ret = PTR_ERR(*bdev); + if (IS_ERR(*bdev_handle)) { + ret = PTR_ERR(*bdev_handle); goto error; } + bdev = (*bdev_handle)->bdev; if (flush) - sync_blockdev(*bdev); - ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); + sync_blockdev(bdev); + ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { - blkdev_put(*bdev, holder); + bdev_release(*bdev_handle); goto error; } - invalidate_bdev(*bdev); - *disk_super = btrfs_read_dev_super(*bdev); + invalidate_bdev(bdev); + *disk_super = btrfs_read_dev_super(bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - blkdev_put(*bdev, holder); + bdev_release(*bdev_handle); goto error; } return 0; error: - *bdev = NULL; + *bdev_handle = NULL; return ret; } @@ -562,13 +510,13 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; - int ret = 0; + int ret; + bool freed = false; lockdep_assert_held(&uuid_mutex); - if (devt) - ret = -ENOENT; - + /* Return good status if there is no instance of devt. */ + ret = 0; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { mutex_lock(&fs_devices->device_list_mutex); @@ -579,8 +527,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device if (devt && devt != device->devt) continue; if (fs_devices->opened) { - /* for an already deleted device return 0 */ - if (devt && ret != 0) + if (devt) ret = -EBUSY; break; } @@ -590,7 +537,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device list_del(&device->dev_list); btrfs_free_device(device); - ret = 0; + freed = true; } mutex_unlock(&fs_devices->device_list_mutex); @@ -601,9 +548,81 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device } } + /* If there is at least one freed device return 0. */ + if (freed) + return 0; + return ret; } +static struct btrfs_fs_devices *find_fsid_by_device( + struct btrfs_super_block *disk_super, + dev_t devt, bool *same_fsid_diff_dev) +{ + struct btrfs_fs_devices *fsid_fs_devices; + struct btrfs_fs_devices *devt_fs_devices; + const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + bool found_by_devt = false; + + /* Find the fs_device by the usual method, if found use it. */ + fsid_fs_devices = find_fsid(disk_super->fsid, + has_metadata_uuid ? disk_super->metadata_uuid : NULL); + + /* The temp_fsid feature is supported only with single device filesystem. */ + if (btrfs_super_num_devices(disk_super) != 1) + return fsid_fs_devices; + + /* + * A seed device is an integral component of the sprout device, which + * functions as a multi-device filesystem. So, temp-fsid feature is + * not supported. + */ + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) + return fsid_fs_devices; + + /* Try to find a fs_devices by matching devt. */ + list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) { + struct btrfs_device *device; + + list_for_each_entry(device, &devt_fs_devices->devices, dev_list) { + if (device->devt == devt) { + found_by_devt = true; + break; + } + } + if (found_by_devt) + break; + } + + if (found_by_devt) { + /* Existing device. */ + if (fsid_fs_devices == NULL) { + if (devt_fs_devices->opened == 0) { + /* Stale device. */ + return NULL; + } else { + /* temp_fsid is mounting a subvol. */ + return devt_fs_devices; + } + } else { + /* Regular or temp_fsid device mounting a subvol. */ + return devt_fs_devices; + } + } else { + /* New device. */ + if (fsid_fs_devices == NULL) { + return NULL; + } else { + /* sb::fsid is already used create a new temp_fsid. */ + *same_fsid_diff_dev = true; + return NULL; + } + } + + /* Not reached. */ +} + /* * This is only used on mount, and we are protected from competing things * messing with our fs_devices by the uuid_mutex, thus we do not need the @@ -613,7 +632,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, blk_mode_t flags, void *holder) { - struct block_device *bdev; + struct bdev_handle *bdev_handle; struct btrfs_super_block *disk_super; u64 devid; int ret; @@ -624,7 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &disk_super); + &bdev_handle, &disk_super); if (ret) return ret; @@ -648,21 +667,21 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = true; } else { - if (bdev_read_only(bdev)) + if (bdev_read_only(bdev_handle->bdev)) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); else set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!bdev_nonrot(bdev)) + if (!bdev_nonrot(bdev_handle->bdev)) fs_devices->rotating = true; - if (bdev_max_discard_sectors(bdev)) + if (bdev_max_discard_sectors(bdev_handle->bdev)) fs_devices->discardable = true; - device->bdev = bdev; + device->bdev_handle = bdev_handle; + device->bdev = bdev_handle->bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - device->holder = holder; fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && @@ -676,89 +695,19 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - blkdev_put(bdev, holder); + bdev_release(bdev_handle); return -EINVAL; } -/* - * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices - * being created with a disk that has already completed its fsid change. Such - * disk can belong to an fs which has its FSID changed or to one which doesn't. - * Handle both cases here. - */ -static struct btrfs_fs_devices *find_fsid_inprogress( - struct btrfs_super_block *disk_super) -{ - struct btrfs_fs_devices *fs_devices; - - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change) - continue; - - if (check_fsid_changed(fs_devices, disk_super->fsid)) - return fs_devices; - } - - return find_fsid(disk_super->fsid, NULL); -} - -static struct btrfs_fs_devices *find_fsid_changed( - struct btrfs_super_block *disk_super) +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) { - struct btrfs_fs_devices *fs_devices; - - /* - * Handles the case where scanned device is part of an fs that had - * multiple successful changes of FSID but currently device didn't - * observe it. Meaning our fsid will be different than theirs. We need - * to handle two subcases : - * 1 - The fs still continues to have different METADATA/FSID uuids. - * 2 - The fs is switched back to its original FSID (METADATA/FSID - * are equal). - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - /* Changed UUIDs */ - if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) && - memcmp(fs_devices->fsid, disk_super->fsid, - BTRFS_FSID_SIZE) != 0) - return fs_devices; + bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); - /* Unchanged UUIDs */ - if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) == 0 && - memcmp(fs_devices->fsid, disk_super->metadata_uuid, - BTRFS_FSID_SIZE) == 0) - return fs_devices; - } - - return NULL; + return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; } -static struct btrfs_fs_devices *find_fsid_reverted_metadata( - struct btrfs_super_block *disk_super) -{ - struct btrfs_fs_devices *fs_devices; - - /* - * Handle the case where the scanned device is part of an fs whose last - * metadata UUID change reverted it to the original FSID. At the same - * time fs_devices was first created by another constituent device - * which didn't fully observe the operation. This results in an - * btrfs_fs_devices created with metadata/fsid different AND - * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the - * fs_devices equal to the FSID of the disk. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (!fs_devices->fsid_change) - continue; - - if (check_fsid_changed(fs_devices, disk_super->fsid)) - return fs_devices; - } - - return NULL; -} /* * Add new device to list of registered devices * @@ -777,10 +726,16 @@ static noinline struct btrfs_device *device_list_add(const char *path, u64 devid = btrfs_stack_device_id(&disk_super->dev_item); dev_t path_devt; int error; + bool same_fsid_diff_dev = false; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); - bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & - BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { + btrfs_err(NULL, +"device %s has incomplete metadata_uuid change, please use btrfstune to complete", + path); + return ERR_PTR(-EAGAIN); + } error = lookup_bdev(path, &path_devt); if (error) { @@ -789,27 +744,23 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(error); } - if (fsid_change_in_progress) { - if (!has_metadata_uuid) - fs_devices = find_fsid_inprogress(disk_super); - else - fs_devices = find_fsid_changed(disk_super); - } else if (has_metadata_uuid) { - fs_devices = find_fsid_with_metadata_uuid(disk_super); - } else { - fs_devices = find_fsid_reverted_metadata(disk_super); - if (!fs_devices) - fs_devices = find_fsid(disk_super->fsid, NULL); - } - + fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev); if (!fs_devices) { - fs_devices = alloc_fs_devices(disk_super->fsid, - has_metadata_uuid ? disk_super->metadata_uuid : NULL); + fs_devices = alloc_fs_devices(disk_super->fsid); if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); - fs_devices->fsid_change = fsid_change_in_progress; + if (has_metadata_uuid) + memcpy(fs_devices->metadata_uuid, + disk_super->metadata_uuid, BTRFS_FSID_SIZE); + + if (same_fsid_diff_dev) { + generate_random_uuid(fs_devices->fsid); + fs_devices->temp_fsid = true; + pr_info("BTRFS: device %s using temp-fsid %pU\n", + path, fs_devices->fsid); + } mutex_lock(&fs_devices->device_list_mutex); list_add(&fs_devices->fs_list, &fs_uuids); @@ -824,25 +775,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, mutex_lock(&fs_devices->device_list_mutex); device = btrfs_find_device(fs_devices, &args); - /* - * If this disk has been pulled into an fs devices created by - * a device which had the CHANGING_FSID_V2 flag then replace the - * metadata_uuid/fsid values of the fs_devices. - */ - if (fs_devices->fsid_change && - found_transid > fs_devices->latest_generation) { + if (found_transid > fs_devices->latest_generation) { memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); - - if (has_metadata_uuid) - memcpy(fs_devices->metadata_uuid, - disk_super->metadata_uuid, - BTRFS_FSID_SIZE); - else - memcpy(fs_devices->metadata_uuid, - disk_super->fsid, BTRFS_FSID_SIZE); - - fs_devices->fsid_change = false; + memcpy(fs_devices->metadata_uuid, + btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE); } } @@ -851,8 +788,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (fs_devices->opened) { btrfs_err(NULL, - "device %s belongs to fsid %pU, and the fs is already mounted", - path, fs_devices->fsid); +"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", + path, fs_devices->fsid, current->comm, + task_pid_nr(current)); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } @@ -995,7 +933,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) lockdep_assert_held(&uuid_mutex); - fs_devices = alloc_fs_devices(orig->fsid, NULL); + fs_devices = alloc_fs_devices(orig->fsid); if (IS_ERR(fs_devices)) return fs_devices; @@ -1066,9 +1004,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, if (device->devid == BTRFS_DEV_REPLACE_DEVID) continue; - if (device->bdev) { - blkdev_put(device->bdev, device->holder); + if (device->bdev_handle) { + bdev_release(device->bdev_handle); device->bdev = NULL; + device->bdev_handle = NULL; fs_devices->open_devices--; } if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -1113,7 +1052,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - blkdev_put(device->bdev, device->holder); + bdev_release(device->bdev_handle); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1354,14 +1293,19 @@ int btrfs_forget_devices(dev_t devt) /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock - * is read via pagecache + * is read via pagecache. + * + * With @mount_arg_dev it's a scan during mount time that will always register + * the device or return an error. Multi-device and seeding devices are registered + * in both cases. */ -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags) +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, + bool mount_arg_dev) { struct btrfs_super_block *disk_super; bool new_device_added = false; struct btrfs_device *device = NULL; - struct block_device *bdev; + struct bdev_handle *bdev_handle; u64 bytenr, bytenr_orig; int ret; @@ -1384,31 +1328,49 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags) * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev = blkdev_get_by_path(path, flags, NULL, NULL); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + bdev_handle = bdev_open_by_path(path, flags, NULL, NULL); + if (IS_ERR(bdev_handle)) + return ERR_CAST(bdev_handle); bytenr_orig = btrfs_sb_offset(0); - ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); + ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr); if (ret) { device = ERR_PTR(ret); goto error_bdev_put; } - disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); + disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr, + bytenr_orig); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; } + if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 && + !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) { + dev_t devt; + + ret = lookup_bdev(path, &devt); + if (ret) + btrfs_warn(NULL, "lookup bdev failed for path %s: %d", + path, ret); + else + btrfs_free_stale_devices(devt, NULL); + + pr_debug("BTRFS: skip registering single non-seed device %s\n", path); + device = NULL; + goto free_disk_super; + } + device = device_list_add(path, disk_super, &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); +free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: - blkdev_put(bdev, NULL); + bdev_release(bdev_handle); return device; } @@ -1424,9 +1386,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, lockdep_assert_held(&device->fs_info->chunk_mutex); - if (!find_first_extent_bit(&device->alloc_state, *start, - &physical_start, &physical_end, - CHUNK_ALLOCATED, NULL)) { + if (find_first_extent_bit(&device->alloc_state, *start, + &physical_start, &physical_end, + CHUNK_ALLOCATED, NULL)) { if (in_range(physical_start, *start, len) || in_range(*start, physical_start, @@ -1438,18 +1400,18 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, return false; } -static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) +static u64 dev_extent_search_start(struct btrfs_device *device) { switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: - return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); + return BTRFS_DEVICE_RANGE_RESERVED; case BTRFS_CHUNK_ALLOC_ZONED: /* * We don't care about the starting region like regular * allocator, because we anyway use/reserve the first two zones * for superblock logging. */ - return ALIGN(start, device->zone_info->zone_size); + return 0; default: BUG(); } @@ -1581,36 +1543,35 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, * correct usable device space, as device extent freed in current transaction * is not reported as available. */ -static int find_free_dev_extent_start(struct btrfs_device *device, - u64 num_bytes, u64 search_start, u64 *start, - u64 *len) +static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *len) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; + u64 search_start; u64 hole_size; u64 max_hole_start; - u64 max_hole_size; + u64 max_hole_size = 0; u64 extent_end; u64 search_end = device->total_bytes; int ret; int slot; struct extent_buffer *l; - search_start = dev_extent_search_start(device, search_start); + search_start = dev_extent_search_start(device); + max_hole_start = search_start; WARN_ON(device->zone_info && !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - max_hole_start = search_start; - max_hole_size = 0; - + if (!path) { + ret = -ENOMEM; + goto out; + } again: if (search_start >= search_end || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { @@ -1725,13 +1686,6 @@ out: return ret; } -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *len) -{ - /* FIXME use last free of some kind */ - return find_free_dev_extent_start(device, num_bytes, 0, start, len); -} - static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 start, u64 *dev_extent_len) @@ -1900,7 +1854,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -1917,15 +1871,13 @@ out: static void update_dev_time(const char *device_path) { struct path path; - struct timespec64 now; int ret; ret = kern_path(device_path, LOOKUP_FOLLOW, &path); if (ret) return; - now = current_time(d_inode(path.dentry)); - inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME | S_VERSION); + inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); path_put(&path); } @@ -2095,7 +2047,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct block_device **bdev, void **holder) + struct bdev_handle **bdev_handle) { struct btrfs_trans_handle *trans; struct btrfs_device *device; @@ -2204,7 +2156,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, btrfs_assign_next_active_device(device, NULL); - if (device->bdev) { + if (device->bdev_handle) { cur_devices->open_devices--; /* remove sysfs entry */ btrfs_sysfs_remove_device(device); @@ -2220,9 +2172,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb - * write lock, and blkdev_put() will pull in the ->open_mutex on the - * block device and it's dependencies. Instead just flush the device - * and let the caller do the final blkdev_put. + * write lock, and bdev_release() will pull in the ->open_mutex on + * the block device and it's dependencies. Instead just flush the + * device and let the caller do the final bdev_release. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_scratch_superblocks(fs_info, device->bdev, @@ -2233,8 +2185,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } } - *bdev = device->bdev; - *holder = device->holder; + *bdev_handle = device->bdev_handle; synchronize_rcu(); btrfs_free_device(device); @@ -2371,7 +2322,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, const char *path) { struct btrfs_super_block *disk_super; - struct block_device *bdev; + struct bdev_handle *bdev_handle; int ret; if (!path || !path[0]) @@ -2389,7 +2340,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, } ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, - &bdev, &disk_super); + &bdev_handle, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); return ret; @@ -2402,7 +2353,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - blkdev_put(bdev, NULL); + bdev_release(bdev_handle); return 0; } @@ -2459,7 +2410,7 @@ static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) * Private copy of the seed devices, anchored at * fs_info->fs_devices->seed_list */ - seed_devices = alloc_fs_devices(NULL, NULL); + seed_devices = alloc_fs_devices(NULL); if (IS_ERR(seed_devices)) return seed_devices; @@ -2605,7 +2556,7 @@ next_slot: if (device->fs_devices->seeding) { btrfs_set_device_generation(leaf, dev_item, device->generation); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } path->slots[0]++; @@ -2622,7 +2573,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_device *device; - struct block_device *bdev; + struct bdev_handle *bdev_handle; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices = NULL; @@ -2635,12 +2586,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + fs_info->bdev_holder, NULL); + if (IS_ERR(bdev_handle)) + return PTR_ERR(bdev_handle); - if (!btrfs_check_device_zone_type(fs_info, bdev)) { + if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) { ret = -EINVAL; goto error; } @@ -2652,11 +2603,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path locked = true; } - sync_blockdev(bdev); + sync_blockdev(bdev_handle->bdev); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { - if (device->bdev == bdev) { + if (device->bdev == bdev_handle->bdev) { ret = -EEXIST; rcu_read_unlock(); goto error; @@ -2672,7 +2623,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } device->fs_info = fs_info; - device->bdev = bdev; + device->bdev_handle = bdev_handle; + device->bdev = bdev_handle->bdev; ret = lookup_bdev(device_path, &device->devt); if (ret) goto error_free_device; @@ -2693,12 +2645,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; device->total_bytes = - round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); + round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->holder = fs_info->bdev_holder; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); @@ -2734,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!bdev_nonrot(bdev)) + if (!bdev_nonrot(device->bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); @@ -2856,7 +2807,7 @@ error_free_zone: error_free_device: btrfs_free_device(device); error: - blkdev_put(bdev, fs_info->bdev_holder); + bdev_release(bdev_handle); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -2903,7 +2854,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); @@ -2937,6 +2888,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, btrfs_set_super_total_bytes(super_copy, round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; + atomic64_add(diff, &fs_info->free_chunk_space); btrfs_device_set_total_bytes(device, new_size); btrfs_device_set_disk_total_bytes(device, new_size); @@ -3035,7 +2987,8 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) } /* - * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. + * Find the mapping containing the given logical extent. + * * @logical: Logical block offset in bytes. * @length: Length of extent in bytes. * @@ -3053,15 +3006,16 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, read_unlock(&em_tree->lock); if (!em) { - btrfs_crit(fs_info, "unable to find logical %llu length %llu", + btrfs_crit(fs_info, + "unable to find chunk map for logical %llu length %llu", logical, length); return ERR_PTR(-EINVAL); } - if (em->start > logical || em->start + em->len < logical) { + if (em->start > logical || em->start + em->len <= logical) { btrfs_crit(fs_info, - "found a bad mapping, wanted %llu-%llu, found %llu-%llu", - logical, length, em->start, em->start + em->len); + "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", + logical, logical + length, em->start, em->start + em->len); free_extent_map(em); return ERR_PTR(-EINVAL); } @@ -3491,7 +3445,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info, btrfs_set_balance_flags(leaf, item, bctl->flags); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); err = btrfs_commit_transaction(trans); @@ -4846,6 +4800,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 old_size = btrfs_device_get_total_bytes(device); u64 diff; u64 start; + u64 free_diff = 0; new_size = round_down(new_size, fs_info->sectorsize); start = new_size; @@ -4871,7 +4826,19 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_device_set_total_bytes(device, new_size); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes -= diff; - atomic64_sub(diff, &fs_info->free_chunk_space); + + /* + * The new free_chunk_space is new_size - used, so we have to + * subtract the delta of the old free_chunk_space which included + * old_size - used. If used > new_size then just subtract this + * entire device's free space. + */ + if (device->bytes_used < new_size) + free_diff = (old_size - device->bytes_used) - + (new_size - device->bytes_used); + else + free_diff = old_size - device->bytes_used; + atomic64_sub(free_diff, &fs_info->free_chunk_space); } /* @@ -5006,9 +4973,10 @@ done: if (ret) { mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, old_size); - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes += diff; - atomic64_add(diff, &fs_info->free_chunk_space); + atomic64_add(free_diff, &fs_info->free_chunk_space); + } mutex_unlock(&fs_info->chunk_mutex); } return ret; @@ -5117,7 +5085,7 @@ static void init_alloc_chunk_ctl_policy_regular( ASSERT(space_info); ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); - ctl->max_stripe_size = ctl->max_chunk_size; + ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G); if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); @@ -5888,6 +5856,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, } static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes) { struct btrfs_io_context *bioc; @@ -5907,6 +5876,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ bioc->fs_info = fs_info; bioc->replace_stripe_src = -1; bioc->full_stripe_logical = (u64)-1; + bioc->logical = logical; return bioc; } @@ -6211,19 +6181,61 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, return U64_MAX; } -static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, - u32 stripe_index, u64 stripe_offset, u32 stripe_nr) +static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, struct btrfs_io_stripe *dst, + struct map_lookup *map, u32 stripe_index, + u64 stripe_offset, u64 stripe_nr) { dst->dev = map->stripes[stripe_index].dev; + + if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type)) + return btrfs_get_raid_extent_offset(fs_info, logical, length, + map->type, stripe_index, dst); + dst->physical = map->stripes[stripe_index].physical + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); + return 0; } +/* + * Map one logical range to one or more physical ranges. + * + * @length: (Mandatory) mapped length of this run. + * One logical range can be split into different segments + * due to factors like zones and RAID0/5/6/10 stripe + * boundaries. + * + * @bioc_ret: (Mandatory) returned btrfs_io_context structure. + * which has one or more physical ranges (btrfs_io_stripe) + * recorded inside. + * Caller should call btrfs_put_bioc() to free it after use. + * + * @smap: (Optional) single physical range optimization. + * If the map request can be fulfilled by one single + * physical range, and this is parameter is not NULL, + * then @bioc_ret would be NULL, and @smap would be + * updated. + * + * @mirror_num_ret: (Mandatory) returned mirror number if the original + * value is 0. + * + * Mirror number 0 means to choose any live mirrors. + * + * For non-RAID56 profiles, non-zero mirror_num means + * the Nth mirror. (e.g. mirror_num 1 means the first + * copy). + * + * For RAID56 profile, mirror 1 means rebuild from P and + * the remaining data stripes. + * + * For RAID6 profile, mirror > 2 means mark another + * data/P stripe error and rebuild from the remaining + * stripes.. + */ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map) + struct btrfs_io_stripe *smap, int *mirror_num_ret) { struct extent_map *em; struct map_lookup *map; @@ -6318,8 +6330,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) { + if (op != BTRFS_MAP_READ || mirror_num > 1) { /* + * Needs full stripe mapping. + * * Push stripe_nr back to the start of the full stripe * For those cases needing a full stripe, @stripe_nr * is the full stripe number. @@ -6342,19 +6356,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, stripe_index = 0; stripe_offset = 0; } else { - /* - * Mirror #0 or #1 means the original data block. - * Mirror #2 is RAID5 parity block. - * Mirror #3 is RAID6 Q block. - */ + ASSERT(mirror_num <= 1); + /* Just grab the data stripe directly. */ stripe_index = stripe_nr % data_stripes; stripe_nr /= data_stripes; - if (mirror_num > 1) - stripe_index = data_stripes + mirror_num - 2; /* We distribute the parity blocks across stripes */ stripe_index = (stripe_nr + stripe_index) % map->num_stripes; - if (op == BTRFS_MAP_READ && mirror_num <= 1) + if (op == BTRFS_MAP_READ && mirror_num < 1) mirror_num = 1; } } else { @@ -6393,18 +6402,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * I/O context structure. */ if (smap && num_alloc_stripes == 1 && - !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && - (op == BTRFS_MAP_READ || !dev_replace_is_ongoing || - !dev_replace->tgtdev)) { - set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); + !(btrfs_need_stripe_tree_update(fs_info, map->type) && + op != BTRFS_MAP_READ) && + !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) { + ret = set_io_stripe(fs_info, op, logical, length, smap, map, + stripe_index, stripe_offset, stripe_nr); if (mirror_num_ret) *mirror_num_ret = mirror_num; *bioc_ret = NULL; - ret = 0; goto out; } - bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes); + bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes); if (!bioc) { ret = -ENOMEM; goto out; @@ -6418,7 +6427,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * * It's still mostly the same as other profiles, just with extra rotation. */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && (op != BTRFS_MAP_READ || mirror_num > 1)) { /* * For RAID56 @stripe_nr is already the number of full stripes @@ -6430,22 +6439,35 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, */ bioc->full_stripe_logical = em->start + btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); - for (i = 0; i < num_stripes; i++) - set_io_stripe(&bioc->stripes[i], map, - (i + stripe_nr) % num_stripes, - stripe_offset, stripe_nr); + for (int i = 0; i < num_stripes; i++) { + ret = set_io_stripe(fs_info, op, logical, length, + &bioc->stripes[i], map, + (i + stripe_nr) % num_stripes, + stripe_offset, stripe_nr); + if (ret < 0) + break; + } } else { /* * For all other non-RAID56 profiles, just copy the target * stripe into the bioc. */ for (i = 0; i < num_stripes; i++) { - set_io_stripe(&bioc->stripes[i], map, stripe_index, - stripe_offset, stripe_nr); + ret = set_io_stripe(fs_info, op, logical, length, + &bioc->stripes[i], map, stripe_index, + stripe_offset, stripe_nr); + if (ret < 0) + break; stripe_index++; } } + if (ret) { + *bioc_ret = NULL; + btrfs_put_bioc(bioc); + goto out; + } + if (op != BTRFS_MAP_READ) max_errors = btrfs_chunk_max_errors(map); @@ -6872,7 +6894,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (!btrfs_test_opt(fs_info, DEGRADED)) return ERR_PTR(-ENOENT); - fs_devices = alloc_fs_devices(fsid, NULL); + fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) return fs_devices; @@ -7505,7 +7527,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); out: btrfs_free_path(path); @@ -8047,7 +8069,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, ASSERT(mirror_num > 0); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, - &bioc, smap, &mirror_ret, true); + &bioc, smap, &mirror_ret); if (ret < 0) return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b8c51f16ba86..9cc374864a79 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -90,13 +90,11 @@ struct btrfs_device { u64 generation; + struct bdev_handle *bdev_handle; struct block_device *bdev; struct btrfs_zoned_device_info *zone_info; - /* block device holder for blkdev_get/put */ - void *holder; - /* * Device's major-minor number. Must be set even if the device is not * opened (bdev == NULL), unless the device is missing. @@ -290,6 +288,19 @@ struct btrfs_fs_devices { * - Following shall be true at all times: * - metadata_uuid == btrfs_header::fsid * - metadata_uuid == btrfs_dev_item::fsid + * + * - Relations between fsid and metadata_uuid in sb and fs_devices: + * - Normal: + * fs_devices->fsid == fs_devices->metadata_uuid == sb->fsid + * sb->metadata_uuid == 0 + * + * - When the BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag is set: + * fs_devices->fsid == sb->fsid + * fs_devices->metadata_uuid == sb->metadata_uuid + * + * - When in-memory fs_devices->temp_fsid is true + * fs_devices->fsid = random + * fs_devices->metadata_uuid == sb->fsid */ u8 metadata_uuid[BTRFS_FSID_SIZE]; @@ -353,9 +364,10 @@ struct btrfs_fs_devices { bool rotating; /* Devices support TRIM/discard commands. */ bool discardable; - bool fsid_change; /* The filesystem is a seed filesystem. */ bool seeding; + /* The mount needs to use a randomly generated fsid. */ + bool temp_fsid; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -381,12 +393,12 @@ struct btrfs_fs_devices { struct btrfs_io_stripe { struct btrfs_device *dev; - union { - /* Block mapping */ - u64 physical; - /* For the endio handler */ - struct btrfs_io_context *bioc; - }; + /* Block mapping. */ + u64 physical; + u64 length; + bool is_scrub; + /* For the endio handler. */ + struct btrfs_io_context *bioc; }; struct btrfs_discard_stripe { @@ -419,6 +431,11 @@ struct btrfs_io_context { atomic_t error; u16 max_errors; + u64 logical; + u64 size; + /* Raid stripe tree ordered entry. */ + struct list_head rst_ordered_entry; + /* * The total number of stripes, including the extra duplicated * stripe for replace. @@ -596,8 +613,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map); + struct btrfs_io_stripe *smap, int *mirror_num_ret); int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, struct btrfs_io_stripe *smap, u64 logical, u32 length, int mirror_num); @@ -611,7 +627,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, void btrfs_mapping_tree_free(struct extent_map_tree *tree); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags); +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, + bool mount_arg_dev); int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); @@ -629,7 +646,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct block_device **bdev, void **holder); + struct bdev_handle **bdev_handle); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, @@ -650,8 +667,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_uuid_scan_kthread(void *data); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats); @@ -749,5 +764,6 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index fc4b20c2688a..3cf236fb40a4 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -188,15 +188,15 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, if (old_data_len + name_len + sizeof(*di) == item_size) { /* No other xattrs packed in the same leaf item. */ if (size > old_data_len) - btrfs_extend_item(path, size - old_data_len); + btrfs_extend_item(trans, path, size - old_data_len); else if (size < old_data_len) - btrfs_truncate_item(path, data_size, 1); + btrfs_truncate_item(trans, path, data_size, 1); } else { /* There are other xattrs packed in the same item. */ ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto out; - btrfs_extend_item(path, data_size); + btrfs_extend_item(trans, path, data_size); } ptr = btrfs_item_ptr(leaf, slot, char); @@ -205,7 +205,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; write_extent_buffer(leaf, value, data_ptr, size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } else { /* * Insert, and we had space for the xattr, so path->slots[0] is @@ -264,8 +264,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, goto out; inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + inode_set_ctime_current(inode); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -407,8 +407,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, ret = btrfs_set_prop(trans, inode, name, value, size, flags); if (!ret) { inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + inode_set_ctime_current(inode); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); } @@ -442,7 +442,7 @@ static const struct xattr_handler btrfs_btrfs_xattr_handler = { .set = btrfs_xattr_handler_set_prop, }; -const struct xattr_handler *btrfs_xattr_handlers[] = { +const struct xattr_handler * const btrfs_xattr_handlers[] = { &btrfs_security_xattr_handler, &btrfs_trusted_xattr_handler, &btrfs_user_xattr_handler, diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 1cd3fc0a8f17..118118ca3e1d 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -8,7 +8,7 @@ #include <linux/xattr.h> -extern const struct xattr_handler *btrfs_xattr_handlers[]; +extern const struct xattr_handler * const btrfs_xattr_handlers[]; int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 72b90bc19a19..188378ca19c7 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -65,6 +65,9 @@ #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) +static void wait_eb_writebacks(struct btrfs_block_group *block_group); +static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written); + static inline bool sb_zone_is_full(const struct blk_zone *zone) { return (zone->cond == BLK_ZONE_COND_FULL) || @@ -465,8 +468,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) * use the cache. */ if (populate_cache && bdev_is_zoned(device->bdev)) { - zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) * - zone_info->nr_zones); + zone_info->zone_cache = vcalloc(zone_info->nr_zones, + sizeof(struct blk_zone)); if (!zone_info->zone_cache) { btrfs_err_in_rcu(device->fs_info, "zoned: failed to allocate zone cache for %s", @@ -1279,21 +1282,284 @@ out: return ret; } +struct zone_info { + u64 physical; + u64 capacity; + u64 alloc_offset; +}; + +static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, + struct zone_info *info, unsigned long *active, + struct map_lookup *map) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *device = map->stripes[zone_idx].dev; + int dev_replace_is_ongoing = 0; + unsigned int nofs_flag; + struct blk_zone zone; + int ret; + + info->physical = map->stripes[zone_idx].physical; + + if (!device->bdev) { + info->alloc_offset = WP_MISSING_DEV; + return 0; + } + + /* Consider a zone as active if we can allow any number of active zones. */ + if (!device->zone_info->max_active_zones) + __set_bit(zone_idx, active); + + if (!btrfs_dev_is_sequential(device, info->physical)) { + info->alloc_offset = WP_CONVENTIONAL; + return 0; + } + + /* This zone will be used for allocation, so mark this zone non-empty. */ + btrfs_dev_clear_zone_empty(device, info->physical); + + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical); + up_read(&dev_replace->rwsem); + + /* + * The group is mapped to a sequential zone. Get the zone write pointer + * to determine the allocation offset within the zone. + */ + WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); + nofs_flag = memalloc_nofs_save(); + ret = btrfs_get_dev_zone(device, info->physical, &zone); + memalloc_nofs_restore(nofs_flag); + if (ret) { + if (ret != -EIO && ret != -EOPNOTSUPP) + return ret; + info->alloc_offset = WP_MISSING_DEV; + return 0; + } + + if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + btrfs_err_in_rcu(fs_info, + "zoned: unexpected conventional zone %llu on device %s (devid %llu)", + zone.start << SECTOR_SHIFT, rcu_str_deref(device->name), + device->devid); + return -EIO; + } + + info->capacity = (zone.capacity << SECTOR_SHIFT); + + switch (zone.cond) { + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + btrfs_err(fs_info, + "zoned: offline/readonly zone %llu on device %s (devid %llu)", + (info->physical >> device->zone_info->zone_size_shift), + rcu_str_deref(device->name), device->devid); + info->alloc_offset = WP_MISSING_DEV; + break; + case BLK_ZONE_COND_EMPTY: + info->alloc_offset = 0; + break; + case BLK_ZONE_COND_FULL: + info->alloc_offset = info->capacity; + break; + default: + /* Partially used zone. */ + info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT); + __set_bit(zone_idx, active); + break; + } + + return 0; +} + +static int btrfs_load_block_group_single(struct btrfs_block_group *bg, + struct zone_info *info, + unsigned long *active) +{ + if (info->alloc_offset == WP_MISSING_DEV) { + btrfs_err(bg->fs_info, + "zoned: cannot recover write pointer for zone %llu", + info->physical); + return -EIO; + } + + bg->alloc_offset = info->alloc_offset; + bg->zone_capacity = info->capacity; + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + return 0; +} + +static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, + struct map_lookup *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree"); + return -EINVAL; + } + + if (zone_info[0].alloc_offset == WP_MISSING_DEV) { + btrfs_err(bg->fs_info, + "zoned: cannot recover write pointer for zone %llu", + zone_info[0].physical); + return -EIO; + } + if (zone_info[1].alloc_offset == WP_MISSING_DEV) { + btrfs_err(bg->fs_info, + "zoned: cannot recover write pointer for zone %llu", + zone_info[1].physical); + return -EIO; + } + if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { + btrfs_err(bg->fs_info, + "zoned: write pointer offset mismatch of zones in DUP profile"); + return -EIO; + } + + if (test_bit(0, active) != test_bit(1, active)) { + if (!btrfs_zone_activate(bg)) + return -EIO; + } else if (test_bit(0, active)) { + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + + bg->alloc_offset = zone_info[0].alloc_offset; + bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity); + return 0; +} + +static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, + struct map_lookup *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + int i; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + for (i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && + !btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "zoned: write pointer offset mismatch of zones in %s profile", + btrfs_bg_type_to_raid_name(map->type)); + return -EIO; + } + if (test_bit(0, active) != test_bit(i, active)) { + if (!btrfs_test_opt(fs_info, DEGRADED) && + !btrfs_zone_activate(bg)) { + return -EIO; + } + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + /* In case a device is missing we have a cap of 0, so don't use it. */ + bg->zone_capacity = min_not_zero(zone_info[0].capacity, + zone_info[1].capacity); + } + + if (zone_info[0].alloc_offset != WP_MISSING_DEV) + bg->alloc_offset = zone_info[0].alloc_offset; + else + bg->alloc_offset = zone_info[i - 1].alloc_offset; + + return 0; +} + +static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, + struct map_lookup *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + for (int i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + if (test_bit(0, active) != test_bit(i, active)) { + if (!btrfs_zone_activate(bg)) + return -EIO; + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + bg->zone_capacity += zone_info[i].capacity; + bg->alloc_offset += zone_info[i].alloc_offset; + } + + return 0; +} + +static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, + struct map_lookup *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + for (int i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + if (test_bit(0, active) != test_bit(i, active)) { + if (!btrfs_zone_activate(bg)) + return -EIO; + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + + if ((i % map->sub_stripes) == 0) { + bg->zone_capacity += zone_info[i].capacity; + bg->alloc_offset += zone_info[i].alloc_offset; + } + } + + return 0; +} + int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; - struct btrfs_device *device; u64 logical = cache->start; u64 length = cache->length; + struct zone_info *zone_info = NULL; int ret; int i; - unsigned int nofs_flag; - u64 *alloc_offsets = NULL; - u64 *caps = NULL; - u64 *physical = NULL; unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; @@ -1325,20 +1591,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } - alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); - if (!alloc_offsets) { - ret = -ENOMEM; - goto out; - } - - caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS); - if (!caps) { - ret = -ENOMEM; - goto out; - } - - physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS); - if (!physical) { + zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); + if (!zone_info) { ret = -ENOMEM; goto out; } @@ -1350,98 +1604,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } for (i = 0; i < map->num_stripes; i++) { - bool is_sequential; - struct blk_zone zone; - struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - int dev_replace_is_ongoing = 0; - - device = map->stripes[i].dev; - physical[i] = map->stripes[i].physical; - - if (device->bdev == NULL) { - alloc_offsets[i] = WP_MISSING_DEV; - continue; - } - - is_sequential = btrfs_dev_is_sequential(device, physical[i]); - if (is_sequential) - num_sequential++; - else - num_conventional++; - - /* - * Consider a zone as active if we can allow any number of - * active zones. - */ - if (!device->zone_info->max_active_zones) - __set_bit(i, active); - - if (!is_sequential) { - alloc_offsets[i] = WP_CONVENTIONAL; - continue; - } - - /* - * This zone will be used for allocation, so mark this zone - * non-empty. - */ - btrfs_dev_clear_zone_empty(device, physical[i]); - - down_read(&dev_replace->rwsem); - dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) - btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]); - up_read(&dev_replace->rwsem); - - /* - * The group is mapped to a sequential zone. Get the zone write - * pointer to determine the allocation offset within the zone. - */ - WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size)); - nofs_flag = memalloc_nofs_save(); - ret = btrfs_get_dev_zone(device, physical[i], &zone); - memalloc_nofs_restore(nofs_flag); - if (ret == -EIO || ret == -EOPNOTSUPP) { - ret = 0; - alloc_offsets[i] = WP_MISSING_DEV; - continue; - } else if (ret) { - goto out; - } - - if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { - btrfs_err_in_rcu(fs_info, - "zoned: unexpected conventional zone %llu on device %s (devid %llu)", - zone.start << SECTOR_SHIFT, - rcu_str_deref(device->name), device->devid); - ret = -EIO; + ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map); + if (ret) goto out; - } - - caps[i] = (zone.capacity << SECTOR_SHIFT); - switch (zone.cond) { - case BLK_ZONE_COND_OFFLINE: - case BLK_ZONE_COND_READONLY: - btrfs_err(fs_info, - "zoned: offline/readonly zone %llu on device %s (devid %llu)", - physical[i] >> device->zone_info->zone_size_shift, - rcu_str_deref(device->name), device->devid); - alloc_offsets[i] = WP_MISSING_DEV; - break; - case BLK_ZONE_COND_EMPTY: - alloc_offsets[i] = 0; - break; - case BLK_ZONE_COND_FULL: - alloc_offsets[i] = caps[i]; - break; - default: - /* Partially used zone */ - alloc_offsets[i] = - ((zone.wp - zone.start) << SECTOR_SHIFT); - __set_bit(i, active); - break; - } + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) + num_conventional++; + else + num_sequential++; } if (num_sequential > 0) @@ -1465,63 +1635,24 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case 0: /* single */ - if (alloc_offsets[0] == WP_MISSING_DEV) { - btrfs_err(fs_info, - "zoned: cannot recover write pointer for zone %llu", - physical[0]); - ret = -EIO; - goto out; - } - cache->alloc_offset = alloc_offsets[0]; - cache->zone_capacity = caps[0]; - if (test_bit(0, active)) - set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); + ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; case BTRFS_BLOCK_GROUP_DUP: - if (map->type & BTRFS_BLOCK_GROUP_DATA) { - btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg"); - ret = -EINVAL; - goto out; - } - if (alloc_offsets[0] == WP_MISSING_DEV) { - btrfs_err(fs_info, - "zoned: cannot recover write pointer for zone %llu", - physical[0]); - ret = -EIO; - goto out; - } - if (alloc_offsets[1] == WP_MISSING_DEV) { - btrfs_err(fs_info, - "zoned: cannot recover write pointer for zone %llu", - physical[1]); - ret = -EIO; - goto out; - } - if (alloc_offsets[0] != alloc_offsets[1]) { - btrfs_err(fs_info, - "zoned: write pointer offset mismatch of zones in DUP profile"); - ret = -EIO; - goto out; - } - if (test_bit(0, active) != test_bit(1, active)) { - if (!btrfs_zone_activate(cache)) { - ret = -EIO; - goto out; - } - } else { - if (test_bit(0, active)) - set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, - &cache->runtime_flags); - } - cache->alloc_offset = alloc_offsets[0]; - cache->zone_capacity = min(caps[0], caps[1]); + ret = btrfs_load_block_group_dup(cache, map, zone_info, active); break; case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID1C3: + case BTRFS_BLOCK_GROUP_RAID1C4: + ret = btrfs_load_block_group_raid1(cache, map, zone_info, active); + break; case BTRFS_BLOCK_GROUP_RAID0: + ret = btrfs_load_block_group_raid0(cache, map, zone_info, active); + break; case BTRFS_BLOCK_GROUP_RAID10: + ret = btrfs_load_block_group_raid10(cache, map, zone_info, active); + break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: - /* non-single profiles are not supported yet */ default: btrfs_err(fs_info, "zoned: profile %s not yet supported", btrfs_bg_type_to_raid_name(map->type)); @@ -1530,13 +1661,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } out: - if (cache->alloc_offset > fs_info->zone_size) { - btrfs_err(fs_info, - "zoned: invalid write pointer %llu in block group %llu", - cache->alloc_offset, cache->start); - ret = -EIO; - } - if (cache->alloc_offset > cache->zone_capacity) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", @@ -1567,9 +1691,7 @@ out: cache->physical_map = NULL; } bitmap_free(active); - kfree(physical); - kfree(caps); - kfree(alloc_offsets); + kfree(zone_info); free_extent_map(em); return ret; @@ -1583,19 +1705,9 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) return; WARN_ON(cache->bytes_super != 0); - - /* Check for block groups never get activated */ - if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) && - cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) && - !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) && - cache->alloc_offset == 0) { - unusable = cache->length; - free = 0; - } else { - unusable = (cache->alloc_offset - cache->used) + - (cache->length - cache->zone_capacity); - free = cache->zone_capacity - cache->alloc_offset; - } + unusable = (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); + free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ cache->cached = BTRFS_CACHE_FINISHED; @@ -1616,7 +1728,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); set_extent_buffer_dirty(eb); set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, - EXTENT_DIRTY | EXTENT_NOWAIT, NULL); + EXTENT_DIRTY, NULL); } bool btrfs_use_zone_append(struct btrfs_bio *bbio) @@ -1707,10 +1819,21 @@ void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) { struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_sum *sum = - list_first_entry(&ordered->list, typeof(*sum), list); - u64 logical = sum->logical; - u64 len = sum->len; + struct btrfs_ordered_sum *sum; + u64 logical, len; + + /* + * Write to pre-allocated region is for the data relocation, and so + * it should use WRITE operation. No split/rewrite are necessary. + */ + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) + return; + + ASSERT(!list_empty(&ordered->list)); + /* The ordered->list can be empty in the above pre-alloc case. */ + sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list); + logical = sum->logical; + len = sum->len; while (len < ordered->disk_num_bytes) { sum = list_next_entry(sum, list); @@ -1747,41 +1870,121 @@ out: } } -bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_block_group **cache_ret) +static bool check_bg_is_active(struct btrfs_eb_write_context *ctx, + struct btrfs_block_group **active_bg) { - struct btrfs_block_group *cache; - bool ret = true; + const struct writeback_control *wbc = ctx->wbc; + struct btrfs_block_group *block_group = ctx->zoned_bg; + struct btrfs_fs_info *fs_info = block_group->fs_info; - if (!btrfs_is_zoned(fs_info)) + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) return true; - cache = btrfs_lookup_block_group(fs_info, eb->start); - if (!cache) - return true; + if (fs_info->treelog_bg == block_group->start) { + if (!btrfs_zone_activate(block_group)) { + int ret_fin = btrfs_zone_finish_one_bg(fs_info); - if (cache->meta_write_pointer != eb->start) { - btrfs_put_block_group(cache); - cache = NULL; - ret = false; - } else { - cache->meta_write_pointer = eb->start + eb->len; - } + if (ret_fin != 1 || !btrfs_zone_activate(block_group)) + return false; + } + } else if (*active_bg != block_group) { + struct btrfs_block_group *tgt = *active_bg; - *cache_ret = cache; + /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */ + lockdep_assert_held(&fs_info->zoned_meta_io_lock); - return ret; + if (tgt) { + /* + * If there is an unsent IO left in the allocated area, + * we cannot wait for them as it may cause a deadlock. + */ + if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) { + if (wbc->sync_mode == WB_SYNC_NONE || + (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)) + return false; + } + + /* Pivot active metadata/system block group. */ + btrfs_zoned_meta_io_unlock(fs_info); + wait_eb_writebacks(tgt); + do_zone_finish(tgt, true); + btrfs_zoned_meta_io_lock(fs_info); + if (*active_bg == tgt) { + btrfs_put_block_group(tgt); + *active_bg = NULL; + } + } + if (!btrfs_zone_activate(block_group)) + return false; + if (*active_bg != block_group) { + ASSERT(*active_bg == NULL); + *active_bg = block_group; + btrfs_get_block_group(block_group); + } + } + + return true; } -void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, - struct extent_buffer *eb) +/* + * Check if @ctx->eb is aligned to the write pointer. + * + * Return: + * 0: @ctx->eb is at the write pointer. You can write it. + * -EAGAIN: There is a hole. The caller should handle the case. + * -EBUSY: There is a hole, but the caller can just bail out. + */ +int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx) { - if (!btrfs_is_zoned(eb->fs_info) || !cache) - return; + const struct writeback_control *wbc = ctx->wbc; + const struct extent_buffer *eb = ctx->eb; + struct btrfs_block_group *block_group = ctx->zoned_bg; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + if (block_group) { + if (block_group->start > eb->start || + block_group->start + block_group->length <= eb->start) { + btrfs_put_block_group(block_group); + block_group = NULL; + ctx->zoned_bg = NULL; + } + } + + if (!block_group) { + block_group = btrfs_lookup_block_group(fs_info, eb->start); + if (!block_group) + return 0; + ctx->zoned_bg = block_group; + } + + if (block_group->meta_write_pointer == eb->start) { + struct btrfs_block_group **tgt; - ASSERT(cache->meta_write_pointer == eb->start + eb->len); - cache->meta_write_pointer = eb->start; + if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) + return 0; + + if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) + tgt = &fs_info->active_system_bg; + else + tgt = &fs_info->active_meta_bg; + if (check_bg_is_active(ctx, tgt)) + return 0; + } + + /* + * Since we may release fs_info->zoned_meta_io_lock, someone can already + * start writing this eb. In that case, we can just bail out. + */ + if (block_group->meta_write_pointer > eb->start) + return -EBUSY; + + /* If for_sync, this hole will be filled with trasnsaction commit. */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + return -EAGAIN; + return -EBUSY; } int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) @@ -1803,7 +2006,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, int i, ret; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &mapped_length, &bioc, NULL, NULL, 1); + &mapped_length, &bioc, NULL, NULL); if (ret || !bioc || mapped_length < PAGE_SIZE) { ret = -EIO; goto out_put_bioc; @@ -1879,10 +2082,10 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_space_info *space_info = block_group->space_info; struct map_lookup *map; struct btrfs_device *device; u64 physical; + const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); bool ret; int i; @@ -1891,7 +2094,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; - spin_lock(&space_info->lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; @@ -1904,30 +2106,44 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } + spin_lock(&fs_info->zone_active_bgs_lock); for (i = 0; i < map->num_stripes; i++) { + struct btrfs_zoned_device_info *zinfo; + int reserved = 0; + device = map->stripes[i].dev; physical = map->stripes[i].physical; + zinfo = device->zone_info; - if (device->zone_info->max_active_zones == 0) + if (zinfo->max_active_zones == 0) continue; + if (is_data) + reserved = zinfo->reserved_active_zones; + /* + * For the data block group, leave active zones for one + * metadata block group and one system block group. + */ + if (atomic_read(&zinfo->active_zones_left) <= reserved) { + ret = false; + spin_unlock(&fs_info->zone_active_bgs_lock); + goto out_unlock; + } + if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; + spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } + if (!is_data) + zinfo->reserved_active_zones--; } + spin_unlock(&fs_info->zone_active_bgs_lock); /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); - WARN_ON(block_group->alloc_offset != 0); - if (block_group->zone_unusable == block_group->length) { - block_group->zone_unusable = block_group->length - block_group->zone_capacity; - space_info->bytes_zone_unusable -= block_group->zone_capacity; - } spin_unlock(&block_group->lock); - btrfs_try_granting_tickets(fs_info, space_info); - spin_unlock(&space_info->lock); /* For the active block group list */ btrfs_get_block_group(block_group); @@ -1940,7 +2156,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); return ret; } @@ -2006,6 +2221,10 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ * and block_group->meta_write_pointer for metadata. */ if (!fully_written) { + if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + return -EAGAIN; + } spin_unlock(&block_group->lock); ret = btrfs_inc_block_group_ro(block_group, false); @@ -2034,7 +2253,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ return 0; } - if (block_group->reserved) { + if (block_group->reserved || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + &block_group->runtime_flags)) { spin_unlock(&block_group->lock); btrfs_dec_block_group_ro(block_group); return -EAGAIN; @@ -2043,6 +2264,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); block_group->alloc_offset = block_group->zone_capacity; + if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) + block_group->meta_write_pointer = block_group->start + + block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; btrfs_clear_treelog_bg(block_group); btrfs_clear_data_reloc_bg(block_group); @@ -2052,18 +2276,21 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; const u64 physical = map->stripes[i].physical; + struct btrfs_zoned_device_info *zinfo = device->zone_info; - if (device->zone_info->max_active_zones == 0) + if (zinfo->max_active_zones == 0) continue; ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, + zinfo->zone_size >> SECTOR_SHIFT, GFP_NOFS); if (ret) return ret; + if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + zinfo->reserved_active_zones++; btrfs_dev_clear_active_zone(device, physical); } @@ -2102,8 +2329,10 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); + spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { struct btrfs_zoned_device_info *zinfo = device->zone_info; + int reserved = 0; if (!device->bdev) continue; @@ -2113,17 +2342,21 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) break; } + if (flags & BTRFS_BLOCK_GROUP_DATA) + reserved = zinfo->reserved_active_zones; + switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case 0: /* single */ - ret = (atomic_read(&zinfo->active_zones_left) >= 1); + ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved)); break; case BTRFS_BLOCK_GROUP_DUP: - ret = (atomic_read(&zinfo->active_zones_left) >= 2); + ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved)); break; } if (ret) break; } + spin_unlock(&fs_info->zone_active_bgs_lock); mutex_unlock(&fs_info->chunk_mutex); if (!ret) @@ -2265,7 +2498,10 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica /* All relocation extents are written. */ if (block_group->start + block_group->alloc_offset == logical + length) { - /* Now, release this block group for further allocations. */ + /* + * Now, release this block group for further allocations and + * zone finish. + */ clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); } @@ -2289,7 +2525,8 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) spin_lock(&block_group->lock); if (block_group->reserved || block_group->alloc_offset == 0 || - (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); continue; } @@ -2365,3 +2602,55 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, return 0; } + +/* + * Reserve zones for one metadata block group, one tree-log block group, and one + * system block group. + */ +void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_block_group *block_group; + struct btrfs_device *device; + /* Reserve zones for normal SINGLE metadata and tree-log block group. */ + unsigned int metadata_reserve = 2; + /* Reserve a zone for SINGLE system block group. */ + unsigned int system_reserve = 1; + + if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) + return; + + /* + * This function is called from the mount context. So, there is no + * parallel process touching the bits. No need for read_seqretry(). + */ + if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP) + metadata_reserve = 4; + if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP) + system_reserve = 2; + + /* Apply the reservation on all the devices. */ + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + + device->zone_info->reserved_active_zones = + metadata_reserve + system_reserve; + } + mutex_unlock(&fs_devices->device_list_mutex); + + /* Release reservation for currently active block groups. */ + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + struct map_lookup *map = block_group->physical_map; + + if (!(block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) + continue; + + for (int i = 0; i < map->num_stripes; i++) + map->stripes[i].dev->zone_info->reserved_active_zones--; + } + spin_unlock(&fs_info->zone_active_bgs_lock); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 27322b926038..b9cec523b778 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -22,6 +22,11 @@ struct btrfs_zoned_device_info { u8 zone_size_shift; u32 nr_zones; unsigned int max_active_zones; + /* + * Reserved active zones for one metadata and one system block group. + * It can vary per-device depending on the allocation status. + */ + int reserved_active_zones; atomic_t active_zones_left; unsigned long *seq_zones; unsigned long *empty_zones; @@ -58,11 +63,8 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); -bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_block_group **cache_ret); -void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, - struct extent_buffer *eb); +int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx); int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); @@ -81,6 +83,7 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, bool do_finish); +void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -189,17 +192,10 @@ static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { } -static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_block_group **cache_ret) -{ - return true; -} - -static inline void btrfs_revert_meta_write_pointer( - struct btrfs_block_group *cache, - struct extent_buffer *eb) +static inline int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx) { + return 0; } static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device, @@ -262,6 +258,8 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, return 0; } +static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { } + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index e7ac4ec809a4..5511766485cd 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -145,7 +145,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) } /* - * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds + * Calculate monotonic memory bounds. * * It is possible based on the level configurations that a higher level * workspace uses less memory than a lower level workspace. In order to reuse @@ -218,7 +218,8 @@ void zstd_cleanup_workspace_manager(void) } /* - * zstd_find_workspace - find workspace + * Find workspace for given level. + * * @level: compression level * * This iterates over the set bits in the active_map beginning at the requested @@ -256,7 +257,8 @@ static struct list_head *zstd_find_workspace(unsigned int level) } /* - * zstd_get_workspace - zstd's get_workspace + * Zstd get_workspace for level. + * * @level: compression level * * If @level is 0, then any compression level can be used. Therefore, we begin @@ -296,7 +298,8 @@ again: } /* - * zstd_put_workspace - zstd put_workspace + * Zstd put_workspace. + * * @ws: list_head for the workspace * * When putting back a workspace, we only need to update the LRU if we are of |