diff options
Diffstat (limited to 'fs/btrfs')
70 files changed, 7975 insertions, 5741 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 4188ba3fd8c3..99f9995670ea 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -17,6 +17,7 @@ subdir-ccflags-y += $(condflags) subdir-ccflags-y += -Wno-missing-field-initializers subdir-ccflags-y += -Wno-sign-compare subdir-ccflags-y += -Wno-type-limits +subdir-ccflags-y += -Wno-shift-negative-value obj-$(CONFIG_BTRFS_FS) := btrfs.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0a0d0eccee4e..548d6a5477b4 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -55,9 +55,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) return acl; } -static int __btrfs_set_acl(struct btrfs_trans_handle *trans, - struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, int type) +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type) { int ret, size = 0; const char *name; @@ -123,40 +122,8 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, if (ret) return ret; } - ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type); + ret = __btrfs_set_acl(NULL, inode, acl, type); if (ret) inode->i_mode = old_mode; return ret; } - -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) -{ - struct posix_acl *default_acl, *acl; - int ret = 0; - - /* this happens with subvols */ - if (!dir) - return 0; - - ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (ret) - return ret; - - if (default_acl) { - ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl, - ACL_TYPE_DEFAULT); - posix_acl_release(default_acl); - } - - if (acl) { - if (!ret) - ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl, - ACL_TYPE_ACCESS); - posix_acl_release(acl); - } - - if (!default_acl && !acl) - cache_no_acl(inode); - return ret; -} diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 43c89952b7d2..aac240430efe 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -15,13 +15,12 @@ enum { WORK_DONE_BIT, WORK_ORDER_DONE_BIT, - WORK_HIGH_PRIO_BIT, }; #define NO_THRESHOLD (-1) #define DFT_THRESHOLD (32) -struct __btrfs_workqueue { +struct btrfs_workqueue { struct workqueue_struct *normal_wq; /* File system this workqueue services */ @@ -48,12 +47,7 @@ struct __btrfs_workqueue { spinlock_t thres_lock; }; -struct btrfs_workqueue { - struct __btrfs_workqueue *normal; - struct __btrfs_workqueue *high; -}; - -struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq) { return wq->fs_info; } @@ -66,22 +60,22 @@ struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work) bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) { /* - * We could compare wq->normal->pending with num_online_cpus() + * We could compare wq->pending with num_online_cpus() * to support "thresh == NO_THRESHOLD" case, but it requires * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's * postpone it until someone needs the support of that case. */ - if (wq->normal->thresh == NO_THRESHOLD) + if (wq->thresh == NO_THRESHOLD) return false; - return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; + return atomic_read(&wq->pending) > wq->thresh * 2; } -static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, - unsigned int flags, int limit_active, int thresh) +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, + int limit_active, int thresh) { - struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; @@ -105,12 +99,8 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, ret->thresh = thresh; } - if (flags & WQ_HIGHPRI) - ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags, - ret->current_active, name); - else - ret->normal_wq = alloc_workqueue("btrfs-%s", flags, - ret->current_active, name); + ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active, + name); if (!ret->normal_wq) { kfree(ret); return NULL; @@ -119,41 +109,7 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, INIT_LIST_HEAD(&ret->ordered_list); spin_lock_init(&ret->list_lock); spin_lock_init(&ret->thres_lock); - trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI); - return ret; -} - -static inline void -__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); - -struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, - const char *name, - unsigned int flags, - int limit_active, - int thresh) -{ - struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); - - if (!ret) - return NULL; - - ret->normal = __btrfs_alloc_workqueue(fs_info, name, - flags & ~WQ_HIGHPRI, - limit_active, thresh); - if (!ret->normal) { - kfree(ret); - return NULL; - } - - if (flags & WQ_HIGHPRI) { - ret->high = __btrfs_alloc_workqueue(fs_info, name, flags, - limit_active, thresh); - if (!ret->high) { - __btrfs_destroy_workqueue(ret->normal); - kfree(ret); - return NULL; - } - } + trace_btrfs_workqueue_alloc(ret, name); return ret; } @@ -162,7 +118,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, * This hook WILL be called in IRQ handler context, * so workqueue_set_max_active MUST NOT be called in this hook */ -static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) +static inline void thresh_queue_hook(struct btrfs_workqueue *wq) { if (wq->thresh == NO_THRESHOLD) return; @@ -174,7 +130,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) * This hook is called in kthread content. * So workqueue_set_max_active is called here. */ -static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) +static inline void thresh_exec_hook(struct btrfs_workqueue *wq) { int new_current_active; long pending; @@ -217,7 +173,7 @@ out: } } -static void run_ordered_work(struct __btrfs_workqueue *wq, +static void run_ordered_work(struct btrfs_workqueue *wq, struct btrfs_work *self) { struct list_head *list = &wq->ordered_list; @@ -305,7 +261,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) { struct btrfs_work *work = container_of(normal_work, struct btrfs_work, normal_work); - struct __btrfs_workqueue *wq; + struct btrfs_workqueue *wq = work->wq; int need_order = 0; /* @@ -318,7 +274,6 @@ static void btrfs_work_helper(struct work_struct *normal_work) */ if (work->ordered_func) need_order = 1; - wq = work->wq; trace_btrfs_work_sched(work); thresh_exec_hook(wq); @@ -350,8 +305,7 @@ void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, work->flags = 0; } -static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, - struct btrfs_work *work) +void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work) { unsigned long flags; @@ -366,54 +320,22 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, queue_work(wq->normal_wq, &work->normal_work); } -void btrfs_queue_work(struct btrfs_workqueue *wq, - struct btrfs_work *work) -{ - struct __btrfs_workqueue *dest_wq; - - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high) - dest_wq = wq->high; - else - dest_wq = wq->normal; - __btrfs_queue_work(dest_wq, work); -} - -static inline void -__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq) -{ - destroy_workqueue(wq->normal_wq); - trace_btrfs_workqueue_destroy(wq); - kfree(wq); -} - void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) { if (!wq) return; - if (wq->high) - __btrfs_destroy_workqueue(wq->high); - __btrfs_destroy_workqueue(wq->normal); + destroy_workqueue(wq->normal_wq); + trace_btrfs_workqueue_destroy(wq); kfree(wq); } void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active) { - if (!wq) - return; - wq->normal->limit_active = limit_active; - if (wq->high) - wq->high->limit_active = limit_active; -} - -void btrfs_set_work_high_priority(struct btrfs_work *work) -{ - set_bit(WORK_HIGH_PRIO_BIT, &work->flags); + if (wq) + wq->limit_active = limit_active; } void btrfs_flush_workqueue(struct btrfs_workqueue *wq) { - if (wq->high) - flush_workqueue(wq->high->normal_wq); - - flush_workqueue(wq->normal->normal_wq); + flush_workqueue(wq->normal_wq); } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 3204daa51b95..07960529b360 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -11,8 +11,6 @@ struct btrfs_fs_info; struct btrfs_workqueue; -/* Internal use only */ -struct __btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); typedef void (*btrfs_work_func_t)(struct work_struct *arg); @@ -25,7 +23,7 @@ struct btrfs_work { /* Don't touch things below */ struct work_struct normal_work; struct list_head ordered_list; - struct __btrfs_workqueue *wq; + struct btrfs_workqueue *wq; unsigned long flags; }; @@ -40,9 +38,8 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work); void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); -void btrfs_set_work_high_priority(struct btrfs_work *work); struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); -struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); void btrfs_flush_workqueue(struct btrfs_workqueue *wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index c9ee579bc5a6..ebc392ea1d74 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -789,11 +789,13 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); - } else if (!extent_buffer_uptodate(eb)) { + } + if (!extent_buffer_uptodate(eb)) { free_pref(ref); free_extent_buffer(eb); return -EIO; } + if (lock) btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) @@ -1335,7 +1337,8 @@ again: if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; - } else if (!extent_buffer_uptodate(eb)) { + } + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 8202ad6aa131..ede389f2602d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -168,11 +168,12 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, struct rb_node **p; struct rb_node *parent = NULL; struct btrfs_block_group *cache; + bool leftmost = true; ASSERT(block_group->length != 0); - spin_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_node; + write_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_root.rb_node; while (*p) { parent = *p; @@ -181,20 +182,18 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, p = &(*p)->rb_left; } else if (block_group->start > cache->start) { p = &(*p)->rb_right; + leftmost = false; } else { - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); return -EEXIST; } } rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color(&block_group->cache_node, - &info->block_group_cache_tree); + rb_insert_color_cached(&block_group->cache_node, + &info->block_group_cache_tree, leftmost); - if (info->first_logical_byte > block_group->start) - info->first_logical_byte = block_group->start; - - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); return 0; } @@ -210,8 +209,8 @@ static struct btrfs_block_group *block_group_cache_tree_search( struct rb_node *n; u64 end, start; - spin_lock(&info->block_group_cache_lock); - n = info->block_group_cache_tree.rb_node; + read_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_root.rb_node; while (n) { cache = rb_entry(n, struct btrfs_block_group, cache_node); @@ -233,12 +232,9 @@ static struct btrfs_block_group *block_group_cache_tree_search( break; } } - if (ret) { + if (ret) btrfs_get_block_group(ret); - if (bytenr == 0 && info->first_logical_byte > ret->start) - info->first_logical_byte = ret->start; - } - spin_unlock(&info->block_group_cache_lock); + read_unlock(&info->block_group_cache_lock); return ret; } @@ -267,15 +263,15 @@ struct btrfs_block_group *btrfs_next_block_group( struct btrfs_fs_info *fs_info = cache->fs_info; struct rb_node *node; - spin_lock(&fs_info->block_group_cache_lock); + read_lock(&fs_info->block_group_cache_lock); /* If our block group was removed, we need a full search. */ if (RB_EMPTY_NODE(&cache->cache_node)) { const u64 next_bytenr = cache->start + cache->length; - spin_unlock(&fs_info->block_group_cache_lock); + read_unlock(&fs_info->block_group_cache_lock); btrfs_put_block_group(cache); - cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; + return btrfs_lookup_first_block_group(fs_info, next_bytenr); } node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); @@ -284,46 +280,70 @@ struct btrfs_block_group *btrfs_next_block_group( btrfs_get_block_group(cache); } else cache = NULL; - spin_unlock(&fs_info->block_group_cache_lock); + read_unlock(&fs_info->block_group_cache_lock); return cache; } -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +/** + * Check if we can do a NOCOW write for a given extent. + * + * @fs_info: The filesystem information object. + * @bytenr: Logical start address of the extent. + * + * Check if we can do a NOCOW write for the given extent, and increments the + * number of NOCOW writers in the block group that contains the extent, as long + * as the block group exists and it's currently not in read-only mode. + * + * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller + * is responsible for calling btrfs_dec_nocow_writers() later. + * + * Or NULL if we can not do a NOCOW write + */ +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr) { struct btrfs_block_group *bg; - bool ret = true; + bool can_nocow = true; bg = btrfs_lookup_block_group(fs_info, bytenr); if (!bg) - return false; + return NULL; spin_lock(&bg->lock); if (bg->ro) - ret = false; + can_nocow = false; else atomic_inc(&bg->nocow_writers); spin_unlock(&bg->lock); - /* No put on block group, done by btrfs_dec_nocow_writers */ - if (!ret) + if (!can_nocow) { btrfs_put_block_group(bg); + return NULL; + } - return ret; + /* No put on block group, done by btrfs_dec_nocow_writers(). */ + return bg; } -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +/** + * Decrement the number of NOCOW writers in a block group. + * + * @bg: The block group. + * + * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), + * and on the block group returned by that call. Typically this is called after + * creating an ordered extent for a NOCOW write, to prevent races with scrub and + * relocation. + * + * After this call, the caller should not use the block group anymore. It it wants + * to use it, then it should get a reference on it before calling this function. + */ +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) { - struct btrfs_block_group *bg; - - bg = btrfs_lookup_block_group(fs_info, bytenr); - ASSERT(bg); if (atomic_dec_and_test(&bg->nocow_writers)) wake_up_var(&bg->nocow_writers); - /* - * Once for our lookup and once for the lookup done by a previous call - * to btrfs_inc_nocow_writers() - */ - btrfs_put_block_group(bg); + + /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ btrfs_put_block_group(bg); } @@ -772,10 +792,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only cache->has_caching_ctl = 1; spin_unlock(&cache->lock); - spin_lock(&fs_info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); btrfs_get_block_group(cache); @@ -957,17 +977,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (ret) goto out; - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&block_group->cache_node, - &fs_info->block_group_cache_tree); + write_lock(&fs_info->block_group_cache_lock); + rb_erase_cached(&block_group->cache_node, + &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); /* Once for the block groups rbtree */ btrfs_put_block_group(block_group); - if (fs_info->first_logical_byte == block_group->start) - fs_info->first_logical_byte = (u64)-1; - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); /* @@ -992,7 +1010,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (block_group->cached == BTRFS_CACHE_STARTED) btrfs_wait_block_group_cache_done(block_group); if (block_group->has_caching_ctl) { - spin_lock(&fs_info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); if (!caching_ctl) { struct btrfs_caching_control *ctl; @@ -1006,7 +1024,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } if (caching_ctl) list_del_init(&caching_ctl->list); - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); if (caching_ctl) { /* Once for the caching bgs list and once for us. */ btrfs_put_caching_control(caching_ctl); @@ -1367,6 +1385,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) goto next; } + ret = btrfs_zone_finish(block_group); + if (ret < 0) { + btrfs_dec_block_group_ro(block_group); + if (ret == -EAGAIN) + ret = 0; + goto next; + } + /* * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. @@ -1512,6 +1538,13 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, return bg1->used > bg2->used; } +static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) +{ + if (btrfs_is_zoned(fs_info)) + return btrfs_zoned_should_reclaim(fs_info); + return true; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1522,8 +1555,15 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + if (!btrfs_should_reclaim(fs_info)) + return; + + sb_start_write(fs_info->sb); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + sb_end_write(fs_info->sb); return; + } /* * Long running balances can keep us blocked here for eternity, so @@ -1531,6 +1571,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); return; } @@ -1605,6 +1646,7 @@ next: spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); } void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) @@ -1686,35 +1728,13 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; struct btrfs_key found_key; - struct extent_buffer *leaf; - int slot; - - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - return ret; - - while (1) { - slot = path->slots[0]; - leaf = path->nodes[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - break; - } - btrfs_item_key_to_cpu(leaf, &found_key, slot); + btrfs_for_each_slot(root, key, &found_key, path, ret) { if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = read_bg_from_eb(fs_info, &found_key, path); - break; + return read_bg_from_eb(fs_info, &found_key, path); } - - path->slots[0]++; } -out: return ret; } @@ -2006,6 +2026,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->length = key->offset; cache->used = btrfs_stack_block_group_used(bgi); cache->flags = btrfs_stack_block_group_flags(bgi); + cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); set_free_space_tree_thresholds(cache); @@ -2288,7 +2309,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, spin_lock(&block_group->lock); btrfs_set_stack_block_group_used(&bgi, block_group->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); + block_group->global_root_id); btrfs_set_stack_block_group_flags(&bgi, block_group->flags); key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; @@ -2444,6 +2465,27 @@ next: btrfs_trans_release_chunk_metadata(trans); } +/* + * For extent tree v2 we use the block_group_item->chunk_offset to point at our + * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. + */ +static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) +{ + u64 div = SZ_1G; + u64 index; + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return BTRFS_FIRST_CHUNK_TREE_OBJECTID; + + /* If we have a smaller fs index based on 128MiB. */ + if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) + div = SZ_128M; + + offset = div64_u64(offset, div); + div64_u64_rem(offset, fs_info->nr_global_roots, &index); + return index; +} + struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size) @@ -2464,6 +2506,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + cache->global_root_id = calculate_global_root_id(fs_info, cache->start); + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) cache->needs_free_space = 1; @@ -2473,12 +2517,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } - /* - * New block group is likely to be used soon. Try to activate it now. - * Failure is OK for now. - */ - btrfs_zone_activate(cache); - ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2693,7 +2731,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, bi = btrfs_item_ptr_offset(leaf, path->slots[0]); btrfs_set_stack_block_group_used(&bgi, cache->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); + cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); btrfs_mark_buffer_dirty(leaf); @@ -2916,7 +2954,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_path *path = NULL; LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; int loops = 0; spin_lock(&cur_trans->dirty_bgs_lock); @@ -2982,7 +3019,6 @@ again: cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; /* @@ -3083,7 +3119,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) int should_put; struct btrfs_path *path; struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -3141,7 +3176,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; list_add_tail(&cache->io_list, io); } else { @@ -3200,6 +3234,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) return ret; } +static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, + u64 bytes_freed) +{ + const struct btrfs_space_info *space_info = bg->space_info; + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const u64 new_val = bg->used; + const u64 old_val = new_val + bytes_freed; + u64 thresh; + + if (reclaim_thresh == 0) + return false; + + thresh = div_factor_fine(bg->length, reclaim_thresh); + + /* + * If we were below the threshold before don't reclaim, we are likely a + * brand new block group and we don't want to relocate new block groups. + */ + if (old_val < thresh) + return false; + if (new_val >= thresh) + return false; + return true; +} + int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { @@ -3222,6 +3281,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { + bool reclaim; + cache = btrfs_lookup_block_group(info, bytenr); if (!cache) { ret = -ENOENT; @@ -3267,6 +3328,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; + + reclaim = should_reclaim_block_group(cache, num_bytes); spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -3293,6 +3356,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, if (!alloc && old_val == 0) { if (!btrfs_test_opt(info, DISCARD_ASYNC)) btrfs_mark_bg_unused(cache); + } else if (!alloc && reclaim) { + btrfs_mark_bg_to_reclaim(cache); } btrfs_put_block_group(cache); @@ -3425,7 +3490,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); } -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) +static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) { struct btrfs_block_group *bg; int ret; @@ -3512,7 +3577,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) out: btrfs_trans_release_chunk_metadata(trans); - return ret; + if (ret) + return ERR_PTR(ret); + + btrfs_get_block_group(bg); + return bg; } /* @@ -3627,10 +3696,17 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *space_info; + struct btrfs_block_group *ret_bg; bool wait_for_alloc = false; bool should_alloc = false; + bool from_extent_allocation = false; int ret = 0; + if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { + from_extent_allocation = true; + force = CHUNK_ALLOC_FORCE; + } + /* Don't re-enter if we're already allocating a chunk */ if (trans->allocating_chunk) return -ENOSPC; @@ -3720,9 +3796,22 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, force_metadata_allocation(fs_info); } - ret = do_chunk_alloc(trans, flags); + ret_bg = do_chunk_alloc(trans, flags); trans->allocating_chunk = false; + if (IS_ERR(ret_bg)) { + ret = PTR_ERR(ret_bg); + } else if (from_extent_allocation) { + /* + * New block group is likely to be used soon. Try to activate + * it now. Failure is OK for now. + */ + btrfs_zone_activate(ret_bg); + } + + if (!ret) + btrfs_put_block_group(ret_bg); + spin_lock(&space_info->lock); if (ret < 0) { if (ret == -ENOSPC) @@ -3913,14 +4002,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - spin_lock(&info->block_group_cache_lock); + write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); spin_lock(&info->unused_bgs_lock); while (!list_empty(&info->unused_bgs)) { @@ -3950,14 +4039,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->zone_active_bgs_lock); - spin_lock(&info->block_group_cache_lock); - while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + write_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group, cache_node); - rb_erase(&block_group->cache_node, - &info->block_group_cache_tree); + rb_erase_cached(&block_group->cache_node, + &info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); list_del(&block_group->list); @@ -3980,9 +4069,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) ASSERT(block_group->swap_extents == 0); btrfs_put_block_group(block_group); - spin_lock(&info->block_group_cache_lock); + write_lock(&info->block_group_cache_lock); } - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); btrfs_release_global_block_rsv(info); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 5878b7ce3b78..35e0e860cc0b 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -35,11 +35,15 @@ enum btrfs_discard_state { * the FS with empty chunks * * CHUNK_ALLOC_FORCE means it must try to allocate one + * + * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from + * find_free_extent() that also activaes the zone */ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_NO_FORCE, CHUNK_ALLOC_LIMITED, CHUNK_ALLOC_FORCE, + CHUNK_ALLOC_FORCE_FOR_EXTENT, }; struct btrfs_caching_control { @@ -68,6 +72,7 @@ struct btrfs_block_group { u64 bytes_super; u64 flags; u64 cache_generation; + u64 global_root_id; /* * If the free space extent count exceeds this number, convert the block @@ -99,6 +104,7 @@ struct btrfs_block_group { unsigned int relocating_repair:1; unsigned int chunk_item_inserted:1; unsigned int zone_is_active:1; + unsigned int zoned_data_reloc_ongoing:1; int disk_cache_state; @@ -207,6 +213,8 @@ struct btrfs_block_group { u64 meta_write_pointer; struct map_lookup *physical_map; struct list_head active_bg_list; + struct work_struct zone_finish_work; + struct extent_buffer *last_eb; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -249,8 +257,9 @@ void btrfs_put_block_group(struct btrfs_block_group *cache); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg); -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b3e46aabc3d8..33811e896623 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -14,6 +14,13 @@ #include "delayed-inode.h" /* + * Since we search a directory based on f_pos (struct dir_context::pos) we have + * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so + * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()). + */ +#define BTRFS_DIR_START_INDEX 2 + +/* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set * the btrfs file release call will add this inode to the @@ -173,8 +180,9 @@ struct btrfs_inode { u64 disk_i_size; /* - * if this is a directory then index_cnt is the counter for the index - * number for new files that are created + * If this is a directory then index_cnt is the counter for the index + * number for new files that are created. For an empty directory, this + * must be initialized to BTRFS_DIR_START_INDEX. */ u64 index_cnt; @@ -333,6 +341,36 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) spin_unlock(&inode->lock); } +/* + * Should be called while holding the inode's VFS lock in exclusive mode or in a + * context where no one else can access the inode concurrently (during inode + * creation or when loading an inode from disk). + */ +static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) +{ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + /* + * The inode may have been part of a reflink operation in the last + * transaction that modified it, and then a fsync has reset the + * last_reflink_trans to avoid subsequent fsyncs in the same + * transaction to do unnecessary work. So update last_reflink_trans + * to the last_trans value (we have to be pessimistic and assume a + * reflink happened). + * + * The ->last_trans is protected by the inode's spinlock and we can + * have a concurrent ordered extent completion update it. Also set + * last_reflink_trans to ->last_trans only if the former is less than + * the later, because we can be called in a context where + * last_reflink_trans was set to the current transaction generation + * while ->last_trans was not yet updated in the current transaction, + * and therefore has a lower value. + */ + spin_lock(&inode->lock); + if (inode->last_reflink_trans < inode->last_trans) + inode->last_reflink_trans = inode->last_trans; + spin_unlock(&inode->lock); +} + static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { bool ret = false; @@ -346,30 +384,16 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) return ret; } -struct btrfs_dio_private { - struct inode *inode; - - /* - * Since DIO can use anonymous page, we cannot use page_offset() to - * grab the file offset, thus need a dedicated member for file offset. - */ - u64 file_offset; - u64 disk_bytenr; - /* Used for bio::bi_size */ - u32 bytes; - - /* - * References to this structure. There is one reference per in-flight - * bio plus one while we're still setting up. - */ - refcount_t refs; - - /* dio_bio came from fs/direct-io.c */ - struct bio *dio_bio; - - /* Array of checksums */ - u8 csums[]; -}; +/* + * Check if the inode has flags compatible with compression + */ +static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATACOW || + inode->flags & BTRFS_INODE_NODATASUM) + return false; + return true; +} /* * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7e9f90fa0388..5d20137b7b67 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -78,7 +78,6 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/mutex.h> -#include <linux/genhd.h> #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/string.h> @@ -1553,21 +1552,18 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -ENOMEM; block_ctx->datav = block_ctx->mem_to_free; block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); - for (i = 0; i < num_pages; i++) { - block_ctx->pagev[i] = alloc_page(GFP_NOFS); - if (!block_ctx->pagev[i]) - return -1; - } + ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev); + if (ret) + return ret; dev_bytenr = block_ctx->dev_bytenr; for (i = 0; i < num_pages;) { struct bio *bio; unsigned int j; - bio = btrfs_bio_alloc(num_pages - i); - bio_set_dev(bio, block_ctx->dev->bdev); + bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, + REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = dev_bytenr >> 9; - bio->bi_opf = REQ_OP_READ; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -2034,7 +2030,7 @@ continue_loop: static void btrfsic_bio_end_io(struct bio *bp) { - struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; + struct btrfsic_block *block = bp->bi_private; int iodone_w_error; /* mutex is not held! This is not save if IO is not yet completed @@ -2636,100 +2632,93 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) &btrfsic_dev_state_hashtable); } -static void __btrfsic_submit_bio(struct bio *bio) +static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) { - struct btrfsic_dev_state *dev_state; + unsigned int segs = bio_segments(bio); + u64 dev_bytenr = 512 * bio->bi_iter.bi_sector; + u64 cur_bytenr = dev_bytenr; + struct bvec_iter iter; + struct bio_vec bvec; + char **mapped_datav; + int bio_is_patched = 0; + int i = 0; + + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + pr_info( +"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + bio_op(bio), bio->bi_opf, segs, + bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - if (!btrfsic_is_initialized) + mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); + if (!mapped_datav) return; - mutex_lock(&btrfsic_mutex); - /* since btrfsic_submit_bio() is also called before - * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); - if (NULL != dev_state && - (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - int i = 0; - u64 dev_bytenr; - u64 cur_bytenr; - struct bio_vec bvec; - struct bvec_iter iter; - int bio_is_patched; - char **mapped_datav; - unsigned int segs = bio_segments(bio); - - dev_bytenr = 512 * bio->bi_iter.bi_sector; - bio_is_patched = 0; - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, segs, - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - - mapped_datav = kmalloc_array(segs, - sizeof(*mapped_datav), GFP_NOFS); - if (!mapped_datav) - goto leave; - cur_bytenr = dev_bytenr; - - bio_for_each_segment(bvec, bio, iter) { - BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = page_address(bvec.bv_page); - i++; - - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) - pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec.bv_len, bvec.bv_offset); - cur_bytenr += bvec.bv_len; - } - btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_datav, segs, - bio, &bio_is_patched, - bio->bi_opf); - kfree(mapped_datav); - } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_bdev); - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - pr_info( -"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->bdev); - } else { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_len != PAGE_SIZE); + mapped_datav[i] = page_address(bvec.bv_page); + i++; - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = bio->bi_opf; - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - } + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) + pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", + i, cur_bytenr, bvec.bv_len, bvec.bv_offset); + cur_bytenr += bvec.bv_len; } -leave: - mutex_unlock(&btrfsic_mutex); + + btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, + bio, &bio_is_patched, bio->bi_opf); + kfree(mapped_datav); } -void btrfsic_submit_bio(struct bio *bio) +static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) { - __btrfsic_submit_bio(bio); - submit_bio(bio); + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_bdev); + + if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = bio->bi_opf; + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + } else if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) { + pr_info( +"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", + dev_state->bdev); + } } -int btrfsic_submit_bio_wait(struct bio *bio) +void btrfsic_check_bio(struct bio *bio) { - __btrfsic_submit_bio(bio); - return submit_bio_wait(bio); + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return; + + /* + * We can be called before btrfsic_mount, so there might not be a + * dev_state. + */ + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); + mutex_lock(&btrfsic_mutex); + if (dev_state) { + if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio)) + btrfsic_check_write_bio(bio, dev_state); + else if (bio->bi_opf & REQ_PREFLUSH) + btrfsic_check_flush_bio(bio, dev_state); + } + mutex_unlock(&btrfsic_mutex); } int btrfsic_mount(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index bcc730a06cb5..e4c8aed7996f 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -7,11 +7,9 @@ #define BTRFS_CHECK_INTEGRITY_H #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -void btrfsic_submit_bio(struct bio *bio); -int btrfsic_submit_bio_wait(struct bio *bio); +void btrfsic_check_bio(struct bio *bio); #else -#define btrfsic_submit_bio submit_bio -#define btrfsic_submit_bio_wait submit_bio_wait +static inline void btrfsic_check_bio(struct bio *bio) { } #endif int btrfsic_mount(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 71e5b2e9a1ba..f4564f32f6d9 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -219,7 +219,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b bi_size += bvec->bv_len; if (bio->bi_status) - cb->errors = 1; + cb->status = bio->bi_status; ASSERT(bi_size && bi_size <= cb->compressed_len); last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, @@ -234,7 +234,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b return last_io; } -static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio) +static void finish_compressed_bio_read(struct compressed_bio *cb) { unsigned int index; struct page *page; @@ -247,19 +247,18 @@ static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bi } /* Do io completion on the original bio */ - if (cb->errors) { - bio_io_error(cb->orig_bio); + if (cb->status != BLK_STS_OK) { + cb->orig_bio->bi_status = cb->status; + bio_endio(cb->orig_bio); } else { struct bio_vec *bvec; struct bvec_iter_all iter_all; - ASSERT(bio); - ASSERT(!bio->bi_status); /* * We have verified the checksum already, set page checked so * the end_io handlers know about it */ - ASSERT(!bio_flagged(bio, BIO_CLONED)); + ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED)); bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { u64 bvec_start = page_offset(bvec->bv_page) + bvec->bv_offset; @@ -308,7 +307,7 @@ static void end_compressed_bio_read(struct bio *bio) * Some IO in this cb have failed, just skip checksum as there * is no way it could be correct. */ - if (cb->errors == 1) + if (cb->status != BLK_STS_OK) goto csum_failed; inode = cb->inode; @@ -324,8 +323,8 @@ static void end_compressed_bio_read(struct bio *bio) csum_failed: if (ret) - cb->errors = 1; - finish_compressed_bio_read(cb, bio); + cb->status = errno_to_blk_status(ret); + finish_compressed_bio_read(cb); out: bio_put(bio); } @@ -342,11 +341,12 @@ static noinline void end_compressed_writeback(struct inode *inode, unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct page *pages[16]; unsigned long nr_pages = end_index - index + 1; + const int errno = blk_status_to_errno(cb->status); int i; int ret; - if (cb->errors) - mapping_set_error(inode->i_mapping, -EIO); + if (errno) + mapping_set_error(inode->i_mapping, errno); while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, @@ -358,7 +358,7 @@ static noinline void end_compressed_writeback(struct inode *inode, continue; } for (i = 0; i < ret; i++) { - if (cb->errors) + if (errno) SetPageError(pages[i]); btrfs_page_clamp_clear_writeback(fs_info, pages[i], cb->start, cb->len); @@ -381,9 +381,10 @@ static void finish_compressed_bio_write(struct compressed_bio *cb) */ btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, cb->start, cb->start + cb->len - 1, - !cb->errors); + cb->status == BLK_STS_OK); - end_compressed_writeback(inode, cb); + if (cb->writeback) + end_compressed_writeback(inode, cb); /* Note, our inode could be gone now */ /* @@ -424,7 +425,6 @@ out: } static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, - struct compressed_bio *cb, struct bio *bio, int mirror_num) { blk_status_t ret; @@ -506,7 +506,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct page **compressed_pages, unsigned int nr_pages, unsigned int write_flags, - struct cgroup_subsys_state *blkcg_css) + struct cgroup_subsys_state *blkcg_css, + bool writeback) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = NULL; @@ -524,16 +525,20 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (!cb) return BLK_STS_RESOURCE; refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); - cb->errors = 0; + cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; cb->len = len; cb->mirror_num = 0; cb->compressed_pages = compressed_pages; cb->compressed_len = compressed_len; + cb->writeback = writeback; cb->orig_bio = NULL; cb->nr_pages = nr_pages; + if (blkcg_css) + kthread_associate_blkcg(blkcg_css); + while (cur_disk_bytenr < disk_start + compressed_len) { u64 offset = cur_disk_bytenr - disk_start; unsigned int index = offset >> PAGE_SHIFT; @@ -552,6 +557,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, bio = NULL; goto finish_cb; } + if (blkcg_css) + bio->bi_opf |= REQ_CGROUP_PUNT; } /* * We should never reach next_stripe_start start as we will @@ -591,12 +598,12 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (submit) { if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, 1); + ret = btrfs_csum_one_bio(inode, bio, start, true); if (ret) goto finish_cb; } - ret = submit_compressed_bio(fs_info, cb, bio, 0); + ret = submit_compressed_bio(fs_info, bio, 0); if (ret) goto finish_cb; bio = NULL; @@ -609,6 +616,9 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, return 0; finish_cb: + if (blkcg_css) + kthread_associate_blkcg(NULL); + if (bio) { bio->bi_status = ret; bio_endio(bio); @@ -791,15 +801,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) +void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned int compressed_len; - unsigned int nr_pages; - unsigned int pg_index; struct bio *comp_bio = NULL; const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_byte = disk_bytenr; @@ -808,8 +816,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - blk_status_t ret = BLK_STS_RESOURCE; - int faili = 0; + blk_status_t ret; + int ret2; + int i; u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; @@ -821,17 +830,21 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); - if (!em) - return BLK_STS_IOERR; + if (!em) { + ret = BLK_STS_IOERR; + goto out; + } ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); - if (!cb) + if (!cb) { + ret = BLK_STS_RESOURCE; goto out; + } refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); - cb->errors = 0; + cb->status = BLK_STS_OK; cb->inode = inode; cb->mirror_num = mirror_num; sums = cb->sums; @@ -840,30 +853,26 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, em_len = em->len; em_start = em->start; - free_extent_map(em); - em = NULL; - cb->len = bio->bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = extent_compress_type(bio_flags); + cb->compress_type = em->compress_type; cb->orig_bio = bio; - nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_NOFS); - if (!cb->compressed_pages) - goto fail1; - - for (pg_index = 0; pg_index < nr_pages; pg_index++) { - cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS); - if (!cb->compressed_pages[pg_index]) { - faili = pg_index - 1; - ret = BLK_STS_RESOURCE; - goto fail2; - } + free_extent_map(em); + em = NULL; + + cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); + if (!cb->compressed_pages) { + ret = BLK_STS_RESOURCE; + goto fail; + } + + ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); + if (ret2) { + ret = BLK_STS_RESOURCE; + goto fail; } - faili = nr_pages - 1; - cb->nr_pages = nr_pages; add_ra_bio_pages(inode, em_start + em_len, cb); @@ -932,26 +941,29 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, fs_info->sectorsize); sums += fs_info->csum_size * nr_sectors; - ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num); + ret = submit_compressed_bio(fs_info, comp_bio, mirror_num); if (ret) goto finish_cb; comp_bio = NULL; } } - return 0; + return; -fail2: - while (faili >= 0) { - __free_page(cb->compressed_pages[faili]); - faili--; +fail: + if (cb->compressed_pages) { + for (i = 0; i < cb->nr_pages; i++) { + if (cb->compressed_pages[i]) + __free_page(cb->compressed_pages[i]); + } } kfree(cb->compressed_pages); -fail1: kfree(cb); out: free_extent_map(em); - return ret; + bio->bi_status = ret; + bio_endio(bio); + return; finish_cb: if (comp_bio) { comp_bio->bi_status = ret; @@ -959,7 +971,7 @@ finish_cb: } /* All bytes of @cb is submitted, endio will free @cb */ if (cur_disk_byte == disk_bytenr + compressed_len) - return ret; + return; wait_var_event(cb, refcount_read(&cb->pending_sectors) == (disk_bytenr + compressed_len - cur_disk_byte) >> @@ -970,8 +982,7 @@ finish_cb: */ ASSERT(refcount_read(&cb->pending_sectors)); /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_read(cb, NULL); - return ret; + finish_compressed_bio_read(cb); } /* diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 56eef0821e3e..2707404389a5 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -22,6 +22,8 @@ struct btrfs_inode; /* Maximum length of compressed data stored on disk */ #define BTRFS_MAX_COMPRESSED (SZ_128K) +static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); + /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) @@ -52,8 +54,11 @@ struct compressed_bio { /* The compression algorithm for this bio */ u8 compress_type; + /* Whether this is a write for writeback. */ + bool writeback; + /* IO errors */ - u8 errors; + blk_status_t status; int mirror_num; /* for reads, this is the bio we are copying the data into */ @@ -95,9 +100,10 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct page **compressed_pages, unsigned int nr_pages, unsigned int write_flags, - struct cgroup_subsys_state *blkcg_css); -blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); + struct cgroup_subsys_state *blkcg_css, + bool writeback); +void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a7db3f6f1b7b..6e556031a8f3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -16,6 +16,7 @@ #include "volumes.h" #include "qgroup.h" #include "tree-mod-log.h" +#include "tree-checker.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -342,7 +343,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, int level = btrfs_header_level(buf); ret = btrfs_set_disk_extent_flags(trans, buf, - new_flags, level, 0); + new_flags, level); if (ret) return ret; } @@ -846,9 +847,11 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, btrfs_header_owner(parent), btrfs_node_ptr_generation(parent, slot), level - 1, &first_key); - if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) + return eb; + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); - eb = ERR_PTR(-EIO); + return ERR_PTR(-EIO); } return eb; @@ -1388,12 +1391,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } /* - * helper function for btrfs_search_slot. The goal is to find a block - * in cache without setting the path to blocking. If we find the block - * we return zero and the path is unchanged. + * Helper function for btrfs_search_slot() and other functions that do a search + * on a btree. The goal is to find a tree block in the cache (the radix tree at + * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read + * its pages from disk. * - * If we can't find the block, we set the path blocking and do some - * reada. -EAGAIN is returned and the search must be repeated. + * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the + * whole btree search, starting again from the current root node. */ static int read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, @@ -1407,12 +1411,21 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, struct btrfs_key first_key; int ret; int parent_level; + bool unlock_up; + unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); blocknr = btrfs_node_blockptr(*eb_ret, slot); gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); + /* + * If we need to read an extent buffer from disk and we are holding locks + * on upper level nodes, we unlock all the upper nodes before reading the + * extent buffer, and then return -EAGAIN to the caller as it needs to + * restart the search. We don't release the lock on the current level + * because we need to walk this node to figure out which blocks to read. + */ tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { if (p->reada == READA_FORWARD_ALWAYS) @@ -1434,47 +1447,61 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, return 0; } + if (unlock_up) + btrfs_unlock_up_safe(p, level + 1); + /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); - if (!ret) { - *eb_ret = tmp; - return 0; + ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key); + if (ret) { + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; } - free_extent_buffer(tmp); - btrfs_release_path(p); - return -EIO; + if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EUCLEAN; + } + + if (unlock_up) + ret = -EAGAIN; + + goto out; } - /* - * reduce lock contention at high levels - * of the btree by dropping locks before - * we read. Don't release the lock on the current - * level because we need to walk this node to figure - * out which blocks to read. - */ - btrfs_unlock_up_safe(p, level + 1); + if (unlock_up) { + btrfs_unlock_up_safe(p, level + 1); + ret = -EAGAIN; + } else { + ret = 0; + } if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); - ret = -EAGAIN; tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, gen, parent_level - 1, &first_key); - if (!IS_ERR(tmp)) { - /* - * If the read above didn't mark this buffer up to date, - * it will never end up being up to date. Set ret to EIO now - * and give up so that our caller doesn't loop forever - * on our EAGAINs. - */ - if (!extent_buffer_uptodate(tmp)) - ret = -EIO; - free_extent_buffer(tmp); + if (IS_ERR(tmp)) { + btrfs_release_path(p); + return PTR_ERR(tmp); + } + /* + * If the read above didn't mark this buffer up to date, + * it will never end up being up to date. Set ret to EIO now + * and give up so that our caller doesn't loop forever + * on our EAGAINs. + */ + if (!extent_buffer_uptodate(tmp)) + ret = -EIO; + +out: + if (ret == 0) { + *eb_ret = tmp; } else { - ret = PTR_ERR(tmp); + free_extent_buffer(tmp); + btrfs_release_path(p); } - btrfs_release_path(p); return ret; } @@ -2277,6 +2304,43 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, return ret; } +/** + * Search for a valid slot for the given path. + * + * @root: The root node of the tree. + * @key: Will contain a valid item if found. + * @path: The starting point to validate the slot. + * + * Return: 0 if the item is valid + * 1 if not found + * <0 if error. + */ +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path) +{ + while (1) { + int ret; + const int slot = path->slots[0]; + const struct extent_buffer *leaf = path->nodes[0]; + + /* This is where we start walking the path. */ + if (slot >= btrfs_header_nritems(leaf)) { + /* + * If we've reached the last slot in this leaf we need + * to go to the next leaf and reset the path. + */ + ret = btrfs_next_leaf(root, path); + if (ret) + return ret; + continue; + } + /* Store the found, valid item in @key. */ + btrfs_item_key_to_cpu(leaf, key, slot); + break; + } + return 0; +} + /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. @@ -2990,16 +3054,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (free_space < data_size) goto out_unlock; - /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, slot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) goto out_unlock; - free_space = btrfs_leaf_free_space(right); - if (free_space < data_size) - goto out_unlock; - left_nritems = btrfs_header_nritems(left); if (left_nritems == 0) goto out_unlock; @@ -3224,7 +3283,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - /* cow and double check */ ret = btrfs_cow_block(trans, root, left, path->nodes[1], slot - 1, &left, BTRFS_NESTING_LEFT_COW); @@ -3235,12 +3293,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - free_space = btrfs_leaf_free_space(left); - if (free_space < data_size) { - ret = 1; - goto out; - } - if (check_sibling_keys(left, right)) { ret = -EUCLEAN; goto out; @@ -4170,24 +4222,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; - u32 last_off; - u32 dsize = 0; int ret = 0; int wret; - int i; u32 nritems; leaf = path->nodes[0]; - last_off = btrfs_item_offset(leaf, slot + nr - 1); - - for (i = 0; i < nr; i++) - dsize += btrfs_item_size(leaf, slot + i); - nritems = btrfs_header_nritems(leaf); if (slot + nr != nritems) { - int data_end = leaf_data_end(leaf); + const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); + const int data_end = leaf_data_end(leaf); struct btrfs_map_token token; + u32 dsize = 0; + int i; + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size(leaf, slot + i); memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, @@ -4227,24 +4277,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, fixup_low_keys(path, &disk_key, 1); } - /* delete the leaf if it is mostly empty */ + /* + * Try to delete the leaf if it is mostly empty. We do this by + * trying to move all its items into its left and right neighbours. + * If we can't move all the items, then we don't delete it - it's + * not ideal, but future insertions might fill the leaf with more + * items, or items from other leaves might be moved later into our + * leaf due to deletions on those leaves. + */ if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { + u32 min_push_space; + /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below */ slot = path->slots[1]; atomic_inc(&leaf->refs); - - wret = push_leaf_left(trans, root, path, 1, 1, - 1, (u32)-1); + /* + * We want to be able to at least push one item to the + * left neighbour leaf, and that's the first item. + */ + min_push_space = sizeof(struct btrfs_item) + + btrfs_item_size(leaf, 0); + wret = push_leaf_left(trans, root, path, 0, + min_push_space, 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (path->nodes[0] == leaf && btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, - 1, 1, 0); + /* + * If we were not able to push all items from our + * leaf to its left neighbour, then attempt to + * either push all the remaining items to the + * right neighbour or none. There's no advantage + * in pushing only some items, instead of all, as + * it's pointless to end up with a leaf having + * too few items while the neighbours can be full + * or nearly full. + */ + nritems = btrfs_header_nritems(leaf); + min_push_space = leaf_space_used(leaf, 0, nritems); + wret = push_leaf_right(trans, root, path, 0, + min_push_space, 1, 0); if (wret < 0 && wret != -ENOSPC) ret = wret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ebb2d109e8bb..415bf1823fb3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -49,6 +49,7 @@ extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; struct btrfs_bio; +struct btrfs_ioctl_encoded_io_args; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ @@ -148,6 +149,8 @@ enum { /* Indicates there was an error cleaning up a log tree. */ BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + + BTRFS_FS_STATE_COUNT }; #define BTRFS_BACKREF_REV_MAX 256 @@ -274,8 +277,14 @@ struct btrfs_super_block { /* the UUID written into btree blocks */ u8 metadata_uuid[BTRFS_FSID_SIZE]; + /* Extent tree v2 */ + __le64 block_group_root; + __le64 block_group_root_generation; + u8 block_group_root_level; + /* future expansion */ - __le64 reserved[28]; + u8 reserved8[7]; + __le64 reserved[25]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; @@ -300,6 +309,26 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG + */ +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) +#else #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ @@ -314,6 +343,7 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ BTRFS_FEATURE_INCOMPAT_ZONED) +#endif #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -636,6 +666,7 @@ struct btrfs_fs_info { struct btrfs_root *quota_root; struct btrfs_root *uuid_root; struct btrfs_root *data_reloc_root; + struct btrfs_root *block_group_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -644,13 +675,13 @@ struct btrfs_fs_info { rwlock_t global_root_lock; struct rb_root global_root_tree; - spinlock_t fs_roots_radix_lock; - struct radix_tree_root fs_roots_radix; + /* The xarray that holds all the FS roots */ + spinlock_t fs_roots_lock; + struct xarray fs_roots; /* block group cache stuff */ - spinlock_t block_group_cache_lock; - u64 first_logical_byte; - struct rb_root block_group_cache_tree; + rwlock_t block_group_cache_lock; + struct rb_root_cached block_group_cache_tree; /* keep track of unallocated space */ atomic64_t free_chunk_space; @@ -817,12 +848,13 @@ struct btrfs_fs_info { * two */ struct btrfs_workqueue *workers; + struct btrfs_workqueue *hipri_workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; struct btrfs_workqueue *endio_workers; struct btrfs_workqueue *endio_meta_workers; struct btrfs_workqueue *endio_raid56_workers; - struct btrfs_workqueue *rmw_workers; + struct workqueue_struct *rmw_workers; struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; @@ -915,9 +947,9 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; - struct btrfs_workqueue *scrub_workers; - struct btrfs_workqueue *scrub_wr_completion_workers; - struct btrfs_workqueue *scrub_parity_workers; + struct workqueue_struct *scrub_workers; + struct workqueue_struct *scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity_workers; struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -963,10 +995,10 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer radix tree */ + /* Extent buffer xarray */ spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; + struct xarray extent_buffers; /* next backup root to be overwritten */ int backup_root_index; @@ -1014,10 +1046,7 @@ struct btrfs_fs_info { * Zone size > 0 when in ZONED mode, otherwise it's used for a check * if the mode is enabled */ - union { - u64 zone_size; - u64 zoned; - }; + u64 zone_size; struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; @@ -1029,6 +1058,9 @@ struct btrfs_fs_info { */ spinlock_t relocation_bg_lock; u64 data_reloc_bg; + struct mutex zoned_data_reloc_io_lock; + + u64 nr_global_roots; spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; @@ -1087,7 +1119,8 @@ enum { */ BTRFS_ROOT_SHAREABLE, BTRFS_ROOT_TRACK_DIRTY, - BTRFS_ROOT_IN_RADIX, + /* The root is tracked in fs_info::fs_roots */ + BTRFS_ROOT_REGISTERED, BTRFS_ROOT_ORPHAN_ITEM_INSERTED, BTRFS_ROOT_DEFRAG_RUNNING, BTRFS_ROOT_FORCE_COW, @@ -1191,10 +1224,10 @@ struct btrfs_root { struct rb_root inode_tree; /* - * radix tree that keeps track of delayed nodes of every inode, - * protected by inode_lock + * Xarray that keeps track of delayed nodes of every inode, protected + * by inode_lock */ - struct radix_tree_root delayed_nodes_tree; + struct xarray delayed_nodes; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -1297,6 +1330,8 @@ struct btrfs_replace_extent_info { * existing extent into a file range. */ bool is_new_extent; + /* Indicate if we should update the inode's mtime and ctime. */ + bool update_times; /* Meaningful only if is_new_extent is true. */ int qgroup_reserved; /* @@ -1609,25 +1644,25 @@ DECLARE_BTRFS_SETGET_BITS(64) static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ const type *s) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ type *s, u##bits val) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } @@ -1658,8 +1693,8 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { - BUILD_BUG_ON(sizeof(u64) != - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } @@ -1667,8 +1702,8 @@ static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { - BUILD_BUG_ON(sizeof(u64) != - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } @@ -2328,6 +2363,17 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); +/* + * For extent tree v2 we overload the extent root with the block group root, as + * we will have multiple extent roots. + */ +BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level, + struct btrfs_root_backup, extent_root_level, 8); + /* struct btrfs_balance_item */ BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); @@ -2462,6 +2508,13 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block, + block_group_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation, + struct btrfs_super_block, + block_group_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block, + block_group_root_level, 8); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); @@ -2732,7 +2785,8 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict); + u64 objectid, u64 offset, u64 bytenr, bool strict, + struct btrfs_path *path); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -2759,8 +2813,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, - int level, int is_data); + struct extent_buffer *eb, u64 flags, int level); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, @@ -2839,7 +2892,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes, bool noflush); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); @@ -2986,6 +3040,35 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path); +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path); + +/* + * Search in @root for a given @key, and store the slot found in @found_key. + * + * @root: The root node of the tree. + * @key: The key we are looking for. + * @found_key: Will hold the found item. + * @path: Holds the current slot/leaf. + * @iter_ret: Contains the value returned from btrfs_search_slot or + * btrfs_get_next_valid_item, whichever was executed last. + * + * The @iter_ret is an output variable that will contain the return value of + * btrfs_search_slot, if it encountered an error, or the value returned from + * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid + * slot was found, 1 if there were no more leaves, and <0 if there was an error. + * + * It's recommended to use a separate variable for iter_ret and then use it to + * set the function return value so there's no confusion of the 0/1/errno + * values stemming from btrfs_search_slot. + */ +#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \ + for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \ + (iter_ret) >= 0 && \ + (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \ + (path)->slots[0]++ \ + ) + static inline int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *p, u64 time_seq) { @@ -3137,7 +3220,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); /* file-item.c */ -struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); @@ -3155,7 +3237,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 file_start, int contig); + u64 offset, bool one_ordered); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, @@ -3171,8 +3253,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ -blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); +void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type); unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end); @@ -3202,10 +3284,28 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); -int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - struct btrfs_root *parent_root, - struct user_namespace *mnt_userns); +struct btrfs_new_inode_args { + /* Input */ + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool orphan; + bool subvol; + + /* + * Output from btrfs_new_inode_prepare(), input to + * btrfs_create_new_inode(). + */ + struct posix_acl *default_acl; + struct posix_acl *acl; +}; +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items); +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args); +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, unsigned *bits); void btrfs_clear_delalloc_extent(struct inode *inode, @@ -3216,7 +3316,6 @@ void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); -int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); @@ -3256,9 +3355,14 @@ int btrfs_writepage_cow_fixup(struct page *page); void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate); +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded); +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); + +ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); + extern const struct dentry_operations btrfs_dentry_operations; -extern const struct iomap_ops btrfs_dio_iomap_ops; -extern const struct iomap_dio_ops btrfs_dio_ops; /* Inode locking type flags, by default the exclusive lock is taken */ #define BTRFS_ILOCK_SHARED (1U << 0) @@ -3270,6 +3374,7 @@ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3318,6 +3423,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, @@ -3343,11 +3450,29 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } -#ifdef CONFIG_PRINTK +#ifdef CONFIG_PRINTK_INDEX + +#define btrfs_printk(fs_info, fmt, args...) \ +do { \ + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ + _btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + __printf(2, 3) __cold -void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + +#elif defined(CONFIG_PRINTK) + +#define btrfs_printk(fs_info, fmt, args...) \ + _btrfs_printk(fs_info, fmt, ##args) + +__printf(2, 3) +__cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + #else + #define btrfs_printk(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, fmt, ##args) #endif @@ -3598,12 +3723,25 @@ do { \ __LINE__, (errno)); \ } while (0) +#ifdef CONFIG_PRINTK_INDEX + #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ +do { \ + printk_index_subsys_emit( \ + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ + KERN_CRIT, fmt); \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ } while (0) +#else + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args) + +#endif + #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ &(fs_info)->fs_state))) #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ @@ -3756,15 +3894,16 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir); +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type); #else #define btrfs_get_acl NULL #define btrfs_set_acl NULL -static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) +static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, + int type) { - return 0; + return -EOPNOTSUPP; } #endif @@ -3774,7 +3913,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_recover_relocation(struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -3869,7 +4008,7 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { - return fs_info->zoned != 0; + return fs_info->zone_size > 0; } static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) @@ -3886,5 +4025,8 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) #define PageOrdered(page) PagePrivate2(page) #define SetPageOrdered(page) SetPagePrivate2(page) #define ClearPageOrdered(page) ClearPagePrivate2(page) +#define folio_test_ordered(folio) folio_test_private_2(folio) +#define folio_set_ordered(folio) folio_set_private_2(folio) +#define folio_clear_ordered(folio) folio_clear_private_2(folio) #endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index fb46a28f5065..36ab0859a263 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -270,11 +270,11 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, } static void calc_inode_reservations(struct btrfs_fs_info *fs_info, - u64 num_bytes, u64 *meta_reserve, - u64 *qgroup_reserve) + u64 num_bytes, u64 disk_num_bytes, + u64 *meta_reserve, u64 *qgroup_reserve) { u64 nr_extents = count_max_extents(num_bytes); - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, @@ -288,7 +288,8 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, *qgroup_reserve = nr_extents * fs_info->nodesize; } -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes, bool noflush) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -307,7 +308,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) * If we have a transaction open (can happen if we call truncate_block * from truncate), then we need FLUSH_LIMIT so we don't deadlock. */ - if (btrfs_is_free_space_inode(inode)) { + if (noflush || btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; } else { if (current->journal_info) @@ -318,6 +319,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) } num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize); /* * We always want to do it this way, every other way is wrong and ends @@ -329,9 +331,10 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) * everything out and try again, which is bad. This way we just * over-reserve slightly, and clean up the mess when we are done. */ - calc_inode_reservations(fs_info, num_bytes, &meta_reserve, - &qgroup_reserve); - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); + calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, + &meta_reserve, &qgroup_reserve); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true, + noflush); if (ret) return ret; ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); @@ -349,7 +352,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) spin_lock(&inode->lock); nr_extents = count_max_extents(num_bytes); btrfs_mod_outstanding_extents(inode, nr_extents); - inode->csum_bytes += num_bytes; + inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); @@ -454,7 +457,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; - ret = btrfs_delalloc_reserve_metadata(inode, len); + ret = btrfs_delalloc_reserve_metadata(inode, len, len, false); if (ret < 0) { btrfs_free_reserved_data_space(inode, *reserved, start, len); extent_changeset_free(*reserved); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 748bf6b0d860..66779ab3ed4a 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -78,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( } spin_lock(&root->inode_lock); - node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + node = xa_load(&root->delayed_nodes, ino); if (node) { if (btrfs_inode->delayed_node) { @@ -90,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( /* * It's possible that we're racing into the middle of removing - * this node from the radix tree. In this case, the refcount + * this node from the xarray. In this case, the refcount * was zero and it should never go back to one. Just return - * NULL like it was never in the radix at all; our release + * NULL like it was never in the xarray at all; our release * function is in the process of removing it. * * Some implementations of refcount_inc refuse to bump the @@ -100,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * here, refcount_inc() may decide to just WARN_ONCE() instead * of actually bumping the refcount. * - * If this node is properly in the radix, we want to bump the + * If this node is properly in the xarray, we want to bump the * refcount twice, once for the inode and once for this get * operation. */ @@ -128,36 +128,30 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( u64 ino = btrfs_ino(btrfs_inode); int ret; -again: - node = btrfs_get_delayed_node(btrfs_inode); - if (node) - return node; + do { + node = btrfs_get_delayed_node(btrfs_inode); + if (node) + return node; - node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); - if (!node) - return ERR_PTR(-ENOMEM); - btrfs_init_delayed_node(node, root, ino); + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + btrfs_init_delayed_node(node, root, ino); - /* cached in the btrfs inode and can be accessed */ - refcount_set(&node->refs, 2); + /* Cached in the inode and can be accessed */ + refcount_set(&node->refs, 2); - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - kmem_cache_free(delayed_node_cache, node); - return ERR_PTR(ret); - } - - spin_lock(&root->inode_lock); - ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); - if (ret == -EEXIST) { - spin_unlock(&root->inode_lock); - kmem_cache_free(delayed_node_cache, node); - radix_tree_preload_end(); - goto again; - } + spin_lock(&root->inode_lock); + ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS); + if (ret) { + spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); + if (ret != -EBUSY) + return ERR_PTR(ret); + } + } while (ret); btrfs_inode->delayed_node = node; spin_unlock(&root->inode_lock); - radix_tree_preload_end(); return node; } @@ -276,8 +270,7 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); + xa_erase(&root->delayed_nodes, delayed_node->inode_id); spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } @@ -1870,34 +1863,35 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { - u64 inode_id = 0; + unsigned long index = 0; + struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_node *delayed_nodes[8]; - int i, n; while (1) { + int n = 0; + spin_lock(&root->inode_lock); - n = radix_tree_gang_lookup(&root->delayed_nodes_tree, - (void **)delayed_nodes, inode_id, - ARRAY_SIZE(delayed_nodes)); - if (!n) { + if (xa_empty(&root->delayed_nodes)) { spin_unlock(&root->inode_lock); - break; + return; } - inode_id = delayed_nodes[n - 1]->inode_id + 1; - for (i = 0; i < n; i++) { + xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) { /* * Don't increase refs in case the node is dead and * about to be removed from the tree in the loop below */ - if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) - delayed_nodes[i] = NULL; + if (refcount_inc_not_zero(&delayed_node->refs)) { + delayed_nodes[n] = delayed_node; + n++; + } + if (n >= ARRAY_SIZE(delayed_nodes)) + break; } + index++; spin_unlock(&root->inode_lock); - for (i = 0; i < n; i++) { - if (!delayed_nodes[i]) - continue; + for (int i = 0; i < n; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 4176df149d04..99f37fca2e96 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -930,7 +930,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); - BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; @@ -1103,8 +1102,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return -ENOMEM; init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, - false); + BTRFS_UPDATE_DELAYED_HEAD, false, false); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 91a3aabad150..d6304b690ec4 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -58,7 +58,6 @@ struct btrfs_delayed_extent_op { u8 level; bool update_key; bool update_flags; - bool is_data; u64 flags_to_set; }; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 62b9651ea662..a7dd6ba25e99 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -243,6 +243,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device **device_out) { + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; struct block_device *bdev; struct rcu_string *name; @@ -271,7 +272,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, sync_blockdev(bdev); - list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { btrfs_err(fs_info, "target device is in the filesystem!"); @@ -302,6 +303,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, goto error; } rcu_assign_pointer(device->name, name); + ret = lookup_bdev(device_path, &device->devt); + if (ret) + goto error; set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = 0; @@ -320,17 +324,17 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->mode = FMODE_EXCL; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); - device->fs_devices = fs_info->fs_devices; + device->fs_devices = fs_devices; ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error; - mutex_lock(&fs_info->fs_devices->device_list_mutex); - list_add(&device->dev_list, &fs_info->fs_devices->devices); - fs_info->fs_devices->num_devices++; - fs_info->fs_devices->open_devices++; - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); + list_add(&device->dev_list, &fs_devices->devices); + fs_devices->num_devices++; + fs_devices->open_devices++; + mutex_unlock(&fs_devices->device_list_mutex); *device_out = device; return 0; @@ -470,6 +474,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_block_group *cache; struct btrfs_trans_handle *trans; + int iter_ret = 0; int ret = 0; u64 chunk_offset; @@ -520,29 +525,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto free_path; - if (ret > 0) { - if (path->slots[0] >= - btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto free_path; - if (ret > 0) { - ret = 0; - goto free_path; - } - } else { - ret = 0; - } - } - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; - - btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != src_dev->devid) break; @@ -553,30 +537,23 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, if (found_key.offset < key.offset) break; - dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); cache = btrfs_lookup_block_group(fs_info, chunk_offset); if (!cache) - goto skip; + continue; spin_lock(&cache->lock); cache->to_copy = 1; spin_unlock(&cache->lock); btrfs_put_block_group(cache); - -skip: - ret = btrfs_next_item(root, path); - if (ret != 0) { - if (ret > 0) - ret = 0; - break; - } } + if (iter_ret < 0) + ret = iter_ret; -free_path: btrfs_free_path(path); unlock: mutex_unlock(&fs_info->chunk_mutex); @@ -730,7 +707,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - /* Commit dev_replace state and reserve 1 item for it. */ + /* + * Commit dev_replace state and reserve 1 item for it. + * This is crucial to ensure we won't miss copying extents for new block + * groups that are allocated after we started the device replace, and + * must be done after setting up the device replace state. + */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -872,6 +854,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *tgt_device; struct btrfs_device *src_device; struct btrfs_root *root = fs_info->tree_root; @@ -921,12 +904,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, WARN_ON(ret); /* Prevent write_all_supers() during the finishing procedure */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); /* Prevent new chunks being allocated on the source device */ mutex_lock(&fs_info->chunk_mutex); if (!list_empty(&src_device->post_commit_list)) { - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); mutex_unlock(&fs_info->chunk_mutex); } else { break; @@ -963,7 +946,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(tgt_device); @@ -992,8 +975,8 @@ error: btrfs_assign_next_active_device(src_device, tgt_device); - list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); - fs_info->fs_devices->rw_devices++; + list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); + fs_devices->rw_devices++; up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); @@ -1016,7 +999,7 @@ error: * belong to this filesystem. */ mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); /* replace the sysfs entry */ btrfs_sysfs_remove_device(src_device); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 3b532bab0755..72fb2c518a2b 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -325,36 +325,15 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const char *name, int name_len) { - struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - u32 nritems; int ret; key.objectid = dirid; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ERR_PTR(ret); - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - - while (1) { - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_for_each_slot(root, &key, &key, path, ret) { if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; @@ -362,10 +341,12 @@ btrfs_search_dir_index_item(struct btrfs_root *root, name, name_len); if (di) return di; - - path->slots[0]++; } - return NULL; + /* Adjust return code if the key was not found in the next leaf. */ + if (ret > 0) + ret = 0; + + return ERR_PTR(ret); } struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 48590a380762..4ba005c41983 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -5,7 +5,6 @@ #include <linux/fs.h> #include <linux/blkdev.h> -#include <linux/radix-tree.h> #include <linux/writeback.h> #include <linux/workqueue.h> #include <linux/kthread.h> @@ -374,9 +373,9 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, * @level: expected level, mandatory check * @first_key: expected key of first slot, skip check if NULL */ -static int btree_read_extent_buffer_pages(struct extent_buffer *eb, - u64 parent_transid, int level, - struct btrfs_key *first_key) +int btrfs_read_extent_buffer(struct extent_buffer *eb, + u64 parent_transid, int level, + struct btrfs_key *first_key) { struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; @@ -441,17 +440,31 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) else ret = btrfs_check_leaf_full(eb); - if (ret < 0) { - btrfs_print_tree(eb, 0); + if (ret < 0) + goto error; + + /* + * Also check the generation, the eb reached here must be newer than + * last committed. Or something seriously wrong happened. + */ + if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { + ret = -EUCLEAN; btrfs_err(fs_info, - "block=%llu write time tree block corruption detected", - eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - return ret; + "block=%llu bad generation, have %llu expect > %llu", + eb->start, btrfs_header_generation(eb), + fs_info->last_trans_committed); + goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); return 0; + +error: + btrfs_print_tree(eb, 0); + btrfs_err(fs_info, "block=%llu write time tree block corruption detected", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return ret; } /* Checksum all dirty extent buffers in one bio_vec */ @@ -472,7 +485,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, fs_info->nodesize); - /* A dirty eb shouldn't disappear from buffer_radix */ + /* A dirty eb shouldn't disappear from extent_buffers */ if (WARN_ON(!eb)) return -EUCLEAN; @@ -505,7 +518,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec u64 found_start; struct extent_buffer *eb; - if (fs_info->sectorsize < PAGE_SIZE) + if (fs_info->nodesize < PAGE_SIZE) return csum_dirty_subpage_buffers(fs_info, bvec); eb = (struct extent_buffer *)page->private; @@ -690,7 +703,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ASSERT(page->private); - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return validate_subpage_buffer(page, start, end, mirror); eb = (struct extent_buffer *)page->private; @@ -836,8 +849,7 @@ static void run_one_async_free(struct btrfs_work *work) } blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 dio_file_offset, + int mirror_num, u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; @@ -860,9 +872,9 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, async->status = 0; if (op_is_sync(bio->bi_opf)) - btrfs_set_work_high_priority(&async->work); - - btrfs_queue_work(fs_info->workers, &async->work); + btrfs_queue_work(fs_info->hipri_workers, &async->work); + else + btrfs_queue_work(fs_info->workers, &async->work); return 0; } @@ -906,8 +918,7 @@ static bool should_async_write(struct btrfs_fs_info *fs_info, return true; } -blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) +void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); blk_status_t ret; @@ -919,31 +930,25 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, */ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_METADATA); - if (ret) - goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (!ret) + ret = btrfs_map_bio(fs_info, bio, mirror_num); } else if (!should_async_write(fs_info, BTRFS_I(inode))) { ret = btree_csum_one_bio(bio); - if (ret) - goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (!ret) + ret = btrfs_map_bio(fs_info, bio, mirror_num); } else { /* * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - 0, btree_submit_bio_start); + btree_submit_bio_start); } - if (ret) - goto out_w_error; - return 0; - -out_w_error: - bio->bi_status = ret; - bio_endio(bio); - return ret; + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + } } #ifdef CONFIG_MIGRATION @@ -991,49 +996,48 @@ static int btree_writepages(struct address_space *mapping, return btree_write_cache_pages(mapping, wbc); } -static int btree_releasepage(struct page *page, gfp_t gfp_flags) +static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) { - if (PageWriteback(page) || PageDirty(page)) - return 0; + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return false; - return try_release_extent_buffer(page); + return try_release_extent_buffer(&folio->page); } -static void btree_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void btree_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - extent_invalidatepage(tree, page, offset); - btree_releasepage(page, GFP_NOFS); - if (PagePrivate(page)) { - btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, - "page private not zero on page %llu", - (unsigned long long)page_offset(page)); - detach_page_private(page); + tree = &BTRFS_I(folio->mapping->host)->io_tree; + extent_invalidate_folio(tree, folio, offset); + btree_release_folio(folio, GFP_NOFS); + if (folio_get_private(folio)) { + btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, + "folio private not zero on folio %llu", + (unsigned long long)folio_pos(folio)); + folio_detach_private(folio); } } -static int btree_set_page_dirty(struct page *page) -{ #ifdef DEBUG - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); +static bool btree_dirty_folio(struct address_space *mapping, + struct folio *folio) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); struct btrfs_subpage *subpage; struct extent_buffer *eb; int cur_bit = 0; - u64 page_start = page_offset(page); + u64 page_start = folio_pos(folio); if (fs_info->sectorsize == PAGE_SIZE) { - BUG_ON(!PagePrivate(page)); - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); BUG_ON(!eb); BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); btrfs_assert_tree_write_locked(eb); - return __set_page_dirty_nobuffers(page); + return filemap_dirty_folio(mapping, folio); } - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); ASSERT(subpage->dirty_bitmap); while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { @@ -1059,18 +1063,20 @@ static int btree_set_page_dirty(struct page *page) cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); } -#endif - return __set_page_dirty_nobuffers(page); + return filemap_dirty_folio(mapping, folio); } +#else +#define btree_dirty_folio filemap_dirty_folio +#endif static const struct address_space_operations btree_aops = { .writepages = btree_writepages, - .releasepage = btree_releasepage, - .invalidatepage = btree_invalidatepage, + .release_folio = btree_release_folio, + .invalidate_folio = btree_invalidate_folio, #ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, #endif - .set_page_dirty = btree_set_page_dirty, + .dirty_folio = btree_dirty_folio, }; struct extent_buffer *btrfs_find_create_tree_block( @@ -1103,12 +1109,15 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, if (IS_ERR(buf)) return buf; - ret = btree_read_extent_buffer_pages(buf, parent_transid, - level, first_key); + ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key); if (ret) { free_extent_buffer_stale(buf); return ERR_PTR(ret); } + if (btrfs_check_eb_owner(buf, owner_root)) { + free_extent_buffer_stale(buf); + return ERR_PTR(-EUCLEAN); + } return buf; } @@ -1149,7 +1158,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; - INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); + xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); btrfs_init_root_block_rsv(root); @@ -1201,9 +1210,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&root->leak_list); - spin_lock(&fs_info->fs_roots_radix_lock); + spin_lock(&fs_info->fs_roots_lock); list_add_tail(&root->leak_list, &fs_info->allocated_roots); - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); #endif } @@ -1289,12 +1298,33 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, return root; } +static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group *block_group; + u64 ret; + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + if (bytenr) + block_group = btrfs_lookup_block_group(fs_info, bytenr); + else + block_group = btrfs_lookup_first_block_group(fs_info, bytenr); + ASSERT(block_group); + if (!block_group) + return 0; + ret = block_group->global_root_id; + btrfs_put_block_group(block_group); + + return ret; +} + struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_key key = { .objectid = BTRFS_CSUM_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, - .offset = 0, + .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); @@ -1305,7 +1335,7 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) struct btrfs_key key = { .objectid = BTRFS_EXTENT_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, - .offset = 0, + .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); @@ -1522,10 +1552,28 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, ret = PTR_ERR(root->node); root->node = NULL; goto fail; - } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + } + if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; goto fail; } + + /* + * For real fs, and not log/reloc trees, root owner must + * match its root node owner + */ + if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + root->root_key.objectid != btrfs_header_owner(root->node)) { + btrfs_crit(fs_info, +"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", + root->root_key.objectid, root->node->start, + btrfs_header_owner(root->node), + root->root_key.objectid); + ret = -EUCLEAN; + goto fail; + } root->commit_root = btrfs_root_node(root); return root; fail: @@ -1611,12 +1659,11 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, { struct btrfs_root *root; - spin_lock(&fs_info->fs_roots_radix_lock); - root = radix_tree_lookup(&fs_info->fs_roots_radix, - (unsigned long)root_id); + spin_lock(&fs_info->fs_roots_lock); + root = xa_load(&fs_info->fs_roots, (unsigned long)root_id); if (root) root = btrfs_grab_root(root); - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); return root; } @@ -1658,20 +1705,14 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - ret = radix_tree_preload(GFP_NOFS); - if (ret) - return ret; - - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - root); + spin_lock(&fs_info->fs_roots_lock); + ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid, + root, GFP_NOFS); if (ret == 0) { btrfs_grab_root(root); - set_bit(BTRFS_ROOT_IN_RADIX, &root->state); + set_bit(BTRFS_ROOT_REGISTERED, &root->state); } - spin_unlock(&fs_info->fs_roots_radix_lock); - radix_tree_preload_end(); + spin_unlock(&fs_info->fs_roots_lock); return ret; } @@ -1727,6 +1768,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->uuid_root); btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); + btrfs_put_root(fs_info->block_group_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1812,9 +1854,10 @@ again: ret = btrfs_insert_fs_root(fs_info, root); if (ret) { - btrfs_put_root(root); - if (ret == -EEXIST) + if (ret == -EEXIST) { + btrfs_put_root(root); goto again; + } goto fail; } return root; @@ -1925,8 +1968,7 @@ static void end_workqueue_fn(struct btrfs_work *work) static int cleaner_kthread(void *arg) { - struct btrfs_root *root = arg; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = arg; int again; while (1) { @@ -1959,7 +2001,7 @@ static int cleaner_kthread(void *arg) btrfs_run_delayed_iputs(fs_info); - again = btrfs_clean_one_deleted_snapshot(root); + again = btrfs_clean_one_deleted_snapshot(fs_info); mutex_unlock(&fs_info->cleaner_mutex); /* @@ -2095,8 +2137,6 @@ static void backup_super_roots(struct btrfs_fs_info *info) { const int next_backup = info->backup_root_index; struct btrfs_root_backup *root_backup; - struct btrfs_root *extent_root = btrfs_extent_root(info, 0); - struct btrfs_root *csum_root = btrfs_csum_root(info, 0); root_backup = info->super_for_commit->super_roots + next_backup; @@ -2121,11 +2161,30 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); - btrfs_set_backup_extent_root(root_backup, extent_root->node->start); - btrfs_set_backup_extent_root_gen(root_backup, - btrfs_header_generation(extent_root->node)); - btrfs_set_backup_extent_root_level(root_backup, - btrfs_header_level(extent_root->node)); + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) { + btrfs_set_backup_block_group_root(root_backup, + info->block_group_root->node->start); + btrfs_set_backup_block_group_root_gen(root_backup, + btrfs_header_generation(info->block_group_root->node)); + btrfs_set_backup_block_group_root_level(root_backup, + btrfs_header_level(info->block_group_root->node)); + } else { + struct btrfs_root *extent_root = btrfs_extent_root(info, 0); + struct btrfs_root *csum_root = btrfs_csum_root(info, 0); + + btrfs_set_backup_extent_root(root_backup, + extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(extent_root->node)); + + btrfs_set_backup_csum_root(root_backup, csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(csum_root->node)); + } /* * we might commit during log recovery, which happens before we set @@ -2146,12 +2205,6 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_dev_root_level(root_backup, btrfs_header_level(info->dev_root->node)); - btrfs_set_backup_csum_root(root_backup, csum_root->node->start); - btrfs_set_backup_csum_root_gen(root_backup, - btrfs_header_generation(csum_root->node)); - btrfs_set_backup_csum_root_level(root_backup, - btrfs_header_level(csum_root->node)); - btrfs_set_backup_total_bytes(root_backup, btrfs_super_total_bytes(info->super_copy)); btrfs_set_backup_bytes_used(root_backup, @@ -2217,10 +2270,12 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { btrfs_destroy_workqueue(fs_info->fixup_workers); btrfs_destroy_workqueue(fs_info->delalloc_workers); + btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); btrfs_destroy_workqueue(fs_info->endio_workers); btrfs_destroy_workqueue(fs_info->endio_raid56_workers); - btrfs_destroy_workqueue(fs_info->rmw_workers); + if (fs_info->rmw_workers) + destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); @@ -2269,6 +2324,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); + free_root_extent_buffers(info->block_group_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -2286,9 +2342,9 @@ void btrfs_put_root(struct btrfs_root *root) btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG - spin_lock(&root->fs_info->fs_roots_radix_lock); + spin_lock(&root->fs_info->fs_roots_lock); list_del_init(&root->leak_list); - spin_unlock(&root->fs_info->fs_roots_radix_lock); + spin_unlock(&root->fs_info->fs_roots_lock); #endif kfree(root); } @@ -2296,28 +2352,21 @@ void btrfs_put_root(struct btrfs_root *root) void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { - int ret; - struct btrfs_root *gang[8]; - int i; + struct btrfs_root *root; + unsigned long index = 0; while (!list_empty(&fs_info->dead_roots)) { - gang[0] = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); - list_del(&gang[0]->root_list); + root = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&root->root_list); - if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) - btrfs_drop_and_free_fs_root(fs_info, gang[0]); - btrfs_put_root(gang[0]); + if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) + btrfs_drop_and_free_fs_root(fs_info, root); + btrfs_put_root(root); } - while (1) { - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, 0, - ARRAY_SIZE(gang)); - if (!ret) - break; - for (i = 0; i < ret; i++) - btrfs_drop_and_free_fs_root(fs_info, gang[i]); + xa_for_each(&fs_info->fs_roots, index, root) { + btrfs_drop_and_free_fs_root(fs_info, root); } } @@ -2394,7 +2443,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = - btrfs_alloc_workqueue(fs_info, "worker", + btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); + fs_info->hipri_workers = + btrfs_alloc_workqueue(fs_info, "worker-high", flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = @@ -2426,8 +2477,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->endio_raid56_workers = btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, max_active, 4); - fs_info->rmw_workers = - btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); + fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); @@ -2442,8 +2492,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->discard_ctl.discard_workers = alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); - if (!(fs_info->workers && fs_info->delalloc_workers && - fs_info->flush_workers && + if (!(fs_info->workers && fs_info->hipri_workers && + fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && @@ -2504,11 +2554,13 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = NULL; btrfs_put_root(log_tree_root); return ret; - } else if (!extent_buffer_uptodate(log_tree_root->node)) { + } + if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); btrfs_put_root(log_tree_root); return -EIO; } + /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { @@ -2533,6 +2585,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, { struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_root *root; + u64 max_global_id = 0; int ret; struct btrfs_key key = { .objectid = objectid, @@ -2568,6 +2621,13 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, break; btrfs_release_path(path); + /* + * Just worry about this for extent tree, it'll be the same for + * everybody. + */ + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + max_global_id = max(max_global_id, key.offset); + found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { @@ -2585,6 +2645,9 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, } btrfs_release_path(path); + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + fs_info->nr_global_roots = max_global_id + 1; + if (!found || ret) { if (objectid == BTRFS_CSUM_TREE_OBJECTID) set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); @@ -2752,12 +2815,14 @@ static int validate_super(struct btrfs_fs_info *fs_info, } /* - * For 4K page size, we only support 4K sector size. - * For 64K page size, we support 64K and 4K sector sizes. + * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * + * We can support 16K sectorsize with 64K page size without problem, + * but such sectorsize/pagesize combination doesn't make much sense. + * 4K will be our future standard, PAGE_SIZE is supported from the very + * beginning. */ - if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || - (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && - sectorsize != SZ_64K))) { + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2930,6 +2995,56 @@ out: return ret; } +static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) +{ + int ret = 0; + + root->node = read_tree_block(root->fs_info, bytenr, + root->root_key.objectid, gen, level, NULL); + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); + root->node = NULL; + return ret; + } + if (!extent_buffer_uptodate(root->node)) { + free_extent_buffer(root->node); + root->node = NULL; + return -EIO; + } + + btrfs_set_root_node(&root->root_item, root->node); + root->commit_root = btrfs_root_node(root); + btrfs_set_root_refs(&root->root_item, 1); + return ret; +} + +static int load_important_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *sb = fs_info->super_copy; + u64 gen, bytenr; + int level, ret; + + bytenr = btrfs_super_root(sb); + gen = btrfs_super_generation(sb); + level = btrfs_super_root_level(sb); + ret = load_super_root(fs_info->tree_root, bytenr, gen, level); + if (ret) { + btrfs_warn(fs_info, "couldn't read tree root"); + return ret; + } + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + bytenr = btrfs_super_block_group_root(sb); + gen = btrfs_super_block_group_root_generation(sb); + level = btrfs_super_block_group_root_level(sb); + ret = load_super_root(fs_info->block_group_root, bytenr, gen, level); + if (ret) + btrfs_warn(fs_info, "couldn't read block group root"); + return ret; +} + static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) { int backup_index = find_newest_super_backup(fs_info); @@ -2939,10 +3054,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) int ret = 0; int i; - for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { - u64 generation; - int level; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + struct btrfs_root *root; + root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID, + GFP_KERNEL); + if (!root) + return -ENOMEM; + fs_info->block_group_root = root; + } + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { if (handle_error) { if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); @@ -2967,29 +3089,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) if (ret < 0) return ret; } - generation = btrfs_super_generation(sb); - level = btrfs_super_root_level(sb); - tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), - BTRFS_ROOT_TREE_OBJECTID, - generation, level, NULL); - if (IS_ERR(tree_root->node)) { - handle_error = true; - ret = PTR_ERR(tree_root->node); - tree_root->node = NULL; - btrfs_warn(fs_info, "couldn't read tree root"); - continue; - } else if (!extent_buffer_uptodate(tree_root->node)) { + ret = load_important_roots(fs_info); + if (ret) { handle_error = true; - ret = -EIO; - btrfs_warn(fs_info, "error while reading tree root"); continue; } - btrfs_set_root_node(&tree_root->root_item, tree_root->node); - tree_root->commit_root = btrfs_root_node(tree_root); - btrfs_set_root_refs(&tree_root->root_item, 1); - /* * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user @@ -3009,8 +3115,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) } /* All successful */ - fs_info->generation = generation; - fs_info->last_trans_committed = generation; + fs_info->generation = btrfs_header_generation(tree_root->node); + fs_info->last_trans_committed = fs_info->generation; fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ @@ -3028,8 +3134,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC); + xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -3037,7 +3143,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->fs_roots_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); @@ -3053,6 +3159,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->zoned_meta_io_lock); + mutex_init(&fs_info->zoned_data_reloc_io_lock); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -3104,9 +3211,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); - spin_lock_init(&fs_info->block_group_cache_lock); - fs_info->block_group_cache_tree = RB_ROOT; - fs_info->first_logical_byte = (u64)-1; + rwlock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree = RB_ROOT_CACHED; extent_io_tree_init(fs_info, &fs_info->excluded_extents, IO_TREE_FS_EXCLUDED_EXTENTS, NULL); @@ -3190,7 +3296,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block static int btrfs_uuid_rescan_kthread(void *data) { - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + struct btrfs_fs_info *fs_info = data; int ret; /* @@ -3268,7 +3374,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) /* * btrfs_find_orphan_roots() is responsible for finding all the dead * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load - * them into the fs_info->fs_roots_radix tree. This must be done before + * them into the fs_info->fs_roots. This must be done before * calling btrfs_orphan_cleanup() on the tree root. If we don't do it * first, then btrfs_orphan_cleanup() will delete a dead root's orphan * item before the root's tree is deleted - this means that if we unmount @@ -3293,7 +3399,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) up_read(&fs_info->cleanup_work_sem); mutex_lock(&fs_info->cleaner_mutex); - ret = btrfs_recover_relocation(fs_info->tree_root); + ret = btrfs_recover_relocation(fs_info); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { btrfs_warn(fs_info, "failed to recover relocation: %d", ret); @@ -3506,7 +3612,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ~BTRFS_FEATURE_INCOMPAT_SUPP; if (features) { btrfs_err(fs_info, - "cannot mount because of unsupported optional features (%llx)", + "cannot mount because of unsupported optional features (0x%llx)", features); err = -EINVAL; goto fail_alloc; @@ -3544,7 +3650,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!sb_rdonly(sb) && features) { btrfs_err(fs_info, - "cannot mount read-write because of unsupported optional features (%llx)", + "cannot mount read-write because of unsupported optional features (0x%llx)", features); err = -EINVAL; goto fail_alloc; @@ -3553,17 +3659,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; + /* + * V1 space cache has some hardcoded PAGE_SIZE usage, and is + * going to be deprecated. + * + * Force to use v2 cache for subpage case. + */ + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + btrfs_set_and_info(fs_info, FREE_SPACE_TREE, + "forcing free space tree for sector size %u with page size %lu", + sectorsize, PAGE_SIZE); + btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - if (btrfs_super_incompat_flags(fs_info->super_copy) & - BTRFS_FEATURE_INCOMPAT_RAID56) { - btrfs_err(fs_info, - "RAID56 is not yet supported for sector size %u with page size %lu", - sectorsize, PAGE_SIZE); - err = -EINVAL; - goto fail_alloc; - } subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); if (!subpage_info) goto fail_alloc; @@ -3594,21 +3703,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device generation = btrfs_super_chunk_root_generation(disk_super); level = btrfs_super_chunk_root_level(disk_super); - - chunk_root->node = read_tree_block(fs_info, - btrfs_super_chunk_root(disk_super), - BTRFS_CHUNK_TREE_OBJECTID, - generation, level, NULL); - if (IS_ERR(chunk_root->node) || - !extent_buffer_uptodate(chunk_root->node)) { + ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super), + generation, level); + if (ret) { btrfs_err(fs_info, "failed to read chunk root"); - if (!IS_ERR(chunk_root->node)) - free_extent_buffer(chunk_root->node); - chunk_root->node = NULL; goto fail_tree_roots; } - btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); - chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, offsetof(struct btrfs_header, chunk_tree_uuid), @@ -3728,7 +3828,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } - fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) goto fail_sysfs; @@ -4033,8 +4133,9 @@ static int write_dev_supers(struct btrfs_device *device, * to do I/O, so we don't lose the ability to do integrity * checking. */ - bio = bio_alloc(GFP_NOFS, 1); - bio_set_dev(bio, device->bdev); + bio = bio_alloc(device->bdev, 1, + REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, + GFP_NOFS); bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; @@ -4046,11 +4147,11 @@ static int write_dev_supers(struct btrfs_device *device, * go down lazy and there's a short window where the on-disk * copies might still contain the older version. */ - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO; if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); if (btrfs_advance_sb_log(device, i)) errors++; @@ -4131,6 +4232,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) */ static void btrfs_end_empty_barrier(struct bio *bio) { + bio_uninit(bio); complete(bio->bi_private); } @@ -4140,7 +4242,7 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static void write_dev_flush(struct btrfs_device *device) { - struct bio *bio = device->flush_bio; + struct bio *bio = &device->flush_bio; #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* @@ -4153,19 +4255,18 @@ static void write_dev_flush(struct btrfs_device *device) * of simplicity, since this is a debug tool and not meant for use in * non-debug builds. */ - struct request_queue *q = bdev_get_queue(device->bdev); - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (!bdev_write_cache(device->bdev)) return; #endif - bio_reset(bio); + bio_init(bio, device->bdev, NULL, 0, + REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; - bio_set_dev(bio, device->bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } @@ -4174,7 +4275,7 @@ static void write_dev_flush(struct btrfs_device *device) */ static blk_status_t wait_dev_flush(struct btrfs_device *device) { - struct bio *bio = device->flush_bio; + struct bio *bio = &device->flush_bio; if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return BLK_STS_OK; @@ -4398,12 +4499,11 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, { bool drop_ref = false; - spin_lock(&fs_info->fs_roots_radix_lock); - radix_tree_delete(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid); - if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) + spin_lock(&fs_info->fs_roots_lock); + xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state)) drop_ref = true; - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); if (BTRFS_FS_ERROR(fs_info)) { ASSERT(root->log_root == NULL); @@ -4419,50 +4519,48 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { - u64 root_objectid = 0; - struct btrfs_root *gang[8]; - int i = 0; + struct btrfs_root *roots[8]; + unsigned long index = 0; + int i; int err = 0; - unsigned int ret = 0; + int grabbed; while (1) { - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, root_objectid, - ARRAY_SIZE(gang)); - if (!ret) { - spin_unlock(&fs_info->fs_roots_radix_lock); - break; + struct btrfs_root *root; + + spin_lock(&fs_info->fs_roots_lock); + if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) { + spin_unlock(&fs_info->fs_roots_lock); + return err; } - root_objectid = gang[ret - 1]->root_key.objectid + 1; - for (i = 0; i < ret; i++) { - /* Avoid to grab roots in dead_roots */ - if (btrfs_root_refs(&gang[i]->root_item) == 0) { - gang[i] = NULL; - continue; - } - /* grab all the search result for later use */ - gang[i] = btrfs_grab_root(gang[i]); + grabbed = 0; + xa_for_each_start(&fs_info->fs_roots, index, root, index) { + /* Avoid grabbing roots in dead_roots */ + if (btrfs_root_refs(&root->root_item) > 0) + roots[grabbed++] = btrfs_grab_root(root); + if (grabbed >= ARRAY_SIZE(roots)) + break; } - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); - for (i = 0; i < ret; i++) { - if (!gang[i]) + for (i = 0; i < grabbed; i++) { + if (!roots[i]) continue; - root_objectid = gang[i]->root_key.objectid; - err = btrfs_orphan_cleanup(gang[i]); + index = roots[i]->root_key.objectid; + err = btrfs_orphan_cleanup(roots[i]); if (err) - break; - btrfs_put_root(gang[i]); + goto out; + btrfs_put_root(roots[i]); } - root_objectid++; + index++; } - /* release the uncleaned roots due to error */ - for (; i < ret; i++) { - if (gang[i]) - btrfs_put_root(gang[i]); +out: + /* Release the roots that remain uncleaned due to error */ + for (; i < grabbed; i++) { + if (roots[i]) + btrfs_put_root(roots[i]); } return err; } @@ -4534,6 +4632,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + + /* + * We may have the reclaim task running and relocating a data block group, + * in which case it may create delayed iputs. So stop it before we park + * the cleaner kthread otherwise we can get new delayed iputs after + * parking the cleaner, and that can make the async reclaim task to hang + * if it's waiting for delayed iputs to complete, since the cleaner is + * parked and can not run delayed iputs - this will make us hang when + * trying to stop the async reclaim task. + */ + cancel_work_sync(&fs_info->reclaim_bgs_work); /* * We don't want the cleaner to start new transactions, add more delayed * iputs, etc. while we're closing. We can't use kthread_stop() yet @@ -4574,8 +4683,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); - cancel_work_sync(&fs_info->reclaim_bgs_work); - /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4757,13 +4864,6 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) __btrfs_btree_balance_dirty(fs_info, 0); } -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, - struct btrfs_key *first_key) -{ - return btree_read_extent_buffer_pages(buf, parent_transid, - level, first_key); -} - static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { /* cleanup FS via transaction */ @@ -4779,31 +4879,28 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) { - struct btrfs_root *gang[8]; - u64 root_objectid = 0; - int ret; - - spin_lock(&fs_info->fs_roots_radix_lock); - while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, root_objectid, - ARRAY_SIZE(gang))) != 0) { - int i; + unsigned long index = 0; + int grabbed = 0; + struct btrfs_root *roots[8]; - for (i = 0; i < ret; i++) - gang[i] = btrfs_grab_root(gang[i]); - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_lock(&fs_info->fs_roots_lock); + while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index, + ULONG_MAX, 8, XA_PRESENT))) { + for (int i = 0; i < grabbed; i++) + roots[i] = btrfs_grab_root(roots[i]); + spin_unlock(&fs_info->fs_roots_lock); - for (i = 0; i < ret; i++) { - if (!gang[i]) + for (int i = 0; i < grabbed; i++) { + if (!roots[i]) continue; - root_objectid = gang[i]->root_key.objectid; - btrfs_free_log(NULL, gang[i]); - btrfs_put_root(gang[i]); + index = roots[i]->root_key.objectid; + btrfs_free_log(NULL, roots[i]); + btrfs_put_root(roots[i]); } - root_objectid++; - spin_lock(&fs_info->fs_roots_radix_lock); + index++; + spin_lock(&fs_info->fs_roots_lock); } - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); btrfs_free_log_root_tree(NULL, fs_info); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5e8bef4b7563..4ee8c42c9f78 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -87,8 +87,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); +void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -111,6 +110,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) { + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return fs_info->block_group_root; return btrfs_extent_root(fs_info, 0); } @@ -118,13 +119,12 @@ void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, - struct btrfs_key *first_key); +int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, + int level, struct btrfs_key *first_key); blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 dio_file_offset, + int mirror_num, u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 04083ee5ae6e..c3eb52dbe61c 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -244,8 +244,8 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset); +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 96427b1ecac3..4157ecc27d4b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -598,7 +598,7 @@ fail: static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop, int *last_ref) + int refs_to_drop) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -631,7 +631,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); - *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -896,7 +895,13 @@ again: err = -ENOENT; while (1) { if (ptr >= end) { - WARN_ON(ptr > end); + if (ptr > end) { + err = -EUCLEAN; + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(fs_info, +"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", + path->slots[0], root_objectid, owner, offset, parent); + } break; } iref = (struct btrfs_extent_inline_ref *)ptr; @@ -1072,8 +1077,7 @@ static noinline_for_stack void update_inline_extent_backref(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op, - int *last_ref) + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_item *ei; @@ -1121,7 +1125,6 @@ void update_inline_extent_backref(struct btrfs_path *path, else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { - *last_ref = 1; size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size(leaf, path->slots[0]); ptr = (unsigned long)iref; @@ -1166,8 +1169,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, } return -EUCLEAN; } - update_inline_extent_backref(path, iref, refs_to_add, - extent_op, NULL); + update_inline_extent_backref(path, iref, refs_to_add, extent_op); } else if (ret == -ENOENT) { setup_inline_extent_backref(trans->fs_info, path, iref, parent, root_objectid, owner, offset, @@ -1181,21 +1183,17 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data, int *last_ref) + int refs_to_drop, int is_data) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); - if (iref) { - update_inline_extent_backref(path, iref, -refs_to_drop, NULL, - last_ref); - } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop, - last_ref); - } else { - *last_ref = 1; + if (iref) + update_inline_extent_backref(path, iref, -refs_to_drop, NULL); + else if (is_data) + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + else ret = btrfs_del_item(trans, root, path); - } return ret; } @@ -1247,7 +1245,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (size) { ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += size; else if (ret != -EOPNOTSUPP) @@ -1264,7 +1262,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (bytes_left) { ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += bytes_left; } @@ -1299,7 +1297,7 @@ static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, &discarded); discarded += src_disc; - } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) { + } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); } else { ret = 0; @@ -1585,12 +1583,12 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; - int metadata = !extent_op->is_data; + int metadata = 1; if (TRANS_ABORTED(trans)) return 0; - if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) metadata = 0; path = btrfs_alloc_path(); @@ -2188,7 +2186,7 @@ out: int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags, - int level, int is_data) + int level) { struct btrfs_delayed_extent_op *extent_op; int ret; @@ -2200,7 +2198,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_flags = true; extent_op->update_key = false; - extent_op->is_data = is_data ? true : false; extent_op->level = level; ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); @@ -2365,15 +2362,10 @@ out: } int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr, bool strict) + u64 bytenr, bool strict, struct btrfs_path *path) { - struct btrfs_path *path; int ret; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - do { ret = check_committed_ref(root, path, objectid, offset, bytenr, strict); @@ -2384,7 +2376,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, } while (ret == -EAGAIN); out: - btrfs_free_path(path); + btrfs_release_path(path); if (btrfs_is_data_reloc_root(root)) WARN_ON(ret > 0); return ret; @@ -2505,24 +2497,21 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) return ret; } -static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) +static u64 first_logical_byte(struct btrfs_fs_info *fs_info) { - struct btrfs_block_group *cache; - u64 bytenr; - - spin_lock(&fs_info->block_group_cache_lock); - bytenr = fs_info->first_logical_byte; - spin_unlock(&fs_info->block_group_cache_lock); - - if (bytenr < (u64)-1) - return bytenr; + struct rb_node *leftmost; + u64 bytenr = 0; - cache = btrfs_lookup_first_block_group(fs_info, search_start); - if (!cache) - return 0; + read_lock(&fs_info->block_group_cache_lock); + /* Get the block group with the lowest logical start address. */ + leftmost = rb_first_cached(&fs_info->block_group_cache_tree); + if (leftmost) { + struct btrfs_block_group *bg; - bytenr = cache->start; - btrfs_put_block_group(cache); + bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); + bytenr = bg->start; + } + read_unlock(&fs_info->block_group_cache_lock); return bytenr; } @@ -2766,12 +2755,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_unlock(&cache->lock); if (!readonly && return_free_space && global_rsv->space_info == space_info) { - u64 to_add = len; - spin_lock(&global_rsv->lock); if (!global_rsv->full) { - to_add = min(len, global_rsv->size - - global_rsv->reserved); + u64 to_add = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += to_add; btrfs_space_info_update_bytes_may_use(fs_info, space_info, to_add); @@ -2862,6 +2850,35 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +static int do_free_extent_accounting(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, bool is_data) +{ + int ret; + + if (is_data) { + struct btrfs_root *csum_root; + + csum_root = btrfs_csum_root(trans->fs_info, bytenr); + ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + ret = add_to_free_space_tree(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); + if (ret) + btrfs_abort_transaction(trans, ret); + + return ret; +} + /* * Drop one or more refs of @node. * @@ -2943,7 +2960,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; - int last_ref = 0; bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); extent_root = btrfs_extent_root(info, bytenr); @@ -3010,8 +3026,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, - NULL, refs_to_drop, is_data, - &last_ref); + NULL, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3136,8 +3151,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, - iref, refs_to_drop, is_data, - &last_ref); + iref, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3182,7 +3196,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { @@ -3191,28 +3204,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - if (is_data) { - struct btrfs_root *csum_root; - csum_root = btrfs_csum_root(info, bytenr); - ret = btrfs_del_csums(trans, csum_root, bytenr, - num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - } - - ret = add_to_free_space_tree(trans, bytenr, num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - - ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); } btrfs_release_path(path); @@ -3808,8 +3800,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, /* Check RO and no space case before trying to activate it */ spin_lock(&block_group->lock); - if (block_group->ro || - block_group->alloc_offset == block_group->zone_capacity) { + if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. @@ -3841,7 +3832,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); - if (block_group->ro) { + if (block_group->ro || block_group->zoned_data_reloc_ongoing) { ret = 1; goto out; } @@ -3903,8 +3894,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc) + if (ret && ffe_ctl->for_data_reloc && + fs_info->data_reloc_bg == block_group->start) { + /* + * Do not allow further allocations from this block group. + * Compared to increasing the ->ro, setting the + * ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + */ + block_group->zoned_data_reloc_ongoing = 1; fs_info->data_reloc_bg = 0; + } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); @@ -4087,7 +4094,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, } ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, - CHUNK_ALLOC_FORCE); + CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ if (ret == -ENOSPC) @@ -4277,7 +4284,7 @@ static noinline int find_free_extent(struct btrfs_root *root, return ret; ffe_ctl->search_start = max(ffe_ctl->search_start, - first_logical_byte(fs_info, 0)); + first_logical_byte(fs_info)); ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); if (ffe_ctl->search_start == ffe_ctl->hint_byte) { block_group = btrfs_lookup_block_group(fs_info, @@ -4605,6 +4612,28 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, return ret; } +static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + + ret = remove_from_free_space_tree(trans, bytenr, num_bytes); + if (ret) + return ret; + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); + if (ret) { + ASSERT(!ret); + btrfs_err(fs_info, "update block group failed for %llu %llu", + bytenr, num_bytes); + return ret; + } + + trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); + return 0; +} + static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, @@ -4665,18 +4694,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); - if (ret) - return ret; - - ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - ins->objectid, ins->offset); - BUG(); - } - trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); - return ret; + return alloc_reserved_extent(trans, ins->objectid, ins->offset); } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, @@ -4694,7 +4712,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 num_bytes; u64 flags = extent_op->flags_to_set; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); @@ -4704,12 +4721,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { extent_key.offset = ref->level; extent_key.type = BTRFS_METADATA_ITEM_KEY; - num_bytes = fs_info->nodesize; } else { extent_key.offset = node->num_bytes; extent_key.type = BTRFS_EXTENT_ITEM_KEY; size += sizeof(*block_info); - num_bytes = node->num_bytes; } path = btrfs_alloc_path(); @@ -4754,22 +4769,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, extent_key.objectid, - num_bytes); - if (ret) - return ret; - - ret = btrfs_update_block_group(trans, extent_key.objectid, - fs_info->nodesize, true); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - extent_key.objectid, extent_key.offset); - BUG(); - } - - trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, - fs_info->nodesize); - return ret; + return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -4971,7 +4971,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_key = skinny_metadata ? false : true; extent_op->update_flags = true; - extent_op->is_data = false; extent_op->level = level; btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, @@ -5156,7 +5155,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, eb, flag, - btrfs_header_level(eb), 0); + btrfs_header_level(eb)); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -5830,7 +5829,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_qgroup_convert_reserved_meta(root, INT_MAX); btrfs_qgroup_free_meta_all_pertrans(root); - if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) + if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) btrfs_add_dropped_root(trans, root); else btrfs_put_root(root); @@ -5999,7 +5998,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) *trimmed = 0; /* Discard not supported = nothing to do. */ - if (!blk_queue_discard(bdev_get_queue(device->bdev))) + if (!bdev_max_discard_sectors(device->bdev)) return 0; /* Not writable = nothing to do. */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4c91060d103a..04e36343da3a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/page-flags.h> +#include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/swap.h> @@ -28,6 +29,7 @@ #include "subpage.h" #include "zoned.h" #include "block-group.h" +#include "compression.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -75,6 +77,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) if (!fs_info->allocated_ebs.next) return; + WARN_ON(!list_empty(&fs_info->allocated_ebs)); spin_lock_irqsave(&fs_info->eb_leak_lock, flags); while (!list_empty(&fs_info->allocated_ebs)) { eb = list_first_entry(&fs_info->allocated_ebs, @@ -135,6 +138,17 @@ struct tree_entry { struct rb_node rb_node; }; +/* + * Structure to record info about the bio being assembled, and other info like + * how many bytes are there before stripe/ordered extent boundary. + */ +struct btrfs_bio_ctrl { + struct bio *bio; + enum btrfs_compression_type compress_type; + u32 len_to_stripe_boundary; + u32 len_to_oe_boundary; +}; + struct extent_page_data { struct btrfs_bio_ctrl bio_ctrl; /* tells writepage not to lock the state bits for this range @@ -164,24 +178,27 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, return ret; } -int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) +static void submit_one_bio(struct bio *bio, int mirror_num, + enum btrfs_compression_type compress_type) { - blk_status_t ret = 0; struct extent_io_tree *tree = bio->bi_private; bio->bi_private = NULL; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); + if (is_data_inode(tree->private_data)) - ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, - bio_flags); + btrfs_submit_data_bio(tree->private_data, bio, mirror_num, + compress_type); else - ret = btrfs_submit_metadata_bio(tree->private_data, bio, - mirror_num, bio_flags); - - return blk_status_to_errno(ret); + btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num); + /* + * Above submission hooks will handle the error by ending the bio, + * which will do the cleanup properly. So here we should not return + * any error, or the caller of submit_extent_page() will do cleanup + * again, causing problems. + */ } /* Cleanup unsubmitted bios */ @@ -202,13 +219,12 @@ static void end_write_bio(struct extent_page_data *epd, int ret) * Return 0 if everything is OK. * Return <0 for error. */ -static int __must_check flush_write_bio(struct extent_page_data *epd) +static void flush_write_bio(struct extent_page_data *epd) { - int ret = 0; struct bio *bio = epd->bio_ctrl.bio; if (bio) { - ret = submit_one_bio(bio, 0, 0); + submit_one_bio(bio, 0, 0); /* * Clean up of epd->bio is handled by its endio function. * And endio is either triggered by successful bio execution @@ -218,7 +234,6 @@ static int __must_check flush_write_bio(struct extent_page_data *epd) */ epd->bio_ctrl.bio = NULL; } - return ret; } int __init extent_state_cache_init(void) @@ -1507,17 +1522,17 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) { + struct address_space *mapping = inode->i_mapping; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - __set_page_dirty_nobuffers(page); - account_page_redirty(page); - put_page(page); - index++; + folio = filemap_get_folio(mapping, index); + filemap_dirty_folio(mapping, folio); + folio_account_redirty(folio); + index += folio_nr_pages(folio); + folio_put(folio); } } @@ -2303,12 +2318,13 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num) { - struct bio *bio; struct btrfs_device *dev; + struct bio_vec bvec; + struct bio bio; u64 map_length = 0; u64 sector; struct btrfs_io_context *bioc = NULL; - int ret; + int ret = 0; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); @@ -2316,8 +2332,6 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (btrfs_repair_one_zone(fs_info, logical)) return 0; - bio = btrfs_bio_alloc(1); - bio->bi_iter.bi_size = 0; map_length = length; /* @@ -2335,52 +2349,50 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, */ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, &map_length, &bioc, 0); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; - } + if (ret) + goto out_counter_dec; ASSERT(bioc->mirror_num == 1); } else { ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, mirror_num); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; - } + if (ret) + goto out_counter_dec; BUG_ON(mirror_num != bioc->mirror_num); } sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; - bio->bi_iter.bi_sector = sector; dev = bioc->stripes[bioc->mirror_num - 1].dev; btrfs_put_bioc(bioc); + if (!dev || !dev->bdev || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; + ret = -EIO; + goto out_counter_dec; } - bio_set_dev(bio, dev->bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; - bio_add_page(bio, page, length, pg_offset); - if (btrfsic_submit_bio_wait(bio)) { + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = sector; + __bio_add_page(&bio, page, length, pg_offset); + + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + if (ret) { /* try to remap that extent elsewhere? */ - btrfs_bio_counter_dec(fs_info); - bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - return -EIO; + goto out_bio_uninit; } btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", ino, start, rcu_str_deref(dev->name), sector); + ret = 0; + +out_bio_uninit: + bio_uninit(&bio); +out_counter_dec: btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return 0; + return ret; } int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) @@ -2527,7 +2539,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode failrec->start = start; failrec->len = sectorsize; failrec->this_mirror = 0; - failrec->bio_flags = 0; + failrec->compress_type = BTRFS_COMPRESS_NONE; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, failrec->len); @@ -2551,8 +2563,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode logical = em->block_start + logical; if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, em->compress_type); + failrec->compress_type = em->compress_type; } btrfs_debug(fs_info, @@ -2610,6 +2621,7 @@ static bool btrfs_check_repairable(struct inode *inode, * a good copy of the failed sector and if we succeed, we have setup * everything for repair_io_failure to do the rest for us. */ + ASSERT(failed_mirror); failrec->failed_mirror = failed_mirror; failrec->this_mirror++; if (failrec->this_mirror == failed_mirror) @@ -2639,7 +2651,6 @@ int btrfs_repair_one_sector(struct inode *inode, const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; struct btrfs_bio *repair_bbio; - blk_status_t status; btrfs_debug(fs_info, "repair read error: read error at %llu", start); @@ -2658,6 +2669,7 @@ int btrfs_repair_one_sector(struct inode *inode, repair_bio = btrfs_bio_alloc(1); repair_bbio = btrfs_bio(repair_bio); + repair_bbio->file_offset = start; repair_bio->bi_opf = REQ_OP_READ; repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; @@ -2678,13 +2690,13 @@ int btrfs_repair_one_sector(struct inode *inode, "repair read error: submitting new read to mirror %d", failrec->this_mirror); - status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, - failrec->bio_flags); - if (status) { - free_io_failure(failure_tree, tree, failrec); - bio_put(repair_bio); - } - return blk_status_to_errno(status); + /* + * At this point we have a bio, so any errors from submit_bio_hook() + * will be handled by the endio on the repair_bio, so we can't return an + * error here. + */ + submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->compress_type); + return BLK_STS_OK; } static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) @@ -2709,18 +2721,19 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_page_set_error(fs_info, page, start, len); } - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) unlock_page(page); else btrfs_subpage_end_reader(fs_info, page, start, len); } -static blk_status_t submit_read_repair(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - unsigned int error_bitmap, - submit_bio_hook_t *submit_bio_hook) +static blk_status_t submit_data_read_repair(struct inode *inode, + struct bio *failed_bio, + u32 bio_offset, struct page *page, + unsigned int pgoff, + u64 start, u64 end, + int failed_mirror, + unsigned int error_bitmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; @@ -2730,6 +2743,9 @@ static blk_status_t submit_read_repair(struct inode *inode, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + /* This repair is only for data */ + ASSERT(is_data_inode(inode)); + /* We're here because we had some read errors or csum mismatch */ ASSERT(error_bitmap); @@ -2758,7 +2774,7 @@ static blk_status_t submit_read_repair(struct inode *inode, ret = btrfs_repair_one_sector(inode, failed_bio, bio_offset + offset, page, pgoff + offset, start + offset, - failed_mirror, submit_bio_hook); + failed_mirror, btrfs_submit_data_bio); if (!ret) { /* * We have submitted the read repair, the page release @@ -2942,7 +2958,7 @@ update: static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) { ASSERT(PageLocked(page)); - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page)); @@ -2950,7 +2966,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) } /* - * Find extent buffer for a givne bytenr. + * Find extent buffer for a given bytenr. * * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking * in endio context. @@ -2964,16 +2980,14 @@ static struct extent_buffer *find_extent_buffer_readpage( * For regular sectorsize, we can use page->private to grab extent * buffer */ - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { ASSERT(PagePrivate(page) && page->private); return (struct extent_buffer *)page->private; } - /* For subpage case, we need to lookup buffer radix tree */ - rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - bytenr >> fs_info->sectorsize_bits); - rcu_read_unlock(); + /* For subpage case, we need to lookup extent buffer xarray */ + eb = xa_load(&fs_info->extent_buffers, + bytenr >> fs_info->sectorsize_bits); ASSERT(eb); return eb; } @@ -3068,13 +3082,21 @@ static void end_bio_extent_readpage(struct bio *bio) if (is_data_inode(inode)) { /* - * btrfs_submit_read_repair() will handle all the good + * If we failed to submit the IO at all we'll have a + * mirror_num == 0, in which case we need to just mark + * the page with an error and unlock it and carry on. + */ + if (mirror == 0) + goto readpage_ok; + + /* + * submit_data_read_repair() will handle all the good * and bad sectors, we just continue to the next bvec. */ - submit_read_repair(inode, bio, bio_offset, page, - start - page_offset(page), start, - end, mirror, error_bitmap, - btrfs_submit_data_bio); + submit_data_read_repair(inode, bio, bio_offset, page, + start - page_offset(page), + start, end, mirror, + error_bitmap); ASSERT(bio_offset + len > bio_offset); bio_offset += len; @@ -3123,6 +3145,42 @@ readpage_ok: bio_put(bio); } +/** + * Populate every free slot in a provided array with pages. + * + * @nr_pages: number of pages to allocate + * @page_array: the array to fill with pages; any existing non-null entries in + * the array will be skipped + * + * Return: 0 if all pages were able to be allocated; + * -ENOMEM otherwise, and the caller is responsible for freeing all + * non-null page pointers in the array. + */ +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) +{ + unsigned int allocated; + + for (allocated = 0; allocated < nr_pages;) { + unsigned int last = allocated; + + allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); + + if (allocated == nr_pages) + return 0; + + /* + * During this iteration, no page could be allocated, even + * though alloc_pages_bulk_array() falls back to alloc_page() + * if it could not bulk-allocate. So we must be out of memory. + */ + if (allocated == last) + return -ENOMEM; + + memalloc_retry_wait(GFP_NOFS); + } + return 0; +} + /* * Initialize the members up to but not including 'bio'. Use after allocating a * new bio by bio_alloc_bioset as it does not initialize the bytes outside of @@ -3143,18 +3201,18 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) struct bio *bio; ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); + bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset); btrfs_bio_init(btrfs_bio(bio)); return bio; } -struct bio *btrfs_bio_clone(struct bio *bio) +struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio) { struct btrfs_bio *bbio; struct bio *new; /* Bio allocation backed by a bioset does not fail */ - new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); + new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset); bbio = btrfs_bio(new); btrfs_bio_init(bbio); bbio->iter = bio->bi_iter; @@ -3169,7 +3227,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) ASSERT(offset <= UINT_MAX && size <= UINT_MAX); /* this will never fail when it's backed by a bioset */ - bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); + bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); bbio = btrfs_bio(bio); @@ -3189,7 +3247,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) * a contiguous page to the previous one * @size: portion of page that we want to write * @pg_offset: starting offset in the page - * @bio_flags: flags of the current bio to see if we can merge them + * @compress_type: compression type of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. * @@ -3201,7 +3259,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, unsigned int size, unsigned int pg_offset, - unsigned long bio_flags) + enum btrfs_compression_type compress_type) { struct bio *bio = bio_ctrl->bio; u32 bio_size = bio->bi_iter.bi_size; @@ -3213,10 +3271,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, ASSERT(bio); /* The limit should be calculated when bio_ctrl->bio is allocated */ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); - if (bio_ctrl->bio_flags != bio_flags) + if (bio_ctrl->compress_type != compress_type) return 0; - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; @@ -3259,7 +3317,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, * The split happens for real compressed bio, which happens in * btrfs_submit_compressed_read/write(). */ - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) { + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { bio_ctrl->len_to_oe_boundary = U32_MAX; bio_ctrl->len_to_stripe_boundary = U32_MAX; return 0; @@ -3302,7 +3360,7 @@ static int alloc_new_bio(struct btrfs_inode *inode, unsigned int opf, bio_end_io_t end_io_func, u64 disk_bytenr, u32 offset, u64 file_offset, - unsigned long bio_flags) + enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; @@ -3313,37 +3371,49 @@ static int alloc_new_bio(struct btrfs_inode *inode, * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. */ - if (bio_flags & EXTENT_BIO_COMPRESSED) + if (compress_type != BTRFS_COMPRESS_NONE) bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; bio_ctrl->bio = bio; - bio_ctrl->bio_flags = bio_flags; + bio_ctrl->compress_type = compress_type; bio->bi_end_io = end_io_func; bio->bi_private = &inode->io_tree; - bio->bi_write_hint = inode->vfs_inode.i_write_hint; bio->bi_opf = opf; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) goto error; - if (wbc) { - struct block_device *bdev; - bdev = fs_info->fs_devices->latest_dev->bdev; - bio_set_dev(bio, bdev); - wbc_init_bio(wbc, bio); - } - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *device; + if (wbc) { + /* + * For Zone append we need the correct block_device that we are + * going to write to set in the bio to be able to respect the + * hardware limitation. Look it up here: + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct btrfs_device *dev; + + dev = btrfs_zoned_get_device(fs_info, disk_bytenr, + fs_info->sectorsize); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto error; + } - device = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(device)) { - ret = PTR_ERR(device); - goto error; + bio_set_dev(bio, dev->bdev); + } else { + /* + * Otherwise pick the last added device to support + * cgroup writeback. For multi-device file systems this + * means blk-cgroup policies have to always be set on the + * last added/replaced device. This is a bit odd but has + * been like that for a long time. + */ + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); } - - btrfs_bio(bio)->device = device; + wbc_init_bio(wbc, bio); + } else { + ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } return 0; error: @@ -3365,7 +3435,7 @@ error: * @end_io_func: end_io callback for new bio * @mirror_num: desired mirror to read/write * @prev_bio_flags: flags of previous bio to see if we can merge the current one - * @bio_flags: flags of the current bio to see if we can merge them + * @compress_type: compress type for current bio */ static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, @@ -3374,7 +3444,7 @@ static int submit_extent_page(unsigned int opf, size_t size, unsigned long pg_offset, bio_end_io_t end_io_func, int mirror_num, - unsigned long bio_flags, + enum btrfs_compression_type compress_type, bool force_bio_submit) { int ret = 0; @@ -3386,10 +3456,8 @@ static int submit_extent_page(unsigned int opf, ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); if (force_bio_submit && bio_ctrl->bio) { - ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags); + submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); bio_ctrl->bio = NULL; - if (ret < 0) - return ret; } while (cur < pg_offset + size) { @@ -3401,7 +3469,7 @@ static int submit_extent_page(unsigned int opf, ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, end_io_func, disk_bytenr, offset, page_offset(page) + cur, - bio_flags); + compress_type); if (ret < 0) return ret; } @@ -3409,14 +3477,14 @@ static int submit_extent_page(unsigned int opf, * We must go through btrfs_bio_add_page() to ensure each * page range won't cross various boundaries. */ - if (bio_flags & EXTENT_BIO_COMPRESSED) + if (compress_type != BTRFS_COMPRESS_NONE) added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, size - offset, pg_offset + offset, - bio_flags); + compress_type); else added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr + offset, size - offset, - pg_offset + offset, bio_flags); + pg_offset + offset, compress_type); /* Metadata page range should never be split */ if (!is_data_inode(&inode->vfs_inode)) @@ -3430,11 +3498,8 @@ static int submit_extent_page(unsigned int opf, if (added < size - offset) { /* The bio should contain some page(s) */ ASSERT(bio_ctrl->bio->bi_iter.bi_size); - ret = submit_one_bio(bio_ctrl->bio, mirror_num, - bio_ctrl->bio_flags); + submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); bio_ctrl->bio = NULL; - if (ret < 0) - return ret; } cur += added; } @@ -3457,7 +3522,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, if (page->mapping) lockdep_assert_held(&page->mapping->private_lock); - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { if (!PagePrivate(page)) attach_page_private(page, eb); else @@ -3492,7 +3557,7 @@ int set_page_extent_mapped(struct page *page) fs_info = btrfs_sb(page->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) + if (btrfs_is_subpage(fs_info, page)) return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); @@ -3509,7 +3574,7 @@ void clear_page_extent_mapped(struct page *page) return; fs_info = btrfs_sb(page->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) + if (btrfs_is_subpage(fs_info, page)) return btrfs_detach_subpage(fs_info, page); detach_page_private(page); @@ -3534,7 +3599,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, } em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); - if (em_cached && !IS_ERR_OR_NULL(em)) { + if (em_cached && !IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; @@ -3548,7 +3613,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, unsigned int read_flags, u64 *prev_em_start) { @@ -3563,7 +3628,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 cur_end; struct extent_map *em; int ret = 0; - int nr = 0; size_t pg_offset = 0; size_t iosize; size_t blocksize = inode->i_sb->s_blocksize; @@ -3608,25 +3672,23 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); - if (IS_ERR_OR_NULL(em)) { + if (IS_ERR(em)) { unlock_extent(tree, cur, end); end_page_read(page, false, cur, end + 1 - cur); + ret = PTR_ERR(em); break; } extent_offset = cur - em->start; BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - this_bio_flag |= EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&this_bio_flag, - em->compress_type); - } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + this_bio_flag = em->compress_type; iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); - if (this_bio_flag & EXTENT_BIO_COMPRESSED) + if (this_bio_flag != BTRFS_COMPRESS_NONE) disk_bytenr = em->block_start; else disk_bytenr = em->block_start + extent_offset; @@ -3721,11 +3783,13 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, end_bio_extent_readpage, 0, this_bio_flag, force_bio_submit); - if (!ret) { - nr++; - } else { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, false, cur, iosize); + if (ret) { + /* + * We have to unlock the remaining range, or the page + * will never be unlocked. + */ + unlock_extent(tree, cur, end); + end_page_read(page, false, cur, end + 1 - cur); goto out; } cur = cur + iosize; @@ -3735,6 +3799,27 @@ out: return ret; } +int btrfs_read_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + /* + * If btrfs_do_readpage() failed we will want to submit the assembled + * bio to do the cleanup. + */ + if (bio_ctrl.bio) + submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); + return ret; +} + static inline void contiguous_readpages(struct page *pages[], int nr_pages, u64 start, u64 end, struct extent_map **em_cached, @@ -3753,12 +3838,6 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, } } -static void update_nr_written(struct writeback_control *wbc, - unsigned long nr_written) -{ - wbc->nr_to_write -= nr_written; -} - /* * helper for __extent_writepage, doing all of the delayed allocation setup. * @@ -3858,7 +3937,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, * For regular sector size == page size case, since one page only * contains one sector, we return the page offset directly. */ - if (fs_info->sectorsize == PAGE_SIZE) { + if (!btrfs_is_subpage(fs_info, page)) { *start = page_offset(page); *end = page_offset(page) + PAGE_SIZE; return; @@ -3901,10 +3980,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u64 extent_offset; u64 block_start; struct extent_map *em; + int saved_ret = 0; int ret = 0; int nr = 0; u32 opf = REQ_OP_WRITE; const unsigned int write_flags = wbc_to_write_flags(wbc); + bool has_error = false; bool compressed; ret = btrfs_writepage_cow_fixup(page); @@ -3919,7 +4000,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * we don't want to touch the inode after unlocking the page, * so we update the mapping writeback index now */ - update_nr_written(wbc, 1); + wbc->nr_to_write--; while (cur <= end) { u64 disk_bytenr; @@ -3951,9 +4032,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); - if (IS_ERR_OR_NULL(em)) { + if (IS_ERR(em)) { btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); + has_error = true; + if (!saved_ret) + saved_ret = ret; break; } @@ -4017,6 +4101,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, end_bio_extent_writepage, 0, 0, false); if (ret) { + has_error = true; + if (!saved_ret) + saved_ret = ret; + btrfs_page_set_error(fs_info, page, cur, iosize); if (PageWriteback(page)) btrfs_page_clear_writeback(fs_info, page, cur, @@ -4030,8 +4118,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * If we finish without problem, we should not only clear page dirty, * but also empty subpage dirty bits */ - if (!ret) + if (!has_error) btrfs_page_assert_not_dirty(fs_info, page); + else + ret = saved_ret; *nr_ret = nr; return ret; } @@ -4048,6 +4138,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u64 page_start = page_offset(page); @@ -4068,8 +4159,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); - unlock_page(page); + folio_invalidate(folio, 0, folio_size(folio)); + folio_unlock(folio); return 0; } @@ -4161,9 +4252,6 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) static void end_extent_buffer_writeback(struct extent_buffer *eb) { - if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags)) - btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len); - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -4183,14 +4271,12 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb struct extent_page_data *epd) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages, failed_page_nr; + int i, num_pages; int flush = 0; int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - ret = flush_write_bio(epd); - if (ret < 0) - return ret; + flush_write_bio(epd); flush = 1; btrfs_tree_lock(eb); } @@ -4200,9 +4286,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!epd->sync_io) return 0; if (!flush) { - ret = flush_write_bio(epd); - if (ret < 0) - return ret; + flush_write_bio(epd); flush = 1; } while (1) { @@ -4240,7 +4324,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb * Subpage metadata doesn't use page locking at all, so we can skip * the page locking. */ - if (!ret || fs_info->sectorsize < PAGE_SIZE) + if (!ret || fs_info->nodesize < PAGE_SIZE) return ret; num_pages = num_extent_pages(eb); @@ -4249,14 +4333,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - int err; - - err = flush_write_bio(epd); - if (err < 0) { - ret = err; - failed_page_nr = i; - goto err_unlock; - } + flush_write_bio(epd); flush = 1; } lock_page(p); @@ -4264,25 +4341,6 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb } return ret; -err_unlock: - /* Unlock already locked pages */ - for (i = 0; i < failed_page_nr; i++) - unlock_page(eb->pages[i]); - /* - * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. - * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can - * be made and undo everything done before. - */ - btrfs_tree_lock(eb); - spin_lock(&eb->refs_lock); - set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - end_extent_buffer_writeback(eb); - spin_unlock(&eb->refs_lock); - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, - fs_info->dirty_metadata_batch); - btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - btrfs_tree_unlock(eb); - return ret; } static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) @@ -4377,8 +4435,8 @@ static struct extent_buffer *find_extent_buffer_nolock( struct extent_buffer *eb; rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); + eb = xa_load(&fs_info->extent_buffers, + start >> fs_info->sectorsize_bits); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); return eb; @@ -4400,7 +4458,7 @@ static void end_bio_subpage_eb_writepage(struct bio *bio) struct bvec_iter_all iter_all; fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); - ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(fs_info->nodesize < PAGE_SIZE); ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -4552,7 +4610,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, * dirty anymore, we have submitted a page. Update nr_written in wbc. */ if (no_dirty_ebs) - update_nr_written(wbc, 1); + wbc->nr_to_write--; return ret; } @@ -4588,7 +4646,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, break; } disk_bytenr += PAGE_SIZE; - update_nr_written(wbc, 1); + wbc->nr_to_write--; unlock_page(p); } @@ -4727,7 +4785,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!PagePrivate(page)) return 0; - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc, epd); spin_lock(&mapping->private_lock); @@ -4780,11 +4838,11 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return ret; } if (cache) { - /* Impiles write in zoned mode */ + /* + * Implies write in zoned mode. Mark the last eb in a block group. + */ + btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); - /* Mark the last eb in a block group */ - if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) - set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); } ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); @@ -4902,13 +4960,19 @@ retry: * if the fs already has error. */ if (!BTRFS_FS_ERROR(fs_info)) { - ret = flush_write_bio(&epd); + flush_write_bio(&epd); } else { ret = -EROFS; end_write_bio(&epd, ret); } out: btrfs_zoned_meta_io_unlock(fs_info); + /* + * We can get ret > 0 from submit_extent_page() indicating how many ebs + * were submitted. Reset it to 0 to avoid false alerts for the caller. + */ + if (ret > 0) + ret = 0; return ret; } @@ -5010,8 +5074,7 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - ret = flush_write_bio(epd); - BUG_ON(ret < 0); + flush_write_bio(epd); lock_page(page); } @@ -5021,10 +5084,8 @@ retry: } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) { - ret = flush_write_bio(epd); - BUG_ON(ret < 0); - } + if (PageWriteback(page)) + flush_write_bio(epd); wait_on_page_writeback(page); } @@ -5064,9 +5125,8 @@ retry: * page in our current bio, and thus deadlock, so flush the * write bio here. */ - ret = flush_write_bio(epd); - if (!ret) - goto retry; + flush_write_bio(epd); + goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) @@ -5092,8 +5152,7 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) return ret; } - ret = flush_write_bio(&epd); - ASSERT(ret <= 0); + flush_write_bio(&epd); return ret; } @@ -5155,7 +5214,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) } if (!found_error) - ret = flush_write_bio(&epd); + flush_write_bio(&epd); else end_write_bio(&epd, ret); @@ -5182,13 +5241,14 @@ int extent_writepages(struct address_space *mapping, */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); - btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); ASSERT(ret <= 0); if (ret < 0) { + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); end_write_bio(&epd, ret); return ret; } - ret = flush_write_bio(&epd); + flush_write_bio(&epd); + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret; } @@ -5211,24 +5271,22 @@ void extent_readahead(struct readahead_control *rac) if (em_cached) free_extent_map(em_cached); - if (bio_ctrl.bio) { - if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags)) - return; - } + if (bio_ctrl.bio) + submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); } /* - * basic invalidatepage code, this waits on any locked or writeback - * ranges corresponding to the page, and then deletes any extent state + * basic invalidate_folio code, this waits on any locked or writeback + * ranges corresponding to the folio, and then deletes any extent state * records from the tree */ -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset) +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset) { struct extent_state *cached_state = NULL; - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - size_t blocksize = page->mapping->host->i_sb->s_blocksize; + u64 start = folio_pos(folio); + u64 end = start + folio_size(folio) - 1; + size_t blocksize = folio->mapping->host->i_sb->s_blocksize; /* This function is only called for the btree inode */ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); @@ -5238,7 +5296,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, return 0; lock_extent_bits(tree, start, end, &cached_state); - wait_on_page_writeback(page); + folio_wait_writeback(folio); /* * Currently for btree io tree, only EXTENT_LOCKED is utilized, @@ -5250,7 +5308,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, } /* - * a helper for releasepage, this tests for areas of the page that + * a helper for release_folio, this tests for areas of the page that * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ @@ -5286,7 +5344,7 @@ static int try_release_extent_state(struct extent_io_tree *tree, } /* - * a helper for releasepage. As long as there are no locked extents + * a helper for release_folio. As long as there are no locked extents * in the range corresponding to the page, both state records and extent * map records are removed */ @@ -5390,7 +5448,7 @@ static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, break; len = ALIGN(len, sectorsize); em = btrfs_get_extent_fiemap(inode, offset, len); - if (IS_ERR_OR_NULL(em)) + if (IS_ERR(em)) return em; /* if this isn't a hole return it */ @@ -5783,7 +5841,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag return; } - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -5890,9 +5948,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { int i; - struct page *p; struct extent_buffer *new; int num_pages = num_extent_pages(src); + int ret; new = __alloc_extent_buffer(src->fs_info, src->start, src->len); if (new == NULL) @@ -5905,22 +5963,23 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + memset(new->pages, 0, sizeof(*new->pages) * num_pages); + ret = btrfs_alloc_page_array(num_pages, new->pages); + if (ret) { + btrfs_release_extent_buffer(new); + return NULL; + } + for (i = 0; i < num_pages; i++) { int ret; + struct page *p = new->pages[i]; - p = alloc_page(GFP_NOFS); - if (!p) { - btrfs_release_extent_buffer(new); - return NULL; - } ret = attach_extent_buffer_page(new, p, NULL); if (ret < 0) { - put_page(p); btrfs_release_extent_buffer(new); return NULL; } WARN_ON(PageDirty(p)); - new->pages[i] = p; copy_page(page_address(p), page_address(src->pages[i])); } set_extent_buffer_uptodate(new); @@ -5934,31 +5993,36 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb; int num_pages; int i; + int ret; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; num_pages = num_extent_pages(eb); + ret = btrfs_alloc_page_array(num_pages, eb->pages); + if (ret) + goto err; + for (i = 0; i < num_pages; i++) { - int ret; + struct page *p = eb->pages[i]; - eb->pages[i] = alloc_page(GFP_NOFS); - if (!eb->pages[i]) - goto err; - ret = attach_extent_buffer_page(eb, eb->pages[i], NULL); + ret = attach_extent_buffer_page(eb, p, NULL); if (ret < 0) goto err; } + set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; err: - for (; i > 0; i--) { - detach_extent_buffer_page(eb, eb->pages[i - 1]); - __free_page(eb->pages[i - 1]); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) { + detach_extent_buffer_page(eb, eb->pages[i]); + __free_page(eb->pages[i]); + } } __free_extent_buffer(eb); return NULL; @@ -5980,10 +6044,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or - * calling releasepage when the tree reference is the only reference. + * calling release_folio when the tree reference is the only reference. * * In both cases, care is taken to ensure that the extent_buffer's - * pages are not under io. However, releasepage can be concurrently + * pages are not under io. However, release_folio can be concurrently * called with creating new references, which is prone to race * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. @@ -6065,24 +6129,22 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, if (!eb) return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; -again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; - } - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) + + do { + ret = xa_insert(&fs_info->extent_buffers, + start >> fs_info->sectorsize_bits, + eb, GFP_NOFS); + if (ret == -ENOMEM) { + exists = ERR_PTR(ret); goto free_eb; - else - goto again; - } + } + if (ret == -EBUSY) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + } + } while (ret); + check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6103,7 +6165,7 @@ static struct extent_buffer *grab_extent_buffer( * don't try to insert two ebs for the same bytenr. So here we always * return NULL and just continue. */ - if (fs_info->sectorsize < PAGE_SIZE) + if (fs_info->nodesize < PAGE_SIZE) return NULL; /* Page not yet attached to an extent buffer */ @@ -6125,6 +6187,30 @@ static struct extent_buffer *grab_extent_buffer( return NULL; } +static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) +{ + if (!IS_ALIGNED(start, fs_info->sectorsize)) { + btrfs_err(fs_info, "bad tree block start %llu", start); + return -EINVAL; + } + + if (fs_info->nodesize < PAGE_SIZE && + offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { + btrfs_err(fs_info, + "tree block crosses page boundary, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + if (fs_info->nodesize >= PAGE_SIZE && + !IS_ALIGNED(start, PAGE_SIZE)) { + btrfs_err(fs_info, + "tree block is not page aligned, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + return 0; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { @@ -6139,10 +6225,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int uptodate = 1; int ret; - if (!IS_ALIGNED(start, fs_info->sectorsize)) { - btrfs_err(fs_info, "bad tree block start %llu", start); + if (check_eb_alignment(fs_info, start)) return ERR_PTR(-EINVAL); - } #if BITS_PER_LONG == 32 if (start >= MAX_LFS_FILESIZE) { @@ -6155,14 +6239,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_warn_32bit_limit(fs_info); #endif - if (fs_info->sectorsize < PAGE_SIZE && - offset_in_page(start) + len > PAGE_SIZE) { - btrfs_err(fs_info, - "tree block crosses page boundary, start %llu nodesize %lu", - start, len); - return ERR_PTR(-EINVAL); - } - eb = find_extent_buffer(fs_info, start); if (eb) return eb; @@ -6192,7 +6268,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * page, but it may change in the future for 16K page size * support, so we still preallocate the memory in the loop. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE) { prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); @@ -6236,39 +6312,36 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, /* * We can't unlock the pages just yet since the extent buffer * hasn't been properly inserted in the radix tree, this - * opens a race with btree_releasepage which can free a page + * opens a race with btree_release_folio which can free a page * while we are still filling in all pages for the buffer and * we could crash. */ } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); -again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; - } - - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) + + do { + ret = xa_insert(&fs_info->extent_buffers, + start >> fs_info->sectorsize_bits, + eb, GFP_NOFS); + if (ret == -ENOMEM) { + exists = ERR_PTR(ret); goto free_eb; - else - goto again; - } + } + if (ret == -EBUSY) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + } + } while (ret); + /* add one reference for the tree */ check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); /* * Now it's safe to unlock the pages because any calls to - * btree_releasepage will correctly detect that a page belongs to a + * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ for (i = 0; i < num_pages; i++) @@ -6306,10 +6379,8 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); - spin_lock(&fs_info->buffer_lock); - radix_tree_delete(&fs_info->buffer_radix, - eb->start >> fs_info->sectorsize_bits); - spin_unlock(&fs_info->buffer_lock); + xa_erase(&fs_info->extent_buffers, + eb->start >> fs_info->sectorsize_bits); } else { spin_unlock(&eb->refs_lock); } @@ -6411,7 +6482,7 @@ void clear_extent_buffer_dirty(const struct extent_buffer *eb) int num_pages; struct page *page; - if (eb->fs_info->sectorsize < PAGE_SIZE) + if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); num_pages = num_extent_pages(eb); @@ -6443,7 +6514,7 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->sectorsize < PAGE_SIZE; + bool subpage = eb->fs_info->nodesize < PAGE_SIZE; /* * For subpage case, we can have other extent buffers in the @@ -6483,9 +6554,18 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - if (page) - btrfs_page_clear_uptodate(fs_info, page, - eb->start, eb->len); + if (!page) + continue; + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + ClearPageUptodate(page); + else + btrfs_subpage_clear_uptodate(fs_info, page, eb->start, + eb->len); } } @@ -6500,7 +6580,16 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len); + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + SetPageUptodate(page); + else + btrfs_subpage_set_uptodate(fs_info, page, eb->start, + eb->len); } } @@ -6556,12 +6645,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, atomic_dec(&eb->io_pages); } if (bio_ctrl.bio) { - int tmp; - - tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0); + submit_one_bio(bio_ctrl.bio, mirror_num, 0); bio_ctrl.bio = NULL; - if (tmp < 0) - return tmp; } if (ret || wait != WAIT_COMPLETE) return ret; @@ -6595,7 +6680,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) return -EIO; - if (eb->fs_info->sectorsize < PAGE_SIZE) + if (eb->fs_info->nodesize < PAGE_SIZE) return read_extent_buffer_subpage(eb, wait, mirror_num); num_pages = num_extent_pages(eb); @@ -6638,7 +6723,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); /* - * It is possible for releasepage to clear the TREE_REF bit before we + * It is possible for release_folio to clear the TREE_REF bit before we * set io_pages. See check_buffer_tree_ref for a more detailed comment. */ check_buffer_tree_ref(eb); @@ -6674,10 +6759,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } if (bio_ctrl.bio) { - err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags); + submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.compress_type); bio_ctrl.bio = NULL; - if (err) - return err; } if (ret || wait != WAIT_COMPLETE) @@ -6850,7 +6933,7 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, * would have !PageUptodate && !PageError, as we clear PageError before * reading. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE) { bool uptodate, error; uptodate = btrfs_subpage_test_uptodate(fs_info, page, @@ -6952,7 +7035,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, ASSERT(dst->len == src->len); - if (dst->fs_info->sectorsize == PAGE_SIZE) { + if (dst->fs_info->nodesize >= PAGE_SIZE) { num_pages = num_extent_pages(dst); for (i = 0; i < num_pages; i++) copy_page(page_address(dst->pages[i]), @@ -6961,7 +7044,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, size_t src_offset = get_eb_offset_in_page(src, 0); size_t dst_offset = get_eb_offset_in_page(dst, 0); - ASSERT(src->fs_info->sectorsize < PAGE_SIZE); + ASSERT(src->fs_info->nodesize < PAGE_SIZE); memcpy(page_address(dst->pages[0]) + dst_offset, page_address(src->pages[0]) + src_offset, src->len); @@ -7242,42 +7325,25 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } -#define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) { - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; - struct extent_buffer *found = NULL; + struct extent_buffer *eb; + unsigned long index; u64 page_start = page_offset(page); - u64 cur = page_start; ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - while (cur < page_start + PAGE_SIZE) { - int ret; - int i; - - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, - (void **)gang, cur >> fs_info->sectorsize_bits, - min_t(unsigned int, GANG_LOOKUP_SIZE, - PAGE_SIZE / fs_info->nodesize)); - if (ret == 0) - goto out; - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) - goto out; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - goto out; - } - } - cur = gang[ret - 1]->start + gang[ret - 1]->len; + xa_for_each_start(&fs_info->extent_buffers, index, eb, + page_start >> fs_info->sectorsize_bits) { + if (in_range(eb->start, page_start, PAGE_SIZE)) + return eb; + else if (eb->start >= page_start + PAGE_SIZE) + /* Already beyond page end */ + return NULL; } -out: - return found; + return NULL; } static int try_release_subpage_extent_buffer(struct page *page) @@ -7354,7 +7420,7 @@ int try_release_extent_buffer(struct page *page) { struct extent_buffer *eb; - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return try_release_subpage_extent_buffer(page); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0399cf8e3c32..23d4103c8831 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -7,15 +7,9 @@ #include <linux/refcount.h> #include <linux/fiemap.h> #include <linux/btrfs_tree.h> +#include "compression.h" #include "ulist.h" -/* - * flags for bio submission. The high bits indicate the compression - * type for this bio - */ -#define EXTENT_BIO_COMPRESSED 1 -#define EXTENT_BIO_FLAG_SHIFT 16 - enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, @@ -32,7 +26,6 @@ enum { /* write IO error */ EXTENT_BUFFER_WRITE_ERR, EXTENT_BUFFER_NO_CHECK, - EXTENT_BUFFER_ZONE_FINISH, }; /* these are flags for __process_pages_contig */ @@ -71,9 +64,9 @@ struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; -typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, +typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio, int mirror_num, - unsigned long bio_flags); + enum btrfs_compression_type compress_type); typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, struct bio *bio, u64 dio_file_offset); @@ -103,22 +96,11 @@ struct extent_buffer { }; /* - * Structure to record info about the bio being assembled, and other info like - * how many bytes are there before stripe/ordered extent boundary. - */ -struct btrfs_bio_ctrl { - struct bio *bio; - unsigned long bio_flags; - u32 len_to_stripe_boundary; - u32 len_to_oe_boundary; -}; - -/* * Structure to record how many bytes and which ranges are set/cleared */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - unsigned int bytes_changed; + u64 bytes_changed; /* Changed ranges */ struct ulist range_changed; @@ -158,17 +140,6 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -static inline void extent_set_compress_type(unsigned long *bio_flags, - int compress_type) -{ - *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; -} - -static inline int extent_compress_type(unsigned long bio_flags) -{ - return bio_flags >> EXTENT_BIO_FLAG_SHIFT; -} - struct extent_map_tree; typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, @@ -178,11 +149,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags); -int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - unsigned int read_flags, u64 *prev_em_start); +int btrfs_read_folio(struct file *file, struct folio *folio); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end); int extent_writepages(struct address_space *mapping, @@ -277,8 +244,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); + +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); -struct bio *btrfs_bio_clone(struct bio *bio); +struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); @@ -297,7 +266,7 @@ struct io_failure_record { u64 start; u64 len; u64 logical; - unsigned long bio_flags; + enum btrfs_compression_type compress_type; int this_mirror; int failed_mirror; }; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index c28ceddefae4..6fee14ce2e6b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -492,6 +492,8 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, */ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { + lockdep_assert_held_write(&tree->lock); + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase_cached(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) @@ -506,6 +508,8 @@ void replace_extent_mapping(struct extent_map_tree *tree, struct extent_map *new, int modified) { + lockdep_assert_held_write(&tree->lock); + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); ASSERT(extent_map_in_tree(cur)); if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 90c5c38836ab..c828f971a346 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -305,7 +305,7 @@ found: read_extent_buffer(path->nodes[0], dst, (unsigned long)item, ret * csum_size); out: - if (ret == -ENOENT) + if (ret == -ENOENT || ret == -EFBIG) ret = 0; return ret; } @@ -368,6 +368,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_bio *bbio = NULL; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; @@ -377,6 +378,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst u8 *csum; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; + blk_status_t ret = BLK_STS_OK; if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) @@ -400,7 +402,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst return BLK_STS_RESOURCE; if (!dst) { - struct btrfs_bio *bbio = btrfs_bio(bio); + bbio = btrfs_bio(bio); if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); @@ -456,21 +458,27 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst count = search_csum_tree(fs_info, path, cur_disk_bytenr, search_len, csum_dst); - if (count <= 0) { - /* - * Either we hit a critical error or we didn't find - * the csum. - * Either way, we put zero into the csums dst, and skip - * to the next sector. - */ + if (count < 0) { + ret = errno_to_blk_status(count); + if (bbio) + btrfs_bio_free_csum(bbio); + break; + } + + /* + * We didn't find a csum for this range. We need to make sure + * we complain loudly about this, because we are not NODATASUM. + * + * However for the DATA_RELOC inode we could potentially be + * relocating data extents for a NODATASUM inode, so the inode + * itself won't be marked with NODATASUM, but the extent we're + * copying is in fact NODATASUM. If we don't find a csum we + * assume this is the case. + */ + if (count == 0) { memset(csum_dst, 0, csum_size); count = 1; - /* - * For data reloc inode, we need to mark the range - * NODATASUM so that balance won't report false csum - * error. - */ if (BTRFS_I(inode)->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset; @@ -491,7 +499,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst } btrfs_free_path(path); - return BLK_STS_OK; + return ret; } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, @@ -612,32 +620,33 @@ fail: return ret; } -/* - * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio +/** + * Calculate checksums of the data contained inside a bio + * * @inode: Owner of the data inside the bio * @bio: Contains the data to be checksummed - * @file_start: offset in file this bio begins to describe - * @contig: Boolean. If true/1 means all bio vecs in this bio are - * contiguous and they begin at @file_start in the file. False/0 - * means this bio can contain potentially discontiguous bio vecs - * so the logical offset of each should be calculated separately. + * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the + * file offsets are determined from the page offsets in the bio. + * Otherwise, this is the starting file offset of the bio vecs in + * @bio, which must be contiguous. + * @one_ordered: If true, @bio only refers to one ordered extent. */ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 file_start, int contig) + u64 offset, bool one_ordered) { struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; + const bool use_page_offsets = (offset == (u64)-1); char *data; struct bvec_iter iter; struct bio_vec bvec; int index; - int nr_sectors; + unsigned int blockcount; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; int i; - u64 offset; unsigned nofs_flag; nofs_flag = memalloc_nofs_save(); @@ -651,18 +660,13 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); - if (contig) - offset = file_start; - else - offset = 0; /* shut up gcc */ - sums->bytenr = bio->bi_iter.bi_sector << 9; index = 0; shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { - if (!contig) + if (use_page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; if (!ordered) { @@ -681,13 +685,14 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, } } - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, + blockcount = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len + fs_info->sectorsize - 1); - for (i = 0; i < nr_sectors; i++) { - if (offset >= ordered->file_offset + ordered->num_bytes || - offset < ordered->file_offset) { + for (i = 0; i < blockcount; i++) { + if (!one_ordered && + !in_range(offset, ordered->file_offset, + ordered->num_bytes)) { unsigned long bytes_left; sums->len = this_sum_bytes; @@ -1211,6 +1216,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, extent_start = key.offset; extent_end = btrfs_file_extent_end(path); em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + em->generation = btrfs_file_extent_generation(leaf, fi); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { em->start = extent_start; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a0179cc62913..9dfde1af8a64 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -691,7 +691,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, int modify_tree = -1; int update_refs; int found = 0; - int leafs_visited = 0; struct btrfs_path *path = args->path; args->bytes_found = 0; @@ -729,7 +728,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, path->slots[0]--; } ret = 0; - leafs_visited++; next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -741,7 +739,6 @@ next_slot: ret = 0; break; } - leafs_visited++; leaf = path->nodes[0]; recow = 1; } @@ -987,7 +984,7 @@ delete_extent_item: * which case it unlocked our path, so check path->locks[0] matches a * write lock. */ - if (!ret && args->replace_extent && leafs_visited == 1 && + if (!ret && args->replace_extent && path->locks[0] == BTRFS_WRITE_LOCK && btrfs_leaf_free_space(leaf) >= sizeof(struct btrfs_item) + args->extent_item_size) { @@ -1310,11 +1307,12 @@ static int prepare_uptodate_page(struct inode *inode, struct page *page, u64 pos, bool force_uptodate) { + struct folio *folio = page_folio(page); int ret = 0; if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && !PageUptodate(page)) { - ret = btrfs_readpage(NULL, page); + ret = btrfs_read_folio(NULL, folio); if (ret) return ret; lock_page(page); @@ -1324,8 +1322,8 @@ static int prepare_uptodate_page(struct inode *inode, } /* - * Since btrfs_readpage() will unlock the page before it - * returns, there is a window where btrfs_releasepage() can be + * Since btrfs_read_folio() will unlock the folio before it + * returns, there is a window where btrfs_release_folio() can be * called to release the page. Here we check both inode * mapping and PagePrivate() to make sure the page was not * released. @@ -1463,8 +1461,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; } -static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes, bool nowait) +/* + * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) + * + * @pos: File offset. + * @write_bytes: The length to write, will be updated to the nocow writeable + * range. + * + * This function will flush ordered extents in the range to ensure proper + * nocow checks. + * + * Return: + * > 0 If we can nocow, and updates @write_bytes. + * 0 If we can't do a nocow write. + * -EAGAIN If we can't do a nocow write because snapshoting of the inode's + * root is in progress. + * < 0 If an error happened. + * + * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. + */ +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1475,7 +1492,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return 0; - if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock)) + if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); @@ -1483,71 +1500,21 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, fs_info->sectorsize) - 1; num_bytes = lockend - lockstart + 1; - if (nowait) { - struct btrfs_ordered_extent *ordered; - - if (!try_lock_extent(&inode->io_tree, lockstart, lockend)) - return -EAGAIN; - - ordered = btrfs_lookup_ordered_range(inode, lockstart, - num_bytes); - if (ordered) { - btrfs_put_ordered_extent(ordered); - ret = -EAGAIN; - goto out_unlock; - } - } else { - btrfs_lock_and_flush_ordered_range(inode, lockstart, - lockend, NULL); - } - + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, NULL, NULL, NULL, false); if (ret <= 0) { ret = 0; - if (!nowait) - btrfs_drew_write_unlock(&root->snapshot_lock); + btrfs_drew_write_unlock(&root->snapshot_lock); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); } -out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend); return ret; } -static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) -{ - return check_can_nocow(inode, pos, write_bytes, true); -} - -/* - * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) - * - * @pos: File offset - * @write_bytes: The length to write, will be updated to the nocow writeable - * range - * - * This function will flush ordered extents in the range to ensure proper - * nocow checks. - * - * Return: - * >0 and update @write_bytes if we can do nocow write - * 0 if we can't do nocow write - * -EAGAIN if we can't get the needed lock or there are ordered extents - * for * (nowait == true) case - * <0 if other error happened - * - * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock(). - */ -int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) -{ - return check_can_nocow(inode, pos, write_bytes, false); -} - void btrfs_check_nocow_unlock(struct btrfs_inode *inode) { btrfs_drew_write_unlock(&inode->root->snapshot_lock); @@ -1582,20 +1549,15 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, loff_t oldsize; loff_t start_pos; - if (iocb->ki_flags & IOCB_NOWAIT) { - size_t nocow_bytes = count; - - /* We will allocate space in case nodatacow is not set, so bail */ - if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0) - return -EAGAIN; - /* - * There are holes in the range or parts of the range that must - * be COWed (shared extents, RO block groups, etc), so just bail - * out. - */ - if (nocow_bytes < count) - return -EAGAIN; - } + /* + * Quickly bail out on NOWAIT writes if we don't have the nodatacow or + * prealloc flags, as without those flags we always have to COW. We will + * later check if we can really COW into the target range (using + * can_nocow_extent() at btrfs_get_blocks_direct_write()). + */ + if ((iocb->ki_flags & IOCB_NOWAIT) && + !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + return -EAGAIN; current->backing_dev_info = inode_to_bdi(inode); ret = file_remove_privs(file); @@ -1722,7 +1684,8 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, fs_info->sectorsize); WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - reserve_bytes); + reserve_bytes, + reserve_bytes, false); if (ret) { if (!only_release_metadata) btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -1967,8 +1930,7 @@ relock: */ again: from->nofault = true; - err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, written); + err = btrfs_dio_rw(iocb, from, written); from->nofault = false; /* No increment (+=) because iomap returns a cumulative value. */ @@ -2039,12 +2001,43 @@ out: return err < 0 ? err : written; } -static ssize_t btrfs_file_write_iter(struct kiocb *iocb, - struct iov_iter *from) +static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t count; + ssize_t ret; + + btrfs_inode_lock(inode, 0); + count = encoded->len; + ret = generic_write_checks_count(iocb, &count); + if (ret == 0 && count != encoded->len) { + /* + * The write got truncated by generic_write_checks_count(). We + * can't do a partial encoded write. + */ + ret = -EFBIG; + } + if (ret || encoded->len == 0) + goto out; + + ret = btrfs_write_check(iocb, from, encoded->len); + if (ret < 0) + goto out; + + ret = btrfs_do_encoded_write(iocb, from, encoded); +out: + btrfs_inode_unlock(inode, 0); + return ret; +} + +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) { struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); - ssize_t num_written = 0; + ssize_t num_written, num_sync; const bool sync = iocb->ki_flags & IOCB_DSYNC; /* @@ -2055,22 +2048,28 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS; - if (!(iocb->ki_flags & IOCB_DIRECT) && - (iocb->ki_flags & IOCB_NOWAIT)) + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) return -EOPNOTSUPP; if (sync) atomic_inc(&inode->sync_writers); - if (iocb->ki_flags & IOCB_DIRECT) - num_written = btrfs_direct_write(iocb, from); - else - num_written = btrfs_buffered_write(iocb, from); + if (encoded) { + num_written = btrfs_encoded_write(iocb, from, encoded); + num_sync = encoded->len; + } else if (iocb->ki_flags & IOCB_DIRECT) { + num_written = num_sync = btrfs_direct_write(iocb, from); + } else { + num_written = num_sync = btrfs_buffered_write(iocb, from); + } btrfs_set_inode_last_sub_trans(inode); - if (num_written > 0) - num_written = generic_write_sync(iocb, num_written); + if (num_sync > 0) { + num_sync = generic_write_sync(iocb, num_sync); + if (num_sync < 0) + num_written = num_sync; + } if (sync) atomic_dec(&inode->sync_writers); @@ -2079,6 +2078,11 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, return num_written; } +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + return btrfs_do_write_iter(iocb, from, NULL); +} + int btrfs_release_file(struct inode *inode, struct file *filp) { struct btrfs_file_private *private = filp->private_data; @@ -2319,25 +2323,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); - if (ret != BTRFS_NO_LOG_SYNC) { + if (ret == BTRFS_NO_LOG_SYNC) { + ret = btrfs_end_transaction(trans); + goto out; + } + + /* We successfully logged the inode, attempt to sync the log. */ + if (!ret) { + ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { - ret = btrfs_sync_log(trans, root, &ctx); - if (!ret) { - ret = btrfs_end_transaction(trans); - goto out; - } - } - if (!full_sync) { - ret = btrfs_wait_ordered_range(inode, start, len); - if (ret) { - btrfs_end_transaction(trans); - goto out; - } + ret = btrfs_end_transaction(trans); + goto out; } - ret = btrfs_commit_transaction(trans); - } else { + } + + /* + * At this point we need to commit the transaction because we had + * btrfs_need_log_full_commit() or some other error. + * + * If we didn't do a full sync we have to stop the trans handle, wait on + * the ordered extents, start it again and commit the transaction. If + * we attempt to wait on the ordered extents here we could deadlock with + * something like fallocate() that is holding the extent lock trying to + * start a transaction while some other thread is trying to commit the + * transaction while we (fsync) are currently holding the transaction + * open. + */ + if (!full_sync) { ret = btrfs_end_transaction(trans); + if (ret) + goto out; + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) + goto out; + + /* + * This is safe to use here because we're only interested in + * making sure the transaction that had the ordered extents is + * committed. We aren't waiting on anything past this point, + * we're purely getting the transaction and committing it. + */ + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + + /* + * We committed the transaction and there's no currently + * running transaction, this means everything we care + * about made it to disk and we are done. + */ + if (ret == -ENOENT) + ret = 0; + goto out; + } } + + ret = btrfs_commit_transaction(trans); out: ASSERT(list_empty(&ctx.list)); err = file_check_and_advance_wb_err(file); @@ -2361,7 +2402,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) { struct address_space *mapping = filp->f_mapping; - if (!mapping->a_ops->readpage) + if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(filp); @@ -2474,7 +2515,7 @@ out: hole_em = alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_cache(inode, offset, end - 1, 0); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); } else { hole_em->start = offset; hole_em->len = end - offset; @@ -2495,8 +2536,7 @@ out: } while (ret == -EEXIST); free_extent_map(hole_em); if (ret) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); } return 0; @@ -2531,10 +2571,10 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) return ret; } -static int btrfs_punch_hole_lock_range(struct inode *inode, - const u64 lockstart, - const u64 lockend, - struct extent_state **cached_state) +static void btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, + const u64 lockend, + struct extent_state **cached_state) { /* * For subpage case, if the range is not at page boundary, we could @@ -2548,40 +2588,29 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; while (1) { - struct btrfs_ordered_extent *ordered; - int ret; - truncate_pagecache_range(inode, lockstart, lockend); lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), - lockend); - /* - * We need to make sure we have no ordered extents in this range - * and nobody raced in and read a page in this range, if we did - * we need to try again. + * We can't have ordered extents in the range, nor dirty/writeback + * pages, because we have locked the inode's VFS lock in exclusive + * mode, we have locked the inode's i_mmap_lock in exclusive mode, + * we have flushed all delalloc in the range and we have waited + * for any ordered extents in the range to complete. + * We can race with anyone reading pages from this range, so after + * locking the range check if we have pages in the range, and if + * we do, unlock the range and retry. */ - if ((!ordered || - (ordered->file_offset + ordered->num_bytes <= lockstart || - ordered->file_offset > lockend)) && - !filemap_range_has_page(inode->i_mapping, - page_lockstart, page_lockend)) { - if (ordered) - btrfs_put_ordered_extent(ordered); + if (!filemap_range_has_page(inode->i_mapping, page_lockstart, + page_lockend)) break; - } - if (ordered) - btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); - ret = btrfs_wait_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (ret) - return ret; } - return 0; + + btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); } static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, @@ -2727,7 +2756,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); + if (WARN_ON(ret)) + goto out_trans; trans->block_rsv = rsv; cur_offset = start; @@ -2811,6 +2841,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, extent_info->file_offset += replace_len; } + /* + * We are releasing our handle on the transaction, balance the + * dirty pages of the btree inode and flush delayed items, and + * then get a new transaction handle, which may now point to a + * new transaction in case someone else may have committed the + * transaction we used to replace/drop file extent items. So + * bump the inode's iversion and update mtime and ctime except + * if we are called from a dedupe context. This is because a + * power failure/crash may happen after the transaction is + * committed and before we finish replacing/dropping all the + * file extent items we need. + */ + inode_inc_iversion(&inode->vfs_inode); + + if (!extent_info || extent_info->update_times) { + inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; + } + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -2827,7 +2876,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ + if (WARN_ON(ret)) + break; trans->block_rsv = rsv; cur_offset = drop_args.drop_end; @@ -2850,7 +2900,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, * maps for the replacement extents (or holes). */ if (extent_info && !extent_info->is_new_extent) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); if (ret) goto out_trans; @@ -2918,8 +2968,9 @@ out: return ret; } -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; @@ -2936,11 +2987,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) bool truncated_block = false; bool updated_inode = false; + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) - return ret; + goto out_only_mutex; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); ino_size = round_up(inode->i_size, fs_info->sectorsize); ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) @@ -2951,6 +3003,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } + ret = file_modified(file); + if (ret) + goto out_only_mutex; + lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); lockend = round_down(offset + len, btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; @@ -3028,10 +3084,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, - &cached_state); - if (ret) - goto out_only_mutex; + btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); path = btrfs_alloc_path(); if (!path) { @@ -3193,8 +3246,6 @@ static int btrfs_zero_range(struct inode *inode, u64 bytes_to_reserve = 0; bool space_reserved = false; - inode_dio_wait(inode); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, alloc_end - alloc_start); if (IS_ERR(em)) { @@ -3324,10 +3375,8 @@ reserve_space: if (ret < 0) goto out; space_reserved = true; - ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, - &cached_state); - if (ret) - goto out; + btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { @@ -3373,6 +3422,9 @@ static long btrfs_fallocate(struct file *file, int mode, u64 alloc_hint = 0; u64 locked_end; u64 actual_end = 0; + u64 data_space_needed = 0; + u64 data_space_reserved = 0; + u64 qgroup_reserved = 0; struct extent_map *em; int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); int ret; @@ -3391,19 +3443,7 @@ static long btrfs_fallocate(struct file *file, int mode, return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) - return btrfs_punch_hole(inode, offset, len); - - /* - * Only trigger disk allocation, don't trigger qgroup reserve - * - * For qgroup space, it will be checked later. - */ - if (!(mode & FALLOC_FL_ZERO_RANGE)) { - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), - alloc_end - alloc_start); - if (ret < 0) - return ret; - } + return btrfs_punch_hole(file, offset, len); btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); @@ -3413,6 +3453,10 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; } + ret = file_modified(file); + if (ret) + goto out; + /* * TODO: Move these two operations after we have checked * accurate reserved space, or fallocate can still fail but @@ -3437,8 +3481,12 @@ static long btrfs_fallocate(struct file *file, int mode, } /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. + * We have locked the inode at the VFS level (in exclusive mode) and we + * have locked the i_mmap_lock lock (in exclusive mode). Now before + * locking the file range, flush all dealloc in the range and wait for + * all ordered extents in the range to complete. After this we can lock + * the file range and, due to the previous locking we did, we know there + * can't be more delalloc or ordered extents in the range. */ ret = btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); @@ -3452,38 +3500,10 @@ static long btrfs_fallocate(struct file *file, int mode, } locked_end = alloc_end - 1; - while (1) { - struct btrfs_ordered_extent *ordered; + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); - /* the extent lock is ordered inside the running - * transaction - */ - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), - locked_end); - - if (ordered && - ordered->file_offset + ordered->num_bytes > alloc_start && - ordered->file_offset < alloc_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - alloc_start, locked_end, - &cached_state); - /* - * we can't wait on the range with the transaction - * running or with the extent lock held - */ - ret = btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); - if (ret) - goto out; - } else { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - } + btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); @@ -3500,48 +3520,64 @@ static long btrfs_fallocate(struct file *file, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = add_falloc_range(&reserve_list, cur_offset, - last_byte - cur_offset); + const u64 range_len = last_byte - cur_offset; + + ret = add_falloc_range(&reserve_list, cur_offset, range_len); if (ret < 0) { free_extent_map(em); break; } ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), - &data_reserved, cur_offset, - last_byte - cur_offset); + &data_reserved, cur_offset, range_len); if (ret < 0) { - cur_offset = last_byte; free_extent_map(em); break; } - } else { - /* - * Do not need to reserve unwritten extent for this - * range, free reserved data space first, otherwise - * it'll result in false ENOSPC error. - */ - btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, cur_offset, - last_byte - cur_offset); + qgroup_reserved += range_len; + data_space_needed += range_len; } free_extent_map(em); cur_offset = last_byte; } + if (!ret && data_space_needed > 0) { + /* + * We are safe to reserve space here as we can't have delalloc + * in the range, see above. + */ + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + data_space_needed); + if (!ret) + data_space_reserved = data_space_needed; + } + /* * If ret is still 0, means we're OK to fallocate. * Or just cleanup the list and exit. */ list_for_each_entry_safe(range, tmp, &reserve_list, list) { - if (!ret) + if (!ret) { ret = btrfs_prealloc_file_range(inode, mode, range->start, range->len, i_blocksize(inode), offset + len, &alloc_hint); - else + /* + * btrfs_prealloc_file_range() releases space even + * if it returns an error. + */ + data_space_reserved -= range->len; + qgroup_reserved -= range->len; + } else if (data_space_reserved > 0) { btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, range->start, - range->len); + data_reserved, range->start, + range->len); + data_space_reserved -= range->len; + qgroup_reserved -= range->len; + } else if (qgroup_reserved > 0) { + btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, + range->start, range->len); + qgroup_reserved -= range->len; + } list_del(&range->list); kfree(range); } @@ -3558,10 +3594,6 @@ out_unlock: &cached_state); out: btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); - /* Let go of our reservation. */ - if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) - btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, - cur_offset, alloc_end - cur_offset); extent_changeset_free(data_reserved); return ret; } @@ -3719,8 +3751,7 @@ again: */ pagefault_disable(); to->nofault = true; - ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, read); + ret = btrfs_dio_rw(iocb, to, read); to->nofault = false; pagefault_enable(); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 01a408db5683..b1ae3ba2ca2c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -465,7 +465,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) io_ctl->pages[i] = page; if (uptodate && !PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (page->mapping != inode->i_mapping) { btrfs_err(BTRFS_I(inode)->root->fs_info, @@ -2630,16 +2630,19 @@ out: static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 bytenr, u64 size, bool used) { - struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *sinfo = block_group->space_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; - const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold); + int bg_reclaim_threshold = 0; bool initial = (size == block_group->length); u64 reclaimable_unusable; WARN_ON(!initial && offset + size > block_group->zone_capacity); + if (!initial) + bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); + spin_lock(&ctl->tree_lock); if (!used) to_free = size; @@ -4069,7 +4072,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, btrfs_info(fs_info, "cleaning free space cache v1"); - node = rb_first(&fs_info->block_group_cache_tree); + node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = btrfs_remove_free_space_inode(trans, NULL, block_group); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 655aad0f9e1c..1bf89aa67216 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -25,6 +25,8 @@ static struct btrfs_root *btrfs_free_space_root( .offset = 0, }; + if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2)) + key.offset = block_group->global_root_id; return btrfs_global_root(block_group->fs_info, &key); } @@ -1176,7 +1178,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) goto abort; } - node = rb_first(&fs_info->block_group_cache_tree); + node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { block_group = rb_entry(node, struct btrfs_block_group, cache_node); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5bbea5ec31fc..05e0c4a5affd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -64,6 +64,39 @@ struct btrfs_iget_args { struct btrfs_dio_data { ssize_t submitted; struct extent_changeset *data_reserved; + bool data_space_reserved; + bool nocow_done; +}; + +struct btrfs_dio_private { + struct inode *inode; + + /* + * Since DIO can use anonymous page, we cannot use page_offset() to + * grab the file offset, thus need a dedicated member for file offset. + */ + u64 file_offset; + /* Used for bio::bi_size */ + u32 bytes; + + /* + * References to this structure. There is one reference per in-flight + * bio plus one while we're still setting up. + */ + refcount_t refs; + + /* Array of checksums */ + u8 *csums; + + /* This must be last */ + struct bio bio; +}; + +static struct bio_set btrfs_dio_bioset; + +struct btrfs_rename_ctx { + /* Output field. Stores the index number of the old directory entry. */ + u64 index; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -217,15 +250,25 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, static int btrfs_dirty_inode(struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir, - const struct qstr *qstr) + struct btrfs_new_inode_args *args) { int err; - err = btrfs_init_acl(trans, inode, dir); - if (!err) - err = btrfs_xattr_security_init(trans, inode, dir, qstr); - return err; + if (args->default_acl) { + err = __btrfs_set_acl(trans, args->inode, args->default_acl, + ACL_TYPE_DEFAULT); + if (err) + return err; + } + if (args->acl) { + err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); + if (err) + return err; + } + if (!args->default_acl && !args->acl) + cache_no_acl(args->inode); + return btrfs_xattr_security_init(trans, args->inode, args->dir, + &args->dentry->d_name); } /* @@ -234,12 +277,14 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * no overlapping inline items exist in the btree */ static int insert_inline_extent(struct btrfs_trans_handle *trans, - struct btrfs_path *path, bool extent_inserted, - struct btrfs_root *root, struct inode *inode, - u64 start, size_t size, size_t compressed_size, + struct btrfs_path *path, + struct btrfs_inode *inode, bool extent_inserted, + size_t size, size_t compressed_size, int compress_type, - struct page **compressed_pages) + struct page **compressed_pages, + bool update_i_size) { + struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct page *page = NULL; char *kaddr; @@ -247,7 +292,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *ei; int ret; size_t cur_size = size; - unsigned long offset; + u64 i_size; ASSERT((compressed_size > 0 && compressed_pages) || (compressed_size == 0 && !compressed_pages)); @@ -259,8 +304,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; size_t datasize; - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.offset = start; + key.objectid = btrfs_ino(inode); + key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(cur_size); @@ -298,12 +343,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { - page = find_get_page(inode->i_mapping, - start >> PAGE_SHIFT); + page = find_get_page(inode->vfs_inode.i_mapping, 0); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); - offset = offset_in_page(start); - write_extent_buffer(leaf, kaddr + offset, ptr, size); + write_extent_buffer(leaf, kaddr, ptr, size); kunmap_atomic(kaddr); put_page(page); } @@ -314,21 +357,25 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, * We align size to sectorsize for inline extents just for simplicity * sake. */ - size = ALIGN(size, root->fs_info->sectorsize); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); + ret = btrfs_inode_set_file_extent_range(inode, 0, + ALIGN(size, root->fs_info->sectorsize)); if (ret) goto fail; /* - * we're an inline extent, so nobody can - * extend the file past i_size without locking - * a page we already have locked. + * We're an inline extent, so nobody can extend the file past i_size + * without locking a page we already have locked. * - * We must do any isize and inode updates - * before we unlock the pages. Otherwise we - * could end up racing with unlink. + * We must do any i_size and inode updates before we unlock the pages. + * Otherwise we could end up racing with unlink. */ - BTRFS_I(inode)->disk_i_size = inode->i_size; + i_size = i_size_read(&inode->vfs_inode); + if (update_i_size && size > i_size) { + i_size_write(&inode->vfs_inode, size); + i_size = size; + } + inode->disk_i_size = i_size; + fail: return ret; } @@ -339,35 +386,31 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, - u64 end, size_t compressed_size, +static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, + size_t compressed_size, int compress_type, - struct page **compressed_pages) + struct page **compressed_pages, + bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - u64 isize = i_size_read(&inode->vfs_inode); - u64 actual_end = min(end + 1, isize); - u64 inline_len = actual_end - start; - u64 aligned_end = ALIGN(end, fs_info->sectorsize); - u64 data_len = inline_len; + u64 data_len = (compressed_size ?: size); int ret; struct btrfs_path *path; - if (compressed_size) - data_len = compressed_size; - - if (start > 0 || - actual_end > fs_info->sectorsize || + /* + * We can create an inline extent if it ends at or beyond the current + * i_size, is no larger than a sector (decompressed), and the (possibly + * compressed) data fits in a leaf and the configured maximum inline + * size. + */ + if (size < i_size_read(&inode->vfs_inode) || + size > fs_info->sectorsize || data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || - (!compressed_size && - (actual_end & (fs_info->sectorsize - 1)) == 0) || - end + 1 < isize || - data_len > fs_info->max_inline) { + data_len > fs_info->max_inline) return 1; - } path = btrfs_alloc_path(); if (!path) @@ -381,30 +424,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, trans->block_rsv = &inode->block_rsv; drop_args.path = path; - drop_args.start = start; - drop_args.end = aligned_end; + drop_args.start = 0; + drop_args.end = fs_info->sectorsize; drop_args.drop_cache = true; drop_args.replace_extent = true; - - if (compressed_size && compressed_pages) - drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( - compressed_size); - else - drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( - inline_len); - + drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - if (isize > actual_end) - inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, path, drop_args.extent_inserted, - root, &inode->vfs_inode, start, - inline_len, compressed_size, - compress_type, compressed_pages); + ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, + size, compressed_size, compress_type, + compressed_pages, update_i_size); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -413,7 +446,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, goto out; } - btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); + btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); ret = btrfs_update_inode(trans, root, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); @@ -423,7 +456,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, goto out; } - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); out: /* * Don't forget to free the reserved space, as for inlined extent @@ -486,17 +519,6 @@ static noinline int add_async_extent(struct async_chunk *cow, } /* - * Check if the inode has flags compatible with compression - */ -static inline bool inode_can_compress(struct btrfs_inode *inode) -{ - if (inode->flags & BTRFS_INODE_NODATACOW || - inode->flags & BTRFS_INODE_NODATASUM) - return false; - return true; -} - -/* * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics. */ @@ -505,7 +527,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, { struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (!inode_can_compress(inode)) { + if (!btrfs_inode_can_compress(inode)) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: unexpected compression for ino %llu\n", btrfs_ino(inode)); @@ -624,7 +646,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0); nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED / PAGE_SIZE); @@ -735,14 +756,15 @@ cont: /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(BTRFS_I(inode), start, end, + ret = cow_file_range_inline(BTRFS_I(inode), actual_end, 0, BTRFS_COMPRESS_NONE, - NULL); + NULL, false); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(BTRFS_I(inode), start, end, + ret = cow_file_range_inline(BTRFS_I(inode), actual_end, total_compressed, - compress_type, pages); + compress_type, pages, + false); } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | @@ -981,11 +1003,14 @@ static int submit_one_async_extent(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */ - ins.objectid, /* disk_bytenr */ - async_extent->ram_size, /* num_bytes */ - ins.offset, /* disk_num_bytes */ - async_extent->compress_type); + ret = btrfs_add_ordered_extent(inode, start, /* file_offset */ + async_extent->ram_size, /* num_bytes */ + async_extent->ram_size, /* ram_bytes */ + ins.objectid, /* disk_bytenr */ + ins.offset, /* disk_num_bytes */ + 0, /* offset */ + 1 << BTRFS_ORDERED_COMPRESSED, + async_extent->compress_type); if (ret) { btrfs_drop_extent_cache(inode, start, end, 0); goto out_free_reserve; @@ -1003,7 +1028,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, async_extent->pages, /* compressed_pages */ async_extent->nr_pages, async_chunk->write_flags, - async_chunk->blkcg_css)) { + async_chunk->blkcg_css, true)) { const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; @@ -1130,7 +1155,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, int ret = 0; if (btrfs_is_free_space_inode(inode)) { - WARN_ON_ONCE(1); ret = -EINVAL; goto out_unlock; } @@ -1152,9 +1176,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * So here we skip inline extent creation completely. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { + u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), + end + 1); + /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, start, end, 0, - BTRFS_COMPRESS_NONE, NULL); + ret = cow_file_range_inline(inode, actual_end, 0, + BTRFS_COMPRESS_NONE, NULL, false); if (ret == 0) { /* * We use DO_ACCOUNTING here because we need the @@ -1234,9 +1261,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, ins.objectid, - ram_size, cur_alloc_size, - BTRFS_ORDERED_REGULAR); + ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size, + ins.objectid, cur_alloc_size, 0, + 1 << BTRFS_ORDERED_REGULAR, + BTRFS_COMPRESS_NONE); if (ret) goto out_drop_extent_cache; @@ -1617,6 +1645,141 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, nr_written, 1); } +struct can_nocow_file_extent_args { + /* Input fields. */ + + /* Start file offset of the range we want to NOCOW. */ + u64 start; + /* End file offset (inclusive) of the range we want to NOCOW. */ + u64 end; + bool writeback_path; + bool strict; + /* + * Free the path passed to can_nocow_file_extent() once it's not needed + * anymore. + */ + bool free_path; + + /* Output fields. Only set when can_nocow_file_extent() returns 1. */ + + u64 disk_bytenr; + u64 disk_num_bytes; + u64 extent_offset; + /* Number of bytes that can be written to in NOCOW mode. */ + u64 num_bytes; +}; + +/* + * Check if we can NOCOW the file extent that the path points to. + * This function may return with the path released, so the caller should check + * if path->nodes[0] is NULL or not if it needs to use the path afterwards. + * + * Returns: < 0 on error + * 0 if we can not NOCOW + * 1 if we can NOCOW + */ +static int can_nocow_file_extent(struct btrfs_path *path, + struct btrfs_key *key, + struct btrfs_inode *inode, + struct can_nocow_file_extent_args *args) +{ + const bool is_freespace_inode = btrfs_is_free_space_inode(inode); + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + u64 extent_end; + u8 extent_type; + int can_nocow = 0; + int ret = 0; + + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + goto out; + + /* Can't access these fields unless we know it's not an inline extent. */ + args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + args->extent_offset = btrfs_file_extent_offset(leaf, fi); + + if (!(inode->flags & BTRFS_INODE_NODATACOW) && + extent_type == BTRFS_FILE_EXTENT_REG) + goto out; + + /* + * If the extent was created before the generation where the last snapshot + * for its subvolume was created, then this implies the extent is shared, + * hence we must COW. + */ + if (!args->strict && + btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + + /* An explicit hole, must COW. */ + if (args->disk_bytenr == 0) + goto out; + + /* Compressed/encrypted/encoded extents must be COWed. */ + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out; + + extent_end = btrfs_file_extent_end(path); + + /* + * The following checks can be expensive, as they need to take other + * locks and do btree or rbtree searches, so release the path to avoid + * blocking other tasks for too long. + */ + btrfs_release_path(path); + + ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), + key->offset - args->extent_offset, + args->disk_bytenr, false, path); + WARN_ON_ONCE(ret > 0 && is_freespace_inode); + if (ret != 0) + goto out; + + if (args->free_path) { + /* + * We don't need the path anymore, plus through the + * csum_exist_in_range() call below we will end up allocating + * another path. So free the path to avoid unnecessary extra + * memory usage. + */ + btrfs_free_path(path); + path = NULL; + } + + /* If there are pending snapshots for this root, we must COW. */ + if (args->writeback_path && !is_freespace_inode && + atomic_read(&root->snapshot_force_cow)) + goto out; + + args->disk_bytenr += args->extent_offset; + args->disk_bytenr += args->start - key->offset; + args->num_bytes = min(args->end + 1, extent_end) - args->start; + + /* + * Force COW if csums exist in the range. This ensures that csums for a + * given extent are either valid or do not exist. + */ + ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes); + WARN_ON_ONCE(ret > 0 && is_freespace_inode); + if (ret != 0) + goto out; + + can_nocow = 1; + out: + if (args->free_path && path) + btrfs_free_path(path); + + return ret < 0 ? ret : can_nocow; +} + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@ -1637,11 +1800,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, u64 cur_offset = start; int ret; bool check_prev = true; - const bool freespace_inode = btrfs_is_free_space_inode(inode); u64 ino = btrfs_ino(inode); + struct btrfs_block_group *bg; bool nocow = false; - u64 disk_bytenr = 0; - const bool force = inode->flags & BTRFS_INODE_NODATACOW; + struct can_nocow_file_extent_args nocow_args = { 0 }; path = btrfs_alloc_path(); if (!path) { @@ -1654,15 +1816,16 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, return -ENOMEM; } + nocow_args.end = end; + nocow_args.writeback_path = true; + while (1) { struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; u64 extent_end; - u64 extent_offset; - u64 num_bytes = 0; - u64 disk_num_bytes; u64 ram_bytes; + u64 nocow_end; int extent_type; nocow = false; @@ -1738,116 +1901,38 @@ next_slot: fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); - + /* If this is triggered then we have a memory corruption. */ + ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES); + if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) { + ret = -EUCLEAN; + goto error; + } ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); - if (extent_type == BTRFS_FILE_EXTENT_REG || - extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - extent_offset = btrfs_file_extent_offset(leaf, fi); - extent_end = found_key.offset + - btrfs_file_extent_num_bytes(leaf, fi); - disk_num_bytes = - btrfs_file_extent_disk_num_bytes(leaf, fi); - /* - * If the extent we got ends before our current offset, - * skip to the next extent. - */ - if (extent_end <= cur_offset) { - path->slots[0]++; - goto next_slot; - } - /* Skip holes */ - if (disk_bytenr == 0) - goto out_check; - /* Skip compressed/encrypted/encoded extents */ - if (btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - goto out_check; - /* - * If extent is created before the last volume's snapshot - * this implies the extent is shared, hence we can't do - * nocow. This is the same check as in - * btrfs_cross_ref_exist but without calling - * btrfs_search_slot. - */ - if (!freespace_inode && - btrfs_file_extent_generation(leaf, fi) <= - btrfs_root_last_snapshot(&root->root_item)) - goto out_check; - if (extent_type == BTRFS_FILE_EXTENT_REG && !force) - goto out_check; + extent_end = btrfs_file_extent_end(path); - /* - * The following checks can be expensive, as they need to - * take other locks and do btree or rbtree searches, so - * release the path to avoid blocking other tasks for too - * long. - */ - btrfs_release_path(path); + /* + * If the extent we got ends before our current offset, skip to + * the next extent. + */ + if (extent_end <= cur_offset) { + path->slots[0]++; + goto next_slot; + } - ret = btrfs_cross_ref_exist(root, ino, - found_key.offset - - extent_offset, disk_bytenr, false); - if (ret) { - /* - * ret could be -EIO if the above fails to read - * metadata. - */ - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; - goto error; - } + nocow_args.start = cur_offset; + ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); + if (ret < 0) { + if (cow_start != (u64)-1) + cur_offset = cow_start; + goto error; + } else if (ret == 0) { + goto out_check; + } - WARN_ON_ONCE(freespace_inode); - goto out_check; - } - disk_bytenr += extent_offset; - disk_bytenr += cur_offset - found_key.offset; - num_bytes = min(end + 1, extent_end) - cur_offset; - /* - * If there are pending snapshots for this root, we - * fall into common COW way - */ - if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) - goto out_check; - /* - * force cow if csum exists in the range. - * this ensure that csum for a given extent are - * either valid or do not exist. - */ - ret = csum_exist_in_range(fs_info, disk_bytenr, - num_bytes); - if (ret) { - /* - * ret could be -EIO if the above fails to read - * metadata. - */ - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; - goto error; - } - WARN_ON_ONCE(freespace_inode); - goto out_check; - } - /* If the extent's block group is RO, we must COW */ - if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) - goto out_check; + ret = 0; + bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + if (bg) nocow = true; - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - extent_end = found_key.offset + ram_bytes; - extent_end = ALIGN(extent_end, fs_info->sectorsize); - /* Skip extents outside of our requested range */ - if (extent_end <= start) { - path->slots[0]++; - goto next_slot; - } - } else { - /* If this triggers then we have a memory corruption */ - BUG(); - } out_check: /* * If nocow is false then record the beginning of the range @@ -1879,15 +1964,17 @@ out_check: cow_start = (u64)-1; } + nocow_end = cur_offset + nocow_args.num_bytes - 1; + if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 orig_start = found_key.offset - extent_offset; + u64 orig_start = found_key.offset - nocow_args.extent_offset; struct extent_map *em; - em = create_io_em(inode, cur_offset, num_bytes, + em = create_io_em(inode, cur_offset, nocow_args.num_bytes, orig_start, - disk_bytenr, /* block_start */ - num_bytes, /* block_len */ - disk_num_bytes, /* orig_block_len */ + nocow_args.disk_bytenr, /* block_start */ + nocow_args.num_bytes, /* block_len */ + nocow_args.disk_num_bytes, /* orig_block_len */ ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { @@ -1895,28 +1982,35 @@ out_check: goto error; } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, cur_offset, - disk_bytenr, num_bytes, - num_bytes, - BTRFS_ORDERED_PREALLOC); + ret = btrfs_add_ordered_extent(inode, + cur_offset, nocow_args.num_bytes, + nocow_args.num_bytes, + nocow_args.disk_bytenr, + nocow_args.num_bytes, 0, + 1 << BTRFS_ORDERED_PREALLOC, + BTRFS_COMPRESS_NONE); if (ret) { btrfs_drop_extent_cache(inode, cur_offset, - cur_offset + num_bytes - 1, - 0); + nocow_end, 0); goto error; } } else { ret = btrfs_add_ordered_extent(inode, cur_offset, - disk_bytenr, num_bytes, - num_bytes, - BTRFS_ORDERED_NOCOW); + nocow_args.num_bytes, + nocow_args.num_bytes, + nocow_args.disk_bytenr, + nocow_args.num_bytes, + 0, + 1 << BTRFS_ORDERED_NOCOW, + BTRFS_COMPRESS_NONE); if (ret) goto error; } - if (nocow) - btrfs_dec_nocow_writers(fs_info, disk_bytenr); - nocow = false; + if (nocow) { + btrfs_dec_nocow_writers(bg); + nocow = false; + } if (btrfs_is_data_reloc_root(root)) /* @@ -1925,10 +2019,9 @@ out_check: * from freeing metadata of created ordered extent. */ ret = btrfs_reloc_clone_csums(inode, cur_offset, - num_bytes); + nocow_args.num_bytes); - extent_clear_unlock_delalloc(inode, cur_offset, - cur_offset + num_bytes - 1, + extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, @@ -1961,7 +2054,7 @@ out_check: error: if (nocow) - btrfs_dec_nocow_writers(fs_info, disk_bytenr); + btrfs_dec_nocow_writers(bg); if (ret && cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, @@ -2012,11 +2105,10 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page * to use run_delalloc_nocow() here, like for regular * preallocated inodes. */ - ASSERT(!zoned || - (zoned && btrfs_is_data_reloc_root(inode->root))); + ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); - } else if (!inode_can_compress(inode) || + } else if (!btrfs_inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { if (zoned) ret = run_delalloc_zoned(inode, locked_page, start, end, @@ -2310,7 +2402,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); + return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); } /* @@ -2506,9 +2598,8 @@ out: * * c-3) otherwise: async submit */ -blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) - +void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2537,11 +2628,14 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, if (ret) goto out; - if (bio_flags & EXTENT_BIO_COMPRESSED) { - ret = btrfs_submit_compressed_read(inode, bio, - mirror_num, - bio_flags); - goto out; + if (compress_type != BTRFS_COMPRESS_NONE) { + /* + * btrfs_submit_compressed_read will handle completing + * the bio if there were any errors, so just return + * here. + */ + btrfs_submit_compressed_read(inode, bio, mirror_num); + return; } else { /* * Lookup bio sums does extra checks around whether we @@ -2558,11 +2652,11 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, if (btrfs_is_data_reloc_root(root)) goto mapit; /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, + ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btrfs_submit_bio_start); goto out; } else if (!skip_sum) { - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); if (ret) goto out; } @@ -2575,7 +2669,6 @@ out: bio->bi_status = ret; bio_endio(bio); } - return ret; } /* @@ -2870,6 +2963,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); + u64 offset = btrfs_stack_file_extent_offset(stack_fi); u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); struct btrfs_drop_extents_args drop_args = { 0 }; @@ -2944,7 +3038,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, goto out; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), - file_pos, qgroup_reserved, &ins); + file_pos - offset, + qgroup_reserved, &ins); out: btrfs_free_path(path); @@ -2970,20 +3065,20 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; - u64 logical_len; bool update_inode_bytes; + u64 num_bytes = oe->num_bytes; + u64 ram_bytes = oe->ram_bytes; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); + btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) - logical_len = oe->truncated_len; - else - logical_len = oe->num_bytes; - btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len); - btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len); + num_bytes = ram_bytes = oe->truncated_len; + btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ @@ -2994,6 +3089,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * except if the ordered extent was truncated. */ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || + test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), @@ -3028,7 +3124,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && - !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) + !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(inode); @@ -3098,6 +3195,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->file_offset + logical_len); + btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); @@ -3262,11 +3361,11 @@ static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, shash->tfm = fs_info->csum_shash; crypto_shash_digest(shash, kaddr + pgoff, len, csum); + kunmap_atomic(kaddr); if (memcmp(csum, csum_expected, csum_size)) goto zeroit; - kunmap_atomic(kaddr); return 0; zeroit: btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, @@ -3274,9 +3373,7 @@ zeroit: if (bbio->device) btrfs_dev_stat_inc_and_print(bbio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); - memset(kaddr + pgoff, 1, len); - flush_dcache_page(page); - kunmap_atomic(kaddr); + memzero_page(page, pgoff, len); return -EIO; } @@ -3481,6 +3578,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) u64 last_objectid = 0; int ret = 0, nr_unlink = 0; + /* Bail out if the cleanup is already running. */ if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; @@ -3563,17 +3661,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * * btrfs_find_orphan_roots() ran before us, which has * found all deleted roots and loaded them into - * fs_info->fs_roots_radix. So here we can find if an + * fs_info->fs_roots. So here we can find if an * orphan item corresponds to a deleted root by looking - * up the root from that radix tree. + * up the root from that xarray. */ - spin_lock(&fs_info->fs_roots_radix_lock); - dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, - (unsigned long)found_key.objectid); + spin_lock(&fs_info->fs_roots_lock); + dead_root = xa_load(&fs_info->fs_roots, + (unsigned long)found_key.objectid); if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) is_dead_root = 1; - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); if (is_dead_root) { /* prevent this orphan from being found again */ @@ -3813,7 +3911,7 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in delayed_nodes_tree. + * in the delayed_nodes xarray. */ if (BTRFS_I(inode)->last_trans == fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, @@ -4062,7 +4160,8 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len) + const char *name, int name_len, + struct btrfs_rename_ctx *rename_ctx) { struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4118,15 +4217,27 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; } skip_backref: + if (rename_ctx) + rename_ctx->index = index; + ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto err; } - btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, - dir_ino); - btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); + /* + * If we are in a rename context, we don't need to update anything in the + * log. That will be done later during the rename by btrfs_log_new_name(). + * Besides that, doing it here would only cause extra unncessary btree + * operations on the log tree, increasing latency for applications. + */ + if (!rename_ctx) { + btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, + dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, + index); + } /* * If we have a pending delayed iput we could end up with the final iput @@ -4158,7 +4269,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, const char *name, int name_len) { int ret; - ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len); + ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); ret = btrfs_update_inode(trans, inode->root, inode); @@ -4184,8 +4295,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) * 1 for the dir index * 1 for the inode ref * 1 for the inode + * 1 for the parent inode */ - return btrfs_start_transaction_fallback_global_rsv(root, 5); + return btrfs_start_transaction_fallback_global_rsv(root, 6); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -4460,6 +4572,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); return -EPERM; } + if (atomic_read(&dest->nr_swapfiles)) { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu with active swapfile", + root->root_key.objectid); + return -EPERM; + } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); @@ -4565,14 +4684,21 @@ out_up_write: static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; int err = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { + if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { + btrfs_err(fs_info, + "extent tree v2 doesn't support snapshot deletion yet"); + return -EOPNOTSUPP; + } return btrfs_delete_subvolume(dir, dentry); + } trans = __unlink_start_trans(dir); if (IS_ERR(trans)) @@ -4611,7 +4737,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } out: btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); + btrfs_btree_balance_dirty(fs_info); return err; } @@ -4664,7 +4790,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, goto out; } } - ret = btrfs_delalloc_reserve_metadata(inode, blocksize); + ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false); if (ret < 0) { if (!only_release_metadata) btrfs_free_reserved_data_space(inode, data_reserved, @@ -4685,7 +4811,7 @@ again: goto out_unlock; if (!PageUptodate(page)) { - ret = btrfs_readpage(NULL, page); + ret = btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (page->mapping != mapping) { unlock_page(page); @@ -4876,8 +5002,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); if (!hole_em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); goto next; } hole_em->start = cur_offset; @@ -5046,16 +5171,17 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr } /* - * While truncating the inode pages during eviction, we get the VFS calling - * btrfs_invalidatepage() against each page of the inode. This is slow because - * the calls to btrfs_invalidatepage() result in a huge amount of calls to - * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting - * extent_state structures over and over, wasting lots of time. + * While truncating the inode pages during eviction, we get the VFS + * calling btrfs_invalidate_folio() against each folio of the inode. This + * is slow because the calls to btrfs_invalidate_folio() result in a + * huge amount of calls to lock_extent_bits() and clear_extent_bit(), + * which keep merging and splitting extent_state structures over and over, + * wasting lots of time. * - * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all - * those expensive operations on a per page basis and do only the ordered io - * finishing, while we release here the extent_map and extent_state structures, - * without the excessive merging and splitting. + * Therefore if the inode is being evicted, let btrfs_invalidate_folio() + * skip all those expensive operations on a per folio basis and do only + * the ordered io finishing, while we release here the extent_map and + * extent_state structures, without the excessive merging and splitting. */ static void evict_inode_truncate_pages(struct inode *inode) { @@ -5121,7 +5247,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * If still has DELALLOC flag, the extent didn't reach disk, * and its reserved space won't be freed by delayed_ref. * So we need to free its reserved space here. - * (Refer to comment in btrfs_invalidatepage, case 2) + * (Refer to comment in btrfs_invalidate_folio, case 2) * * Note, end is the bytenr of last byte, so we need + 1 here. */ @@ -5584,21 +5710,17 @@ static struct inode *new_simple_dir(struct super_block *s, return inode; } +static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN); +static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE); +static_assert(BTRFS_FT_DIR == FT_DIR); +static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV); +static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV); +static_assert(BTRFS_FT_FIFO == FT_FIFO); +static_assert(BTRFS_FT_SOCK == FT_SOCK); +static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); + static inline u8 btrfs_inode_type(struct inode *inode) { - /* - * Compile-time asserts that generic FT_* types still match - * BTRFS_FT_* types - */ - BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); - BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); - BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); - BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); - BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); - BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); - BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); - BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); - return fs_umode_to_ftype(inode->i_mode); } @@ -5755,8 +5877,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct list_head ins_list; struct list_head del_list; int ret; - struct extent_buffer *leaf; - int slot; char *name_ptr; int name_len; int entries = 0; @@ -5783,35 +5903,19 @@ again: key.offset = ctx->pos; key.objectid = btrfs_ino(BTRFS_I(inode)); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, ret) { struct dir_entry *entry; - - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto err; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &found_key, slot); + struct extent_buffer *leaf = path->nodes[0]; if (found_key.objectid != key.objectid) break; if (found_key.type != BTRFS_DIR_INDEX_KEY) break; if (found_key.offset < ctx->pos) - goto next; + continue; if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) - goto next; - di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + continue; + di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); name_len = btrfs_dir_name_len(leaf, di); if ((total_len + sizeof(struct dir_entry) + name_len) >= PAGE_SIZE) { @@ -5838,9 +5942,11 @@ again: entries++; addr += sizeof(struct dir_entry) + name_len; total_len += sizeof(struct dir_entry) + name_len; -next: - path->slots[0]++; } + /* Catch error encountered during iteration */ + if (ret < 0) + goto err; + btrfs_release_path(path); ret = btrfs_filldir(private->filldir_buf, entries, ctx); @@ -5971,14 +6077,8 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) goto out; ret = 0; - /* - * MAGIC NUMBER EXPLANATION: - * since we search a directory based on f_pos we have to start at 2 - * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody - * else has to start at 2 - */ if (path->slots[0] == 0) { - inode->index_cnt = 2; + inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } @@ -5989,7 +6089,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { - inode->index_cnt = 2; + inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } @@ -6034,6 +6134,57 @@ static int btrfs_insert_inode_locked(struct inode *inode) btrfs_find_actor, &args); } +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items) +{ + struct inode *dir = args->dir; + struct inode *inode = args->inode; + int ret; + + ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); + if (ret) + return ret; + + /* 1 to add inode item */ + *trans_num_items = 1; + /* 1 to add compression property */ + if (BTRFS_I(dir)->prop_compress) + (*trans_num_items)++; + /* 1 to add default ACL xattr */ + if (args->default_acl) + (*trans_num_items)++; + /* 1 to add access ACL xattr */ + if (args->acl) + (*trans_num_items)++; +#ifdef CONFIG_SECURITY + /* 1 to add LSM xattr */ + if (dir->i_security) + (*trans_num_items)++; +#endif + if (args->orphan) { + /* 1 to add orphan item */ + (*trans_num_items)++; + } else { + /* + * 1 to add dir item + * 1 to add dir index + * 1 to update parent inode item + * + * No need for 1 unit for the inode ref item because it is + * inserted in a batch together with the inode item at + * btrfs_create_new_inode(). + */ + *trans_num_items += 3; + } + return 0; +} + +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) +{ + posix_acl_release(args->acl); + posix_acl_release(args->default_acl); +} + /* * Inherit flags from the parent inode. * @@ -6043,9 +6194,6 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) { unsigned int flags; - if (!dir) - return; - flags = BTRFS_I(dir)->flags; if (flags & BTRFS_INODE_NOCOMPRESS) { @@ -6065,82 +6213,92 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) btrfs_sync_inode_flags_to_i_flags(inode); } -static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct user_namespace *mnt_userns, - struct inode *dir, - const char *name, int name_len, - u64 ref_objectid, u64 objectid, - umode_t mode, u64 *index) +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct inode *dir = args->dir; + struct inode *inode = args->inode; + const char *name = args->orphan ? NULL : args->dentry->d_name.name; + int name_len = args->orphan ? 0 : args->dentry->d_name.len; + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_root *root; struct btrfs_inode_item *inode_item; struct btrfs_key *location; struct btrfs_path *path; + u64 objectid; struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; struct btrfs_item_batch batch; unsigned long ptr; - unsigned int nofs_flag; int ret; path = btrfs_alloc_path(); if (!path) - return ERR_PTR(-ENOMEM); - - nofs_flag = memalloc_nofs_save(); - inode = new_inode(fs_info->sb); - memalloc_nofs_restore(nofs_flag); - if (!inode) { - btrfs_free_path(path); - return ERR_PTR(-ENOMEM); - } + return -ENOMEM; - /* - * O_TMPFILE, set link count to 0, so that after this point, - * we fill in an inode item with the correct link count. - */ - if (!name) - set_nlink(inode, 0); + if (!args->subvol) + BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); + root = BTRFS_I(inode)->root; - /* - * we have to initialize this early, so we can reclaim the inode - * number if we fail afterwards in this function. - */ + ret = btrfs_get_free_objectid(root, &objectid); + if (ret) + goto out; inode->i_ino = objectid; - if (dir && name) { + if (args->orphan) { + /* + * O_TMPFILE, set link count to 0, so that after this point, we + * fill in an inode item with the correct link count. + */ + set_nlink(inode, 0); + } else { trace_btrfs_inode_request(dir); - ret = btrfs_set_inode_index(BTRFS_I(dir), index); - if (ret) { - btrfs_free_path(path); - iput(inode); - return ERR_PTR(ret); - } - } else if (dir) { - *index = 0; + ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index); + if (ret) + goto out; } - /* - * index_cnt is ignored for everything but a dir, - * btrfs_set_inode_index_count has an explanation for the magic - * number - */ - BTRFS_I(inode)->index_cnt = 2; - BTRFS_I(inode)->dir_index = *index; - BTRFS_I(inode)->root = btrfs_grab_root(root); + /* index_cnt is ignored for everything but a dir. */ + BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; /* + * Subvolumes don't inherit flags from their parent directory. + * Originally this was probably by accident, but we probably can't + * change it now without compatibility issues. + */ + if (!args->subvol) + btrfs_inherit_iflags(inode, dir); + + if (S_ISREG(inode->i_mode)) { + if (btrfs_test_opt(fs_info, NODATASUM)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + if (btrfs_test_opt(fs_info, NODATACOW)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; + } + + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + location->type = BTRFS_INODE_ITEM_KEY; + + ret = btrfs_insert_inode_locked(inode); + if (ret < 0) { + if (!args->orphan) + BTRFS_I(dir)->index_cnt--; + goto out; + } + + /* * We could have gotten an inode number from somebody who was fsynced * and then removed in this same transaction, so let's just set full * sync since it will be a full sync anyway and this will blow away the * old info in the log. */ - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); key[0].objectid = objectid; key[0].type = BTRFS_INODE_ITEM_KEY; @@ -6148,7 +6306,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[0] = sizeof(struct btrfs_inode_item); - if (name) { + if (!args->orphan) { /* * Start new inodes with an inode_ref. This is slightly more * efficient for small numbers of hard links since they will @@ -6157,64 +6315,95 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ key[1].objectid = objectid; key[1].type = BTRFS_INODE_REF_KEY; - key[1].offset = ref_objectid; - - sizes[1] = name_len + sizeof(*ref); - } - - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - location->type = BTRFS_INODE_ITEM_KEY; - - ret = btrfs_insert_inode_locked(inode); - if (ret < 0) { - iput(inode); - goto fail; + if (args->subvol) { + key[1].offset = objectid; + sizes[1] = 2 + sizeof(*ref); + } else { + key[1].offset = btrfs_ino(BTRFS_I(dir)); + sizes[1] = name_len + sizeof(*ref); + } } batch.keys = &key[0]; batch.data_sizes = &sizes[0]; - batch.total_data_size = sizes[0] + (name ? sizes[1] : 0); - batch.nr = name ? 2 : 1; + batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); + batch.nr = args->orphan ? 1 : 2; ret = btrfs_insert_empty_items(trans, root, path, &batch); - if (ret != 0) - goto fail_unlock; - - inode_init_owner(mnt_userns, inode, dir, mode); - inode_set_bytes(inode, 0); + if (ret != 0) { + btrfs_abort_transaction(trans, ret); + goto discard; + } inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; + /* + * We're going to fill the inode item now, so at this point the inode + * must be fully initialized. + */ + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); - if (name) { + if (!args->orphan) { ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path->nodes[0], ref, *index); ptr = (unsigned long)(ref + 1); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + if (args->subvol) { + btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2); + btrfs_set_inode_ref_index(path->nodes[0], ref, 0); + write_extent_buffer(path->nodes[0], "..", ptr, 2); + } else { + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, + BTRFS_I(inode)->dir_index); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + } } btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); + btrfs_release_path(path); - btrfs_inherit_iflags(inode, dir); + if (args->subvol) { + struct inode *parent; - if (S_ISREG(mode)) { - if (btrfs_test_opt(fs_info, NODATASUM)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(fs_info, NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM; + /* + * Subvolumes inherit properties from their parent subvolume, + * not the directory they were created in. + */ + parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, + BTRFS_I(dir)->root); + if (IS_ERR(parent)) { + ret = PTR_ERR(parent); + } else { + ret = btrfs_inode_inherit_props(trans, inode, parent); + iput(parent); + } + } else { + ret = btrfs_inode_inherit_props(trans, inode, dir); + } + if (ret) { + btrfs_err(fs_info, + "error inheriting props for ino %llu (root %llu): %d", + btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, + ret); + } + + /* + * Subvolumes don't inherit ACLs or get passed to the LSM. This is + * probably a bug. + */ + if (!args->subvol) { + ret = btrfs_init_inode_security(trans, args); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } } inode_tree_add(inode); @@ -6224,21 +6413,30 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_update_root_times(trans, root); - ret = btrfs_inode_inherit_props(trans, inode, dir); - if (ret) - btrfs_err(fs_info, - "error inheriting props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); + if (args->orphan) { + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + } else { + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, + name_len, 0, BTRFS_I(inode)->dir_index); + } + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } - return inode; + ret = 0; + goto out; -fail_unlock: +discard: + /* + * discard_new_inode() calls iput(), but the caller owns the reference + * to the inode. + */ + ihold(inode); discard_new_inode(inode); -fail: - if (dir && name) - BTRFS_I(dir)->index_cnt--; +out: btrfs_free_path(path); - return ERR_PTR(ret); + return ret; } /* @@ -6330,147 +6528,71 @@ fail_dir_item: return ret; } -static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct btrfs_inode *dir, struct dentry *dentry, - struct btrfs_inode *inode, int backref, u64 index) -{ - int err = btrfs_add_link(trans, dir, inode, - dentry->d_name.name, dentry->d_name.len, - backref, index); - if (err > 0) - err = -EEXIST; - return err; -} - -static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t rdev) +static int btrfs_create_common(struct inode *dir, struct dentry *dentry, + struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .inode = inode, + }; + unsigned int trans_num_items; + struct btrfs_trans_handle *trans; int err; - u64 objectid; - u64 index = 0; - /* - * 2 for inode item and ref - * 2 for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - err = btrfs_get_free_objectid(root, &objectid); + err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (err) - goto out_unlock; + goto out_inode; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_unlock; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_new_inode_args; } - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), - 0, index); - if (err) - goto out_unlock; - - btrfs_update_inode(trans, root, BTRFS_I(inode)); - d_instantiate_new(dentry, inode); + err = btrfs_create_new_inode(trans, &new_inode_args); + if (!err) + d_instantiate_new(dentry, inode); -out_unlock: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (err) + iput(inode); return err; } -static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, bool excl) +static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; - int err; - u64 objectid; - u64 index = 0; + struct inode *inode; - /* - * 2 for inode item and ref - * 2 for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, mode); + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + return btrfs_create_common(dir, dentry, inode); +} - err = btrfs_get_free_objectid(root, &objectid); - if (err) - goto out_unlock; +static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + struct inode *inode; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_unlock; - } - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (err) - goto out_unlock; - - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), - 0, index); - if (err) - goto out_unlock; - - d_instantiate_new(dentry, inode); - -out_unlock: - btrfs_end_transaction(trans); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } - btrfs_btree_balance_dirty(fs_info); - return err; + return btrfs_create_common(dir, dentry, inode); } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, @@ -6516,8 +6638,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), - 1, index); + err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + dentry->d_name.name, dentry->d_name.len, 1, index); if (err) { drop_inode = 1; @@ -6537,7 +6659,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } d_instantiate(dentry, inode); - btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); + btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); } fail: @@ -6554,66 +6676,15 @@ fail: static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct inode *inode = NULL; - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - int err = 0; - u64 objectid = 0; - u64 index = 0; - - /* - * 2 items for inode and ref - * 2 items for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - err = btrfs_get_free_objectid(root, &objectid); - if (err) - goto out_fail; - - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, - S_IFDIR | mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_fail; - } + struct inode *inode; - /* these must be set before we unlock the inode */ + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_fail; - - btrfs_i_size_write(BTRFS_I(inode), 0); - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (err) - goto out_fail; - - err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - dentry->d_name.name, - dentry->d_name.len, 0, index); - if (err) - goto out_fail; - - d_instantiate_new(dentry, inode); - -out_fail: - btrfs_end_transaction(trans); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } - btrfs_btree_balance_dirty(fs_info); - return err; + return btrfs_create_common(dir, dentry, inode); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -7040,8 +7111,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, if (IS_ERR(em)) goto out; } - ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, - block_len, type); + ret = btrfs_add_ordered_extent(inode, start, len, len, block_start, + block_len, 0, + (1 << type) | + (1 << BTRFS_ORDERED_DIRECT), + BTRFS_COMPRESS_NONE); if (ret) { if (em) { free_extent_map(em); @@ -7119,6 +7193,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *ram_bytes, bool strict) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; struct extent_buffer *leaf; @@ -7126,13 +7201,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; - u64 disk_bytenr; - u64 backref_offset; - u64 extent_end; - u64 num_bytes; - int slot; int found_type; - bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); path = btrfs_alloc_path(); if (!path) @@ -7143,18 +7212,17 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, if (ret < 0) goto out; - slot = path->slots[0]; if (ret == 1) { - if (slot == 0) { + if (path->slots[0] == 0) { /* can't find the item, must cow */ ret = 0; goto out; } - slot--; + path->slots[0]--; } ret = 0; leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != btrfs_ino(BTRFS_I(inode)) || key.type != BTRFS_EXTENT_DATA_KEY) { /* not our file or wrong item type, must cow */ @@ -7166,55 +7234,38 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, goto out; } - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, fi); - if (found_type != BTRFS_FILE_EXTENT_REG && - found_type != BTRFS_FILE_EXTENT_PREALLOC) { - /* not a regular extent, must cow */ + if (btrfs_file_extent_end(path) <= offset) goto out; - } - if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) - goto out; + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, fi); + if (ram_bytes) + *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (extent_end <= offset) - goto out; + nocow_args.start = offset; + nocow_args.end = offset + *len - 1; + nocow_args.strict = strict; + nocow_args.free_path = true; - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - if (disk_bytenr == 0) - goto out; + ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + /* can_nocow_file_extent() has freed the path. */ + path = NULL; - if (btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - goto out; - - /* - * Do the same check as in btrfs_cross_ref_exist but without the - * unnecessary search. - */ - if (!strict && - (btrfs_file_extent_generation(leaf, fi) <= - btrfs_root_last_snapshot(&root->root_item))) + if (ret != 1) { + /* Treat errors as not being able to NOCOW. */ + ret = 0; goto out; - - backref_offset = btrfs_file_extent_offset(leaf, fi); - - if (orig_start) { - *orig_start = key.offset - backref_offset; - *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); - *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); } - if (btrfs_extent_readonly(fs_info, disk_bytenr)) + ret = 0; + if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) goto out; - num_bytes = min(offset + *len, extent_end) - offset; - if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; - range_end = round_up(offset + num_bytes, + range_end = round_up(offset + nocow_args.num_bytes, root->fs_info->sectorsize) - 1; ret = test_range_bit(io_tree, offset, range_end, EXTENT_DELALLOC, 0, NULL); @@ -7224,36 +7275,12 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } } - btrfs_release_path(path); - - /* - * look for other files referencing this extent, if we - * find any we must cow - */ - - ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), - key.offset - backref_offset, disk_bytenr, - strict); - if (ret) { - ret = 0; - goto out; - } + if (orig_start) + *orig_start = key.offset - nocow_args.extent_offset; + if (orig_block_len) + *orig_block_len = nocow_args.disk_num_bytes; - /* - * adjust disk_bytenr and num_bytes to cover just the bytes - * in this extent we are about to write. If there - * are any csums in that range we have to cow in order - * to keep the csums correct - */ - disk_bytenr += backref_offset; - disk_bytenr += offset - key.offset; - if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes)) - goto out; - /* - * all of the above have passed, it is safe to overwrite this extent - * without cow - */ - *len = num_bytes; + *len = nocow_args.num_bytes; ret = 1; out: btrfs_free_path(path); @@ -7261,14 +7288,22 @@ out: } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, bool writing) + struct extent_state **cached_state, + unsigned int iomap_flags) { + const bool writing = (iomap_flags & IOMAP_WRITE); + const bool nowait = (iomap_flags & IOMAP_NOWAIT); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; int ret = 0; while (1) { - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + if (nowait) { + if (!try_lock_extent(io_tree, lockstart, lockend)) + return -EAGAIN; + } else { + lock_extent_bits(io_tree, lockstart, lockend, cached_state); + } /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure there's no ordered @@ -7289,10 +7324,14 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, lockstart, lockend))) break; - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + unlock_extent_cached(io_tree, lockstart, lockend, cached_state); if (ordered) { + if (nowait) { + btrfs_put_ordered_extent(ordered); + ret = -EAGAIN; + break; + } /* * If we are doing a DIO read and the ordered extent we * found is for a buffered write, we can not wait for it @@ -7312,7 +7351,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) btrfs_start_ordered_extent(ordered, 1); else - ret = -ENOTBLK; + ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); } else { /* @@ -7328,7 +7367,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * ordered extent to complete while holding a lock on * that page. */ - ret = -ENOTBLK; + ret = nowait ? -EAGAIN : -ENOTBLK; } if (ret) @@ -7402,14 +7441,18 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, static int btrfs_get_blocks_direct_write(struct extent_map **map, struct inode *inode, struct btrfs_dio_data *dio_data, - u64 start, u64 len) + u64 start, u64 len, + unsigned int iomap_flags) { + const bool nowait = (iomap_flags & IOMAP_NOWAIT); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = *map; int type; u64 block_start, orig_start, orig_block_len, ram_bytes; + struct btrfs_block_group *bg; bool can_nocow = false; bool space_reserved = false; + u64 prev_len; int ret = 0; /* @@ -7432,21 +7475,27 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes, false) == 1 && - btrfs_inc_nocow_writers(fs_info, block_start)) - can_nocow = true; + &orig_block_len, &ram_bytes, false) == 1) { + bg = btrfs_inc_nocow_writers(fs_info, block_start); + if (bg) + can_nocow = true; + } } + prev_len = len; if (can_nocow) { struct extent_map *em2; /* We can NOCOW, so only need to reserve metadata space. */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + nowait); if (ret < 0) { /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; - btrfs_dec_nocow_writers(fs_info, block_start); + btrfs_dec_nocow_writers(bg); + if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) + ret = -EAGAIN; goto out; } space_reserved = true; @@ -7455,7 +7504,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, orig_start, block_start, len, orig_block_len, ram_bytes, type); - btrfs_dec_nocow_writers(fs_info, block_start); + btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); *map = em = em2; @@ -7465,17 +7514,29 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, ret = PTR_ERR(em2); goto out; } - } else { - const u64 prev_len = len; + dio_data->nocow_done = true; + } else { /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; - /* We have to COW, so need to reserve metadata and data space. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, len); + if (nowait) + return -EAGAIN; + + /* + * If we could not allocate data space before locking the file + * range and we can't do a NOCOW write, then we have to fail. + */ + if (!dio_data->data_space_reserved) + return -ENOSPC; + + /* + * We have to COW and we have already reserved data space before, + * so now we reserve only metadata. + */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + false); if (ret < 0) goto out; space_reserved = true; @@ -7488,17 +7549,15 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, *map = em; len = min(len, em->len - (start - em->start)); if (len < prev_len) - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, - start + len, prev_len - len, - true); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + prev_len - len, true); } /* * We have created our ordered extent, so we can now release our reservation * for an outstanding extent. */ - btrfs_delalloc_release_extents(BTRFS_I(inode), len); + btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); /* * Need to update the i_size under the extent lock so buffered @@ -7509,15 +7568,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, out: if (ret && space_reserved) { btrfs_delalloc_release_extents(BTRFS_I(inode), len); - if (can_nocow) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); - } else { - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, - start, len, true); - extent_changeset_free(dio_data->data_reserved); - dio_data->data_reserved = NULL; - } + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); } return ret; } @@ -7526,14 +7577,16 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; - struct btrfs_dio_data *dio_data = NULL; + struct btrfs_dio_data *dio_data = iter->private; u64 lockstart, lockend; const bool write = !!(flags & IOMAP_WRITE); int ret = 0; u64 len = length; + const u64 data_alloc_len = length; bool unlock_extents = false; if (!write) @@ -7543,34 +7596,67 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, lockend = start + len - 1; /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so we - * need to flush the dirty pages again to make absolutely sure that any - * outstanding dirty pages are on disk. + * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't + * enough if we've written compressed pages to this area, so we need to + * flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk - the first flush only starts + * compression on the data, while keeping the pages locked, so by the + * time the second flush returns we know bios for the compressed pages + * were submitted and finished, and the pages no longer under writeback. + * + * If we have a NOWAIT request and we have any pages in the range that + * are locked, likely due to compression still in progress, we don't want + * to block on page locks. We also don't want to block on pages marked as + * dirty or under writeback (same as for the non-compression case). + * iomap_dio_rw() did the same check, but after that and before we got + * here, mmap'ed writes may have happened or buffered reads started + * (readpage() and readahead(), which lock pages), as we haven't locked + * the file range yet. */ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) { - ret = filemap_fdatawrite_range(inode->i_mapping, start, - start + length - 1); - if (ret) - return ret; + if (flags & IOMAP_NOWAIT) { + if (filemap_range_needs_writeback(inode->i_mapping, + lockstart, lockend)) + return -EAGAIN; + } else { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; + } } - dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); - if (!dio_data) - return -ENOMEM; - - iomap->private = dio_data; + memset(dio_data, 0, sizeof(*dio_data)); + /* + * We always try to allocate data space and must do it before locking + * the file range, to avoid deadlocks with concurrent writes to the same + * range if the range has several extents and the writes don't expand the + * current i_size (the inode lock is taken in shared mode). If we fail to + * allocate data space here we continue and later, after locking the + * file range, we fail with ENOSPC only if we figure out we can not do a + * NOCOW write. + */ + if (write && !(flags & IOMAP_NOWAIT)) { + ret = btrfs_check_data_free_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, data_alloc_len); + if (!ret) + dio_data->data_space_reserved = true; + else if (ret && !(BTRFS_I(inode)->flags & + (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + goto err; + } /* * If this errors out it's because we couldn't invalidate pagecache for - * this range and we need to fallback to buffered. + * this range and we need to fallback to buffered IO, or we are doing a + * NOWAIT read/write and we need to block. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { - ret = -ENOTBLK; + ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); + if (ret < 0) goto err; - } em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { @@ -7630,12 +7716,30 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, len); + start, len, flags); if (ret < 0) goto unlock_err; unlock_extents = true; /* Recalc len in case the new em is smaller than requested */ len = min(len, em->len - (start - em->start)); + if (dio_data->data_space_reserved) { + u64 release_offset; + u64 release_len = 0; + + if (dio_data->nocow_done) { + release_offset = start; + release_len = data_alloc_len; + } else if (len < data_alloc_len) { + release_offset = start + len; + release_len = data_alloc_len - len; + } + + if (release_len > 0) + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + release_offset, + release_len); + } } else { /* * We need to unlock only the end area that we aren't using. @@ -7680,7 +7784,12 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - kfree(dio_data); + if (dio_data->data_space_reserved) { + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + start, data_alloc_len); + extent_changeset_free(dio_data->data_reserved); + } return ret; } @@ -7688,15 +7797,16 @@ err: static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned int flags, struct iomap *iomap) { - int ret = 0; - struct btrfs_dio_data *dio_data = iomap->private; + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct btrfs_dio_data *dio_data = iter->private; size_t submitted = dio_data->submitted; const bool write = !!(flags & IOMAP_WRITE); + int ret = 0; if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); - goto out; + return 0; } if (submitted < length) { @@ -7713,10 +7823,6 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (write) extent_changeset_free(dio_data->data_reserved); -out: - kfree(dio_data); - iomap->private = NULL; - return ret; } @@ -7729,40 +7835,36 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) if (!refcount_dec_and_test(&dip->refs)) return; - if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { + if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { __endio_write_update_ordered(BTRFS_I(dip->inode), dip->file_offset, dip->bytes, - !dip->dio_bio->bi_status); + !dip->bio.bi_status); } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, dip->file_offset, dip->file_offset + dip->bytes - 1); } - bio_endio(dip->dio_bio); - kfree(dip); + kfree(dip->csums); + bio_endio(&dip->bio); } -static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int mirror_num, + enum btrfs_compression_type compress_type) { struct btrfs_dio_private *dip = bio->bi_private; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - blk_status_t ret; BUG_ON(bio_op(bio) == REQ_OP_WRITE); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - return ret; + if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA)) + return; refcount_inc(&dip->refs); - ret = btrfs_map_bio(fs_info, bio, mirror_num); - if (ret) + if (btrfs_map_bio(fs_info, bio, mirror_num)) refcount_dec(&dip->refs); - return ret; } static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, @@ -7777,8 +7879,6 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct bio_vec bvec; struct bvec_iter iter; - const u64 orig_file_offset = dip->file_offset; - u64 start = orig_file_offset; u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; @@ -7788,6 +7888,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); pgoff = bvec.bv_offset; for (i = 0; i < nr_sectors; i++) { + u64 start = bbio->file_offset + bio_offset; + ASSERT(pgoff < PAGE_SIZE); if (uptodate && (!csum || !check_data_csum(inode, bbio, @@ -7800,17 +7902,13 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, } else { int ret; - ASSERT((start - orig_file_offset) < UINT_MAX); - ret = btrfs_repair_one_sector(inode, - &bbio->bio, - start - orig_file_offset, - bvec.bv_page, pgoff, + ret = btrfs_repair_one_sector(inode, &bbio->bio, + bio_offset, bvec.bv_page, pgoff, start, bbio->mirror_num, submit_dio_repair_bio); if (ret) err = errno_to_blk_status(ret); } - start += sectorsize; ASSERT(bio_offset + sectorsize > bio_offset); bio_offset += sectorsize; pgoff += sectorsize; @@ -7831,12 +7929,13 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1); + return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); } static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); blk_status_t err = bio->bi_status; if (err) @@ -7847,12 +7946,12 @@ static void btrfs_end_dio_bio(struct bio *bio) bio->bi_iter.bi_size, err); if (bio_op(bio) == REQ_OP_READ) - err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err); + err = btrfs_check_read_dio_bio(dip, bbio, !err); if (err) - dip->dio_bio->bi_status = err; + dip->bio.bi_status = err; - btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio); + btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); @@ -7880,7 +7979,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, goto map; if (write && async_submit) { - ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset, + ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset, btrfs_submit_bio_start_direct_io); goto err; } else if (write) { @@ -7888,7 +7987,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, * If we aren't doing async submit, calculate the csum of the * bio now. */ - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); if (ret) goto err; } else { @@ -7905,50 +8004,16 @@ err: return ret; } -/* - * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked - * or ordered extents whether or not we submit any bios. - */ -static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, - struct inode *inode, - loff_t file_offset) -{ - const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - size_t dip_size; - struct btrfs_dio_private *dip; - - dip_size = sizeof(*dip); - if (!write && csum) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - size_t nblocks; - - nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; - dip_size += fs_info->csum_size * nblocks; - } - - dip = kzalloc(dip_size, GFP_NOFS); - if (!dip) - return NULL; - - dip->inode = inode; - dip->file_offset = file_offset; - dip->bytes = dio_bio->bi_iter.bi_size; - dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; - dip->dio_bio = dio_bio; - refcount_set(&dip->refs, 1); - return dip; -} - static void btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { + struct btrfs_dio_private *dip = + container_of(dio_bio, struct btrfs_dio_private, bio); struct inode *inode = iter->inode; const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK); - struct btrfs_dio_private *dip; struct bio *bio; u64 start_sector; int async_submit = 0; @@ -7959,27 +8024,28 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, int ret; blk_status_t status; struct btrfs_io_geometry geom; - struct btrfs_dio_data *dio_data = iter->iomap.private; + struct btrfs_dio_data *dio_data = iter->private; struct extent_map *em = NULL; - dip = btrfs_create_dio_private(dio_bio, inode, file_offset); - if (!dip) { - if (!write) { - unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, - file_offset + dio_bio->bi_iter.bi_size - 1); - } - dio_bio->bi_status = BLK_STS_RESOURCE; - bio_endio(dio_bio); - return; - } + dip->inode = inode; + dip->file_offset = file_offset; + dip->bytes = dio_bio->bi_iter.bi_size; + refcount_set(&dip->refs, 1); + dip->csums = NULL; + + if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + unsigned int nr_sectors = + (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - if (!write) { /* * Load the csums up front to reduce csum tree searches and * contention when submitting bios. - * - * If we have csums disabled this will do nothing. */ + status = BLK_STS_RESOURCE; + dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); + if (!dip) + goto out_err; + status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); if (status != BLK_STS_OK) goto out_err; @@ -8013,6 +8079,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; + btrfs_bio(bio)->file_offset = file_offset; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { status = extract_ordered_extent(BTRFS_I(inode), bio, @@ -8068,19 +8135,28 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, out_err_em: free_extent_map(em); out_err: - dip->dio_bio->bi_status = status; + dio_bio->bi_status = status; btrfs_dio_private_put(dip); } -const struct iomap_ops btrfs_dio_iomap_ops = { +static const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, }; -const struct iomap_dio_ops btrfs_dio_ops = { +static const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_submit_direct, + .bio_set = &btrfs_dio_bioset, }; +ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) +{ + struct btrfs_dio_data data; + + return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); +} + static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -8093,22 +8169,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } -int btrfs_readpage(struct file *file, struct page *page) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; - int ret; - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); - if (bio_ctrl.bio) - ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); - return ret; -} - static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; @@ -8146,8 +8206,8 @@ static void btrfs_readahead(struct readahead_control *rac) } /* - * For releasepage() and invalidatepage() we have a race window where - * end_page_writeback() is called but the subpage spinlock is not yet released. + * For release_folio() and invalidate_folio() we have a race window where + * folio_end_writeback() is called but the subpage spinlock is not yet released. * If we continue to release/invalidate the page, we could cause use-after-free * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. @@ -8157,7 +8217,7 @@ static void wait_subpage_spinlock(struct page *page) struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); struct btrfs_subpage *subpage; - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->private); @@ -8178,22 +8238,22 @@ static void wait_subpage_spinlock(struct page *page) spin_unlock_irq(&subpage->lock); } -static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) +static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { - int ret = try_release_extent_mapping(page, gfp_flags); + int ret = try_release_extent_mapping(&folio->page, gfp_flags); if (ret == 1) { - wait_subpage_spinlock(page); - clear_page_extent_mapped(page); + wait_subpage_spinlock(&folio->page); + clear_page_extent_mapped(&folio->page); } return ret; } -static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) +static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { - if (PageWriteback(page) || PageDirty(page)) - return 0; - return __btrfs_releasepage(page, gfp_flags); + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return false; + return __btrfs_release_folio(folio, gfp_flags); } #ifdef CONFIG_MIGRATION @@ -8223,48 +8283,48 @@ static int btrfs_migratepage(struct address_space *mapping, } #endif -static void btrfs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void btrfs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; struct extent_state *cached_state = NULL; - u64 page_start = page_offset(page); - u64 page_end = page_start + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = page_start + folio_size(folio) - 1; u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* - * We have page locked so no new ordered extent can be created on this - * page, nor bio can be submitted for this page. + * We have folio locked so no new ordered extent can be created on this + * page, nor bio can be submitted for this folio. * - * But already submitted bio can still be finished on this page. - * Furthermore, endio function won't skip page which has Ordered + * But already submitted bio can still be finished on this folio. + * Furthermore, endio function won't skip folio which has Ordered * (Private2) already cleared, so it's possible for endio and - * invalidatepage to do the same ordered extent accounting twice - * on one page. + * invalidate_folio to do the same ordered extent accounting twice + * on one folio. * * So here we wait for any submitted bios to finish, so that we won't - * do double ordered extent accounting on the same page. + * do double ordered extent accounting on the same folio. */ - wait_on_page_writeback(page); - wait_subpage_spinlock(page); + folio_wait_writeback(folio); + wait_subpage_spinlock(&folio->page); /* * For subpage case, we have call sites like * btrfs_punch_hole_lock_range() which passes range not aligned to * sectorsize. - * If the range doesn't cover the full page, we don't need to and - * shouldn't clear page extent mapped, as page->private can still + * If the range doesn't cover the full folio, we don't need to and + * shouldn't clear page extent mapped, as folio->private can still * record subpage dirty bits for other part of the range. * - * For cases that can invalidate the full even the range doesn't - * cover the full page, like invalidating the last page, we're + * For cases that invalidate the full folio even the range doesn't + * cover the full folio, like invalidating the last folio, we're * still safe to wait for ordered extent to finish. */ - if (!(offset == 0 && length == PAGE_SIZE)) { - btrfs_releasepage(page, GFP_NOFS); + if (!(offset == 0 && length == folio_size(folio))) { + btrfs_release_folio(folio, GFP_NOFS); return; } @@ -8305,7 +8365,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; - if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { + if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. @@ -8315,7 +8375,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, delete_states = false; goto next; } - btrfs_page_clear_ordered(fs_info, page, cur, range_len); + btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); /* * IO on this page will never be started, so we need to account @@ -8385,11 +8445,11 @@ next: * should not have Ordered (Private2) anymore, or the above iteration * did something wrong. */ - ASSERT(!PageOrdered(page)); - btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE); + ASSERT(!folio_test_ordered(folio)); + btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); if (!inode_evicting) - __btrfs_releasepage(page, GFP_NOFS); - clear_page_extent_mapped(page); + __btrfs_release_folio(folio, GFP_NOFS); + clear_page_extent_mapped(&folio->page); } /* @@ -8734,51 +8794,28 @@ out: * extents beyond i_size to drop. */ if (control.extents_found > 0) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); return ret; } -/* - * create a new subvolume directory/inode (helper for the ioctl). - */ -int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - struct btrfs_root *parent_root, - struct user_namespace *mnt_userns) +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir) { struct inode *inode; - int err; - u64 index = 0; - u64 ino; - - err = btrfs_get_free_objectid(new_root, &ino); - if (err < 0) - return err; - inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2, - ino, ino, - S_IFDIR | (~current_umask() & S_IRWXUGO), - &index); - if (IS_ERR(inode)) - return PTR_ERR(inode); - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; - - set_nlink(inode, 1); - btrfs_i_size_write(BTRFS_I(inode), 0); - unlock_new_inode(inode); - - err = btrfs_subvol_inherit_props(trans, new_root, parent_root); - if (err) - btrfs_err(new_root->fs_info, - "error inheriting subvolume %llu properties: %d", - new_root->root_key.objectid, err); - - err = btrfs_update_inode(trans, new_root, BTRFS_I(inode)); - - iput(inode); - return err; + inode = new_inode(dir->i_sb); + if (inode) { + /* + * Subvolumes don't inherit the sgid bit or the parent's gid if + * the parent's sgid bit is set. This is probably a bug. + */ + inode_init_owner(mnt_userns, inode, NULL, + S_IFDIR | (~current_umask() & S_IRWXUGO)); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + } + return inode; } struct inode *btrfs_alloc_inode(struct super_block *sb) @@ -8787,7 +8824,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_inode *ei; struct inode *inode; - ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; @@ -8918,7 +8955,7 @@ int btrfs_drop_inode(struct inode *inode) static void init_once(void *foo) { - struct btrfs_inode *ei = (struct btrfs_inode *) foo; + struct btrfs_inode *ei = foo; inode_init_once(&ei->vfs_inode); } @@ -8930,6 +8967,7 @@ void __cold btrfs_destroy_cachep(void) * destroy cache. */ rcu_barrier(); + bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); kmem_cache_destroy(btrfs_trans_handle_cachep); kmem_cache_destroy(btrfs_path_cachep); @@ -8970,6 +9008,11 @@ int __init btrfs_init_cachep(void) if (!btrfs_free_space_bitmap_cachep) goto fail; + if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_dio_private, bio), + BIOSET_NEED_BVECS)) + goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -9025,19 +9068,20 @@ static int btrfs_rename_exchange(struct inode *old_dir, { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_trans_handle *trans; + unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); + struct btrfs_rename_ctx old_rename_ctx; + struct btrfs_rename_ctx new_rename_ctx; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; int ret2; - bool root_log_pinned = false; - bool dest_log_pinned = false; bool need_abort = false; /* @@ -9056,14 +9100,37 @@ static int btrfs_rename_exchange(struct inode *old_dir, down_read(&fs_info->subvol_sem); /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal - * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items - * should cover the worst case number of items we'll modify. + * For each inode: + * 1 to remove old dir item + * 1 to remove old dir index + * 1 to add new dir item + * 1 to add new dir index + * 1 to update parent inode + * + * If the parents are the same, we only need to account for one */ - trans = btrfs_start_transaction(root, 12); + trans_num_items = (old_dir == new_dir ? 9 : 10); + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* + * 1 to remove old root ref + * 1 to remove old root backref + * 1 to add new root ref + * 1 to add new root backref + */ + trans_num_items += 4; + } else { + /* + * 1 to update inode item + * 1 to remove old inode ref + * 1 to add new inode ref + */ + trans_num_items += 3; + } + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + trans_num_items += 4; + else + trans_num_items += 3; + trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; @@ -9140,29 +9207,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode), 1); } - /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. - * - * We pin the logs even if at this precise moment none of the inodes was - * logged before. This is because right after we checked for that, some - * other task fsyncing some other inode not involved with this rename - * operation could log that one of our inodes exists. - * - * We don't need to pin the logs before the above calls to - * btrfs_insert_inode_ref(), since those don't ever need to change a log. - */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_pin_log_trans(root); - root_log_pinned = true; - } - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_pin_log_trans(dest); - dest_log_pinned = true; - } - /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); @@ -9170,7 +9214,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_dentry->d_name.name, - old_dentry->d_name.len); + old_dentry->d_name.len, + &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9186,7 +9231,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_dentry->d_name.name, - new_dentry->d_name.len); + new_dentry->d_name.len, + &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } @@ -9216,46 +9262,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (new_inode->i_nlink == 1) BTRFS_I(new_inode)->dir_index = new_idx; - if (root_log_pinned) { - btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), - new_dentry->d_parent); - btrfs_end_log_trans(root); - root_log_pinned = false; - } - if (dest_log_pinned) { - btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), - old_dentry->d_parent); - btrfs_end_log_trans(dest); - dest_log_pinned = false; - } -out_fail: /* - * If we have pinned a log and an error happened, we unpin tasks - * trying to sync the log and force them to fallback to a transaction - * commit if the log currently contains any of the inodes involved in - * this rename operation (to ensure we do not persist a log with an - * inconsistent state for any of these inodes or leading to any - * inconsistencies when replayed). If the transaction was aborted, the - * abortion reason is propagated to userspace when attempting to commit - * the transaction. If the log does not contain any of these inodes, we - * allow the tasks to sync it. + * Now pin the logs of the roots. We do it to ensure that no other task + * can sync the logs while we are in progress with the rename, because + * that could result in an inconsistency in case any of the inodes that + * are part of this rename operation were logged before. */ - if (ret && (root_log_pinned || dest_log_pinned)) { - if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)) - btrfs_set_log_full_commit(trans); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_pin_log_trans(root); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_pin_log_trans(dest); - if (root_log_pinned) { - btrfs_end_log_trans(root); - root_log_pinned = false; - } - if (dest_log_pinned) { - btrfs_end_log_trans(dest); - dest_log_pinned = false; - } - } + /* Do the log updates for all inodes. */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), + old_rename_ctx.index, new_dentry->d_parent); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), + new_rename_ctx.index, old_dentry->d_parent); + + /* Now unpin the logs. */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_log_trans(root); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_log_trans(dest); +out_fail: ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -9266,56 +9297,19 @@ out_notrans: return ret; } -static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct user_namespace *mnt_userns, - struct inode *dir, - struct dentry *dentry) +static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns, + struct inode *dir) { - int ret; struct inode *inode; - u64 objectid; - u64 index; - - ret = btrfs_get_free_objectid(root, &objectid); - if (ret) - return ret; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, - dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), - objectid, - S_IFCHR | WHITEOUT_MODE, - &index); - - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - return ret; + inode = new_inode(dir->i_sb); + if (inode) { + inode_init_owner(mnt_userns, inode, dir, + S_IFCHR | WHITEOUT_MODE); + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); } - - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, - WHITEOUT_DEV); - - ret = btrfs_init_inode_security(trans, inode, dir, - &dentry->d_name); - if (ret) - goto out; - - ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, - BTRFS_I(inode), 0, index); - if (ret) - goto out; - - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); -out: - unlock_new_inode(inode); - if (ret) - inode_dec_link_count(inode); - iput(inode); - - return ret; + return inode; } static int btrfs_rename(struct user_namespace *mnt_userns, @@ -9324,17 +9318,21 @@ static int btrfs_rename(struct user_namespace *mnt_userns, unsigned int flags) { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_new_inode_args whiteout_args = { + .dir = old_dir, + .dentry = old_dentry, + }; struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); + struct btrfs_rename_ctx rename_ctx; u64 index = 0; int ret; int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); - bool log_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9378,23 +9376,56 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) filemap_flush(old_inode->i_mapping); - /* close the racy window with snapshot create/destroy ioctl */ - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + if (flags & RENAME_WHITEOUT) { + whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir); + if (!whiteout_args.inode) + return -ENOMEM; + ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); + if (ret) + goto out_whiteout_inode; + } else { + /* 1 to update the old parent inode. */ + trans_num_items = 1; + } + + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* Close the race window with snapshot create/destroy ioctl */ down_read(&fs_info->subvol_sem); + /* + * 1 to remove old root ref + * 1 to remove old root backref + * 1 to add new root ref + * 1 to add new root backref + */ + trans_num_items += 4; + } else { + /* + * 1 to update inode + * 1 to remove old inode ref + * 1 to add new inode ref + */ + trans_num_items += 3; + } /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume they are normal - * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items - * should cover the worst case number of items we'll modify. - * If our rename has the whiteout flag, we need more 5 units for the - * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item - * when selinux is enabled). + * 1 to remove old dir item + * 1 to remove old dir index + * 1 to add new dir item + * 1 to add new dir index */ - trans_num_items = 11; - if (flags & RENAME_WHITEOUT) + trans_num_items += 4; + /* 1 to update new parent inode if it's not the same as the old parent */ + if (new_dir != old_dir) + trans_num_items++; + if (new_inode) { + /* + * 1 to update inode + * 1 to remove inode ref + * 1 to remove dir item + * 1 to remove dir index + * 1 to possibly add orphan item + */ trans_num_items += 5; + } trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -9439,29 +9470,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { - /* - * Now pin the log. We do it to ensure that no other task can - * sync the log while we are in progress with the rename, as - * that could result in an inconsistency in case any of the - * inodes that are part of this rename operation were logged - * before. - * - * We pin the log even if at this precise moment none of the - * inodes was logged before. This is because right after we - * checked for that, some other task fsyncing some other inode - * not involved with this rename operation could log that one of - * our inodes exists. - * - * We don't need to pin the logs before the above call to - * btrfs_insert_inode_ref(), since that does not need to change - * a log. - */ - btrfs_pin_log_trans(root); - log_pinned = true; ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, - old_dentry->d_name.len); + old_dentry->d_name.len, + &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9503,51 +9516,32 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (log_pinned) { - btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), - new_dentry->d_parent); - btrfs_end_log_trans(root); - log_pinned = false; - } + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), + rename_ctx.index, new_dentry->d_parent); if (flags & RENAME_WHITEOUT) { - ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, - old_dir, old_dentry); - + ret = btrfs_create_new_inode(trans, &whiteout_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; + } else { + unlock_new_inode(whiteout_args.inode); + iput(whiteout_args.inode); + whiteout_args.inode = NULL; } } out_fail: - /* - * If we have pinned the log and an error happened, we unpin tasks - * trying to sync the log and force them to fallback to a transaction - * commit if the log currently contains any of the inodes involved in - * this rename operation (to ensure we do not persist a log with an - * inconsistent state for any of these inodes or leading to any - * inconsistencies when replayed). If the transaction was aborted, the - * abortion reason is propagated to userspace when attempting to commit - * the transaction. If the log does not contain any of these inodes, we - * allow the tasks to sync it. - */ - if (ret && log_pinned) { - if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - (new_inode && - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) - btrfs_set_log_full_commit(trans); - - btrfs_end_log_trans(root); - log_pinned = false; - } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); - + if (flags & RENAME_WHITEOUT) + btrfs_new_inode_args_destroy(&whiteout_args); +out_whiteout_inode: + if (flags & RENAME_WHITEOUT) + iput(whiteout_args.inode); return ret; } @@ -9766,10 +9760,13 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct btrfs_key key; - struct inode *inode = NULL; + struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + }; + unsigned int trans_num_items; int err; - u64 objectid; - u64 index = 0; int name_len; int datasize; unsigned long ptr; @@ -9780,49 +9777,40 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return -ENAMETOOLONG; - /* - * 2 items for inode item and ref - * 2 items for dir items - * 1 item for updating parent inode item - * 1 item for the inline extent item - * 1 item for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 7); - if (IS_ERR(trans)) - return PTR_ERR(trans); + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO); + inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &btrfs_aops; + btrfs_i_size_write(BTRFS_I(inode), name_len); + inode_set_bytes(inode, name_len); - err = btrfs_get_free_objectid(root, &objectid); + new_inode_args.inode = inode; + err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (err) - goto out_unlock; + goto out_inode; + /* 1 additional item for the inline extent */ + trans_num_items++; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, - S_IFLNK | S_IRWXUGO, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_unlock; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_new_inode_args; } - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; - inode->i_mapping->a_ops = &btrfs_aops; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + err = btrfs_create_new_inode(trans, &new_inode_args); if (err) - goto out_unlock; + goto out; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; - goto out_unlock; + btrfs_abort_transaction(trans, err); + discard_new_inode(inode); + inode = NULL; + goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); key.offset = 0; @@ -9831,8 +9819,11 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, err = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (err) { + btrfs_abort_transaction(trans, err); btrfs_free_path(path); - goto out_unlock; + discard_new_inode(inode); + inode = NULL; + goto out; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -9850,31 +9841,16 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode_set_bytes(inode, name_len); - btrfs_i_size_write(BTRFS_I(inode), name_len); - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); - /* - * Last step, add directory indexes for our symlink inode. This is the - * last step to avoid extra cleanup of these indexes if an error happens - * elsewhere above. - */ - if (!err) - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, - BTRFS_I(inode), 0, index); - if (err) - goto out_unlock; - d_instantiate_new(dentry, inode); - -out_unlock: + err = 0; +out: btrfs_end_transaction(trans); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } btrfs_btree_balance_dirty(fs_info); +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (err) + iput(inode); return err; } @@ -9923,6 +9899,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; extent_info.is_new_extent = true; + extent_info.update_times = true; extent_info.qgroup_reserved = qgroup_released; extent_info.insertions = 0; @@ -10021,8 +9998,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em = alloc_extent_map(); if (!em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); goto next; } @@ -10104,11 +10080,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } -static int btrfs_set_page_dirty(struct page *page) -{ - return __set_page_dirty_nobuffers(page); -} - static int btrfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { @@ -10131,62 +10102,58 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; - u64 objectid; - u64 index; - int ret = 0; - - /* - * 5 units required for adding orphan entry - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_get_free_objectid(root, &objectid); - if (ret) - goto out; - - inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0, - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - inode = NULL; - goto out; - } + struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .orphan = true, + }; + unsigned int trans_num_items; + int ret; + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; - inode->i_mapping->a_ops = &btrfs_aops; - ret = btrfs_init_inode_security(trans, inode, dir, NULL); + new_inode_args.inode = inode; + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (ret) - goto out; + goto out_inode; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (ret) - goto out; - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) - goto out; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_new_inode_args; + } + + ret = btrfs_create_new_inode(trans, &new_inode_args); /* - * We set number of links to 0 in btrfs_new_inode(), and here we set - * it to 1 because d_tmpfile() will issue a warning if the count is 0, - * through: + * We set number of links to 0 in btrfs_create_new_inode(), and here we + * set it to 1 because d_tmpfile() will issue a warning if the count is + * 0, through: * * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() */ set_nlink(inode, 1); - d_tmpfile(dentry, inode); - unlock_new_inode(inode); - mark_inode_dirty(inode); -out: + + if (!ret) { + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + mark_inode_dirty(inode); + } + btrfs_end_transaction(trans); - if (ret && inode) - discard_new_inode(inode); btrfs_btree_balance_dirty(fs_info); +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (ret) + iput(inode); return ret; } @@ -10210,6 +10177,746 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) } } +static int btrfs_encoded_io_compression_from_extent( + struct btrfs_fs_info *fs_info, + int compress_type) +{ + switch (compress_type) { + case BTRFS_COMPRESS_NONE: + return BTRFS_ENCODED_IO_COMPRESSION_NONE; + case BTRFS_COMPRESS_ZLIB: + return BTRFS_ENCODED_IO_COMPRESSION_ZLIB; + case BTRFS_COMPRESS_LZO: + /* + * The LZO format depends on the sector size. 64K is the maximum + * sector size that we support. + */ + if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K) + return -EINVAL; + return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + + (fs_info->sectorsize_bits - 12); + case BTRFS_COMPRESS_ZSTD: + return BTRFS_ENCODED_IO_COMPRESSION_ZSTD; + default: + return -EUCLEAN; + } +} + +static ssize_t btrfs_encoded_read_inline( + struct kiocb *iocb, + struct iov_iter *iter, u64 start, + u64 lockend, + struct extent_state **cached_state, + u64 extent_start, size_t count, + struct btrfs_ioctl_encoded_io_args *encoded, + bool *unlocked) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *item; + u64 ram_bytes; + unsigned long ptr; + void *tmp; + ssize_t ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + extent_start, 0); + if (ret) { + if (ret > 0) { + /* The extent item disappeared? */ + ret = -EIO; + } + goto out; + } + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); + ptr = btrfs_file_extent_inline_start(item); + + encoded->len = min_t(u64, extent_start + ram_bytes, + inode->vfs_inode.i_size) - iocb->ki_pos; + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, item)); + if (ret < 0) + goto out; + encoded->compression = ret; + if (encoded->compression) { + size_t inline_size; + + inline_size = btrfs_file_extent_inline_item_len(leaf, + path->slots[0]); + if (inline_size > count) { + ret = -ENOBUFS; + goto out; + } + count = inline_size; + encoded->unencoded_len = ram_bytes; + encoded->unencoded_offset = iocb->ki_pos - extent_start; + } else { + count = min_t(u64, count, encoded->len); + encoded->len = count; + encoded->unencoded_len = count; + ptr += iocb->ki_pos - extent_start; + } + + tmp = kmalloc(count, GFP_NOFS); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(leaf, tmp, ptr, count); + btrfs_release_path(path); + unlock_extent_cached(io_tree, start, lockend, cached_state); + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + *unlocked = true; + + ret = copy_to_iter(tmp, count, iter); + if (ret != count) + ret = -EFAULT; + kfree(tmp); +out: + btrfs_free_path(path); + return ret; +} + +struct btrfs_encoded_read_private { + struct btrfs_inode *inode; + u64 file_offset; + wait_queue_head_t wait; + atomic_t pending; + blk_status_t status; + bool skip_csum; +}; + +static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, + struct bio *bio, int mirror_num) +{ + struct btrfs_encoded_read_private *priv = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + blk_status_t ret; + + if (!priv->skip_csum) { + ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); + if (ret) + return ret; + } + + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); + if (ret) { + btrfs_bio_free_csum(bbio); + return ret; + } + + atomic_inc(&priv->pending); + ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (ret) { + atomic_dec(&priv->pending); + btrfs_bio_free_csum(bbio); + } + return ret; +} + +static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) +{ + const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); + struct btrfs_encoded_read_private *priv = bbio->bio.bi_private; + struct btrfs_inode *inode = priv->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 sectorsize = fs_info->sectorsize; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + u64 start = priv->file_offset; + u32 bio_offset = 0; + + if (priv->skip_csum || !uptodate) + return bbio->bio.bi_status; + + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + unsigned int i, nr_sectors, pgoff; + + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + pgoff = bvec->bv_offset; + for (i = 0; i < nr_sectors; i++) { + ASSERT(pgoff < PAGE_SIZE); + if (check_data_csum(&inode->vfs_inode, bbio, bio_offset, + bvec->bv_page, pgoff, start)) + return BLK_STS_IOERR; + start += sectorsize; + bio_offset += sectorsize; + pgoff += sectorsize; + } + } + return BLK_STS_OK; +} + +static void btrfs_encoded_read_endio(struct bio *bio) +{ + struct btrfs_encoded_read_private *priv = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + blk_status_t status; + + status = btrfs_encoded_read_verify_csum(bbio); + if (status) { + /* + * The memory barrier implied by the atomic_dec_return() here + * pairs with the memory barrier implied by the + * atomic_dec_return() or io_wait_event() in + * btrfs_encoded_read_regular_fill_pages() to ensure that this + * write is observed before the load of status in + * btrfs_encoded_read_regular_fill_pages(). + */ + WRITE_ONCE(priv->status, status); + } + if (!atomic_dec_return(&priv->pending)) + wake_up(&priv->wait); + btrfs_bio_free_csum(bbio); + bio_put(bio); +} + +static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, + u64 disk_bytenr, + u64 disk_io_size, + struct page **pages) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_encoded_read_private priv = { + .inode = inode, + .file_offset = file_offset, + .pending = ATOMIC_INIT(1), + .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), + }; + unsigned long i = 0; + u64 cur = 0; + int ret; + + init_waitqueue_head(&priv.wait); + /* + * Submit bios for the extent, splitting due to bio or stripe limits as + * necessary. + */ + while (cur < disk_io_size) { + struct extent_map *em; + struct btrfs_io_geometry geom; + struct bio *bio = NULL; + u64 remaining; + + em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, + disk_io_size - cur); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + } else { + ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, + disk_bytenr + cur, &geom); + free_extent_map(em); + } + if (ret) { + WRITE_ONCE(priv.status, errno_to_blk_status(ret)); + break; + } + remaining = min(geom.len, disk_io_size - cur); + while (bio || remaining) { + size_t bytes = min_t(u64, remaining, PAGE_SIZE); + + if (!bio) { + bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio->bi_iter.bi_sector = + (disk_bytenr + cur) >> SECTOR_SHIFT; + bio->bi_end_io = btrfs_encoded_read_endio; + bio->bi_private = &priv; + bio->bi_opf = REQ_OP_READ; + } + + if (!bytes || + bio_add_page(bio, pages[i], bytes, 0) < bytes) { + blk_status_t status; + + status = submit_encoded_read_bio(inode, bio, 0); + if (status) { + WRITE_ONCE(priv.status, status); + bio_put(bio); + goto out; + } + bio = NULL; + continue; + } + + i++; + cur += bytes; + remaining -= bytes; + } + } + +out: + if (atomic_dec_return(&priv.pending)) + io_wait_event(priv.wait, !atomic_read(&priv.pending)); + /* See btrfs_encoded_read_endio() for ordering. */ + return blk_status_to_errno(READ_ONCE(priv.status)); +} + +static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, + struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, + bool *unlocked) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + struct page **pages; + unsigned long nr_pages, i; + u64 cur; + size_t page_offset; + ssize_t ret; + + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + ret = btrfs_alloc_page_array(nr_pages, pages); + if (ret) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, + disk_io_size, pages); + if (ret) + goto out; + + unlock_extent_cached(io_tree, start, lockend, cached_state); + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + *unlocked = true; + + if (compressed) { + i = 0; + page_offset = 0; + } else { + i = (iocb->ki_pos - start) >> PAGE_SHIFT; + page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1); + } + cur = 0; + while (cur < count) { + size_t bytes = min_t(size_t, count - cur, + PAGE_SIZE - page_offset); + + if (copy_page_to_iter(pages[i], page_offset, bytes, + iter) != bytes) { + ret = -EFAULT; + goto out; + } + i++; + cur += bytes; + page_offset = 0; + } + ret = count; +out: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); + return ret; +} + +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + ssize_t ret; + size_t count = iov_iter_count(iter); + u64 start, lockend, disk_bytenr, disk_io_size; + struct extent_state *cached_state = NULL; + struct extent_map *em; + bool unlocked = false; + + file_accessed(iocb->ki_filp); + + btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + + if (iocb->ki_pos >= inode->vfs_inode.i_size) { + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + return 0; + } + start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); + /* + * We don't know how long the extent containing iocb->ki_pos is, but if + * it's compressed we know that it won't be longer than this. + */ + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, + lockend - start + 1); + if (ret) + goto out_unlock_inode; + lock_extent_bits(io_tree, start, lockend, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + lockend - start + 1); + if (!ordered) + break; + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(io_tree, start, lockend, &cached_state); + cond_resched(); + } + + em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock_extent; + } + + if (em->block_start == EXTENT_MAP_INLINE) { + u64 extent_start = em->start; + + /* + * For inline extents we get everything we need out of the + * extent item. + */ + free_extent_map(em); + em = NULL; + ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, + &cached_state, extent_start, + count, encoded, &unlocked); + goto out; + } + + /* + * We only want to return up to EOF even if the extent extends beyond + * that. + */ + encoded->len = min_t(u64, extent_map_end(em), + inode->vfs_inode.i_size) - iocb->ki_pos; + if (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + disk_bytenr = EXTENT_MAP_HOLE; + count = min_t(u64, count, encoded->len); + encoded->len = count; + encoded->unencoded_len = count; + } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + disk_bytenr = em->block_start; + /* + * Bail if the buffer isn't large enough to return the whole + * compressed extent. + */ + if (em->block_len > count) { + ret = -ENOBUFS; + goto out_em; + } + disk_io_size = count = em->block_len; + encoded->unencoded_len = em->ram_bytes; + encoded->unencoded_offset = iocb->ki_pos - em->orig_start; + ret = btrfs_encoded_io_compression_from_extent(fs_info, + em->compress_type); + if (ret < 0) + goto out_em; + encoded->compression = ret; + } else { + disk_bytenr = em->block_start + (start - em->start); + if (encoded->len > count) + encoded->len = count; + /* + * Don't read beyond what we locked. This also limits the page + * allocations that we'll do. + */ + disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; + count = start + disk_io_size - iocb->ki_pos; + encoded->len = count; + encoded->unencoded_len = count; + disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); + } + free_extent_map(em); + em = NULL; + + if (disk_bytenr == EXTENT_MAP_HOLE) { + unlock_extent_cached(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + unlocked = true; + ret = iov_iter_zero(count, iter); + if (ret != count) + ret = -EFAULT; + } else { + ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, + &cached_state, disk_bytenr, + disk_io_size, count, + encoded->compression, + &unlocked); + } + +out: + if (ret >= 0) + iocb->ki_pos += encoded->len; +out_em: + free_extent_map(em); +out_unlock_extent: + if (!unlocked) + unlock_extent_cached(io_tree, start, lockend, &cached_state); +out_unlock_inode: + if (!unlocked) + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + return ret; +} + +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_changeset *data_reserved = NULL; + struct extent_state *cached_state = NULL; + int compression; + size_t orig_count; + u64 start, end; + u64 num_bytes, ram_bytes, disk_num_bytes; + unsigned long nr_pages, i; + struct page **pages; + struct btrfs_key ins; + bool extent_reserved = false; + struct extent_map *em; + ssize_t ret; + + switch (encoded->compression) { + case BTRFS_ENCODED_IO_COMPRESSION_ZLIB: + compression = BTRFS_COMPRESS_ZLIB; + break; + case BTRFS_ENCODED_IO_COMPRESSION_ZSTD: + compression = BTRFS_COMPRESS_ZSTD; + break; + case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: + /* The sector size must match for LZO. */ + if (encoded->compression - + BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 != + fs_info->sectorsize_bits) + return -EINVAL; + compression = BTRFS_COMPRESS_LZO; + break; + default: + return -EINVAL; + } + if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) + return -EINVAL; + + orig_count = iov_iter_count(from); + + /* The extent size must be sane. */ + if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED || + orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) + return -EINVAL; + + /* + * The compressed data must be smaller than the decompressed data. + * + * It's of course possible for data to compress to larger or the same + * size, but the buffered I/O path falls back to no compression for such + * data, and we don't want to break any assumptions by creating these + * extents. + * + * Note that this is less strict than the current check we have that the + * compressed data must be at least one sector smaller than the + * decompressed data. We only want to enforce the weaker requirement + * from old kernels that it is at least one byte smaller. + */ + if (orig_count >= encoded->unencoded_len) + return -EINVAL; + + /* The extent must start on a sector boundary. */ + start = iocb->ki_pos; + if (!IS_ALIGNED(start, fs_info->sectorsize)) + return -EINVAL; + + /* + * The extent must end on a sector boundary. However, we allow a write + * which ends at or extends i_size to have an unaligned length; we round + * up the extent size and set i_size to the unaligned end. + */ + if (start + encoded->len < inode->vfs_inode.i_size && + !IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) + return -EINVAL; + + /* Finally, the offset in the unencoded data must be sector-aligned. */ + if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) + return -EINVAL; + + num_bytes = ALIGN(encoded->len, fs_info->sectorsize); + ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize); + end = start + num_bytes - 1; + + /* + * If the extent cannot be inline, the compressed data on disk must be + * sector-aligned. For convenience, we extend it with zeroes if it + * isn't. + */ + disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); + nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); + pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + if (!pages) + return -ENOMEM; + for (i = 0; i < nr_pages; i++) { + size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); + char *kaddr; + + pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); + if (!pages[i]) { + ret = -ENOMEM; + goto out_pages; + } + kaddr = kmap(pages[i]); + if (copy_from_iter(kaddr, bytes, from) != bytes) { + kunmap(pages[i]); + ret = -EFAULT; + goto out_pages; + } + if (bytes < PAGE_SIZE) + memset(kaddr + bytes, 0, PAGE_SIZE - bytes); + kunmap(pages[i]); + } + + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); + if (ret) + goto out_pages; + ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + if (ret) + goto out_pages; + lock_extent_bits(io_tree, start, end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); + if (!ordered && + !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) + break; + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(io_tree, start, end, &cached_state); + cond_resched(); + } + + /* + * We don't use the higher-level delalloc space functions because our + * num_bytes and disk_num_bytes are different. + */ + ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); + if (ret) + goto out_unlock; + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); + if (ret) + goto out_free_data_space; + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes, + false); + if (ret) + goto out_qgroup_free_data; + + /* Try an inline extent first. */ + if (start == 0 && encoded->unencoded_len == encoded->len && + encoded->unencoded_offset == 0) { + ret = cow_file_range_inline(inode, encoded->len, orig_count, + compression, pages, true); + if (ret <= 0) { + if (ret == 0) + ret = orig_count; + goto out_delalloc_release; + } + } + + ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, + disk_num_bytes, 0, 0, &ins, 1, 1); + if (ret) + goto out_delalloc_release; + extent_reserved = true; + + em = create_io_em(inode, start, num_bytes, + start - encoded->unencoded_offset, ins.objectid, + ins.offset, ins.offset, ram_bytes, compression, + BTRFS_ORDERED_COMPRESSED); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_free_reserved; + } + free_extent_map(em); + + ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes, + ins.objectid, ins.offset, + encoded->unencoded_offset, + (1 << BTRFS_ORDERED_ENCODED) | + (1 << BTRFS_ORDERED_COMPRESSED), + compression); + if (ret) { + btrfs_drop_extent_cache(inode, start, end, 0); + goto out_free_reserved; + } + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + + if (start + encoded->len > inode->vfs_inode.i_size) + i_size_write(&inode->vfs_inode, start + encoded->len); + + unlock_extent_cached(io_tree, start, end, &cached_state); + + btrfs_delalloc_release_extents(inode, num_bytes); + + if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, + ins.offset, pages, nr_pages, 0, NULL, + false)) { + btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0); + ret = -EIO; + goto out_pages; + } + ret = orig_count; + goto out; + +out_free_reserved: + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); +out_delalloc_release: + btrfs_delalloc_release_extents(inode, num_bytes); + btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); +out_qgroup_free_data: + if (ret < 0) + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); +out_free_data_space: + /* + * If btrfs_reserve_extent() succeeded, then we already decremented + * bytes_may_use. + */ + if (!extent_reserved) + btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); +out_unlock: + unlock_extent_cached(io_tree, start, end, &cached_state); +out_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kvfree(pages); +out: + if (ret >= 0) + iocb->ki_pos += encoded->len; + return ret; +} + #ifdef CONFIG_SWAP /* * Add an entry indicating a block group or device which is pinned by a @@ -10418,8 +11125,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. + * + * It is possible that subvolume is marked for deletion but still not + * removed yet. To prevent this race, we check the root status before + * activating the swapfile. */ + spin_lock(&root->root_item_lock); + if (btrfs_root_dead(root)) { + spin_unlock(&root->root_item_lock); + + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because subvolume %llu is being deleted", + root->root_key.objectid); + return -EPERM; + } atomic_inc(&root->nr_swapfiles); + spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); @@ -10613,6 +11335,41 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode, spin_unlock(&inode->lock); } +/** + * Verify that there are no ordered extents for a given file range. + * + * @inode: The target inode. + * @start: Start offset of the file range, should be sector size aligned. + * @end: End offset (inclusive) of the file range, its value +1 should be + * sector size aligned. + * + * This should typically be used for cases where we locked an inode's VFS lock in + * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode, + * we have flushed all delalloc in the range, we have waited for all ordered + * extents in the range to complete and finally we have locked the file range in + * the inode's io_tree. + */ +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = inode->root; + struct btrfs_ordered_extent *ordered; + + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start); + if (ordered) { + btrfs_err(root->fs_info, +"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", + start, end, btrfs_ino(inode), root->root_key.objectid, + ordered->file_offset, + ordered->file_offset + ordered->num_bytes - 1); + btrfs_put_ordered_extent(ordered); + } + + ASSERT(ordered == NULL); +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10661,17 +11418,17 @@ static const struct file_operations btrfs_dir_file_operations = { * For now we're avoiding this by dropping bmap. */ static const struct address_space_operations btrfs_aops = { - .readpage = btrfs_readpage, + .read_folio = btrfs_read_folio, .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, - .invalidatepage = btrfs_invalidatepage, - .releasepage = btrfs_releasepage, + .invalidate_folio = btrfs_invalidate_folio, + .release_folio = btrfs_release_folio, #ifdef CONFIG_MIGRATION .migratepage = btrfs_migratepage, #endif - .set_page_dirty = btrfs_set_page_dirty, + .dirty_folio = filemap_dirty_folio, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8d47ec5fc4f4..0f79af919bc4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -28,6 +28,7 @@ #include <linux/iversion.h> #include <linux/fileattr.h> #include <linux/fsverity.h> +#include <linux/sched/xacct.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -88,6 +89,24 @@ struct btrfs_ioctl_send_args_32 { #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ struct btrfs_ioctl_send_args_32) + +struct btrfs_ioctl_encoded_io_args_32 { + compat_uptr_t iov; + compat_ulong_t iovcnt; + __s64 offset; + __u64 flags; + __u64 len; + __u64 unencoded_len; + __u64 unencoded_offset; + __u32 compression; + __u32 encryption; + __u8 reserved[64]; +}; + +#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args_32) +#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args_32) #endif /* Mask out flags that are inappropriate for the given type of inode. */ @@ -440,10 +459,8 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, } } -static int btrfs_ioctl_getversion(struct file *file, int __user *arg) +static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) { - struct inode *inode = file_inode(file); - return put_user(inode->i_generation, arg); } @@ -451,7 +468,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_device *device; - struct request_queue *q; struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; @@ -481,14 +497,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { - if (!device->bdev) + if (!device->bdev || !bdev_max_discard_sectors(device->bdev)) continue; - q = bdev_get_queue(device->bdev); - if (blk_queue_discard(q)) { - num_devices++; - minlen = min_t(u64, q->limits.discard_granularity, - minlen); - } + num_devices++; + minlen = min_t(u64, bdev_discard_granularity(device->bdev), + minlen); } rcu_read_unlock(); @@ -527,9 +540,35 @@ int __pure btrfs_is_empty_uuid(u8 *uuid) return 1; } +/* + * Calculate the number of transaction items to reserve for creating a subvolume + * or snapshot, not including the inode, directory entries, or parent directory. + */ +static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) +{ + /* + * 1 to add root block + * 1 to add root item + * 1 to add root ref + * 1 to add root backref + * 1 to add UUID item + * 1 to add qgroup info + * 1 to add qgroup limit + * + * Ideally the last two would only be accounted if qgroups are enabled, + * but that can change between now and the time we would insert them. + */ + unsigned int num_items = 7; + + if (inherit) { + /* 2 to add qgroup relations for each inherited qgroup */ + num_items += 2 * inherit->num_qgroups; + } + return num_items; +} + static noinline int create_subvol(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, - const char *name, int namelen, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -542,11 +581,15 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; struct timespec64 cur_time = current_time(dir); - struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .subvol = true, + }; + unsigned int trans_num_items; int ret; - dev_t anon_dev = 0; + dev_t anon_dev; u64 objectid; - u64 index = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -554,11 +597,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) - goto fail_free; - - ret = get_anon_bdev(&anon_dev); - if (ret < 0) - goto fail_free; + goto out_root_item; /* * Don't create subvolume whose level is not zero. Or qgroup will be @@ -566,36 +605,47 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, */ if (btrfs_qgroup_level(objectid)) { ret = -ENOSPC; - goto fail_free; + goto out_root_item; } + ret = get_anon_bdev(&anon_dev); + if (ret < 0) + goto out_root_item; + + new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir); + if (!new_inode_args.inode) { + ret = -ENOMEM; + goto out_anon_dev; + } + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) + goto out_inode; + trans_num_items += create_subvol_num_items(inherit); + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); - /* - * The same as the snapshot creation, please see the comment - * of create_snapshot(). - */ - ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, + trans_num_items, false); if (ret) - goto fail_free; + goto out_new_inode_args; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_subvolume_release_metadata(root, &block_rsv); - goto fail_free; + goto out_new_inode_args; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); if (ret) - goto fail; + goto out; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); - goto fail; + goto out; } btrfs_mark_buffer_dirty(leaf); @@ -650,75 +700,46 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, btrfs_tree_unlock(leaf); btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); - goto fail; + goto out; } free_extent_buffer(leaf); leaf = NULL; - key.offset = (u64)-1; new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); if (IS_ERR(new_root)) { - free_anon_bdev(anon_dev); ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - /* Freeing will be done in btrfs_put_root() of new_root */ + /* anon_dev is owned by new_root now. */ anon_dev = 0; + BTRFS_I(new_inode_args.inode)->root = new_root; + /* ... and new_root is owned by new_inode_args.inode now. */ ret = btrfs_record_root_in_trans(trans, new_root); if (ret) { - btrfs_put_root(new_root); - btrfs_abort_transaction(trans, ret); - goto fail; - } - - ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns); - btrfs_put_root(new_root); - if (ret) { - /* We potentially lose an unused inode item here */ - btrfs_abort_transaction(trans, ret); - goto fail; - } - - /* - * insert the directory item - */ - ret = btrfs_set_inode_index(BTRFS_I(dir), &index); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - - ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, - BTRFS_FT_DIR, index); - if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + ret = btrfs_uuid_tree_add(trans, root_item->uuid, + BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, - btrfs_ino(BTRFS_I(dir)), index, name, namelen); + ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - ret = btrfs_uuid_tree_add(trans, root_item->uuid, - BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) - btrfs_abort_transaction(trans, ret); + d_instantiate_new(dentry, new_inode_args.inode); + new_inode_args.inode = NULL; -fail: - kfree(root_item); +out: trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv); @@ -727,18 +748,14 @@ fail: btrfs_end_transaction(trans); else ret = btrfs_commit_transaction(trans); - - if (!ret) { - inode = btrfs_lookup_dentry(dir, dentry); - if (IS_ERR(inode)) - return PTR_ERR(inode); - d_instantiate(dentry, inode); - } - return ret; - -fail_free: +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + iput(new_inode_args.inode); +out_anon_dev: if (anon_dev) free_anon_bdev(anon_dev); +out_root_item: kfree(root_item); return ret; } @@ -750,9 +767,17 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; + unsigned int trans_num_items; struct btrfs_trans_handle *trans; int ret; + /* We do not support snapshotting right now. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_warn(fs_info, + "extent tree v2 doesn't support snapshotting yet"); + return -EOPNOTSUPP; + } + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; @@ -780,16 +805,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); /* - * 1 - parent dir inode - * 2 - dir entries - * 1 - root item - * 2 - root ref/backref - * 1 - root of snapshot - * 1 - UUID item + * 1 to add dir item + * 1 to add dir index + * 1 to update parent inode item */ + trans_num_items = create_subvol_num_items(inherit) + 3; ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, 8, - false); + &pending_snapshot->block_rsv, + trans_num_items, false); if (ret) goto free_pending; @@ -959,7 +982,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (snap_src) error = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit); + error = create_subvol(mnt_userns, dir, dentry, inherit); if (!error) fsnotify_mkdir(dir, dentry); @@ -1215,7 +1238,7 @@ static u32 get_extent_max_capacity(const struct extent_map *em) } static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, - bool locked) + u32 extent_thresh, u64 newer_than, bool locked) { struct extent_map *next; bool ret = false; @@ -1225,11 +1248,12 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, return false; /* - * We want to check if the next extent can be merged with the current - * one, which can be an extent created in a past generation, so we pass - * a minimum generation of 0 to defrag_lookup_extent(). + * Here we need to pass @newer_then when checking the next extent, or + * we will hit a case we mark current extent for defrag, but the next + * one will not be a target. + * This will just cause extra IO without really reducing the fragments. */ - next = defrag_lookup_extent(inode, em->start + em->len, 0, locked); + next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) goto out; @@ -1241,6 +1265,13 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, */ if (next->len >= get_extent_max_capacity(em)) goto out; + /* Skip older extent */ + if (next->generation < newer_than) + goto out; + /* Also check extent size */ + if (next->len >= extent_thresh) + goto out; + ret = true; out: free_extent_map(next); @@ -1327,7 +1358,7 @@ again: * make it uptodate. */ if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (page->mapping != mapping || !PagePrivate(page)) { unlock_page(page); @@ -1385,8 +1416,19 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (!em) break; - /* Skip hole/inline/preallocated extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE || + /* + * If the file extent is an inlined one, we may still want to + * defrag it (fallthrough) if it will cause a regular extent. + * This is for users who want to convert inline extents to + * regular ones through max_inline= mount option. + */ + if (em->block_start == EXTENT_MAP_INLINE && + em->len <= inode->root->fs_info->max_inline) + goto next; + + /* Skip hole/delalloc/preallocated extents */ + if (em->block_start == EXTENT_MAP_HOLE || + em->block_start == EXTENT_MAP_DELALLOC || test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) goto next; @@ -1445,8 +1487,17 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (em->len >= get_extent_max_capacity(em)) goto next; + /* + * Normally there are no more extents after an inline one, thus + * @next_mergeable will normally be false and not defragged. + * So if an inline extent passed all above checks, just add it + * for defrag, and be converted to regular extents. + */ + if (em->block_start == EXTENT_MAP_INLINE) + goto add; + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, - locked); + extent_thresh, newer_than, locked); if (!next_mergeable) { struct defrag_target_range *last; @@ -1522,6 +1573,7 @@ next: } #define CLUSTER_SIZE (SZ_256K) +static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); /* * Defrag one contiguous target range. @@ -1667,7 +1719,6 @@ static int defrag_one_cluster(struct btrfs_inode *inode, LIST_HEAD(target_list); int ret; - BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, false, &target_list, NULL); @@ -1810,9 +1861,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, u64 last_scanned = cur; u64 cluster_end; - /* The cluster size 256K should always be page aligned */ - BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); - if (btrfs_defrag_cancelled(fs_info)) { ret = -EAGAIN; break; @@ -2229,10 +2277,9 @@ free_args: return ret; } -static noinline int btrfs_ioctl_subvol_getflags(struct file *file, +static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, void __user *arg) { - struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -2537,7 +2584,12 @@ static noinline int search_ioctl(struct inode *inode, while (1) { ret = -EFAULT; - if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset)) + /* + * Ensure that the whole user buffer is faulted in at sub-page + * granularity, otherwise the loop may live-lock. + */ + if (fault_in_subpage_writeable(ubuf + sk_offset, + *buf_size - sk_offset)) break; ret = btrfs_search_forward(root, &key, path, sk->min_transid); @@ -2562,26 +2614,22 @@ err: return ret; } -static noinline int btrfs_ioctl_tree_search(struct file *file, - void __user *argp) +static noinline int btrfs_ioctl_tree_search(struct inode *inode, + void __user *argp) { - struct btrfs_ioctl_search_args __user *uargs; + struct btrfs_ioctl_search_args __user *uargs = argp; struct btrfs_ioctl_search_key sk; - struct inode *inode; int ret; size_t buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - uargs = (struct btrfs_ioctl_search_args __user *)argp; - if (copy_from_user(&sk, &uargs->key, sizeof(sk))) return -EFAULT; buf_size = sizeof(uargs->buf); - inode = file_inode(file); ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); /* @@ -2596,12 +2644,11 @@ static noinline int btrfs_ioctl_tree_search(struct file *file, return ret; } -static noinline int btrfs_ioctl_tree_search_v2(struct file *file, +static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, void __user *argp) { - struct btrfs_ioctl_search_args_v2 __user *uarg; + struct btrfs_ioctl_search_args_v2 __user *uarg = argp; struct btrfs_ioctl_search_args_v2 args; - struct inode *inode; int ret; size_t buf_size; const size_t buf_limit = SZ_16M; @@ -2610,7 +2657,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, return -EPERM; /* copy search header and buffer size */ - uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; if (copy_from_user(&args, uarg, sizeof(args))) return -EFAULT; @@ -2620,7 +2666,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, if (buf_size > buf_limit) buf_size = buf_limit; - inode = file_inode(file); ret = search_ioctl(inode, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) @@ -2871,25 +2916,22 @@ out: return ret; } -static noinline int btrfs_ioctl_ino_lookup(struct file *file, +static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_ino_lookup_args *args; - struct inode *inode; int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) return PTR_ERR(args); - inode = file_inode(file); - /* * Unprivileged query to obtain the containing subvolume root id. The * path is reset so it's consistent with btrfs_search_path_in_tree. */ if (args->treeid == 0) - args->treeid = BTRFS_I(inode)->root->root_key.objectid; + args->treeid = root->root_key.objectid; if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { args->name[0] = 0; @@ -2901,7 +2943,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, goto out; } - ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, + ret = btrfs_search_path_in_tree(root->fs_info, args->treeid, args->objectid, args->name); @@ -2957,7 +2999,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) } /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ -static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) +static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) { struct btrfs_ioctl_get_subvol_info_args *subvol_info; struct btrfs_fs_info *fs_info; @@ -2969,7 +3011,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) struct extent_buffer *leaf; unsigned long item_off; unsigned long item_len; - struct inode *inode; int slot; int ret = 0; @@ -2983,7 +3024,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) return -ENOMEM; } - inode = file_inode(file); fs_info = BTRFS_I(inode)->root->fs_info; /* Get root_item of inode's subvolume */ @@ -3077,15 +3117,14 @@ out_free: * Return ROOT_REF information of the subvolume containing this inode * except the subvolume name. */ -static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) +static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, + void __user *argp) { struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; struct btrfs_root_ref *rref; - struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; - struct inode *inode; u64 objectid; int slot; int ret; @@ -3101,15 +3140,13 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) return PTR_ERR(rootrefs); } - inode = file_inode(file); - root = BTRFS_I(inode)->root->fs_info->tree_root; - objectid = BTRFS_I(inode)->root->root_key.objectid; - + objectid = root->root_key.objectid; key.objectid = objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = rootrefs->min_treeid; found = 0; + root = root->fs_info->tree_root; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { goto out; @@ -3189,6 +3226,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int err = 0; bool destroy_parent = false; + /* We don't support snapshots with extent tree v2 yet. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "extent tree v2 doesn't support snapshot deletion yet"); + return -EOPNOTSUPP; + } + if (destroy_v2) { vol_args2 = memdup_user(arg, sizeof(*vol_args2)); if (IS_ERR(vol_args2)) @@ -3464,6 +3508,11 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device add not supported on extent tree v2 yet"); + return -EINVAL; + } + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; @@ -3989,6 +4038,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); + return -EINVAL; + } + sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); @@ -4088,6 +4142,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device replace not supported on extent tree v2 yet"); + return -EINVAL; + } + p = memdup_user(arg, sizeof(*p)); if (IS_ERR(p)) return PTR_ERR(p); @@ -4305,10 +4364,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bool need_unlock; /* for mut. excl. ops lock */ int ret; - if (!arg) - btrfs_warn(fs_info, - "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18"); - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4316,6 +4371,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (ret) return ret; + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + bargs = NULL; + goto out; + } + again: if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { mutex_lock(&fs_info->balance_mutex); @@ -4363,59 +4425,42 @@ again: } locked: - - if (arg) { - bargs = memdup_user(arg, sizeof(*bargs)); - if (IS_ERR(bargs)) { - ret = PTR_ERR(bargs); + if (bargs->flags & BTRFS_BALANCE_RESUME) { + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; goto out_unlock; } - if (bargs->flags & BTRFS_BALANCE_RESUME) { - if (!fs_info->balance_ctl) { - ret = -ENOTCONN; - goto out_bargs; - } + bctl = fs_info->balance_ctl; + spin_lock(&fs_info->balance_lock); + bctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); - bctl = fs_info->balance_ctl; - spin_lock(&fs_info->balance_lock); - bctl->flags |= BTRFS_BALANCE_RESUME; - spin_unlock(&fs_info->balance_lock); - btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); + goto do_balance; + } - goto do_balance; - } - } else { - bargs = NULL; + if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { + ret = -EINVAL; + goto out_unlock; } if (fs_info->balance_ctl) { ret = -EINPROGRESS; - goto out_bargs; + goto out_unlock; } bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); if (!bctl) { ret = -ENOMEM; - goto out_bargs; - } - - if (arg) { - memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); - memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); - memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); - - bctl->flags = bargs->flags; - } else { - /* balance everything - no filters */ - bctl->flags |= BTRFS_BALANCE_TYPE_MASK; + goto out_unlock; } - if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { - ret = -EINVAL; - goto out_bctl; - } + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + bctl->flags = bargs->flags; do_balance: /* * Ownership of bctl and exclusive operation goes to btrfs_balance. @@ -4428,21 +4473,19 @@ do_balance: ret = btrfs_balance(fs_info, bctl, bargs); bctl = NULL; - if ((ret == 0 || ret == -ECANCELED) && arg) { + if (ret == 0 || ret == -ECANCELED) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; } -out_bctl: kfree(bctl); -out_bargs: - kfree(bargs); out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); + kfree(bargs); return ret; } @@ -5149,7 +5192,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -5179,11 +5222,194 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) if (IS_ERR(arg)) return PTR_ERR(arg); } - ret = btrfs_ioctl_send(file, arg); + ret = btrfs_ioctl_send(inode, arg); kfree(arg); return ret; } +static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, + bool compat) +{ + struct btrfs_ioctl_encoded_io_args args = { 0 }; + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, + flags); + size_t copy_end; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, + flags); + if (copy_from_user(&args32, argp, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + if (copy_from_user(&args, argp, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + if (args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_iov; + } + pos = args.offset; + ret = rw_verify_area(READ, file, &pos, args.len); + if (ret < 0) + goto out_iov; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + ret = btrfs_encoded_read(&kiocb, &iter, &args); + if (ret >= 0) { + fsnotify_access(file); + if (copy_to_user(argp + copy_end, + (char *)&args + copy_end_kernel, + sizeof(args) - copy_end_kernel)) + ret = -EFAULT; + } + +out_iov: + kfree(iov); +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) +{ + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EBADF; + goto out_acct; + } + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, argp, sizeof(args32))) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; + args.len = args32.len; + args.unencoded_len = args32.unencoded_len; + args.unencoded_offset = args32.unencoded_offset; + args.compression = args32.compression; + args.encryption = args32.encryption; + memcpy(args.reserved, args32.reserved, sizeof(args.reserved)); +#else + return -ENOTTY; +#endif + } else { + if (copy_from_user(&args, argp, sizeof(args))) { + ret = -EFAULT; + goto out_acct; + } + } + + ret = -EINVAL; + if (args.flags != 0) + goto out_acct; + if (memchr_inv(args.reserved, 0, sizeof(args.reserved))) + goto out_acct; + if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && + args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) + goto out_acct; + if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || + args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) + goto out_acct; + if (args.unencoded_offset > args.unencoded_len) + goto out_acct; + if (args.len > args.unencoded_len - args.unencoded_offset) + goto out_acct; + + ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + file_start_write(file); + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_end_write; + } + pos = args.offset; + ret = rw_verify_area(WRITE, file, &pos, args.len); + if (ret < 0) + goto out_end_write; + + init_sync_kiocb(&kiocb, file); + ret = kiocb_set_rw_flags(&kiocb, 0); + if (ret) + goto out_end_write; + kiocb.ki_pos = pos; + + ret = btrfs_do_write_iter(&kiocb, &iter, &args); + if (ret > 0) + fsnotify_modify(file); + +out_end_write: + file_end_write(file); + kfree(iov); +out_acct: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -5194,7 +5420,7 @@ long btrfs_ioctl(struct file *file, unsigned int switch (cmd) { case FS_IOC_GETVERSION: - return btrfs_ioctl_getversion(file, argp); + return btrfs_ioctl_getversion(inode, argp); case FS_IOC_GETFSLABEL: return btrfs_ioctl_get_fslabel(fs_info, argp); case FS_IOC_SETFSLABEL: @@ -5214,7 +5440,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: - return btrfs_ioctl_subvol_getflags(file, argp); + return btrfs_ioctl_subvol_getflags(inode, argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -5235,14 +5461,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_fs_info(fs_info, argp); case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); - case BTRFS_IOC_BALANCE: - return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_TREE_SEARCH: - return btrfs_ioctl_tree_search(file, argp); + return btrfs_ioctl_tree_search(inode, argp); case BTRFS_IOC_TREE_SEARCH_V2: - return btrfs_ioctl_tree_search_v2(file, argp); + return btrfs_ioctl_tree_search_v2(inode, argp); case BTRFS_IOC_INO_LOOKUP: - return btrfs_ioctl_ino_lookup(file, argp); + return btrfs_ioctl_ino_lookup(root, argp); case BTRFS_IOC_INO_PATHS: return btrfs_ioctl_ino_to_path(root, argp); case BTRFS_IOC_LOGICAL_INO: @@ -5289,10 +5513,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(file, argp, false); + return _btrfs_ioctl_send(inode, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(file, argp, true); + return _btrfs_ioctl_send(inode, argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); @@ -5319,15 +5543,25 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); case BTRFS_IOC_GET_SUBVOL_INFO: - return btrfs_ioctl_get_subvol_info(file, argp); + return btrfs_ioctl_get_subvol_info(inode, argp); case BTRFS_IOC_GET_SUBVOL_ROOTREF: - return btrfs_ioctl_get_subvol_rootref(file, argp); + return btrfs_ioctl_get_subvol_rootref(root, argp); case BTRFS_IOC_INO_LOOKUP_USER: return btrfs_ioctl_ino_lookup_user(file, argp); case FS_IOC_ENABLE_VERITY: return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); + case BTRFS_IOC_ENCODED_READ: + return btrfs_ioctl_encoded_read(file, argp, false); + case BTRFS_IOC_ENCODED_WRITE: + return btrfs_ioctl_encoded_write(file, argp, false); +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: + return btrfs_ioctl_encoded_read(file, argp, true); + case BTRFS_IOC_ENCODED_WRITE_32: + return btrfs_ioctl_encoded_write(file, argp, true); +#endif } return -ENOTTY; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 313d9d685adb..33461b4f9c8b 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne start_ns = ktime_get_ns(); down_read_nested(&eb->lock, nest); - eb->lock_owner = current->pid; trace_btrfs_tree_read_lock(eb, start_ns); } @@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) int btrfs_try_tree_read_lock(struct extent_buffer *eb) { if (down_read_trylock(&eb->lock)) { - eb->lock_owner = current->pid; trace_btrfs_try_tree_read_lock(eb); return 1; } @@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb) { trace_btrfs_tree_read_unlock(eb); - eb->lock_owner = 0; up_read(&eb->lock); } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index e6e28a9c7987..430ad36b8b08 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -55,6 +55,9 @@ * 0x1000 | SegHdr N+1| Data payload N+1 ... | */ +#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) +#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) + struct workspace { void *mem; void *buf; /* where decompressed data goes */ @@ -83,8 +86,8 @@ struct list_head *lzo_alloc_workspace(unsigned int level) return ERR_PTR(-ENOMEM); workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); - workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); + workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL); + workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -380,7 +383,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) kunmap(cur_page); cur_in += LZO_LEN; - if (seg_len > lzo1x_worst_compress(PAGE_SIZE)) { + if (seg_len > WORKSPACE_CBUF_LENGTH) { /* * seg_len shouldn't be larger than we have allocated * for workspace->cbuf @@ -433,7 +436,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, struct workspace *workspace = list_entry(ws, struct workspace, list); size_t in_len; size_t out_len; - size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE); + size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; char *kaddr; unsigned long bytes; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6b51fd2ec5ac..1957b14b329a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -143,16 +143,28 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } -/* - * Allocate and add a new ordered_extent into the per-inode tree. +/** + * Add an ordered extent to the per-inode tree. + * + * @inode: Inode that this extent is for. + * @file_offset: Logical offset in file where the extent starts. + * @num_bytes: Logical length of extent in file. + * @ram_bytes: Full length of unencoded data. + * @disk_bytenr: Offset of extent on disk. + * @disk_num_bytes: Size of extent on disk. + * @offset: Offset into unencoded data where file data starts. + * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @compress_type: Compression algorithm used for data. * - * The tree is given a single reference on the ordered extent that was - * inserted. + * Most of these parameters correspond to &struct btrfs_file_extent_item. The + * tree is given a single reference on the ordered extent that was inserted. + * + * Return: 0 or -ENOMEM. */ -static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type, int dio, - int compress_type) +int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned flags, + int compress_type) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -161,7 +173,8 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset struct btrfs_ordered_extent *entry; int ret; - if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) { + if (flags & + ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { /* For nocow write, we can release the qgroup rsv right now */ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); if (ret < 0) @@ -181,9 +194,11 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset return -ENOMEM; entry->file_offset = file_offset; - entry->disk_bytenr = disk_bytenr; entry->num_bytes = num_bytes; + entry->ram_bytes = ram_bytes; + entry->disk_bytenr = disk_bytenr; entry->disk_num_bytes = disk_num_bytes; + entry->offset = offset; entry->bytes_left = num_bytes; entry->inode = igrab(&inode->vfs_inode); entry->compress_type = compress_type; @@ -191,18 +206,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset entry->qgroup_rsv = ret; entry->physical = (u64)-1; - ASSERT(type == BTRFS_ORDERED_REGULAR || - type == BTRFS_ORDERED_NOCOW || - type == BTRFS_ORDERED_PREALLOC || - type == BTRFS_ORDERED_COMPRESSED); - set_bit(type, &entry->flags); + ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); + entry->flags = flags; percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes, fs_info->delalloc_batch); - if (dio) - set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); - /* one ref for the tree */ refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -247,41 +256,6 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset return 0; } -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, - int type) -{ - ASSERT(type == BTRFS_ORDERED_REGULAR || - type == BTRFS_ORDERED_NOCOW || - type == BTRFS_ORDERED_PREALLOC); - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, type, 0, - BTRFS_COMPRESS_NONE); -} - -int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type) -{ - ASSERT(type == BTRFS_ORDERED_REGULAR || - type == BTRFS_ORDERED_NOCOW || - type == BTRFS_ORDERED_PREALLOC); - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, type, 1, - BTRFS_COMPRESS_NONE); -} - -int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int compress_type) -{ - ASSERT(compress_type != BTRFS_COMPRESS_NONE); - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, - BTRFS_ORDERED_COMPRESSED, 0, - compress_type); -} - /* * Add a struct btrfs_ordered_sum into the list of checksums to be inserted * when an ordered extent is finished. If the list covers more than one @@ -548,9 +522,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, spin_lock(&btrfs_inode->lock); btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); - if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, - false); + if (root != fs_info->tree_root) { + u64 release; + + if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags)) + release = entry->disk_num_bytes; + else + release = entry->num_bytes; + btrfs_delalloc_release_metadata(btrfs_inode, release, false); + } percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); @@ -1052,42 +1032,18 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; u64 file_offset = ordered->file_offset + pos; u64 disk_bytenr = ordered->disk_bytenr + pos; - u64 num_bytes = len; - u64 disk_num_bytes = len; - int type; - unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT); - int compress_type = ordered->compress_type; - unsigned long weight; - int ret; - - weight = hweight_long(flags_masked); - WARN_ON_ONCE(weight > 1); - if (!weight) - type = 0; - else - type = __ffs(flags_masked); + unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; /* - * The splitting extent is already counted and will be added again - * in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid - * double counting. + * The splitting extent is already counted and will be added again in + * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting. */ - percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes, + percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch); - if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) { - WARN_ON_ONCE(1); - ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), - file_offset, disk_bytenr, num_bytes, - disk_num_bytes, compress_type); - } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { - ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset, - disk_bytenr, num_bytes, disk_num_bytes, type); - } else { - ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, - disk_bytenr, num_bytes, disk_num_bytes, type); - } - - return ret; + WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED)); + return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, + disk_bytenr, len, 0, flags, + ordered->compress_type); } int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4194e960ff61..ecad67a2c745 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -74,8 +74,18 @@ enum { BTRFS_ORDERED_LOGGED_CSUM, /* We wait for this extent to complete in the current transaction */ BTRFS_ORDERED_PENDING, + /* BTRFS_IOC_ENCODED_WRITE */ + BTRFS_ORDERED_ENCODED, }; +/* BTRFS_ORDERED_* flags that specify the type of the extent. */ +#define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \ + (1UL << BTRFS_ORDERED_NOCOW) | \ + (1UL << BTRFS_ORDERED_PREALLOC) | \ + (1UL << BTRFS_ORDERED_COMPRESSED) | \ + (1UL << BTRFS_ORDERED_DIRECT) | \ + (1UL << BTRFS_ORDERED_ENCODED)) + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -84,9 +94,11 @@ struct btrfs_ordered_extent { * These fields directly correspond to the same fields in * btrfs_file_extent_item. */ - u64 disk_bytenr; u64 num_bytes; + u64 ram_bytes; + u64 disk_bytenr; u64 disk_num_bytes; + u64 offset; /* number of bytes that still need writing */ u64 bytes_left; @@ -179,14 +191,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, - int type); -int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type); -int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int compress_type); + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned flags, + int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0775ae9f4419..dd8777872143 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -23,6 +23,7 @@ static const struct root_name_map root_map[] = { { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, + { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" }, { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, }; @@ -391,9 +392,9 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow) btrfs_header_owner(c), btrfs_node_ptr_generation(c, i), level - 1, &first_key); - if (IS_ERR(next)) { + if (IS_ERR(next)) continue; - } else if (!extent_buffer_uptodate(next)) { + if (!extent_buffer_uptodate(next)) { free_extent_buffer(next); continue; } diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 1a6d2d5b4b33..a2ec8ecae8de 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -17,9 +17,11 @@ static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); struct prop_handler { struct hlist_node node; const char *xattr_name; - int (*validate)(const char *value, size_t len); + int (*validate)(const struct btrfs_inode *inode, const char *value, + size_t len); int (*apply)(struct inode *inode, const char *value, size_t len); const char *(*extract)(struct inode *inode); + bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -55,7 +57,8 @@ find_prop_handler(const char *name, return NULL; } -int btrfs_validate_prop(const char *name, const char *value, size_t value_len) +int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, + const char *value, size_t value_len) { const struct prop_handler *handler; @@ -69,7 +72,29 @@ int btrfs_validate_prop(const char *name, const char *value, size_t value_len) if (value_len == 0) return 0; - return handler->validate(value, value_len); + return handler->validate(inode, value, value_len); +} + +/* + * Check if a property should be ignored (not set) for an inode. + * + * @inode: The target inode. + * @name: The property's name. + * + * The caller must be sure the given property name is valid, for example by + * having previously called btrfs_validate_prop(). + * + * Returns: true if the property should be ignored for the given inode + * false if the property must not be ignored for the given inode + */ +bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name) +{ + const struct prop_handler *handler; + + handler = find_prop_handler(name, NULL); + ASSERT(handler != NULL); + + return handler->ignore(inode); } int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, @@ -252,8 +277,12 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) return ret; } -static int prop_compression_validate(const char *value, size_t len) +static int prop_compression_validate(const struct btrfs_inode *inode, + const char *value, size_t len) { + if (!btrfs_inode_can_compress(inode)) + return -EINVAL; + if (!value) return 0; @@ -310,6 +339,22 @@ static int prop_compression_apply(struct inode *inode, const char *value, return 0; } +static bool prop_compression_ignore(const struct btrfs_inode *inode) +{ + /* + * Compression only has effect for regular files, and for directories + * we set it just to propagate it to new files created inside them. + * Everything else (symlinks, devices, sockets, fifos) is pointless as + * it will do nothing, so don't waste metadata space on a compression + * xattr for anything that is neither a file nor a directory. + */ + if (!S_ISREG(inode->vfs_inode.i_mode) && + !S_ISDIR(inode->vfs_inode.i_mode)) + return true; + + return false; +} + static const char *prop_compression_extract(struct inode *inode) { switch (BTRFS_I(inode)->prop_compress) { @@ -330,13 +375,13 @@ static struct prop_handler prop_handlers[] = { .validate = prop_compression_validate, .apply = prop_compression_apply, .extract = prop_compression_extract, + .ignore = prop_compression_ignore, .inheritable = 1 }, }; -static int inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - struct inode *parent) +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *parent) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -356,6 +401,9 @@ static int inherit_props(struct btrfs_trans_handle *trans, if (!h->inheritable) continue; + if (h->ignore(BTRFS_I(inode))) + continue; + value = h->extract(parent); if (!value) continue; @@ -364,7 +412,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, * This is not strictly necessary as the property should be * valid, but in case it isn't, don't propagate it further. */ - ret = h->validate(value, strlen(value)); + ret = h->validate(BTRFS_I(inode), value, strlen(value)); if (ret) continue; @@ -408,41 +456,6 @@ static int inherit_props(struct btrfs_trans_handle *trans, return 0; } -int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - struct inode *dir) -{ - if (!dir) - return 0; - - return inherit_props(trans, inode, dir); -} - -int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *parent_root) -{ - struct super_block *sb = root->fs_info->sb; - struct inode *parent_inode, *child_inode; - int ret; - - parent_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, parent_root); - if (IS_ERR(parent_inode)) - return PTR_ERR(parent_inode); - - child_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, root); - if (IS_ERR(child_inode)) { - iput(parent_inode); - return PTR_ERR(child_inode); - } - - ret = inherit_props(trans, child_inode, parent_inode); - iput(child_inode); - iput(parent_inode); - - return ret; -} - void __init btrfs_props_init(void) { int i; diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 40b2c65b518c..ca9dd3df129b 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -13,7 +13,9 @@ void __init btrfs_props_init(void); int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const char *value, size_t value_len, int flags); -int btrfs_validate_prop(const char *name, const char *value, size_t value_len); +int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, + const char *value, size_t value_len); +bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name); int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); @@ -21,8 +23,4 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); -int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *parent_root); - #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 30d42ea655ce..db723c0026bd 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -25,18 +25,6 @@ #include "sysfs.h" #include "tree-mod-log.h" -/* TODO XXX FIXME - * - subvol delete -> delete when ref goes to 0? delete limits also? - * - reorganize keys - * - compressed - * - sync - * - copy also limits on subvol creation - * - limit - * - caches for ulists - * - performance benchmarks - * - check all ioctl parameters - */ - /* * Helpers to access qgroup reservation * @@ -258,16 +246,19 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) return 0; } -/* must be called with qgroup_lock held */ -static int add_relation_rb(struct btrfs_fs_info *fs_info, - u64 memberid, u64 parentid) +/* + * Add relation specified by two qgroups. + * + * Must be called with qgroup_lock held. + * + * Return: 0 on success + * -ENOENT if one of the qgroups is NULL + * <0 other errors + */ +static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) { - struct btrfs_qgroup *member; - struct btrfs_qgroup *parent; struct btrfs_qgroup_list *list; - member = find_qgroup_rb(fs_info, memberid); - parent = find_qgroup_rb(fs_info, parentid); if (!member || !parent) return -ENOENT; @@ -283,7 +274,27 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, return 0; } -/* must be called with qgroup_lock held */ +/* + * Add relation specified by two qgoup ids. + * + * Must be called with qgroup_lock held. + * + * Return: 0 on success + * -ENOENT if one of the ids does not exist + * <0 other errors + */ +static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + + return __add_relation_rb(member, parent); +} + +/* Must be called with qgroup_lock held */ static int del_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) { @@ -948,6 +959,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) */ lockdep_assert_held_write(&fs_info->subvol_sem); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "qgroups are currently unsupported in extent tree v2"); + return -EINVAL; + } + mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) goto out; @@ -1451,7 +1468,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, } spin_lock(&fs_info->qgroup_lock); - ret = add_relation_rb(fs_info, src, dst); + ret = __add_relation_rb(member, parent); if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); goto out; @@ -2273,7 +2290,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, return 0; if (!extent_buffer_uptodate(root_eb)) { - ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL); + ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL); if (ret) goto out; } @@ -3268,7 +3285,8 @@ out: static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { return btrfs_fs_closing(fs_info) || - test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || + !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) @@ -3298,11 +3316,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = PTR_ERR(trans); break; } - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { - err = -EINTR; - } else { - err = qgroup_rescan_leaf(trans, path); - } + + err = qgroup_rescan_leaf(trans, path); + if (err > 0) btrfs_commit_transaction(trans); else @@ -3316,7 +3332,7 @@ out: if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - } else if (err < 0) { + } else if (err < 0 || stopped) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3923,12 +3939,13 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, } int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce) + enum btrfs_qgroup_rsv_type type, bool enforce, + bool noflush) { int ret; ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); - if (ret <= 0 && ret != -EDQUOT) + if ((ret <= 0 && ret != -EDQUOT) || noflush) return ret; ret = try_flush_qgroup(root); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 880e9df0dac1..0c4dd2a9af96 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -364,19 +364,23 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce); + enum btrfs_qgroup_rsv_type type, bool enforce, + bool noflush); /* Reserve metadata space for pertrans and prealloc type */ static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root, int num_bytes, bool enforce) { return __btrfs_qgroup_reserve_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PERTRANS, enforce); + BTRFS_QGROUP_RSV_META_PERTRANS, + enforce, false); } static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, - int num_bytes, bool enforce) + int num_bytes, bool enforce, + bool noflush) { return __btrfs_qgroup_reserve_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PREALLOC, enforce); + BTRFS_QGROUP_RSV_META_PREALLOC, + enforce, noflush); } void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0e239a4c3b26..a5b623ee6fac 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -52,6 +52,17 @@ struct btrfs_stripe_hash_table { struct btrfs_stripe_hash table[]; }; +/* + * A bvec like structure to present a sector inside a page. + * + * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. + */ +struct sector_ptr { + struct page *page; + unsigned int pgoff:24; + unsigned int uptodate:8; +}; + enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, @@ -77,7 +88,7 @@ struct btrfs_raid_bio { /* * for scheduling work in the helper threads */ - struct btrfs_work work; + struct work_struct work; /* * bio list and bio_list_lock are used @@ -101,15 +112,6 @@ struct btrfs_raid_bio { */ unsigned long flags; - /* size of each individual stripe on disk */ - int stripe_len; - - /* number of data stripes (no p/q) */ - int nr_data; - - int real_stripes; - - int stripe_npages; /* * set if we're doing a parity rebuild * for a read from higher up, which is handled @@ -118,18 +120,35 @@ struct btrfs_raid_bio { */ enum btrfs_rbio_ops operation; - /* first bad stripe */ - int faila; + /* Size of each individual stripe on disk */ + u32 stripe_len; - /* second bad stripe (for raid6 use) */ - int failb; + /* How many pages there are for the full stripe including P/Q */ + u16 nr_pages; - int scrubp; - /* - * number of pages needed to represent the full - * stripe - */ - int nr_pages; + /* How many sectors there are for the full stripe including P/Q */ + u16 nr_sectors; + + /* Number of data stripes (no p/q) */ + u8 nr_data; + + /* Numer of all stripes (including P/Q) */ + u8 real_stripes; + + /* How many pages there are for each stripe */ + u8 stripe_npages; + + /* How many sectors there are for each stripe */ + u8 stripe_nsectors; + + /* First bad stripe, -1 means no corruption */ + s8 faila; + + /* Second bad stripe (for RAID6 use) */ + s8 failb; + + /* Stripe number that we're scrubbing */ + u8 scrubp; /* * size of all the bios in the bio_list. This @@ -156,28 +175,29 @@ struct btrfs_raid_bio { */ struct page **stripe_pages; - /* - * pointers to the pages in the bio_list. Stored - * here for faster lookup - */ - struct page **bio_pages; + /* Pointers to the sectors in the bio_list, for faster lookup */ + struct sector_ptr *bio_sectors; /* - * bitmap to record which horizontal stripe has data + * For subpage support, we need to map each sector to above + * stripe_pages. */ + struct sector_ptr *stripe_sectors; + + /* Bitmap to record which horizontal stripe has data */ unsigned long *dbitmap; /* allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; - /* allocated with stripe_npages-many bits for finish_*() calls */ + /* Allocated with stripe_nsectors-many bits for finish_*() calls */ unsigned long *finish_pbitmap; }; static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); -static void rmw_work(struct btrfs_work *work); -static void read_rebuild_work(struct btrfs_work *work); +static void rmw_work(struct work_struct *work); +static void read_rebuild_work(struct work_struct *work); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void __free_raid_bio(struct btrfs_raid_bio *rbio); @@ -186,12 +206,12 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); -static void scrub_parity_work(struct btrfs_work *work); +static void scrub_parity_work(struct work_struct *work); -static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) +static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) { - btrfs_init_work(&rbio->work, work_func, NULL, NULL); - btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); + INIT_WORK(&rbio->work, work_func); + queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); } /* @@ -239,7 +259,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) /* * caching an rbio means to copy anything from the - * bio_pages array into the stripe_pages array. We + * bio_sectors array into the stripe_pages array. We * use the page uptodate bit in the stripe cache array * to indicate if it has valid data * @@ -255,12 +275,18 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) if (ret) return; - for (i = 0; i < rbio->nr_pages; i++) { - if (!rbio->bio_pages[i]) + for (i = 0; i < rbio->nr_sectors; i++) { + /* Some range not covered by bio (partial write), skip it */ + if (!rbio->bio_sectors[i].page) continue; - copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]); - SetPageUptodate(rbio->stripe_pages[i]); + ASSERT(rbio->stripe_sectors[i].page); + memcpy_page(rbio->stripe_sectors[i].page, + rbio->stripe_sectors[i].pgoff, + rbio->bio_sectors[i].page, + rbio->bio_sectors[i].pgoff, + rbio->bioc->fs_info->sectorsize); + rbio->stripe_sectors[i].uptodate = 1; } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); } @@ -283,9 +309,50 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio) return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } +static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, + unsigned int page_nr) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int i; + + ASSERT(page_nr < rbio->nr_pages); + + for (i = sectors_per_page * page_nr; + i < sectors_per_page * page_nr + sectors_per_page; + i++) { + if (!rbio->stripe_sectors[i].uptodate) + return false; + } + return true; +} + /* - * stealing an rbio means taking all the uptodate pages from the stripe - * array in the source rbio and putting them into the destination rbio + * Update the stripe_sectors[] array to use correct page and pgoff + * + * Should be called every time any page pointer in stripes_pages[] got modified. + */ +static void index_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + u32 offset; + int i; + + for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { + int page_index = offset >> PAGE_SHIFT; + + ASSERT(page_index < rbio->nr_pages); + rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; + rbio->stripe_sectors[i].pgoff = offset_in_page(offset); + } +} + +/* + * Stealing an rbio means taking all the uptodate pages from the stripe array + * in the source rbio and putting them into the destination rbio. + * + * This will also update the involved stripe_sectors[] which are referring to + * the old pages. */ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { @@ -298,9 +365,8 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) for (i = 0; i < dest->nr_pages; i++) { s = src->stripe_pages[i]; - if (!s || !PageUptodate(s)) { + if (!s || !full_page_sectors_uptodate(src, i)) continue; - } d = dest->stripe_pages[i]; if (d) @@ -309,6 +375,8 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) dest->stripe_pages[i] = s; src->stripe_pages[i] = NULL; } + index_stripe_sectors(dest); + index_stripe_sectors(src); } /* @@ -600,39 +668,39 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, return 1; } -static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, - int index) +static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) { - return stripe * rbio->stripe_npages + index; + ASSERT(stripe_nr < rbio->real_stripes); + ASSERT(sector_nr < rbio->stripe_nsectors); + + return stripe_nr * rbio->stripe_nsectors + sector_nr; } -/* - * these are just the pages from the rbio array, not from anything - * the FS sent down to us - */ -static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, - int index) +/* Return a sector from rbio->stripe_sectors, not from the bio list */ +static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) { - return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; + return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, + sector_nr)]; } -/* - * helper to index into the pstripe - */ -static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) +/* Grab a sector inside P stripe */ +static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr) { - return rbio_stripe_page(rbio, rbio->nr_data, index); + return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); } -/* - * helper to index into the qstripe, returns null - * if there is no qstripe - */ -static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) +/* Grab a sector inside Q stripe, return NULL if not RAID6 */ +static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; - return rbio_stripe_page(rbio, rbio->nr_data + 1, index); + return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); } /* @@ -911,47 +979,43 @@ static void raid_write_end_io(struct bio *bio) rbio_orig_end_io(rbio, err); } -/* - * the read/modify/write code wants to use the original bio for - * any pages it included, and then use the rbio for everything - * else. This function decides if a given index (stripe number) - * and page number in that stripe fall inside the original bio - * or the rbio. - * - * if you set bio_list_only, you'll get a NULL back for any ranges - * that are outside the bio_list +/** + * Get a sector pointer specified by its @stripe_nr and @sector_nr * - * This doesn't take any refs on anything, you get a bare page pointer - * and the caller must bump refs as required. + * @rbio: The raid bio + * @stripe_nr: Stripe number, valid range [0, real_stripe) + * @sector_nr: Sector number inside the stripe, + * valid range [0, stripe_nsectors) + * @bio_list_only: Whether to use sectors inside the bio list only. * - * You must call index_rbio_pages once before you can trust - * the answers from this function. + * The read/modify/write code wants to reuse the original bio page as much + * as possible, and only use stripe_sectors as fallback. */ -static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, - int index, int pagenr, int bio_list_only) +static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, + bool bio_list_only) { - int chunk_page; - struct page *p = NULL; + struct sector_ptr *sector; + int index; + + ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); + ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); - chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; + index = stripe_nr * rbio->stripe_nsectors + sector_nr; + ASSERT(index >= 0 && index < rbio->nr_sectors); spin_lock_irq(&rbio->bio_list_lock); - p = rbio->bio_pages[chunk_page]; + sector = &rbio->bio_sectors[index]; + if (sector->page || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (!sector->page) + sector = NULL; + spin_unlock_irq(&rbio->bio_list_lock); + return sector; + } spin_unlock_irq(&rbio->bio_list_lock); - if (p || bio_list_only) - return p; - - return rbio->stripe_pages[chunk_page]; -} - -/* - * number of pages we need for the entire stripe across all the - * drives - */ -static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) -{ - return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; + return &rbio->stripe_sectors[index]; } /* @@ -960,22 +1024,28 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, struct btrfs_io_context *bioc, - u64 stripe_len) + u32 stripe_len) { + const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; + const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT; + const unsigned int num_pages = stripe_npages * real_stripes; + const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits; + const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; int nr_data = 0; - int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; - int num_pages = rbio_nr_pages(stripe_len, real_stripes); - int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); void *p; + ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); + /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ + ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + rbio = kzalloc(sizeof(*rbio) + sizeof(*rbio->stripe_pages) * num_pages + - sizeof(*rbio->bio_pages) * num_pages + + sizeof(*rbio->bio_sectors) * num_sectors + + sizeof(*rbio->stripe_sectors) * num_sectors + sizeof(*rbio->finish_pointers) * real_stripes + - sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + - sizeof(*rbio->finish_pbitmap) * - BITS_TO_LONGS(stripe_npages), + sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) + + sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -988,8 +1058,10 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->bioc = bioc; rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; + rbio->nr_sectors = num_sectors; rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; + rbio->stripe_nsectors = stripe_nsectors; rbio->faila = -1; rbio->failb = -1; refcount_set(&rbio->refs, 1); @@ -997,8 +1069,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, atomic_set(&rbio->stripes_pending, 0); /* - * the stripe_pages, bio_pages, etc arrays point to the extra - * memory we allocated past the end of the rbio + * The stripe_pages, bio_sectors, etc arrays point to the extra memory + * we allocated past the end of the rbio. */ p = rbio + 1; #define CONSUME_ALLOC(ptr, count) do { \ @@ -1006,10 +1078,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ } while (0) CONSUME_ALLOC(rbio->stripe_pages, num_pages); - CONSUME_ALLOC(rbio->bio_pages, num_pages); + CONSUME_ALLOC(rbio->bio_sectors, num_sectors); + CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); CONSUME_ALLOC(rbio->finish_pointers, real_stripes); - CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); - CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); + CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors)); + CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors)); #undef CONSUME_ALLOC if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) @@ -1026,59 +1099,63 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, /* allocate pages for all the stripes in the bio, including parity */ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { - int i; - struct page *page; + int ret; - for (i = 0; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) - continue; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[i] = page; - } + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); + if (ret < 0) + return ret; + /* Mapping all sectors */ + index_stripe_sectors(rbio); return 0; } /* only allocate pages for p/q stripes */ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) { - int i; - struct page *page; + const int data_pages = rbio->nr_data * rbio->stripe_npages; + int ret; - i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); + ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, + rbio->stripe_pages + data_pages); + if (ret < 0) + return ret; - for (; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) - continue; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[i] = page; - } + index_stripe_sectors(rbio); return 0; } /* - * add a single page from a specific stripe into our list of bios for IO - * this will try to merge into existing bios if possible, and returns - * zero if all went well. + * Add a single sector @sector into our list of bios for IO. + * + * Return 0 if everything went well. + * Return <0 for error. */ -static int rbio_add_io_page(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list, - struct page *page, - int stripe_nr, - unsigned long page_index, - unsigned long bio_max_len) -{ +static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct sector_ptr *sector, + unsigned int stripe_nr, + unsigned int sector_nr, + unsigned long bio_max_len, + unsigned int opf) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio *last = bio_list->tail; int ret; struct bio *bio; struct btrfs_io_stripe *stripe; u64 disk_start; + /* + * Note: here stripe_nr has taken device replace into consideration, + * thus it can be larger than rbio->real_stripe. + * So here we check against bioc->num_stripes, not rbio->real_stripes. + */ + ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); + ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); + ASSERT(sector->page); + stripe = &rbio->bioc->stripes[stripe_nr]; - disk_start = stripe->physical + (page_index << PAGE_SHIFT); + disk_start = stripe->physical + sector_nr * sectorsize; /* if the device is missing, just fail this stripe */ if (!stripe->dev->bdev) @@ -1095,20 +1172,20 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, page, PAGE_SIZE, 0); - if (ret == PAGE_SIZE) + ret = bio_add_page(last, sector->page, sectorsize, + sector->pgoff); + if (ret == sectorsize) return 0; } } /* put a new bio on the list */ - bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); - btrfs_bio(bio)->device = stripe->dev; - bio->bi_iter.bi_size = 0; - bio_set_dev(bio, stripe->dev->bdev); + bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL), + opf, GFP_NOFS); bio->bi_iter.bi_sector = disk_start >> 9; + bio->bi_private = rbio; - bio_add_page(bio, page, PAGE_SIZE, 0); + bio_add_page(bio, sector->page, sectorsize, sector->pgoff); bio_list_add(bio_list, bio); return 0; } @@ -1130,6 +1207,32 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) } } +static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct bio_vec bvec; + struct bvec_iter iter; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->raid_map[0]; + + if (bio_flagged(bio, BIO_CLONED)) + bio->bi_iter = btrfs_bio(bio)->iter; + + bio_for_each_segment(bvec, bio, iter) { + u32 bvec_offset; + + for (bvec_offset = 0; bvec_offset < bvec.bv_len; + bvec_offset += sectorsize, offset += sectorsize) { + int index = offset / sectorsize; + struct sector_ptr *sector = &rbio->bio_sectors[index]; + + sector->page = bvec.bv_page; + sector->pgoff = bvec.bv_offset + bvec_offset; + ASSERT(sector->pgoff < PAGE_SIZE); + } + } +} + /* * helper function to walk our bio list and populate the bio_pages array with * the result. This seems expensive, but it is faster than constantly @@ -1141,28 +1244,11 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; - u64 start; - unsigned long stripe_offset; - unsigned long page_index; spin_lock_irq(&rbio->bio_list_lock); - bio_list_for_each(bio, &rbio->bio_list) { - struct bio_vec bvec; - struct bvec_iter iter; - int i = 0; + bio_list_for_each(bio, &rbio->bio_list) + index_one_bio(rbio, bio); - start = bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->bioc->raid_map[0]; - page_index = stripe_offset >> PAGE_SHIFT; - - if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_bio(bio)->iter; - - bio_for_each_segment(bvec, bio, iter) { - rbio->bio_pages[page_index + i] = bvec.bv_page; - i++; - } - } spin_unlock_irq(&rbio->bio_list_lock); } @@ -1177,10 +1263,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; + const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; int stripe; - int pagenr; + int sectornr; bool has_qstripe; struct bio_list bio_list; struct bio *bio; @@ -1224,35 +1311,37 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *p; - /* first collect one page from each data stripe */ + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + + /* First collect one sector from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { - p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap_local_page(p); + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; } - /* then add the parity stripe */ - p = rbio_pstripe_page(rbio, pagenr); - SetPageUptodate(p); - pointers[stripe++] = kmap_local_page(p); + /* Then add the parity stripe */ + sector = rbio_pstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; if (has_qstripe) { - /* - * raid6, add the qstripe and call the - * library function to fill in our p/q + * RAID6, add the qstripe and call the library function + * to fill in our p/q */ - p = rbio_qstripe_page(rbio, pagenr); - SetPageUptodate(p); - pointers[stripe++] = kmap_local_page(p); + sector = rbio_qstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + + sector->pgoff; - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { /* raid5 */ - copy_page(pointers[nr_data], pointers[0]); - run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); + memcpy(pointers[nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, nr_data - 1, sectorsize); } for (stripe = stripe - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); @@ -1264,18 +1353,20 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) * everything else. */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + if (stripe < rbio->nr_data) { - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (!page) + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) continue; } else { - page = rbio_stripe_page(rbio, stripe, pagenr); + sector = rbio_stripe_sector(rbio, stripe, sectornr); } - ret = rbio_add_io_page(rbio, &bio_list, - page, stripe, pagenr, rbio->stripe_len); + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, rbio->stripe_len, + REQ_OP_WRITE); if (ret) goto cleanup; } @@ -1288,19 +1379,21 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) if (!bioc->tgtdev_map[stripe]) continue; - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + if (stripe < rbio->nr_data) { - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (!page) + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) continue; } else { - page = rbio_stripe_page(rbio, stripe, pagenr); + sector = rbio_stripe_sector(rbio, stripe, sectornr); } - ret = rbio_add_io_page(rbio, &bio_list, page, + ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->bioc->tgtdev_map[stripe], - pagenr, rbio->stripe_len); + sectornr, rbio->stripe_len, + REQ_OP_WRITE); if (ret) goto cleanup; } @@ -1311,9 +1404,7 @@ write_data: BUG_ON(atomic_read(&rbio->stripes_pending) == 0); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - bio->bi_opf = REQ_OP_WRITE; submit_bio(bio); } @@ -1417,18 +1508,48 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio, } /* + * For subpage case, we can no longer set page Uptodate directly for + * stripe_pages[], thus we need to locate the sector. + */ +static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, + struct page *page, + unsigned int pgoff) +{ + int i; + + for (i = 0; i < rbio->nr_sectors; i++) { + struct sector_ptr *sector = &rbio->stripe_sectors[i]; + + if (sector->page == page && sector->pgoff == pgoff) + return sector; + } + return NULL; +} + +/* * this sets each page in the bio uptodate. It should only be used on private * rbio pages, nothing that comes in from the higher layers */ -static void set_bio_pages_uptodate(struct bio *bio) +static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio_vec *bvec; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) - SetPageUptodate(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, iter_all) { + struct sector_ptr *sector; + int pgoff; + + for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; + pgoff += sectorsize) { + sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); + ASSERT(sector); + if (sector) + sector->uptodate = 1; + } + } } /* @@ -1446,7 +1567,7 @@ static void raid_rmw_end_io(struct bio *bio) if (bio->bi_status) fail_bio_stripe(rbio, bio); else - set_bio_pages_uptodate(bio); + set_bio_pages_uptodate(rbio, bio); bio_put(bio); @@ -1478,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int pagenr; + int sectornr; int stripe; struct bio *bio; @@ -1496,28 +1617,30 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + /* - * we want to find all the pages missing from - * the rbio and read them from the disk. If - * page_in_rbio finds a page in the bio list - * we don't need to read it off the stripe. + * We want to find all the sectors missing from the + * rbio and read them from the disk. If * sector_in_rbio() + * finds a page in the bio list we don't need to read + * it off the stripe. */ - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (page) + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) continue; - page = rbio_stripe_page(rbio, stripe, pagenr); + sector = rbio_stripe_sector(rbio, stripe, sectornr); /* - * the bio cache may have handed us an uptodate - * page. If so, be happy and use it + * The bio cache may have handed us an uptodate page. + * If so, be happy and use it. */ - if (PageUptodate(page)) + if (sector->uptodate) continue; - ret = rbio_add_io_page(rbio, &bio_list, page, - stripe, pagenr, rbio->stripe_len); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, rbio->stripe_len, + REQ_OP_READ); if (ret) goto cleanup; } @@ -1540,9 +1663,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid_rmw_end_io; - bio->bi_opf = REQ_OP_READ; btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); @@ -1624,7 +1745,7 @@ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list; - struct btrfs_work work; + struct work_struct work; }; /* @@ -1692,7 +1813,7 @@ static void run_plug(struct btrfs_plug_cb *plug) * if the unplug comes from schedule, we have to push the * work off to a helper thread */ -static void unplug_work(struct btrfs_work *work) +static void unplug_work(struct work_struct *work) { struct btrfs_plug_cb *plug; plug = container_of(work, struct btrfs_plug_cb, work); @@ -1705,9 +1826,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - btrfs_init_work(&plug->work, unplug_work, NULL, NULL); - btrfs_queue_work(plug->info->rmw_workers, - &plug->work); + INIT_WORK(&plug->work, unplug_work); + queue_work(plug->info->rmw_workers, &plug->work); return; } run_plug(plug); @@ -1716,8 +1836,7 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len) +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; @@ -1772,14 +1891,18 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, */ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) { - int pagenr, stripe; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + int sectornr, stripe; void **pointers; void **unmap_array; int faila = -1, failb = -1; - struct page *page; blk_status_t err; int i; + /* + * This array stores the pointer for each sector, thus it has the extra + * pgoff value added from each sector + */ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { err = BLK_STS_RESOURCE; @@ -1808,43 +1931,44 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + /* * Now we just use bitmap to mark the horizontal stripes in * which we have data when doing parity scrub. */ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(pagenr, rbio->dbitmap)) + !test_bit(sectornr, rbio->dbitmap)) continue; /* - * Setup our array of pointers with pages from each stripe + * Setup our array of pointers with sectors from each stripe * * NOTE: store a duplicate array of pointers to preserve the * pointer order */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { /* - * if we're rebuilding a read, we have to use + * If we're rebuilding a read, we have to use * pages from the bio list */ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && (stripe == faila || stripe == failb)) { - page = page_in_rbio(rbio, stripe, pagenr, 0); + sector = sector_in_rbio(rbio, stripe, sectornr, 0); } else { - page = rbio_stripe_page(rbio, stripe, pagenr); + sector = rbio_stripe_sector(rbio, stripe, sectornr); } - pointers[stripe] = kmap_local_page(page); + ASSERT(sector->page); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; unmap_array[stripe] = pointers[stripe]; } - /* all raid6 handling here */ + /* All raid6 handling here */ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { - /* - * single failure, rebuild from parity raid5 - * style - */ + /* Single failure, rebuild from parity raid5 style */ if (failb < 0) { if (faila == rbio->nr_data) { /* @@ -1887,10 +2011,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { raid6_datap_recov(rbio->real_stripes, - PAGE_SIZE, faila, pointers); + sectorsize, faila, pointers); } else { raid6_2data_recov(rbio->real_stripes, - PAGE_SIZE, faila, failb, + sectorsize, faila, failb, pointers); } } else { @@ -1900,7 +2024,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) BUG_ON(failb != -1); pstripe: /* Copy parity block into failed block to start with */ - copy_page(pointers[faila], pointers[rbio->nr_data]); + memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); /* rearrange the pointer array */ p = pointers[faila]; @@ -1909,7 +2033,7 @@ pstripe: pointers[rbio->nr_data - 1] = p; /* xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); + run_xor(pointers, rbio->nr_data - 1, sectorsize); } /* if we're doing this rebuild as part of an rmw, go through * and set all of our private rbio pages in the @@ -1918,14 +2042,14 @@ pstripe: * other endio functions will fiddle the uptodate bits */ if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < rbio->stripe_npages; i++) { + for (i = 0; i < rbio->stripe_nsectors; i++) { if (faila != -1) { - page = rbio_stripe_page(rbio, faila, i); - SetPageUptodate(page); + sector = rbio_stripe_sector(rbio, faila, i); + sector->uptodate = 1; } if (failb != -1) { - page = rbio_stripe_page(rbio, failb, i); - SetPageUptodate(page); + sector = rbio_stripe_sector(rbio, failb, i); + sector->uptodate = 1; } } } @@ -1998,7 +2122,7 @@ static void raid_recover_end_io(struct bio *bio) if (bio->bi_status) fail_bio_stripe(rbio, bio); else - set_bio_pages_uptodate(bio); + set_bio_pages_uptodate(rbio, bio); bio_put(bio); if (!atomic_dec_and_test(&rbio->stripes_pending)) @@ -2023,7 +2147,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int pagenr; + int sectornr; int stripe; struct bio *bio; @@ -2046,20 +2170,20 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) continue; } - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *p; + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; /* * the rmw code may have already read this * page in */ - p = rbio_stripe_page(rbio, stripe, pagenr); - if (PageUptodate(p)) + sector = rbio_stripe_sector(rbio, stripe, sectornr); + if (sector->uptodate) continue; - ret = rbio_add_io_page(rbio, &bio_list, - rbio_stripe_page(rbio, stripe, pagenr), - stripe, pagenr, rbio->stripe_len); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, rbio->stripe_len, + REQ_OP_READ); if (ret < 0) goto cleanup; } @@ -2086,9 +2210,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid_recover_end_io; - bio->bi_opf = REQ_OP_READ; btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); @@ -2115,7 +2237,7 @@ cleanup: * of the drive. */ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len, int mirror_num, int generic_io) + u32 stripe_len, int mirror_num, int generic_io) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; @@ -2193,7 +2315,7 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, } -static void rmw_work(struct btrfs_work *work) +static void rmw_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; @@ -2201,7 +2323,7 @@ static void rmw_work(struct btrfs_work *work) raid56_rmw_stripe(rbio); } -static void read_rebuild_work(struct btrfs_work *work) +static void read_rebuild_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; @@ -2221,7 +2343,7 @@ static void read_rebuild_work(struct btrfs_work *work) struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len, struct btrfs_device *scrub_dev, + u32 stripe_len, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_fs_info *fs_info = bioc->fs_info; @@ -2252,9 +2374,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, } ASSERT(i < rbio->real_stripes); - /* Now we just support the sectorsize equals to page size */ - ASSERT(fs_info->sectorsize == PAGE_SIZE); - ASSERT(rbio->stripe_npages == stripe_nsectors); bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); /* @@ -2268,17 +2387,19 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, /* Used for both parity scrub and missing. */ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - u64 logical) + unsigned int pgoff, u64 logical) { + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; int stripe_offset; int index; ASSERT(logical >= rbio->bioc->raid_map[0]); - ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] + + ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + rbio->stripe_len * rbio->nr_data); stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); - index = stripe_offset >> PAGE_SHIFT; - rbio->bio_pages[index] = page; + index = stripe_offset / sectorsize; + rbio->bio_sectors[index].page = page; + rbio->bio_sectors[index].pgoff = pgoff; } /* @@ -2287,14 +2408,16 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, */ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { - int i; - int bit; - int index; - struct page *page; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + int stripe; + int sectornr; + + for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + struct page *page; + int index = (stripe * rbio->stripe_nsectors + sectornr) * + sectorsize >> PAGE_SHIFT; - for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { - for (i = 0; i < rbio->real_stripes; i++) { - index = i * rbio->stripe_npages + bit; if (rbio->stripe_pages[index]) continue; @@ -2304,6 +2427,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) rbio->stripe_pages[index] = page; } } + index_stripe_sectors(rbio); return 0; } @@ -2311,14 +2435,15 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { struct btrfs_io_context *bioc = rbio->bioc; + const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; unsigned long *pbitmap = rbio->finish_pbitmap; int nr_data = rbio->nr_data; int stripe; - int pagenr; + int sectornr; bool has_qstripe; - struct page *p_page = NULL; - struct page *q_page = NULL; + struct sector_ptr p_sector = { 0 }; + struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; struct bio *bio; int is_replace = 0; @@ -2335,7 +2460,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { is_replace = 1; - bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); + bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors); } /* @@ -2348,54 +2473,59 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (!need_check) goto writeback; - p_page = alloc_page(GFP_NOFS); - if (!p_page) + p_sector.page = alloc_page(GFP_NOFS); + if (!p_sector.page) goto cleanup; - SetPageUptodate(p_page); + p_sector.pgoff = 0; + p_sector.uptodate = 1; if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ - q_page = alloc_page(GFP_NOFS); - if (!q_page) { - __free_page(p_page); + q_sector.page = alloc_page(GFP_NOFS); + if (!q_sector.page) { + __free_page(p_sector.page); + p_sector.page = NULL; goto cleanup; } - SetPageUptodate(q_page); - pointers[rbio->real_stripes - 1] = kmap_local_page(q_page); + q_sector.pgoff = 0; + q_sector.uptodate = 1; + pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); } atomic_set(&rbio->error, 0); /* Map the parity stripe just once */ - pointers[nr_data] = kmap_local_page(p_page); + pointers[nr_data] = kmap_local_page(p_sector.page); - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *p; + for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; void *parity; + /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { - p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap_local_page(p); + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; } if (has_qstripe) { /* RAID6, call the library function to fill in our P/Q */ - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { /* raid5 */ - copy_page(pointers[nr_data], pointers[0]); - run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); + memcpy(pointers[nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, nr_data - 1, sectorsize); } /* Check scrubbing parity and repair it */ - p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - parity = kmap_local_page(p); - if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) - copy_page(parity, pointers[rbio->scrubp]); + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + parity = kmap_local_page(sector->page) + sector->pgoff; + if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) + memcpy(parity, pointers[rbio->scrubp], sectorsize); else /* Parity is right, needn't writeback */ - bitmap_clear(rbio->dbitmap, pagenr, 1); + bitmap_clear(rbio->dbitmap, sectornr, 1); kunmap_local(parity); for (stripe = nr_data - 1; stripe >= 0; stripe--) @@ -2403,10 +2533,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, } kunmap_local(pointers[nr_data]); - __free_page(p_page); - if (q_page) { + __free_page(p_sector.page); + p_sector.page = NULL; + if (q_sector.page) { kunmap_local(pointers[rbio->real_stripes - 1]); - __free_page(q_page); + __free_page(q_sector.page); + q_sector.page = NULL; } writeback: @@ -2415,12 +2547,12 @@ writeback: * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *page; + for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; - page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - ret = rbio_add_io_page(rbio, &bio_list, - page, rbio->scrubp, pagenr, rbio->stripe_len); + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, + sectornr, rbio->stripe_len, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2428,13 +2560,13 @@ writeback: if (!is_replace) goto submit_write; - for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { - struct page *page; + for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; - page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - ret = rbio_add_io_page(rbio, &bio_list, page, + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, bioc->tgtdev_map[rbio->scrubp], - pagenr, rbio->stripe_len); + sectornr, rbio->stripe_len, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2450,9 +2582,7 @@ submit_write: atomic_set(&rbio->stripes_pending, nr_data); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - bio->bi_opf = REQ_OP_WRITE; submit_bio(bio); } @@ -2548,7 +2678,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio) if (bio->bi_status) fail_bio_stripe(rbio, bio); else - set_bio_pages_uptodate(bio); + set_bio_pages_uptodate(rbio, bio); bio_put(bio); @@ -2568,7 +2698,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int pagenr; + int sectornr; int stripe; struct bio *bio; @@ -2584,28 +2714,29 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) * stripe */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *page; + for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; /* - * we want to find all the pages missing from - * the rbio and read them from the disk. If - * page_in_rbio finds a page in the bio list - * we don't need to read it off the stripe. + * We want to find all the sectors missing from the + * rbio and read them from the disk. If * sector_in_rbio() + * finds a sector in the bio list we don't need to read + * it off the stripe. */ - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (page) + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) continue; - page = rbio_stripe_page(rbio, stripe, pagenr); + sector = rbio_stripe_sector(rbio, stripe, sectornr); /* - * the bio cache may have handed us an uptodate - * page. If so, be happy and use it + * The bio cache may have handed us an uptodate sector. + * If so, be happy and use it. */ - if (PageUptodate(page)) + if (sector->uptodate) continue; - ret = rbio_add_io_page(rbio, &bio_list, page, - stripe, pagenr, rbio->stripe_len); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, rbio->stripe_len, + REQ_OP_READ); if (ret) goto cleanup; } @@ -2628,9 +2759,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid56_parity_scrub_end_io; - bio->bi_opf = REQ_OP_READ; btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); @@ -2651,7 +2780,7 @@ finish: validate_rbio_for_parity_scrub(rbio); } -static void scrub_parity_work(struct btrfs_work *work) +static void scrub_parity_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 72c00fc284b5..aaad08aefd7d 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -31,15 +31,14 @@ struct btrfs_raid_bio; struct btrfs_device; int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len, int mirror_num, int generic_io); -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len); + u32 stripe_len, int mirror_num, int generic_io); +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - u64 logical); + unsigned int pgoff, u64 logical); struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, - struct btrfs_io_context *bioc, u64 stripe_len, + struct btrfs_io_context *bioc, u32 stripe_len, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index a3930da4eb3f..a3549d587464 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -277,7 +277,7 @@ copy_inline_extent: path->slots[0]), size); btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(dst)); ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); out: if (!ret && !trans) { @@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int ret; const u64 len = olen_aligned; u64 last_dest_end = destoff; + u64 prev_extent_end = off; ret = -ENOMEM; buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); @@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.offset = off; while (1) { - u64 next_key_min_offset = key.offset + 1; struct btrfs_file_extent_item *extent; u64 extent_gen; int type; @@ -431,14 +431,21 @@ process_slot: * The first search might have left us at an extent item that * ends before our target range's start, can happen if we have * holes and NO_HOLES feature enabled. + * + * Subsequent searches may leave us on a file range we have + * processed before - this happens due to a race with ordered + * extent completion for a file range that is outside our source + * range, but that range was part of a file extent item that + * also covered a leading part of our source range. */ - if (key.offset + datal <= off) { + if (key.offset + datal <= prev_extent_end) { path->slots[0]++; goto process_slot; } else if (key.offset >= off + len) { break; } - next_key_min_offset = key.offset + datal; + + prev_extent_end = key.offset + datal; size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); @@ -489,12 +496,14 @@ process_slot: clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; + clone_info.update_times = !no_time_update; ret = btrfs_replace_file_extents(BTRFS_I(inode), path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); if (ret) goto out; - } else if (type == BTRFS_FILE_EXTENT_INLINE) { + } else { + ASSERT(type == BTRFS_FILE_EXTENT_INLINE); /* * Inline extents always have to start at file offset 0 * and can never be bigger then the sector size. We can @@ -505,8 +514,12 @@ process_slot: */ ASSERT(key.offset == 0); ASSERT(datal <= fs_info->sectorsize); - if (key.offset != 0 || datal > fs_info->sectorsize) - return -EUCLEAN; + if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || + WARN_ON(key.offset != 0) || + WARN_ON(datal > fs_info->sectorsize)) { + ret = -EUCLEAN; + goto out; + } ret = clone_copy_inline_extent(inode, path, &new_key, drop_start, datal, size, @@ -518,17 +531,22 @@ process_slot: btrfs_release_path(path); /* - * If this is a new extent update the last_reflink_trans of both - * inodes. This is used by fsync to make sure it does not log - * multiple checksum items with overlapping ranges. For older - * extents we don't need to do it since inode logging skips the - * checksums for older extents. Also ignore holes and inline - * extents because they don't have checksums in the csum tree. + * Whenever we share an extent we update the last_reflink_trans + * of each inode to the current transaction. This is needed to + * make sure fsync does not log multiple checksum items with + * overlapping ranges (because some extent items might refer + * only to sections of the original extent). For the destination + * inode we do this regardless of the generation of the extents + * or even if they are inline extents or explicit holes, to make + * sure a full fsync does not skip them. For the source inode, + * we only need to update last_reflink_trans in case it's a new + * extent that is not a hole or an inline extent, to deal with + * the checksums problem on fsync. */ - if (extent_gen == trans->transid && disko > 0) { + if (extent_gen == trans->transid && disko > 0) BTRFS_I(src)->last_reflink_trans = trans->transid; - BTRFS_I(inode)->last_reflink_trans = trans->transid; - } + + BTRFS_I(inode)->last_reflink_trans = trans->transid; last_dest_end = ALIGN(new_key.offset + datal, fs_info->sectorsize); @@ -540,7 +558,7 @@ process_slot: break; btrfs_release_path(path); - key.offset = next_key_min_offset; + key.offset = prev_extent_end; if (fatal_signal_pending(current)) { ret = -EINTR; @@ -575,8 +593,7 @@ process_slot: * replaced file extent items. */ if (last_dest_end >= i_size_read(inode)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); ret = btrfs_replace_file_extents(BTRFS_I(inode), path, last_dest_end, destoff + len - 1, NULL, &trans); @@ -605,14 +622,23 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, struct inode *inode2, u64 loff2, u64 len) { + u64 range1_end = loff1 + len - 1; + u64 range2_end = loff2 + len - 1; + if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); + swap(range1_end, range2_end); } else if (inode1 == inode2 && loff2 < loff1) { swap(loff1, loff2); + swap(range1_end, range2_end); } - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end); + + btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); + btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); } static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) @@ -636,7 +662,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, int ret; /* - * Lock destination range to serialize with concurrent readpages() and + * Lock destination range to serialize with concurrent readahead() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, loff, dst, dst_loff, len); @@ -730,7 +756,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, } /* - * Lock destination range to serialize with concurrent readpages() and + * Lock destination range to serialize with concurrent readahead() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, off, inode, destoff, len); @@ -762,7 +788,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; - bool same_inode = inode_out == inode_in; u64 wb_len; int ret; @@ -772,9 +797,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (btrfs_root_readonly(root_out)) return -EROFS; - if (file_in->f_path.mnt != file_out->f_path.mnt || - inode_in->i_sb != inode_out->i_sb) - return -EXDEV; + ASSERT(inode_in->i_sb == inode_out->i_sb); } /* Don't make the dst file partly checksummed */ @@ -803,15 +826,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, wb_len = ALIGN(*len, bs); /* - * Since we don't lock ranges, wait for ongoing lockless dio writes (as - * any in progress could create its ordered extents after we wait for - * existing ordered extents below). - */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - /* * Workaround to make sure NOCOW buffered write reach disk as NOCOW. * * Btrfs' back references do not have a block level granularity, they diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9d8054839782..a6dc827e75af 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -362,7 +362,7 @@ struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); - root = (struct btrfs_root *)node->data; + root = node->data; } spin_unlock(&rc->reloc_root_tree.lock); return btrfs_grab_root(root); @@ -1101,7 +1101,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, continue; /* - * if we are modifying block in fs tree, wait for readpage + * if we are modifying block in fs tree, wait for read_folio * to complete and drop the extent cache */ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { @@ -1563,7 +1563,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, end = (u64)-1; } - /* the lock_extent waits for readpage to complete */ + /* the lock_extent waits for read_folio to complete */ lock_extent(&BTRFS_I(inode)->io_tree, start, end); btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); unlock_extent(&BTRFS_I(inode)->io_tree, start, end); @@ -2599,9 +2599,9 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, eb = read_tree_block(fs_info, block->bytenr, block->owner, block->key.offset, block->level, NULL); - if (IS_ERR(eb)) { + if (IS_ERR(eb)) return PTR_ERR(eb); - } else if (!extent_buffer_uptodate(eb)) { + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -2818,7 +2818,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * Subpage can't handle page with DIRTY but without UPTODATE * bit as it can lead to the following deadlock: * - * btrfs_readpage() + * btrfs_read_folio() * | Page already *locked* * |- btrfs_lock_and_flush_ordered_range() * |- btrfs_start_ordered_extent() @@ -2967,11 +2967,12 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, goto release_page; if (PageReadahead(page)) - page_cache_async_readahead(inode->i_mapping, ra, NULL, page, - page_index, last_index + 1 - page_index); + page_cache_async_readahead(inode->i_mapping, ra, NULL, + page_folio(page), page_index, + last_index + 1 - page_index); if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (!PageUptodate(page)) { ret = -EIO; @@ -2997,7 +2998,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, /* Reserve metadata for this range */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - clamped_len); + clamped_len, clamped_len, + false); if (ret) goto release_page; @@ -3845,8 +3847,7 @@ out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (err) { - if (inode) - iput(inode); + iput(inode); inode = ERR_PTR(err); } return inode; @@ -3977,6 +3978,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) if (!bg) return -ENOENT; + /* + * Relocation of a data block group creates ordered extents. Without + * sb_start_write(), we can freeze the filesystem while unfinished + * ordered extents are left. Such ordered extents can cause a deadlock + * e.g. when syncfs() is waiting for their completion but they can't + * finish because they block when joining a transaction, due to the + * fact that the freeze locks are being held in write mode. + */ + if (bg->flags & BTRFS_BLOCK_GROUP_DATA) + ASSERT(sb_write_started(fs_info->sb)); + if (btrfs_pinned_by_swapfile(fs_info, bg)) { btrfs_put_block_group(bg); return -ETXTBSY; @@ -4123,9 +4135,8 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) * this function resumes merging reloc trees with corresponding fs trees. * this is important for keeping the sharing of tree blocks */ -int btrfs_recover_relocation(struct btrfs_root *root) +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; LIST_HEAD(reloc_roots); struct btrfs_key key; struct btrfs_root *fs_root; @@ -4166,7 +4177,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) key.type != BTRFS_ROOT_ITEM_KEY) break; - reloc_root = btrfs_read_tree_root(root, &key); + reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); if (IS_ERR(reloc_root)) { err = PTR_ERR(reloc_root); goto out; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index ca7426ef61c8..a64b26b16904 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -509,7 +509,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, /* One for parent inode, two for dir entries */ qgroup_num_bytes = 3 * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_prealloc(root, - qgroup_num_bytes, true); + qgroup_num_bytes, true, + false); if (ret) return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2e9a322773f2..e7b0323e6efd 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -45,14 +45,14 @@ struct scrub_ctx; * operations. The first one configures an upper limit for the number * of (dynamically allocated) pages that are added to a bio. */ -#define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */ -#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */ +#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */ /* * The following value times PAGE_SIZE needs to be large enough to match the * largest node/leaf/sector size that shall be supported. */ -#define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) +#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) struct scrub_recover { refcount_t refs; @@ -60,7 +60,7 @@ struct scrub_recover { u64 map_length; }; -struct scrub_page { +struct scrub_sector { struct scrub_block *sblock; struct page *page; struct btrfs_device *dev; @@ -87,16 +87,16 @@ struct scrub_bio { blk_status_t status; u64 logical; u64 physical; - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; - int page_count; + struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO]; + int sector_count; int next_free; - struct btrfs_work work; + struct work_struct work; }; struct scrub_block { - struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; - int page_count; - atomic_t outstanding_pages; + struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK]; + int sector_count; + atomic_t outstanding_sectors; refcount_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; struct scrub_parity *sparity; @@ -110,7 +110,7 @@ struct scrub_block { /* It is for the data with checksum */ unsigned int data_corrected:1; }; - struct btrfs_work work; + struct work_struct work; }; /* Used for the chunks with parity stripe such RAID5/6 */ @@ -129,10 +129,10 @@ struct scrub_parity { refcount_t refs; - struct list_head spages; + struct list_head sectors_list; /* Work of parity check and repair */ - struct btrfs_work work; + struct work_struct work; /* Mark the parity blocks which have data */ unsigned long *dbitmap; @@ -158,7 +158,7 @@ struct scrub_ctx { struct list_head csum_list; atomic_t cancel_req; int readonly; - int pages_per_bio; + int sectors_per_bio; /* State of IO submission throttling affecting the associated device */ ktime_t throttle_deadline; @@ -212,43 +212,43 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, static void scrub_recheck_block_checksum(struct scrub_block *sblock); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good); -static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, +static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, - int page_num, int force_write); + int sector_num, int force_write); static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); -static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, - int page_num); +static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, + int sector_num); static int scrub_checksum_data(struct scrub_block *sblock); static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); -static void scrub_page_get(struct scrub_page *spage); -static void scrub_page_put(struct scrub_page *spage); +static void scrub_sector_get(struct scrub_sector *sector); +static void scrub_sector_put(struct scrub_sector *sector); static void scrub_parity_get(struct scrub_parity *sparity); static void scrub_parity_put(struct scrub_parity *sparity); -static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, - u64 physical_for_dev_replace); +static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, + u64 physical_for_dev_replace); static void scrub_bio_end_io(struct bio *bio); -static void scrub_bio_end_io_worker(struct btrfs_work *work); +static void scrub_bio_end_io_worker(struct work_struct *work); static void scrub_block_complete(struct scrub_block *sblock); -static void scrub_remap_extent(struct btrfs_fs_info *fs_info, - u64 extent_logical, u32 extent_len, - u64 *extent_physical, - struct btrfs_device **extent_dev, - int *extent_mirror_num); -static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, - struct scrub_page *spage); +static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, + u64 extent_logical, u32 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num); +static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_sector *sector); static void scrub_wr_submit(struct scrub_ctx *sctx); static void scrub_wr_bio_end_io(struct bio *bio); -static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); +static void scrub_wr_bio_end_io_worker(struct work_struct *work); static void scrub_put_ctx(struct scrub_ctx *sctx); -static inline int scrub_is_page_on_raid56(struct scrub_page *spage) +static inline int scrub_is_page_on_raid56(struct scrub_sector *sector) { - return spage->recover && - (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); + return sector->recover && + (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); } static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -535,9 +535,9 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; - for (i = 0; i < sbio->page_count; i++) { - WARN_ON(!sbio->pagev[i]->page); - scrub_block_put(sbio->pagev[i]->sblock); + for (i = 0; i < sbio->sector_count; i++) { + WARN_ON(!sbio->sectors[i]->page); + scrub_block_put(sbio->sectors[i]->sblock); } bio_put(sbio->bio); } @@ -572,7 +572,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_bio = SCRUB_PAGES_PER_BIO; + sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO; sctx->curr = -1; sctx->fs_info = fs_info; INIT_LIST_HEAD(&sctx->csum_list); @@ -586,9 +586,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sbio->index = i; sbio->sctx = sctx; - sbio->page_count = 0; - btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL, - NULL); + sbio->sector_count = 0; + INIT_WORK(&sbio->work, scrub_bio_end_io_worker); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -728,16 +727,16 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) u8 ref_level = 0; int ret; - WARN_ON(sblock->page_count < 1); - dev = sblock->pagev[0]->dev; + WARN_ON(sblock->sector_count < 1); + dev = sblock->sectors[0]->dev; fs_info = sblock->sctx->fs_info; path = btrfs_alloc_path(); if (!path) return; - swarn.physical = sblock->pagev[0]->physical; - swarn.logical = sblock->pagev[0]->logical; + swarn.physical = sblock->sectors[0]->physical; + swarn.logical = sblock->sectors[0]->logical; swarn.errstr = errstr; swarn.dev = NULL; @@ -798,8 +797,8 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, /* * scrub_handle_errored_block gets called when either verification of the - * pages failed or the bio failed to read, e.g. with EIO. In the latter - * case, this function handles all pages in the bio, even though only one + * sectors failed or the bio failed to read, e.g. with EIO. In the latter + * case, this function handles all sectors in the bio, even though only one * may be bad. * The goal of this function is to repair the errored block by using the * contents of one of the mirrors. @@ -817,16 +816,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) struct scrub_block *sblock_bad; int ret; int mirror_index; - int page_num; + int sector_num; int success; bool full_stripe_locked; unsigned int nofs_flag; static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - BUG_ON(sblock_to_check->page_count < 1); + BUG_ON(sblock_to_check->sector_count < 1); fs_info = sctx->fs_info; - if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { /* * if we find an error in a super block, we just report it. * They will get written with the next transaction commit @@ -837,13 +836,13 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_unlock(&sctx->stat_lock); return 0; } - logical = sblock_to_check->pagev[0]->logical; - BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); - failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; - is_metadata = !(sblock_to_check->pagev[0]->flags & + logical = sblock_to_check->sectors[0]->logical; + BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1); + failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1; + is_metadata = !(sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock_to_check->pagev[0]->have_csum; - dev = sblock_to_check->pagev[0]->dev; + have_csum = sblock_to_check->sectors[0]->have_csum; + dev = sblock_to_check->sectors[0]->dev; if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) return 0; @@ -854,7 +853,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * might be waiting the scrub task to pause (which needs to wait for all * the worker tasks to complete before pausing). * We do allocations in the workers through insert_full_stripe_lock() - * and scrub_add_page_to_wr_bio(), which happens down the call chain of + * and scrub_add_sector_to_wr_bio(), which happens down the call chain of * this function. */ nofs_flag = memalloc_nofs_save(); @@ -918,7 +917,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto out; } - /* setup the context, map the logical blocks and alloc the pages */ + /* Setup the context, map the logical blocks and alloc the sectors */ ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); @@ -937,7 +936,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) { /* - * the error disappeared after reading page by page, or + * The error disappeared after reading sector by sector, or * the area was part of a huge bio and other parts of the * bio caused I/O errors, or the block layer merged several * read requests into one and the error is caused by a @@ -998,10 +997,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * that is known to contain an error is rewritten. Afterwards * the block is known to be corrected. * If a mirror is found which is completely correct, and no - * checksum is present, only those pages are rewritten that had + * checksum is present, only those sectors are rewritten that had * an I/O error in the block to be repaired, since it cannot be - * determined, which copy of the other pages is better (and it - * could happen otherwise that a correct page would be + * determined, which copy of the other sectors is better (and it + * could happen otherwise that a correct sector would be * overwritten by a bad one). */ for (mirror_index = 0; ;mirror_index++) { @@ -1011,25 +1010,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) continue; /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ - if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) { + if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) { if (mirror_index >= BTRFS_MAX_MIRRORS) break; - if (!sblocks_for_recheck[mirror_index].page_count) + if (!sblocks_for_recheck[mirror_index].sector_count) break; sblock_other = sblocks_for_recheck + mirror_index; } else { - struct scrub_recover *r = sblock_bad->pagev[0]->recover; + struct scrub_recover *r = sblock_bad->sectors[0]->recover; int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; if (mirror_index >= max_allowed) break; - if (!sblocks_for_recheck[1].page_count) + if (!sblocks_for_recheck[1].sector_count) break; ASSERT(failed_mirror_index == 0); sblock_other = sblocks_for_recheck + 1; - sblock_other->pagev[0]->mirror_num = 1 + mirror_index; + sblock_other->sectors[0]->mirror_num = 1 + mirror_index; } /* build and submit the bios, check checksums */ @@ -1078,16 +1077,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * area are unreadable. */ success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; - page_num++) { - struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; + for (sector_num = 0; sector_num < sblock_bad->sector_count; + sector_num++) { + struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; struct scrub_block *sblock_other = NULL; - /* skip no-io-error page in scrub */ - if (!spage_bad->io_error && !sctx->is_dev_replace) + /* Skip no-io-error sectors in scrub */ + if (!sector_bad->io_error && !sctx->is_dev_replace) continue; - if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) { + if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) { /* * In case of dev replace, if raid56 rebuild process * didn't work out correct data, then copy the content @@ -1096,14 +1095,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * sblock_for_recheck array to target device. */ sblock_other = NULL; - } else if (spage_bad->io_error) { - /* try to find no-io-error page in mirrors */ + } else if (sector_bad->io_error) { + /* Try to find no-io-error sector in mirrors */ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; + sblocks_for_recheck[mirror_index].sector_count > 0; mirror_index++) { if (!sblocks_for_recheck[mirror_index]. - pagev[page_num]->io_error) { + sectors[sector_num]->io_error) { sblock_other = sblocks_for_recheck + mirror_index; break; @@ -1115,27 +1114,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (sctx->is_dev_replace) { /* - * did not find a mirror to fetch the page - * from. scrub_write_page_to_dev_replace() - * handles this case (page->io_error), by - * filling the block with zeros before - * submitting the write request + * Did not find a mirror to fetch the sector from. + * scrub_write_sector_to_dev_replace() handles this + * case (sector->io_error), by filling the block with + * zeros before submitting the write request */ if (!sblock_other) sblock_other = sblock_bad; - if (scrub_write_page_to_dev_replace(sblock_other, - page_num) != 0) { + if (scrub_write_sector_to_dev_replace(sblock_other, + sector_num) != 0) { atomic64_inc( &fs_info->dev_replace.num_write_errors); success = 0; } } else if (sblock_other) { - ret = scrub_repair_page_from_good_copy(sblock_bad, - sblock_other, - page_num, 0); + ret = scrub_repair_sector_from_good_copy(sblock_bad, + sblock_other, + sector_num, 0); if (0 == ret) - spage_bad->io_error = 0; + sector_bad->io_error = 0; else success = 0; } @@ -1186,18 +1184,16 @@ out: struct scrub_block *sblock = sblocks_for_recheck + mirror_index; struct scrub_recover *recover; - int page_index; + int i; - for (page_index = 0; page_index < sblock->page_count; - page_index++) { - sblock->pagev[page_index]->sblock = NULL; - recover = sblock->pagev[page_index]->recover; + for (i = 0; i < sblock->sector_count; i++) { + sblock->sectors[i]->sblock = NULL; + recover = sblock->sectors[i]->recover; if (recover) { scrub_put_recover(fs_info, recover); - sblock->pagev[page_index]->recover = - NULL; + sblock->sectors[i]->recover = NULL; } - scrub_page_put(sblock->pagev[page_index]); + scrub_sector_put(sblock->sectors[i]); } } kfree(sblocks_for_recheck); @@ -1255,26 +1251,25 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, { struct scrub_ctx *sctx = original_sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = original_sblock->page_count * fs_info->sectorsize; - u64 logical = original_sblock->pagev[0]->logical; - u64 generation = original_sblock->pagev[0]->generation; - u64 flags = original_sblock->pagev[0]->flags; - u64 have_csum = original_sblock->pagev[0]->have_csum; + u64 length = original_sblock->sector_count << fs_info->sectorsize_bits; + u64 logical = original_sblock->sectors[0]->logical; + u64 generation = original_sblock->sectors[0]->generation; + u64 flags = original_sblock->sectors[0]->flags; + u64 have_csum = original_sblock->sectors[0]->have_csum; struct scrub_recover *recover; struct btrfs_io_context *bioc; u64 sublen; u64 mapped_length; u64 stripe_offset; int stripe_index; - int page_index = 0; + int sector_index = 0; int mirror_index; int nmirrors; int ret; /* - * note: the two members refs and outstanding_pages - * are not used (and not set) in the blocks that are used for - * the recheck procedure + * Note: the two members refs and outstanding_sectors are not used (and + * not set) in the blocks that are used for the recheck procedure. */ while (length > 0) { @@ -1306,20 +1301,20 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, recover->bioc = bioc; recover->map_length = mapped_length; - ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK); nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; - struct scrub_page *spage; + struct scrub_sector *sector; sblock = sblocks_for_recheck + mirror_index; sblock->sctx = sctx; - spage = kzalloc(sizeof(*spage), GFP_NOFS); - if (!spage) { + sector = kzalloc(sizeof(*sector), GFP_NOFS); + if (!sector) { leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -1327,16 +1322,16 @@ leave_nomem: scrub_put_recover(fs_info, recover); return -ENOMEM; } - scrub_page_get(spage); - sblock->pagev[page_index] = spage; - spage->sblock = sblock; - spage->flags = flags; - spage->generation = generation; - spage->logical = logical; - spage->have_csum = have_csum; + scrub_sector_get(sector); + sblock->sectors[sector_index] = sector; + sector->sblock = sblock; + sector->flags = flags; + sector->generation = generation; + sector->logical = logical; + sector->have_csum = have_csum; if (have_csum) - memcpy(spage->csum, - original_sblock->pagev[0]->csum, + memcpy(sector->csum, + original_sblock->sectors[0]->csum, sctx->fs_info->csum_size); scrub_stripe_index_and_offset(logical, @@ -1348,28 +1343,28 @@ leave_nomem: mirror_index, &stripe_index, &stripe_offset); - spage->physical = bioc->stripes[stripe_index].physical + + sector->physical = bioc->stripes[stripe_index].physical + stripe_offset; - spage->dev = bioc->stripes[stripe_index].dev; + sector->dev = bioc->stripes[stripe_index].dev; - BUG_ON(page_index >= original_sblock->page_count); - spage->physical_for_dev_replace = - original_sblock->pagev[page_index]-> + BUG_ON(sector_index >= original_sblock->sector_count); + sector->physical_for_dev_replace = + original_sblock->sectors[sector_index]-> physical_for_dev_replace; - /* for missing devices, dev->bdev is NULL */ - spage->mirror_num = mirror_index + 1; - sblock->page_count++; - spage->page = alloc_page(GFP_NOFS); - if (!spage->page) + /* For missing devices, dev->bdev is NULL */ + sector->mirror_num = mirror_index + 1; + sblock->sector_count++; + sector->page = alloc_page(GFP_NOFS); + if (!sector->page) goto leave_nomem; scrub_get_recover(recover); - spage->recover = recover; + sector->recover = recover; } scrub_put_recover(fs_info, recover); length -= sublen; logical += sublen; - page_index++; + sector_index++; } return 0; @@ -1382,19 +1377,19 @@ static void scrub_bio_wait_endio(struct bio *bio) static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, struct bio *bio, - struct scrub_page *spage) + struct scrub_sector *sector) { DECLARE_COMPLETION_ONSTACK(done); int ret; int mirror_num; - bio->bi_iter.bi_sector = spage->logical >> 9; + bio->bi_iter.bi_sector = sector->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - mirror_num = spage->sblock->pagev[0]->mirror_num; - ret = raid56_parity_recover(bio, spage->recover->bioc, - spage->recover->map_length, + mirror_num = sector->sblock->sectors[0]->mirror_num; + ret = raid56_parity_recover(bio, sector->recover->bioc, + sector->recover->map_length, mirror_num, 0); if (ret) return ret; @@ -1406,26 +1401,25 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, struct scrub_block *sblock) { - struct scrub_page *first_page = sblock->pagev[0]; + struct scrub_sector *first_sector = sblock->sectors[0]; struct bio *bio; - int page_num; + int i; - /* All pages in sblock belong to the same stripe on the same device. */ - ASSERT(first_page->dev); - if (!first_page->dev->bdev) + /* All sectors in sblock belong to the same stripe on the same device. */ + ASSERT(first_sector->dev); + if (!first_sector->dev->bdev) goto out; - bio = btrfs_bio_alloc(BIO_MAX_VECS); - bio_set_dev(bio, first_page->dev->bdev); + bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); - for (page_num = 0; page_num < sblock->page_count; page_num++) { - struct scrub_page *spage = sblock->pagev[page_num]; + for (i = 0; i < sblock->sector_count; i++) { + struct scrub_sector *sector = sblock->sectors[i]; - WARN_ON(!spage->page); - bio_add_page(bio, spage->page, PAGE_SIZE, 0); + WARN_ON(!sector->page); + bio_add_page(bio, sector->page, PAGE_SIZE, 0); } - if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) { + if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) { bio_put(bio); goto out; } @@ -1436,65 +1430,63 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, return; out: - for (page_num = 0; page_num < sblock->page_count; page_num++) - sblock->pagev[page_num]->io_error = 1; + for (i = 0; i < sblock->sector_count; i++) + sblock->sectors[i]->io_error = 1; sblock->no_io_error_seen = 0; } /* - * this function will check the on disk data for checksum errors, header - * errors and read I/O errors. If any I/O errors happen, the exact pages - * which are errored are marked as being bad. The goal is to enable scrub - * to take those pages that are not errored from all the mirrors so that - * the pages that are errored in the just handled mirror can be repaired. + * This function will check the on disk data for checksum errors, header errors + * and read I/O errors. If any I/O errors happen, the exact sectors which are + * errored are marked as being bad. The goal is to enable scrub to take those + * sectors that are not errored from all the mirrors so that the sectors that + * are errored in the just handled mirror can be repaired. */ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int retry_failed_mirror) { - int page_num; + int i; sblock->no_io_error_seen = 1; /* short cut for raid56 */ - if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0])) + if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0])) return scrub_recheck_block_on_raid56(fs_info, sblock); - for (page_num = 0; page_num < sblock->page_count; page_num++) { - struct bio *bio; - struct scrub_page *spage = sblock->pagev[page_num]; + for (i = 0; i < sblock->sector_count; i++) { + struct scrub_sector *sector = sblock->sectors[i]; + struct bio bio; + struct bio_vec bvec; - if (spage->dev->bdev == NULL) { - spage->io_error = 1; + if (sector->dev->bdev == NULL) { + sector->io_error = 1; sblock->no_io_error_seen = 0; continue; } - WARN_ON(!spage->page); - bio = btrfs_bio_alloc(1); - bio_set_dev(bio, spage->dev->bdev); + WARN_ON(!sector->page); + bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ); + bio_add_page(&bio, sector->page, fs_info->sectorsize, 0); + bio.bi_iter.bi_sector = sector->physical >> 9; - bio_add_page(bio, spage->page, fs_info->sectorsize, 0); - bio->bi_iter.bi_sector = spage->physical >> 9; - bio->bi_opf = REQ_OP_READ; - - if (btrfsic_submit_bio_wait(bio)) { - spage->io_error = 1; + btrfsic_check_bio(&bio); + if (submit_bio_wait(&bio)) { + sector->io_error = 1; sblock->no_io_error_seen = 0; } - bio_put(bio); + bio_uninit(&bio); } if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); } -static inline int scrub_check_fsid(u8 fsid[], - struct scrub_page *spage) +static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector) { - struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices; + struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices; int ret; ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -1507,7 +1499,7 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock) sblock->checksum_error = 0; sblock->generation_error = 0; - if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA) + if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA) scrub_checksum_data(sblock); else scrub_checksum_tree_block(sblock); @@ -1516,15 +1508,14 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock) static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good) { - int page_num; + int i; int ret = 0; - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + for (i = 0; i < sblock_bad->sector_count; i++) { int ret_sub; - ret_sub = scrub_repair_page_from_good_copy(sblock_bad, - sblock_good, - page_num, 1); + ret_sub = scrub_repair_sector_from_good_copy(sblock_bad, + sblock_good, i, 1); if (ret_sub) ret = ret_sub; } @@ -1532,47 +1523,43 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, return ret; } -static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int page_num, int force_write) +static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int sector_num, int force_write) { - struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; - struct scrub_page *spage_good = sblock_good->pagev[page_num]; + struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; + struct scrub_sector *sector_good = sblock_good->sectors[sector_num]; struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; const u32 sectorsize = fs_info->sectorsize; - BUG_ON(spage_bad->page == NULL); - BUG_ON(spage_good->page == NULL); + BUG_ON(sector_bad->page == NULL); + BUG_ON(sector_good->page == NULL); if (force_write || sblock_bad->header_error || - sblock_bad->checksum_error || spage_bad->io_error) { - struct bio *bio; + sblock_bad->checksum_error || sector_bad->io_error) { + struct bio bio; + struct bio_vec bvec; int ret; - if (!spage_bad->dev->bdev) { + if (!sector_bad->dev->bdev) { btrfs_warn_rl(fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); return -EIO; } - bio = btrfs_bio_alloc(1); - bio_set_dev(bio, spage_bad->dev->bdev); - bio->bi_iter.bi_sector = spage_bad->physical >> 9; - bio->bi_opf = REQ_OP_WRITE; + bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); + bio.bi_iter.bi_sector = sector_bad->physical >> 9; + __bio_add_page(&bio, sector_good->page, sectorsize, 0); - ret = bio_add_page(bio, spage_good->page, sectorsize, 0); - if (ret != sectorsize) { - bio_put(bio); - return -EIO; - } + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + bio_uninit(&bio); - if (btrfsic_submit_bio_wait(bio)) { - btrfs_dev_stat_inc_and_print(spage_bad->dev, + if (ret) { + btrfs_dev_stat_inc_and_print(sector_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); atomic64_inc(&fs_info->dev_replace.num_write_errors); - bio_put(bio); return -EIO; } - bio_put(bio); } return 0; @@ -1581,7 +1568,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) { struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - int page_num; + int i; /* * This block is used for the check of the parity on the source device, @@ -1590,25 +1577,24 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) if (sblock->sparity) return; - for (page_num = 0; page_num < sblock->page_count; page_num++) { + for (i = 0; i < sblock->sector_count; i++) { int ret; - ret = scrub_write_page_to_dev_replace(sblock, page_num); + ret = scrub_write_sector_to_dev_replace(sblock, i); if (ret) atomic64_inc(&fs_info->dev_replace.num_write_errors); } } -static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, - int page_num) +static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num) { - struct scrub_page *spage = sblock->pagev[page_num]; + struct scrub_sector *sector = sblock->sectors[sector_num]; - BUG_ON(spage->page == NULL); - if (spage->io_error) - clear_page(page_address(spage->page)); + BUG_ON(sector->page == NULL); + if (sector->io_error) + clear_page(page_address(sector->page)); - return scrub_add_page_to_wr_bio(sblock->sctx, spage); + return scrub_add_sector_to_wr_bio(sblock->sctx, sector); } static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) @@ -1633,8 +1619,8 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) return ret; } -static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, - struct scrub_page *spage) +static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_sector *sector) { struct scrub_bio *sbio; int ret; @@ -1650,45 +1636,38 @@ again: return -ENOMEM; } sctx->wr_curr_bio->sctx = sctx; - sctx->wr_curr_bio->page_count = 0; + sctx->wr_curr_bio->sector_count = 0; } sbio = sctx->wr_curr_bio; - if (sbio->page_count == 0) { - struct bio *bio; - - ret = fill_writer_pointer_gap(sctx, - spage->physical_for_dev_replace); + if (sbio->sector_count == 0) { + ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace); if (ret) { mutex_unlock(&sctx->wr_lock); return ret; } - sbio->physical = spage->physical_for_dev_replace; - sbio->logical = spage->logical; + sbio->physical = sector->physical_for_dev_replace; + sbio->logical = sector->logical; sbio->dev = sctx->wr_tgtdev; - bio = sbio->bio; - if (!bio) { - bio = btrfs_bio_alloc(sctx->pages_per_bio); - sbio->bio = bio; + if (!sbio->bio) { + sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, + REQ_OP_WRITE, GFP_NOFS); } - - bio->bi_private = sbio; - bio->bi_end_io = scrub_wr_bio_end_io; - bio_set_dev(bio, sbio->dev->bdev); - bio->bi_iter.bi_sector = sbio->physical >> 9; - bio->bi_opf = REQ_OP_WRITE; + sbio->bio->bi_private = sbio; + sbio->bio->bi_end_io = scrub_wr_bio_end_io; + sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * sectorsize != - spage->physical_for_dev_replace || - sbio->logical + sbio->page_count * sectorsize != - spage->logical) { + } else if (sbio->physical + sbio->sector_count * sectorsize != + sector->physical_for_dev_replace || + sbio->logical + sbio->sector_count * sectorsize != + sector->logical) { scrub_wr_submit(sctx); goto again; } - ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); if (ret != sectorsize) { - if (sbio->page_count < 1) { + if (sbio->sector_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; mutex_unlock(&sctx->wr_lock); @@ -1698,10 +1677,10 @@ again: goto again; } - sbio->pagev[sbio->page_count] = spage; - scrub_page_get(spage); - sbio->page_count++; - if (sbio->page_count == sctx->pages_per_bio) + sbio->sectors[sbio->sector_count] = sector; + scrub_sector_get(sector); + sbio->sector_count++; + if (sbio->sector_count == sctx->sectors_per_bio) scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); @@ -1717,16 +1696,16 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) sbio = sctx->wr_curr_bio; sctx->wr_curr_bio = NULL; - WARN_ON(!sbio->bio->bi_bdev); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer * orders the requests before sending them to the driver which * doubled the write performance on spinning disks when measured * with Linux 3.5 */ - btrfsic_submit_bio(sbio->bio); + btrfsic_check_bio(sbio->bio); + submit_bio(sbio->bio); if (btrfs_is_zoned(sctx->fs_info)) - sctx->write_pointer = sbio->physical + sbio->page_count * + sctx->write_pointer = sbio->physical + sbio->sector_count * sctx->fs_info->sectorsize; } @@ -1738,31 +1717,31 @@ static void scrub_wr_bio_end_io(struct bio *bio) sbio->status = bio->bi_status; sbio->bio = bio; - btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); - btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); + INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker); + queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } -static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) +static void scrub_wr_bio_end_io_worker(struct work_struct *work) { struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); struct scrub_ctx *sctx = sbio->sctx; int i; - ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); + ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; + for (i = 0; i < sbio->sector_count; i++) { + struct scrub_sector *sector = sbio->sectors[i]; - spage->io_error = 1; + sector->io_error = 1; atomic64_inc(&dev_replace->num_write_errors); } } - for (i = 0; i < sbio->page_count; i++) - scrub_page_put(sbio->pagev[i]); + for (i = 0; i < sbio->sector_count; i++) + scrub_sector_put(sbio->sectors[i]); bio_put(sbio->bio); kfree(sbio); @@ -1786,8 +1765,8 @@ static int scrub_checksum(struct scrub_block *sblock) sblock->generation_error = 0; sblock->checksum_error = 0; - WARN_ON(sblock->page_count < 1); - flags = sblock->pagev[0]->flags; + WARN_ON(sblock->sector_count < 1); + flags = sblock->sectors[0]->flags; ret = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) ret = scrub_checksum_data(sblock); @@ -1809,26 +1788,26 @@ static int scrub_checksum_data(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 csum[BTRFS_CSUM_SIZE]; - struct scrub_page *spage; + struct scrub_sector *sector; char *kaddr; - BUG_ON(sblock->page_count < 1); - spage = sblock->pagev[0]; - if (!spage->have_csum) + BUG_ON(sblock->sector_count < 1); + sector = sblock->sectors[0]; + if (!sector->have_csum) return 0; - kaddr = page_address(spage->page); + kaddr = page_address(sector->page); shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); /* - * In scrub_pages() and scrub_pages_for_parity() we ensure each spage + * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector * only contains one sector of data. */ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - if (memcmp(csum, spage->csum, fs_info->csum_size)) + if (memcmp(csum, sector->csum, fs_info->csum_size)) sblock->checksum_error = 1; return sblock->checksum_error; } @@ -1849,16 +1828,16 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) const u32 sectorsize = sctx->fs_info->sectorsize; const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits; int i; - struct scrub_page *spage; + struct scrub_sector *sector; char *kaddr; - BUG_ON(sblock->page_count < 1); + BUG_ON(sblock->sector_count < 1); - /* Each member in pagev is just one block, not a full page */ - ASSERT(sblock->page_count == num_sectors); + /* Each member in sectors is just one sector */ + ASSERT(sblock->sector_count == num_sectors); - spage = sblock->pagev[0]; - kaddr = page_address(spage->page); + sector = sblock->sectors[0]; + kaddr = page_address(sector->page); h = (struct btrfs_header *)kaddr; memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); @@ -1867,15 +1846,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (spage->logical != btrfs_stack_header_bytenr(h)) + if (sector->logical != btrfs_stack_header_bytenr(h)) sblock->header_error = 1; - if (spage->generation != btrfs_stack_header_generation(h)) { + if (sector->generation != btrfs_stack_header_generation(h)) { sblock->header_error = 1; sblock->generation_error = 1; } - if (!scrub_check_fsid(h->fsid, spage)) + if (!scrub_check_fsid(h->fsid, sector)) sblock->header_error = 1; if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, @@ -1888,7 +1867,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) sectorsize - BTRFS_CSUM_SIZE); for (i = 1; i < num_sectors; i++) { - kaddr = page_address(sblock->pagev[i]->page); + kaddr = page_address(sblock->sectors[i]->page); crypto_shash_update(shash, kaddr, sectorsize); } @@ -1906,23 +1885,23 @@ static int scrub_checksum_super(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; - struct scrub_page *spage; + struct scrub_sector *sector; char *kaddr; int fail_gen = 0; int fail_cor = 0; - BUG_ON(sblock->page_count < 1); - spage = sblock->pagev[0]; - kaddr = page_address(spage->page); + BUG_ON(sblock->sector_count < 1); + sector = sblock->sectors[0]; + kaddr = page_address(sector->page); s = (struct btrfs_super_block *)kaddr; - if (spage->logical != btrfs_super_bytenr(s)) + if (sector->logical != btrfs_super_bytenr(s)) ++fail_cor; - if (spage->generation != btrfs_super_generation(s)) + if (sector->generation != btrfs_super_generation(s)) ++fail_gen; - if (!scrub_check_fsid(s->fsid, spage)) + if (!scrub_check_fsid(s->fsid, sector)) ++fail_cor; shash->tfm = fs_info->csum_shash; @@ -1943,10 +1922,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(spage->dev, + btrfs_dev_stat_inc_and_print(sector->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(spage->dev, + btrfs_dev_stat_inc_and_print(sector->dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -1966,23 +1945,23 @@ static void scrub_block_put(struct scrub_block *sblock) if (sblock->sparity) scrub_parity_put(sblock->sparity); - for (i = 0; i < sblock->page_count; i++) - scrub_page_put(sblock->pagev[i]); + for (i = 0; i < sblock->sector_count; i++) + scrub_sector_put(sblock->sectors[i]); kfree(sblock); } } -static void scrub_page_get(struct scrub_page *spage) +static void scrub_sector_get(struct scrub_sector *sector) { - atomic_inc(&spage->refs); + atomic_inc(§or->refs); } -static void scrub_page_put(struct scrub_page *spage) +static void scrub_sector_put(struct scrub_sector *sector) { - if (atomic_dec_and_test(&spage->refs)) { - if (spage->page) - __free_page(spage->page); - kfree(spage); + if (atomic_dec_and_test(§or->refs)) { + if (sector->page) + __free_page(sector->page); + kfree(sector); } } @@ -2057,13 +2036,14 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); - btrfsic_submit_bio(sbio->bio); + btrfsic_check_bio(sbio->bio); + submit_bio(sbio->bio); } -static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, - struct scrub_page *spage) +static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_sector *sector) { - struct scrub_block *sblock = spage->sblock; + struct scrub_block *sblock = sector->sblock; struct scrub_bio *sbio; const u32 sectorsize = sctx->fs_info->sectorsize; int ret; @@ -2078,7 +2058,7 @@ again: if (sctx->curr != -1) { sctx->first_free = sctx->bios[sctx->curr]->next_free; sctx->bios[sctx->curr]->next_free = -1; - sctx->bios[sctx->curr]->page_count = 0; + sctx->bios[sctx->curr]->sector_count = 0; spin_unlock(&sctx->list_lock); } else { spin_unlock(&sctx->list_lock); @@ -2086,37 +2066,31 @@ again: } } sbio = sctx->bios[sctx->curr]; - if (sbio->page_count == 0) { - struct bio *bio; - - sbio->physical = spage->physical; - sbio->logical = spage->logical; - sbio->dev = spage->dev; - bio = sbio->bio; - if (!bio) { - bio = btrfs_bio_alloc(sctx->pages_per_bio); - sbio->bio = bio; + if (sbio->sector_count == 0) { + sbio->physical = sector->physical; + sbio->logical = sector->logical; + sbio->dev = sector->dev; + if (!sbio->bio) { + sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, + REQ_OP_READ, GFP_NOFS); } - - bio->bi_private = sbio; - bio->bi_end_io = scrub_bio_end_io; - bio_set_dev(bio, sbio->dev->bdev); - bio->bi_iter.bi_sector = sbio->physical >> 9; - bio->bi_opf = REQ_OP_READ; + sbio->bio->bi_private = sbio; + sbio->bio->bi_end_io = scrub_bio_end_io; + sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * sectorsize != - spage->physical || - sbio->logical + sbio->page_count * sectorsize != - spage->logical || - sbio->dev != spage->dev) { + } else if (sbio->physical + sbio->sector_count * sectorsize != + sector->physical || + sbio->logical + sbio->sector_count * sectorsize != + sector->logical || + sbio->dev != sector->dev) { scrub_submit(sctx); goto again; } - sbio->pagev[sbio->page_count] = spage; - ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + sbio->sectors[sbio->sector_count] = sector; + ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); if (ret != sectorsize) { - if (sbio->page_count < 1) { + if (sbio->sector_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; return -EIO; @@ -2126,9 +2100,9 @@ again: } scrub_block_get(sblock); /* one for the page added to the bio */ - atomic_inc(&sblock->outstanding_pages); - sbio->page_count++; - if (sbio->page_count == sctx->pages_per_bio) + atomic_inc(&sblock->outstanding_sectors); + sbio->sector_count++; + if (sbio->sector_count == sctx->sectors_per_bio) scrub_submit(sctx); return 0; @@ -2144,10 +2118,10 @@ static void scrub_missing_raid56_end_io(struct bio *bio) bio_put(bio); - btrfs_queue_work(fs_info->scrub_workers, &sblock->work); + queue_work(fs_info->scrub_workers, &sblock->work); } -static void scrub_missing_raid56_worker(struct btrfs_work *work) +static void scrub_missing_raid56_worker(struct work_struct *work) { struct scrub_block *sblock = container_of(work, struct scrub_block, work); struct scrub_ctx *sctx = sblock->sctx; @@ -2155,8 +2129,8 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) u64 logical; struct btrfs_device *dev; - logical = sblock->pagev[0]->logical; - dev = sblock->pagev[0]->dev; + logical = sblock->sectors[0]->logical; + dev = sblock->sectors[0]->dev; if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); @@ -2193,8 +2167,8 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = sblock->page_count * PAGE_SIZE; - u64 logical = sblock->pagev[0]->logical; + u64 length = sblock->sector_count << fs_info->sectorsize_bits; + u64 logical = sblock->sectors[0]->logical; struct btrfs_io_context *bioc = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; @@ -2213,12 +2187,12 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) * We shouldn't be scrubbing a missing device. Even for dev * replace, we should only get here for RAID 5/6. We either * managed to mount something with no mirrors remaining or - * there's a bug in scrub_remap_extent()/btrfs_map_block(). + * there's a bug in scrub_find_good_copy()/btrfs_map_block(). */ goto bioc_out; } - bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = logical >> 9; bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; @@ -2227,13 +2201,17 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) if (!rbio) goto rbio_out; - for (i = 0; i < sblock->page_count; i++) { - struct scrub_page *spage = sblock->pagev[i]; + for (i = 0; i < sblock->sector_count; i++) { + struct scrub_sector *sector = sblock->sectors[i]; - raid56_add_scrub_pages(rbio, spage->page, spage->logical); + /* + * For now, our scrub is still one page per sector, so pgoff + * is always 0. + */ + raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical); } - btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL); + INIT_WORK(&sblock->work, scrub_missing_raid56_worker); scrub_block_get(sblock); scrub_pending_bio_inc(sctx); raid56_submit_missing_rbio(rbio); @@ -2249,7 +2227,7 @@ bioc_out: spin_unlock(&sctx->stat_lock); } -static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, +static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, u64 physical_for_dev_replace) @@ -2273,7 +2251,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, sblock->no_io_error_seen = 1; for (index = 0; len > 0; index++) { - struct scrub_page *spage; + struct scrub_sector *sector; /* * Here we will allocate one page for one sector to scrub. * This is fine if PAGE_SIZE == sectorsize, but will cost @@ -2281,8 +2259,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, */ u32 l = min(sectorsize, len); - spage = kzalloc(sizeof(*spage), GFP_KERNEL); - if (!spage) { + sector = kzalloc(sizeof(*sector), GFP_KERNEL); + if (!sector) { leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2290,26 +2268,26 @@ leave_nomem: scrub_block_put(sblock); return -ENOMEM; } - ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); - scrub_page_get(spage); - sblock->pagev[index] = spage; - spage->sblock = sblock; - spage->dev = dev; - spage->flags = flags; - spage->generation = gen; - spage->logical = logical; - spage->physical = physical; - spage->physical_for_dev_replace = physical_for_dev_replace; - spage->mirror_num = mirror_num; + ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); + scrub_sector_get(sector); + sblock->sectors[index] = sector; + sector->sblock = sblock; + sector->dev = dev; + sector->flags = flags; + sector->generation = gen; + sector->logical = logical; + sector->physical = physical; + sector->physical_for_dev_replace = physical_for_dev_replace; + sector->mirror_num = mirror_num; if (csum) { - spage->have_csum = 1; - memcpy(spage->csum, csum, sctx->fs_info->csum_size); + sector->have_csum = 1; + memcpy(sector->csum, csum, sctx->fs_info->csum_size); } else { - spage->have_csum = 0; + sector->have_csum = 0; } - sblock->page_count++; - spage->page = alloc_page(GFP_KERNEL); - if (!spage->page) + sblock->sector_count++; + sector->page = alloc_page(GFP_KERNEL); + if (!sector->page) goto leave_nomem; len -= l; logical += l; @@ -2317,7 +2295,7 @@ leave_nomem: physical_for_dev_replace += l; } - WARN_ON(sblock->page_count == 0); + WARN_ON(sblock->sector_count == 0); if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { /* * This case should only be hit for RAID 5/6 device replace. See @@ -2325,11 +2303,11 @@ leave_nomem: */ scrub_missing_raid56_pages(sblock); } else { - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev[index]; + for (index = 0; index < sblock->sector_count; index++) { + struct scrub_sector *sector = sblock->sectors[index]; int ret; - ret = scrub_add_page_to_rd_bio(sctx, spage); + ret = scrub_add_sector_to_rd_bio(sctx, sector); if (ret) { scrub_block_put(sblock); return ret; @@ -2353,31 +2331,31 @@ static void scrub_bio_end_io(struct bio *bio) sbio->status = bio->bi_status; sbio->bio = bio; - btrfs_queue_work(fs_info->scrub_workers, &sbio->work); + queue_work(fs_info->scrub_workers, &sbio->work); } -static void scrub_bio_end_io_worker(struct btrfs_work *work) +static void scrub_bio_end_io_worker(struct work_struct *work) { struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); struct scrub_ctx *sctx = sbio->sctx; int i; - ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); + ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); if (sbio->status) { - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; + for (i = 0; i < sbio->sector_count; i++) { + struct scrub_sector *sector = sbio->sectors[i]; - spage->io_error = 1; - spage->sblock->no_io_error_seen = 0; + sector->io_error = 1; + sector->sblock->no_io_error_seen = 0; } } - /* now complete the scrub_block items that have all pages completed */ - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; - struct scrub_block *sblock = spage->sblock; + /* Now complete the scrub_block items that have all pages completed */ + for (i = 0; i < sbio->sector_count; i++) { + struct scrub_sector *sector = sbio->sectors[i]; + struct scrub_block *sblock = sector->sblock; - if (atomic_dec_and_test(&sblock->outstanding_pages)) + if (atomic_dec_and_test(&sblock->outstanding_sectors)) scrub_block_complete(sblock); scrub_block_put(sblock); } @@ -2456,8 +2434,8 @@ static void scrub_block_complete(struct scrub_block *sblock) } if (sblock->sparity && corrupted && !sblock->data_corrected) { - u64 start = sblock->pagev[0]->logical; - u64 end = sblock->pagev[sblock->page_count - 1]->logical + + u64 start = sblock->sectors[0]->logical; + u64 end = sblock->sectors[sblock->sector_count - 1]->logical + sblock->sctx->fs_info->sectorsize; ASSERT(end - start <= U32_MAX); @@ -2532,8 +2510,11 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u64 physical_for_dev_replace) + u64 gen, int mirror_num) { + struct btrfs_device *src_dev = dev; + u64 src_physical = physical; + int src_mirror = mirror_num; int ret; u8 csum[BTRFS_CSUM_SIZE]; u32 blocksize; @@ -2561,6 +2542,18 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, WARN_ON(1); } + /* + * For dev-replace case, we can have @dev being a missing device. + * Regular scrub will avoid its execution on missing device at all, + * as that would trigger tons of read error. + * + * Reading from missing device will cause read error counts to + * increase unnecessarily. + * So here we change the read source to a good mirror. + */ + if (sctx->is_dev_replace && !dev->bdev) + scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical, + &src_dev, &src_mirror); while (len) { u32 l = min(len, blocksize); int have_csum = 0; @@ -2571,20 +2564,20 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, if (have_csum == 0) ++sctx->stat.no_csum; } - ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, - mirror_num, have_csum ? csum : NULL, - physical_for_dev_replace); + ret = scrub_sectors(sctx, logical, l, src_physical, src_dev, + flags, gen, src_mirror, + have_csum ? csum : NULL, physical); if (ret) return ret; len -= l; logical += l; physical += l; - physical_for_dev_replace += l; + src_physical += l; } return 0; } -static int scrub_pages_for_parity(struct scrub_parity *sparity, +static int scrub_sectors_for_parity(struct scrub_parity *sparity, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum) @@ -2613,10 +2606,10 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, scrub_parity_get(sparity); for (index = 0; len > 0; index++) { - struct scrub_page *spage; + struct scrub_sector *sector; - spage = kzalloc(sizeof(*spage), GFP_KERNEL); - if (!spage) { + sector = kzalloc(sizeof(*sector), GFP_KERNEL); + if (!sector) { leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2624,29 +2617,29 @@ leave_nomem: scrub_block_put(sblock); return -ENOMEM; } - ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); /* For scrub block */ - scrub_page_get(spage); - sblock->pagev[index] = spage; + scrub_sector_get(sector); + sblock->sectors[index] = sector; /* For scrub parity */ - scrub_page_get(spage); - list_add_tail(&spage->list, &sparity->spages); - spage->sblock = sblock; - spage->dev = dev; - spage->flags = flags; - spage->generation = gen; - spage->logical = logical; - spage->physical = physical; - spage->mirror_num = mirror_num; + scrub_sector_get(sector); + list_add_tail(§or->list, &sparity->sectors_list); + sector->sblock = sblock; + sector->dev = dev; + sector->flags = flags; + sector->generation = gen; + sector->logical = logical; + sector->physical = physical; + sector->mirror_num = mirror_num; if (csum) { - spage->have_csum = 1; - memcpy(spage->csum, csum, sctx->fs_info->csum_size); + sector->have_csum = 1; + memcpy(sector->csum, csum, sctx->fs_info->csum_size); } else { - spage->have_csum = 0; + sector->have_csum = 0; } - sblock->page_count++; - spage->page = alloc_page(GFP_KERNEL); - if (!spage->page) + sblock->sector_count++; + sector->page = alloc_page(GFP_KERNEL); + if (!sector->page) goto leave_nomem; @@ -2656,19 +2649,19 @@ leave_nomem: physical += sectorsize; } - WARN_ON(sblock->page_count == 0); - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev[index]; + WARN_ON(sblock->sector_count == 0); + for (index = 0; index < sblock->sector_count; index++) { + struct scrub_sector *sector = sblock->sectors[index]; int ret; - ret = scrub_add_page_to_rd_bio(sctx, spage); + ret = scrub_add_sector_to_rd_bio(sctx, sector); if (ret) { scrub_block_put(sblock); return ret; } } - /* last one frees, either here or in bio completion for last page */ + /* Last one frees, either here or in bio completion for last sector */ scrub_block_put(sblock); return 0; } @@ -2707,7 +2700,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, if (have_csum == 0) goto skip; } - ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, + ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev, flags, gen, mirror_num, have_csum ? csum : NULL); if (ret) @@ -2767,7 +2760,7 @@ static int get_raid56_logic_offset(u64 physical, int num, static void scrub_free_parity(struct scrub_parity *sparity) { struct scrub_ctx *sctx = sparity->sctx; - struct scrub_page *curr, *next; + struct scrub_sector *curr, *next; int nbits; nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); @@ -2778,15 +2771,15 @@ static void scrub_free_parity(struct scrub_parity *sparity) spin_unlock(&sctx->stat_lock); } - list_for_each_entry_safe(curr, next, &sparity->spages, list) { + list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) { list_del_init(&curr->list); - scrub_page_put(curr); + scrub_sector_put(curr); } kfree(sparity); } -static void scrub_parity_bio_endio_worker(struct btrfs_work *work) +static void scrub_parity_bio_endio_worker(struct work_struct *work) { struct scrub_parity *sparity = container_of(work, struct scrub_parity, work); @@ -2798,7 +2791,7 @@ static void scrub_parity_bio_endio_worker(struct btrfs_work *work) static void scrub_parity_bio_endio(struct bio *bio) { - struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; + struct scrub_parity *sparity = bio->bi_private; struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; if (bio->bi_status) @@ -2807,9 +2800,8 @@ static void scrub_parity_bio_endio(struct bio *bio) bio_put(bio); - btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL, - NULL); - btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work); + INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker); + queue_work(fs_info->scrub_parity_workers, &sparity->work); } static void scrub_parity_check_and_repair(struct scrub_parity *sparity) @@ -2834,7 +2826,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) if (ret || !bioc || !bioc->raid_map) goto bioc_out; - bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = sparity->logic_start >> 9; bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; @@ -2882,6 +2874,251 @@ static void scrub_parity_put(struct scrub_parity *sparity) scrub_parity_check_and_repair(sparity); } +/* + * Return 0 if the extent item range covers any byte of the range. + * Return <0 if the extent item is before @search_start. + * Return >0 if the extent item is after @start_start + @search_len. + */ +static int compare_extent_item_range(struct btrfs_path *path, + u64 search_start, u64 search_len) +{ + struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; + u64 len; + struct btrfs_key key; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY); + if (key.type == BTRFS_METADATA_ITEM_KEY) + len = fs_info->nodesize; + else + len = key.offset; + + if (key.objectid + len <= search_start) + return -1; + if (key.objectid >= search_start + search_len) + return 1; + return 0; +} + +/* + * Locate one extent item which covers any byte in range + * [@search_start, @search_start + @search_length) + * + * If the path is not initialized, we will initialize the search by doing + * a btrfs_search_slot(). + * If the path is already initialized, we will use the path as the initial + * slot, to avoid duplicated btrfs_search_slot() calls. + * + * NOTE: If an extent item starts before @search_start, we will still + * return the extent item. This is for data extent crossing stripe boundary. + * + * Return 0 if we found such extent item, and @path will point to the extent item. + * Return >0 if no such extent item can be found, and @path will be released. + * Return <0 if hit fatal error, and @path will be released. + */ +static int find_first_extent_item(struct btrfs_root *extent_root, + struct btrfs_path *path, + u64 search_start, u64 search_len) +{ + struct btrfs_fs_info *fs_info = extent_root->fs_info; + struct btrfs_key key; + int ret; + + /* Continue using the existing path */ + if (path->nodes[0]) + goto search_forward; + + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = search_start; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + ASSERT(ret > 0); + /* + * Here we intentionally pass 0 as @min_objectid, as there could be + * an extent item starting before @search_start. + */ + ret = btrfs_previous_extent_item(extent_root, path, 0); + if (ret < 0) + return ret; + /* + * No matter whether we have found an extent item, the next loop will + * properly do every check on the key. + */ +search_forward: + while (true) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid >= search_start + search_len) + break; + if (key.type != BTRFS_METADATA_ITEM_KEY && + key.type != BTRFS_EXTENT_ITEM_KEY) + goto next; + + ret = compare_extent_item_range(path, search_start, search_len); + if (ret == 0) + return ret; + if (ret > 0) + break; +next: + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(extent_root, path); + if (ret) { + /* Either no more item or fatal error */ + btrfs_release_path(path); + return ret; + } + } + } + btrfs_release_path(path); + return 1; +} + +static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, + u64 *size_ret, u64 *flags_ret, u64 *generation_ret) +{ + struct btrfs_key key; + struct btrfs_extent_item *ei; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || + key.type == BTRFS_EXTENT_ITEM_KEY); + *extent_start_ret = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY) + *size_ret = path->nodes[0]->fs_info->nodesize; + else + *size_ret = key.offset; + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); + *flags_ret = btrfs_extent_flags(path->nodes[0], ei); + *generation_ret = btrfs_extent_generation(path->nodes[0], ei); +} + +static bool does_range_cross_boundary(u64 extent_start, u64 extent_len, + u64 boundary_start, u64 boudary_len) +{ + return (extent_start < boundary_start && + extent_start + extent_len > boundary_start) || + (extent_start < boundary_start + boudary_len && + extent_start + extent_len > boundary_start + boudary_len); +} + +static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, + struct scrub_parity *sparity, + struct map_lookup *map, + struct btrfs_device *sdev, + struct btrfs_path *path, + u64 logical) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical); + u64 cur_logical = logical; + int ret; + + ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); + + /* Path must not be populated */ + ASSERT(!path->nodes[0]); + + while (cur_logical < logical + map->stripe_len) { + struct btrfs_io_context *bioc = NULL; + struct btrfs_device *extent_dev; + u64 extent_start; + u64 extent_size; + u64 mapped_length; + u64 extent_flags; + u64 extent_gen; + u64 extent_physical; + u64 extent_mirror_num; + + ret = find_first_extent_item(extent_root, path, cur_logical, + logical + map->stripe_len - cur_logical); + /* No more extent item in this data stripe */ + if (ret > 0) { + ret = 0; + break; + } + if (ret < 0) + break; + get_extent_info(path, &extent_start, &extent_size, &extent_flags, + &extent_gen); + + /* Metadata should not cross stripe boundaries */ + if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + does_range_cross_boundary(extent_start, extent_size, + logical, map->stripe_len)) { + btrfs_err(fs_info, + "scrub: tree block %llu spanning stripes, ignored. logical=%llu", + extent_start, logical); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + cur_logical += extent_size; + continue; + } + + /* Skip hole range which doesn't have any extent */ + cur_logical = max(extent_start, cur_logical); + + /* Truncate the range inside this data stripe */ + extent_size = min(extent_start + extent_size, + logical + map->stripe_len) - cur_logical; + extent_start = cur_logical; + ASSERT(extent_size <= U32_MAX); + + scrub_parity_mark_sectors_data(sparity, extent_start, extent_size); + + mapped_length = extent_size; + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start, + &mapped_length, &bioc, 0); + if (!ret && (!bioc || mapped_length < extent_size)) + ret = -EIO; + if (ret) { + btrfs_put_bioc(bioc); + scrub_parity_mark_sectors_error(sparity, extent_start, + extent_size); + break; + } + extent_physical = bioc->stripes[0].physical; + extent_mirror_num = bioc->mirror_num; + extent_dev = bioc->stripes[0].dev; + btrfs_put_bioc(bioc); + + ret = btrfs_lookup_csums_range(csum_root, extent_start, + extent_start + extent_size - 1, + &sctx->csum_list, 1); + if (ret) { + scrub_parity_mark_sectors_error(sparity, extent_start, + extent_size); + break; + } + + ret = scrub_extent_for_parity(sparity, extent_start, + extent_size, extent_physical, + extent_dev, extent_flags, + extent_gen, extent_mirror_num); + scrub_free_csums(sctx); + + if (ret) { + scrub_parity_mark_sectors_error(sparity, extent_start, + extent_size); + break; + } + + cond_resched(); + cur_logical += extent_size; + } + btrfs_release_path(path); + return ret; +} + static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *sdev, @@ -2889,28 +3126,12 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, u64 logic_end) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start); - struct btrfs_root *csum_root; - struct btrfs_extent_item *extent; - struct btrfs_io_context *bioc = NULL; struct btrfs_path *path; - u64 flags; + u64 cur_logical; int ret; - int slot; - struct extent_buffer *l; - struct btrfs_key key; - u64 generation; - u64 extent_logical; - u64 extent_physical; - /* Check the comment in scrub_stripe() for why u32 is enough here */ - u32 extent_len; - u64 mapped_length; - struct btrfs_device *extent_dev; struct scrub_parity *sparity; int nsectors; int bitmap_len; - int extent_mirror_num; - int stop_loop = 0; path = btrfs_alloc_path(); if (!path) { @@ -2943,178 +3164,19 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->logic_start = logic_start; sparity->logic_end = logic_end; refcount_set(&sparity->refs, 1); - INIT_LIST_HEAD(&sparity->spages); + INIT_LIST_HEAD(&sparity->sectors_list); sparity->dbitmap = sparity->bitmap; sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; ret = 0; - while (logic_start < logic_end) { - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - key.type = BTRFS_METADATA_ITEM_KEY; - else - key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logic_start; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + for (cur_logical = logic_start; cur_logical < logic_end; + cur_logical += map->stripe_len) { + ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map, + sdev, path, cur_logical); if (ret < 0) - goto out; - - if (ret > 0) { - ret = btrfs_previous_extent_item(root, path, 0); - if (ret < 0) - goto out; - if (ret > 0) { - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, - path, 0, 0); - if (ret < 0) - goto out; - } - } - - stop_loop = 0; - while (1) { - u64 bytes; - - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - - stop_loop = 1; - break; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - - if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = fs_info->nodesize; - else - bytes = key.offset; - - if (key.objectid + bytes <= logic_start) - goto next; - - if (key.objectid >= logic_end) { - stop_loop = 1; - break; - } - - while (key.objectid >= logic_start + map->stripe_len) - logic_start += map->stripe_len; - - extent = btrfs_item_ptr(l, slot, - struct btrfs_extent_item); - flags = btrfs_extent_flags(l, extent); - generation = btrfs_extent_generation(l, extent); - - if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && - (key.objectid < logic_start || - key.objectid + bytes > - logic_start + map->stripe_len)) { - btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - key.objectid, logic_start); - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - goto next; - } -again: - extent_logical = key.objectid; - ASSERT(bytes <= U32_MAX); - extent_len = bytes; - - if (extent_logical < logic_start) { - extent_len -= logic_start - extent_logical; - extent_logical = logic_start; - } - - if (extent_logical + extent_len > - logic_start + map->stripe_len) - extent_len = logic_start + map->stripe_len - - extent_logical; - - scrub_parity_mark_sectors_data(sparity, extent_logical, - extent_len); - - mapped_length = extent_len; - bioc = NULL; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, - extent_logical, &mapped_length, &bioc, - 0); - if (!ret) { - if (!bioc || mapped_length < extent_len) - ret = -EIO; - } - if (ret) { - btrfs_put_bioc(bioc); - goto out; - } - extent_physical = bioc->stripes[0].physical; - extent_mirror_num = bioc->mirror_num; - extent_dev = bioc->stripes[0].dev; - btrfs_put_bioc(bioc); - - csum_root = btrfs_csum_root(fs_info, extent_logical); - ret = btrfs_lookup_csums_range(csum_root, - extent_logical, - extent_logical + extent_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; - - ret = scrub_extent_for_parity(sparity, extent_logical, - extent_len, - extent_physical, - extent_dev, flags, - generation, - extent_mirror_num); - - scrub_free_csums(sctx); - - if (ret) - goto out; - - if (extent_logical + extent_len < - key.objectid + bytes) { - logic_start += map->stripe_len; - - if (logic_start >= logic_end) { - stop_loop = 1; - break; - } - - if (logic_start < key.objectid + bytes) { - cond_resched(); - goto again; - } - } -next: - path->slots[0]++; - } - - btrfs_release_path(path); - - if (stop_loop) break; - - logic_start += map->stripe_len; - } -out: - if (ret < 0) { - ASSERT(logic_end - logic_start <= U32_MAX); - scrub_parity_mark_sectors_error(sparity, logic_start, - logic_end - logic_start); } + scrub_parity_put(sparity); scrub_submit(sctx); mutex_lock(&sctx->wr_lock); @@ -3165,6 +3227,206 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, return ret; } +/* + * Scrub one range which can only has simple mirror based profile. + * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in + * RAID0/RAID10). + * + * Since we may need to handle a subset of block group, we need @logical_start + * and @logical_length parameter. + */ +static int scrub_simple_mirror(struct scrub_ctx *sctx, + struct btrfs_root *extent_root, + struct btrfs_root *csum_root, + struct btrfs_block_group *bg, + struct map_lookup *map, + u64 logical_start, u64 logical_length, + struct btrfs_device *device, + u64 physical, int mirror_num) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + const u64 logical_end = logical_start + logical_length; + /* An artificial limit, inherit from old scrub behavior */ + const u32 max_length = SZ_64K; + struct btrfs_path path = { 0 }; + u64 cur_logical = logical_start; + int ret; + + /* The range must be inside the bg */ + ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); + + path.search_commit_root = 1; + path.skip_locking = 1; + /* Go through each extent items inside the logical range */ + while (cur_logical < logical_end) { + u64 extent_start; + u64 extent_len; + u64 extent_flags; + u64 extent_gen; + u64 scrub_len; + + /* Canceled? */ + if (atomic_read(&fs_info->scrub_cancel_req) || + atomic_read(&sctx->cancel_req)) { + ret = -ECANCELED; + break; + } + /* Paused? */ + if (atomic_read(&fs_info->scrub_pause_req)) { + /* Push queued extents */ + sctx->flush_all_writes = true; + scrub_submit(sctx); + mutex_lock(&sctx->wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_lock); + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + sctx->flush_all_writes = false; + scrub_blocked_if_needed(fs_info); + } + /* Block group removed? */ + spin_lock(&bg->lock); + if (bg->removed) { + spin_unlock(&bg->lock); + ret = 0; + break; + } + spin_unlock(&bg->lock); + + ret = find_first_extent_item(extent_root, &path, cur_logical, + logical_end - cur_logical); + if (ret > 0) { + /* No more extent, just update the accounting */ + sctx->stat.last_physical = physical + logical_length; + ret = 0; + break; + } + if (ret < 0) + break; + get_extent_info(&path, &extent_start, &extent_len, + &extent_flags, &extent_gen); + /* Skip hole range which doesn't have any extent */ + cur_logical = max(extent_start, cur_logical); + + /* + * Scrub len has three limits: + * - Extent size limit + * - Scrub range limit + * This is especially imporatant for RAID0/RAID10 to reuse + * this function + * - Max scrub size limit + */ + scrub_len = min(min(extent_start + extent_len, + logical_end), cur_logical + max_length) - + cur_logical; + + if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { + ret = btrfs_lookup_csums_range(csum_root, cur_logical, + cur_logical + scrub_len - 1, + &sctx->csum_list, 1); + if (ret) + break; + } + if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + does_range_cross_boundary(extent_start, extent_len, + logical_start, logical_length)) { + btrfs_err(fs_info, +"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)", + extent_start, logical_start, logical_end); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + cur_logical += scrub_len; + continue; + } + ret = scrub_extent(sctx, map, cur_logical, scrub_len, + cur_logical - logical_start + physical, + device, extent_flags, extent_gen, + mirror_num); + scrub_free_csums(sctx); + if (ret) + break; + if (sctx->is_dev_replace) + sync_replace_for_zoned(sctx); + cur_logical += scrub_len; + /* Don't hold CPU for too long time */ + cond_resched(); + } + btrfs_release_path(&path); + return ret; +} + +/* Calculate the full stripe length for simple stripe based profiles */ +static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + + return map->num_stripes / map->sub_stripes * map->stripe_len; +} + +/* Get the logical bytenr for the stripe */ +static u64 simple_stripe_get_logical(struct map_lookup *map, + struct btrfs_block_group *bg, + int stripe_index) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + ASSERT(stripe_index < map->num_stripes); + + /* + * (stripe_index / sub_stripes) gives how many data stripes we need to + * skip. + */ + return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start; +} + +/* Get the mirror number for the stripe */ +static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + ASSERT(stripe_index < map->num_stripes); + + /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ + return stripe_index % map->sub_stripes + 1; +} + +static int scrub_simple_stripe(struct scrub_ctx *sctx, + struct btrfs_root *extent_root, + struct btrfs_root *csum_root, + struct btrfs_block_group *bg, + struct map_lookup *map, + struct btrfs_device *device, + int stripe_index) +{ + const u64 logical_increment = simple_stripe_full_stripe_len(map); + const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); + const u64 orig_physical = map->stripes[stripe_index].physical; + const int mirror_num = simple_stripe_mirror_num(map, stripe_index); + u64 cur_logical = orig_logical; + u64 cur_physical = orig_physical; + int ret = 0; + + while (cur_logical < bg->start + bg->length) { + /* + * Inside each stripe, RAID0 is just SINGLE, and RAID10 is + * just RAID1, so we can reuse scrub_simple_mirror() to scrub + * this stripe. + */ + ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map, + cur_logical, map->stripe_len, device, + cur_physical, mirror_num); + if (ret) + return ret; + /* Skip to next stripe which belongs to the target device */ + cur_logical += logical_increment; + /* For physical offset, we just go to next stripe */ + cur_physical += map->stripe_len; + } + return ret; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, struct map_lookup *map, @@ -3175,59 +3437,22 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root; struct btrfs_root *csum_root; - struct btrfs_extent_item *extent; struct blk_plug plug; + const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; - u64 flags; int ret; - int slot; - u64 nstripes; - struct extent_buffer *l; - u64 physical; + u64 physical = map->stripes[stripe_index].physical; + const u64 physical_end = physical + dev_extent_len; u64 logical; u64 logic_end; - u64 physical_end; - u64 generation; - int mirror_num; - struct btrfs_key key; - u64 increment = map->stripe_len; + /* The logical increment after finishing one stripe */ + u64 increment; + /* Offset inside the chunk */ u64 offset; - u64 extent_logical; - u64 extent_physical; - /* - * Unlike chunk length, extent length should never go beyond - * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here. - */ - u32 extent_len; u64 stripe_logical; u64 stripe_end; - struct btrfs_device *extent_dev; - int extent_mirror_num; int stop_loop = 0; - physical = map->stripes[stripe_index].physical; - offset = 0; - nstripes = div64_u64(dev_extent_len, map->stripe_len); - mirror_num = 1; - increment = map->stripe_len; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - offset = map->stripe_len * stripe_index; - increment = map->stripe_len * map->num_stripes; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - int factor = map->num_stripes / map->sub_stripes; - offset = map->stripe_len * (stripe_index / map->sub_stripes); - increment = map->stripe_len * factor; - mirror_num = stripe_index % map->sub_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - mirror_num = stripe_index % map->num_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - mirror_num = stripe_index % map->num_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical, stripe_index, map, &offset, - NULL); - increment = map->stripe_len * nr_data_stripes(map); - } - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3241,21 +3466,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, path->skip_locking = 1; path->reada = READA_FORWARD; - logical = chunk_logical + offset; - physical_end = physical + nstripes * map->stripe_len; - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical_end, stripe_index, - map, &logic_end, NULL); - logic_end += chunk_logical; - } else { - logic_end = logical + increment * nstripes; - } wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); scrub_blocked_if_needed(fs_info); - root = btrfs_extent_root(fs_info, logical); - csum_root = btrfs_csum_root(fs_info, logical); + root = btrfs_extent_root(fs_info, bg->start); + csum_root = btrfs_csum_root(fs_info, bg->start); /* * collect all data csums for the stripe to avoid seeking during @@ -3272,241 +3488,83 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } /* - * now find all extents for each stripe and scrub them + * There used to be a big double loop to handle all profiles using the + * same routine, which grows larger and more gross over time. + * + * So here we handle each profile differently, so simpler profiles + * have simpler scrubbing function. */ - ret = 0; - while (physical < physical_end) { - /* - * canceled? - */ - if (atomic_read(&fs_info->scrub_cancel_req) || - atomic_read(&sctx->cancel_req)) { - ret = -ECANCELED; - goto out; - } + if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID56_MASK))) { /* - * check to see if we have to pause + * Above check rules out all complex profile, the remaining + * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple + * mirrored duplication without stripe. + * + * Only @physical and @mirror_num needs to calculated using + * @stripe_index. */ - if (atomic_read(&fs_info->scrub_pause_req)) { - /* push queued extents */ - sctx->flush_all_writes = true; - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - wait_event(sctx->list_wait, - atomic_read(&sctx->bios_in_flight) == 0); - sctx->flush_all_writes = false; - scrub_blocked_if_needed(fs_info); - } - - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = get_raid56_logic_offset(physical, stripe_index, - map, &logical, - &stripe_logical); - logical += chunk_logical; - if (ret) { - /* it is parity strip */ - stripe_logical += chunk_logical; - stripe_end = stripe_logical + increment; - ret = scrub_raid56_parity(sctx, map, scrub_dev, - stripe_logical, - stripe_end); - if (ret) - goto out; - goto skip; - } - } - - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - key.type = BTRFS_METADATA_ITEM_KEY; - else - key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logical; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = btrfs_previous_extent_item(root, path, 0); - if (ret < 0) - goto out; - if (ret > 0) { - /* there's no smaller item, so stick with the - * larger one */ - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, - path, 0, 0); - if (ret < 0) - goto out; - } - } - - stop_loop = 0; - while (1) { - u64 bytes; - - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - - stop_loop = 1; - break; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - - if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = fs_info->nodesize; - else - bytes = key.offset; - - if (key.objectid + bytes <= logical) - goto next; - - if (key.objectid >= logical + map->stripe_len) { - /* out of this device extent */ - if (key.objectid >= logic_end) - stop_loop = 1; - break; - } - - /* - * If our block group was removed in the meanwhile, just - * stop scrubbing since there is no point in continuing. - * Continuing would prevent reusing its device extents - * for new block groups for a long time. - */ - spin_lock(&bg->lock); - if (bg->removed) { - spin_unlock(&bg->lock); - ret = 0; - goto out; - } - spin_unlock(&bg->lock); - - extent = btrfs_item_ptr(l, slot, - struct btrfs_extent_item); - flags = btrfs_extent_flags(l, extent); - generation = btrfs_extent_generation(l, extent); - - if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && - (key.objectid < logical || - key.objectid + bytes > - logical + map->stripe_len)) { - btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - key.objectid, logical); - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - goto next; - } - -again: - extent_logical = key.objectid; - ASSERT(bytes <= U32_MAX); - extent_len = bytes; - - /* - * trim extent to this stripe - */ - if (extent_logical < logical) { - extent_len -= logical - extent_logical; - extent_logical = logical; - } - if (extent_logical + extent_len > - logical + map->stripe_len) { - extent_len = logical + map->stripe_len - - extent_logical; - } + ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, + bg->start, bg->length, scrub_dev, + map->stripes[stripe_index].physical, + stripe_index + 1); + offset = 0; + goto out; + } + if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { + ret = scrub_simple_stripe(sctx, root, csum_root, bg, map, + scrub_dev, stripe_index); + offset = map->stripe_len * (stripe_index / map->sub_stripes); + goto out; + } - extent_physical = extent_logical - logical + physical; - extent_dev = scrub_dev; - extent_mirror_num = mirror_num; - if (sctx->is_dev_replace) - scrub_remap_extent(fs_info, extent_logical, - extent_len, &extent_physical, - &extent_dev, - &extent_mirror_num); - - if (flags & BTRFS_EXTENT_FLAG_DATA) { - ret = btrfs_lookup_csums_range(csum_root, - extent_logical, - extent_logical + extent_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; - } + /* Only RAID56 goes through the old code */ + ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); + ret = 0; - ret = scrub_extent(sctx, map, extent_logical, extent_len, - extent_physical, extent_dev, flags, - generation, extent_mirror_num, - extent_logical - logical + physical); + /* Calculate the logical end of the stripe */ + get_raid56_logic_offset(physical_end, stripe_index, + map, &logic_end, NULL); + logic_end += chunk_logical; - scrub_free_csums(sctx); + /* Initialize @offset in case we need to go to out: label */ + get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); + increment = map->stripe_len * nr_data_stripes(map); + /* + * Due to the rotation, for RAID56 it's better to iterate each stripe + * using their physical offset. + */ + while (physical < physical_end) { + ret = get_raid56_logic_offset(physical, stripe_index, map, + &logical, &stripe_logical); + logical += chunk_logical; + if (ret) { + /* it is parity strip */ + stripe_logical += chunk_logical; + stripe_end = stripe_logical + increment; + ret = scrub_raid56_parity(sctx, map, scrub_dev, + stripe_logical, + stripe_end); if (ret) goto out; + goto next; + } - if (sctx->is_dev_replace) - sync_replace_for_zoned(sctx); - - if (extent_logical + extent_len < - key.objectid + bytes) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* - * loop until we find next data stripe - * or we have finished all stripes. - */ -loop: - physical += map->stripe_len; - ret = get_raid56_logic_offset(physical, - stripe_index, map, - &logical, &stripe_logical); - logical += chunk_logical; - - if (ret && physical < physical_end) { - stripe_logical += chunk_logical; - stripe_end = stripe_logical + - increment; - ret = scrub_raid56_parity(sctx, - map, scrub_dev, - stripe_logical, - stripe_end); - if (ret) - goto out; - goto loop; - } - } else { - physical += map->stripe_len; - logical += increment; - } - if (logical < key.objectid + bytes) { - cond_resched(); - goto again; - } - - if (physical >= physical_end) { - stop_loop = 1; - break; - } - } + /* + * Now we're at a data stripe, scrub each extents in the range. + * + * At this stage, if we ignore the repair part, inside each data + * stripe it is no different than SINGLE profile. + * We can reuse scrub_simple_mirror() here, as the repair part + * is still based on @mirror_num. + */ + ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, + logical, map->stripe_len, + scrub_dev, physical, 1); + if (ret < 0) + goto out; next: - path->slots[0]++; - } - btrfs_release_path(path); -skip: logical += increment; physical += map->stripe_len; spin_lock(&sctx->stat_lock); @@ -3699,6 +3757,31 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache) goto skip; + ASSERT(cache->start <= chunk_offset); + /* + * We are using the commit root to search for device extents, so + * that means we could have found a device extent item from a + * block group that was deleted in the current transaction. The + * logical start offset of the deleted block group, stored at + * @chunk_offset, might be part of the logical address range of + * a new block group (which uses different physical extents). + * In this case btrfs_lookup_block_group() has returned the new + * block group, and its start address is less than @chunk_offset. + * + * We skip such new block groups, because it's pointless to + * process them, as we won't find their extents because we search + * for them using the commit root of the extent tree. For a device + * replace it's also fine to skip it, we won't miss copying them + * to the target device because we have the write duplication + * setup through the regular write path (by btrfs_map_block()), + * and we have committed a transaction when we started the device + * replace, right after setting up the device replace state. + */ + if (cache->start < chunk_offset) { + btrfs_put_block_group(cache); + goto skip; + } + if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { spin_lock(&cache->lock); if (!cache->to_copy) { @@ -3822,7 +3905,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_replace->item_needs_writeback = 1; up_write(&dev_replace->rwsem); - ASSERT(cache->start == chunk_offset); ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, dev_extent_len); @@ -3940,9 +4022,9 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (!btrfs_check_super_location(scrub_dev, bytenr)) continue; - ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, - scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, - NULL, bytenr); + ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, + NULL, bytenr); if (ret) return ret; } @@ -3955,22 +4037,23 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) { if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, &fs_info->scrub_lock)) { - struct btrfs_workqueue *scrub_workers = NULL; - struct btrfs_workqueue *scrub_wr_comp = NULL; - struct btrfs_workqueue *scrub_parity = NULL; - - scrub_workers = fs_info->scrub_workers; - scrub_wr_comp = fs_info->scrub_wr_completion_workers; - scrub_parity = fs_info->scrub_parity_workers; + struct workqueue_struct *scrub_workers = fs_info->scrub_workers; + struct workqueue_struct *scrub_wr_comp = + fs_info->scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity = + fs_info->scrub_parity_workers; fs_info->scrub_workers = NULL; fs_info->scrub_wr_completion_workers = NULL; fs_info->scrub_parity_workers = NULL; mutex_unlock(&fs_info->scrub_lock); - btrfs_destroy_workqueue(scrub_workers); - btrfs_destroy_workqueue(scrub_wr_comp); - btrfs_destroy_workqueue(scrub_parity); + if (scrub_workers) + destroy_workqueue(scrub_workers); + if (scrub_wr_comp) + destroy_workqueue(scrub_wr_comp); + if (scrub_parity) + destroy_workqueue(scrub_parity); } } @@ -3980,9 +4063,9 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { - struct btrfs_workqueue *scrub_workers = NULL; - struct btrfs_workqueue *scrub_wr_comp = NULL; - struct btrfs_workqueue *scrub_parity = NULL; + struct workqueue_struct *scrub_workers = NULL; + struct workqueue_struct *scrub_wr_comp = NULL; + struct workqueue_struct *scrub_parity = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; int ret = -ENOMEM; @@ -3990,18 +4073,16 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, - is_dev_replace ? 1 : max_active, 4); + scrub_workers = alloc_workqueue("btrfs-scrub", flags, + is_dev_replace ? 1 : max_active); if (!scrub_workers) goto fail_scrub_workers; - scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, - max_active, 2); + scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active); if (!scrub_wr_comp) goto fail_scrub_wr_completion_workers; - scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, - max_active, 2); + scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active); if (!scrub_parity) goto fail_scrub_parity_workers; @@ -4022,11 +4103,11 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->scrub_lock); ret = 0; - btrfs_destroy_workqueue(scrub_parity); + destroy_workqueue(scrub_parity); fail_scrub_parity_workers: - btrfs_destroy_workqueue(scrub_wr_comp); + destroy_workqueue(scrub_wr_comp); fail_scrub_wr_completion_workers: - btrfs_destroy_workqueue(scrub_workers); + destroy_workqueue(scrub_workers); fail_scrub_workers: return ret; } @@ -4058,18 +4139,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } if (fs_info->nodesize > - PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || - fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { + SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits || + fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) { /* - * would exhaust the array bounds of pagev member in + * Would exhaust the array bounds of sectorv member in * struct scrub_block */ btrfs_err(fs_info, - "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", - fs_info->nodesize, - SCRUB_MAX_PAGES_PER_BLOCK, - fs_info->sectorsize, - SCRUB_MAX_PAGES_PER_BLOCK); +"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails", + fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK, + fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK); return -EINVAL; } @@ -4137,7 +4216,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, /* * In order to avoid deadlock with reclaim when there is a transaction * trying to pause scrub, make sure we use GFP_NOFS for all the - * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity() + * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() * invoked by our callees. The pausing request is done when the * transaction commit starts, and it blocks the transaction until scrub * is paused (done at specific points at scrub_stripe() or right above @@ -4271,11 +4350,11 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } -static void scrub_remap_extent(struct btrfs_fs_info *fs_info, - u64 extent_logical, u32 extent_len, - u64 *extent_physical, - struct btrfs_device **extent_dev, - int *extent_mirror_num) +static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, + u64 extent_logical, u32 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num) { u64 mapped_length; struct btrfs_io_context *bioc = NULL; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 201eb2628aea..fa56890ff81f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -10,7 +10,6 @@ #include <linux/mount.h> #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> -#include <linux/radix-tree.h> #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/compat.h> @@ -128,11 +127,18 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct radix_tree_root name_cache; + struct xarray name_cache; struct list_head name_cache_list; int name_cache_size; + /* + * The inode we are currently processing. It's not NULL only when we + * need to issue write commands for data extents from this inode. + */ + struct inode *cur_inode; struct file_ra_state ra; + u64 page_cache_clear_start; + bool clean_page_cache; /* * We process inodes by their increasing order, so if before an @@ -262,14 +268,13 @@ struct orphan_dir_info { struct name_cache_entry { struct list_head list; /* - * radix_tree has only 32bit entries but we need to handle 64bit inums. - * We use the lower 32bit of the 64bit inum to store it in the tree. If - * more then one inum would fall into the same entry, we use radix_list - * to store the additional entries. radix_list is also used to store - * entries where two entries have the same inum but different - * generations. + * On 32bit kernels, xarray has only 32bit indices, but we need to + * handle 64bit inums. We use the lower 32bit of the 64bit inum to store + * it in the tree. If more than one inum would fall into the same entry, + * we use inum_aliases to store the additional entries. inum_aliases is + * also used to store entries with the same inum but different generations. */ - struct list_head radix_list; + struct list_head inum_aliases; u64 ino; u64 gen; u64 parent_ino; @@ -528,17 +533,12 @@ out: static int fs_path_copy(struct fs_path *p, struct fs_path *from) { - int ret; - p->reversed = from->reversed; fs_path_reset(p); - ret = fs_path_add_path(p, from); - - return ret; + return fs_path_add_path(p, from); } - static void fs_path_unreverse(struct fs_path *p) { char *tmp; @@ -2024,9 +2024,9 @@ out: } /* - * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, + * Insert a name cache entry. On 32bit kernels the xarray index is 32bit, * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::radix_list. + * takes care of this with the help of name_cache_entry::inum_aliases. * In case of error, nce is kfreed. */ static int name_cache_insert(struct send_ctx *sctx, @@ -2035,8 +2035,7 @@ static int name_cache_insert(struct send_ctx *sctx, int ret = 0; struct list_head *nce_head; - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); + nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); if (!nce_head) { @@ -2045,14 +2044,14 @@ static int name_cache_insert(struct send_ctx *sctx, } INIT_LIST_HEAD(nce_head); - ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); + ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL); if (ret < 0) { kfree(nce_head); kfree(nce); return ret; } } - list_add_tail(&nce->radix_list, nce_head); + list_add_tail(&nce->inum_aliases, nce_head); list_add_tail(&nce->list, &sctx->name_cache_list); sctx->name_cache_size++; @@ -2064,15 +2063,14 @@ static void name_cache_delete(struct send_ctx *sctx, { struct list_head *nce_head; - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); + nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); if (!nce_head) { btrfs_err(sctx->send_root->fs_info, "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", nce->ino, sctx->name_cache_size); } - list_del(&nce->radix_list); + list_del(&nce->inum_aliases); list_del(&nce->list); sctx->name_cache_size--; @@ -2080,7 +2078,7 @@ static void name_cache_delete(struct send_ctx *sctx, * We may not get to the final release of nce_head if the lookup fails */ if (nce_head && list_empty(nce_head)) { - radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); + xa_erase(&sctx->name_cache, (unsigned long)nce->ino); kfree(nce_head); } } @@ -2091,11 +2089,11 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, struct list_head *nce_head; struct name_cache_entry *cur; - nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); + nce_head = xa_load(&sctx->name_cache, (unsigned long)ino); if (!nce_head) return NULL; - list_for_each_entry(cur, nce_head, radix_list) { + list_for_each_entry(cur, nce_head, inum_aliases) { if (cur->ino == ino && cur->gen == gen) return cur; } @@ -2680,61 +2678,43 @@ out: static int did_create_dir(struct send_ctx *sctx, u64 dir) { int ret = 0; + int iter_ret = 0; struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_key di_key; - struct extent_buffer *eb; struct btrfs_dir_item *di; - int slot; path = alloc_path_for_send(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); - if (ret < 0) - goto out; - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(sctx->send_root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } + btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) { + struct extent_buffer *eb = path->nodes[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; - goto out; + break; } - di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(eb, di, &di_key); if (di_key.type != BTRFS_ROOT_ITEM_KEY && di_key.objectid < sctx->send_progress) { ret = 1; - goto out; + break; } - - path->slots[0]++; } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; -out: btrfs_free_path(path); return ret; } @@ -2938,6 +2918,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, u64 send_progress) { int ret = 0; + int iter_ret = 0; struct btrfs_root *root = sctx->parent_root; struct btrfs_path *path; struct btrfs_key key; @@ -2964,23 +2945,9 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (odi) key.offset = odi->last_dir_index_offset; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct waiting_dir_move *dm; - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - continue; - } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); if (found_key.objectid != key.objectid || found_key.type != key.type) break; @@ -3015,8 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, ret = 0; goto out; } - - path->slots[0]++; + } + if (iter_ret < 0) { + ret = iter_ret; + goto out; } free_orphan_dir_info(sctx, odi); @@ -3584,7 +3553,7 @@ static int check_ino_in_path(struct btrfs_root *root, } /* - * Check if ino ino1 is an ancestor of inode ino2 in the given root for any + * Check if inode ino1 is an ancestor of inode ino2 in the given root for any * possible path (in case ino2 is not a directory and has multiple hard links). * Return 1 if true, 0 if false and < 0 on error. */ @@ -3596,6 +3565,7 @@ static int is_ancestor(struct btrfs_root *root, { bool free_fs_path = false; int ret = 0; + int iter_ret = 0; struct btrfs_path *path = NULL; struct btrfs_key key; @@ -3616,26 +3586,12 @@ static int is_ancestor(struct btrfs_root *root, key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (true) { + btrfs_for_each_slot(root, &key, &key, path, iter_ret) { struct extent_buffer *leaf = path->nodes[0]; int slot = path->slots[0]; u32 cur_offset = 0; u32 item_size; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != ino2) break; if (key.type != BTRFS_INODE_REF_KEY && @@ -3673,10 +3629,12 @@ static int is_ancestor(struct btrfs_root *root, if (ret) goto out; } - path->slots[0]++; } ret = 0; - out: + if (iter_ret < 0) + ret = iter_ret; + +out: btrfs_free_path(path); if (free_fs_path) fs_path_free(fs_path); @@ -4556,13 +4514,12 @@ out: static int process_all_refs(struct send_ctx *sctx, enum btrfs_compare_tree_result cmd) { - int ret; + int ret = 0; + int iter_ret = 0; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - struct extent_buffer *eb; - int slot; iterate_inode_ref_t cb; int pending_move = 0; @@ -4586,24 +4543,7 @@ static int process_all_refs(struct send_ctx *sctx, key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(eb, &found_key, slot); - + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || (found_key.type != BTRFS_INODE_REF_KEY && found_key.type != BTRFS_INODE_EXTREF_KEY)) @@ -4612,8 +4552,11 @@ static int process_all_refs(struct send_ctx *sctx, ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); if (ret < 0) goto out; - - path->slots[0]++; + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto out; } btrfs_release_path(path); @@ -4875,13 +4818,12 @@ out: static int process_all_new_xattrs(struct send_ctx *sctx) { - int ret; + int ret = 0; + int iter_ret = 0; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - struct extent_buffer *eb; - int slot; path = alloc_path_for_send(); if (!path) @@ -4892,39 +4834,21 @@ static int process_all_new_xattrs(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } - - btrfs_item_key_to_cpu(eb, &found_key, slot); + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; - goto out; + break; } ret = iterate_dir_item(root, path, __process_new_xattr, sctx); if (ret < 0) - goto out; - - path->slots[0]++; + break; } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; -out: btrfs_free_path(path); return ret; } @@ -4951,7 +4875,6 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; struct page *page; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; @@ -4962,40 +4885,33 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) if (ret) return ret; - inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); - last_index = (offset + len - 1) >> PAGE_SHIFT; - /* initial readahead */ - memset(&sctx->ra, 0, sizeof(struct file_ra_state)); - file_ra_state_init(&sctx->ra, inode->i_mapping); - while (index <= last_index) { unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); - page = find_lock_page(inode->i_mapping, index); + page = find_lock_page(sctx->cur_inode->i_mapping, index); if (!page) { - page_cache_sync_readahead(inode->i_mapping, &sctx->ra, - NULL, index, last_index + 1 - index); + page_cache_sync_readahead(sctx->cur_inode->i_mapping, + &sctx->ra, NULL, index, + last_index + 1 - index); - page = find_or_create_page(inode->i_mapping, index, - GFP_KERNEL); + page = find_or_create_page(sctx->cur_inode->i_mapping, + index, GFP_KERNEL); if (!page) { ret = -ENOMEM; break; } } - if (PageReadahead(page)) { - page_cache_async_readahead(inode->i_mapping, &sctx->ra, - NULL, page, index, last_index + 1 - index); - } + if (PageReadahead(page)) + page_cache_async_readahead(sctx->cur_inode->i_mapping, + &sctx->ra, NULL, page_folio(page), + index, last_index + 1 - index); if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (!PageUptodate(page)) { unlock_page(page); @@ -5018,7 +4934,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) len -= cur_len; sctx->send_size += cur_len; } - iput(inode); + return ret; } @@ -5225,12 +5141,49 @@ static int send_extent_data(struct send_ctx *sctx, const u64 offset, const u64 len) { + const u64 end = offset + len; u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); + if (sctx->cur_inode == NULL) { + struct btrfs_root *root = sctx->send_root; + + sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(sctx->cur_inode)) { + int err = PTR_ERR(sctx->cur_inode); + + sctx->cur_inode = NULL; + return err; + } + memset(&sctx->ra, 0, sizeof(struct file_ra_state)); + file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); + + /* + * It's very likely there are no pages from this inode in the page + * cache, so after reading extents and sending their data, we clean + * the page cache to avoid trashing the page cache (adding pressure + * to the page cache and forcing eviction of other data more useful + * for applications). + * + * We decide if we should clean the page cache simply by checking + * if the inode's mapping nrpages is 0 when we first open it, and + * not by using something like filemap_range_has_page() before + * reading an extent because when we ask the readahead code to + * read a given file range, it may (and almost always does) read + * pages from beyond that range (see the documentation for + * page_cache_sync_readahead()), so it would not be reliable, + * because after reading the first extent future calls to + * filemap_range_has_page() would return true because the readahead + * on the previous extent resulted in reading pages of the current + * extent as well. + */ + sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0); + sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE); + } + while (sent < len) { u64 size = min(len - sent, read_size); int ret; @@ -5240,6 +5193,37 @@ static int send_extent_data(struct send_ctx *sctx, return ret; sent += size; } + + if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { + /* + * Always operate only on ranges that are a multiple of the page + * size. This is not only to prevent zeroing parts of a page in + * the case of subpage sector size, but also to guarantee we evict + * pages, as passing a range that is smaller than page size does + * not evict the respective page (only zeroes part of its content). + * + * Always start from the end offset of the last range cleared. + * This is because the readahead code may (and very often does) + * reads pages beyond the range we request for readahead. So if + * we have an extent layout like this: + * + * [ extent A ] [ extent B ] [ extent C ] + * + * When we ask page_cache_sync_readahead() to read extent A, it + * may also trigger reads for pages of extent B. If we are doing + * an incremental send and extent B has not changed between the + * parent and send snapshots, some or all of its pages may end + * up being read and placed in the page cache. So when truncating + * the page cache we always start from the end offset of the + * previously processed extent up to the end of the current + * extent. + */ + truncate_inode_pages_range(&sctx->cur_inode->i_data, + sctx->page_cache_clear_start, + end - 1); + sctx->page_cache_clear_start = end; + } + return 0; } @@ -5970,13 +5954,12 @@ out: static int process_all_extents(struct send_ctx *sctx) { - int ret; + int ret = 0; + int iter_ret = 0; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - struct extent_buffer *eb; - int slot; root = sctx->send_root; path = alloc_path_for_send(); @@ -5986,41 +5969,21 @@ static int process_all_extents(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } - - btrfs_item_key_to_cpu(eb, &found_key, slot); - + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; - goto out; + break; } ret = process_extent(sctx, path, &found_key); if (ret < 0) - goto out; - - path->slots[0]++; + break; } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; -out: btrfs_free_path(path); return ret; } @@ -6210,8 +6173,11 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx) { LIST_HEAD(deleted_refs); struct btrfs_path *path; + struct btrfs_root *root = sctx->parent_root; struct btrfs_key key; + struct btrfs_key found_key; struct parent_paths_ctx ctx; + int iter_ret = 0; int ret; path = alloc_path_for_send(); @@ -6221,39 +6187,26 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx) key.objectid = sctx->cur_ino; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); - if (ret < 0) - goto out; ctx.refs = &deleted_refs; ctx.sctx = sctx; - while (true) { - struct extent_buffer *eb = path->nodes[0]; - int slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(sctx->parent_root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.objectid != sctx->cur_ino) + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + if (found_key.objectid != key.objectid) break; - if (key.type != BTRFS_INODE_REF_KEY && - key.type != BTRFS_INODE_EXTREF_KEY) + if (found_key.type != key.type && + found_key.type != BTRFS_INODE_EXTREF_KEY) break; - ret = iterate_inode_ref(sctx->parent_root, path, &key, 1, + ret = iterate_inode_ref(root, path, &found_key, 1, record_parent_ref, &ctx); if (ret < 0) goto out; - - path->slots[0]++; + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto out; } while (!list_empty(&deleted_refs)) { @@ -6275,6 +6228,30 @@ out: return ret; } +static void close_current_inode(struct send_ctx *sctx) +{ + u64 i_size; + + if (sctx->cur_inode == NULL) + return; + + i_size = i_size_read(sctx->cur_inode); + + /* + * If we are doing an incremental send, we may have extents between the + * last processed extent and the i_size that have not been processed + * because they haven't changed but we may have read some of their pages + * through readahead, see the comments at send_extent_data(). + */ + if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size) + truncate_inode_pages_range(&sctx->cur_inode->i_data, + sctx->page_cache_clear_start, + round_up(i_size, PAGE_SIZE) - 1); + + iput(sctx->cur_inode); + sctx->cur_inode = NULL; +} + static int changed_inode(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { @@ -6285,6 +6262,8 @@ static int changed_inode(struct send_ctx *sctx, u64 left_gen = 0; u64 right_gen = 0; + close_current_inode(sctx); + sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = 0; sctx->cur_inode_last_extent = (u64)-1; @@ -7477,10 +7456,10 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) root->root_key.objectid, root->dedupe_in_progress); } -long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) +long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) { int ret = 0; - struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; + struct btrfs_root *send_root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; @@ -7539,7 +7518,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); + xa_init_flags(&sctx->name_cache, GFP_KERNEL); INIT_LIST_HEAD(&sctx->name_cache_list); sctx->flags = arg->flags; @@ -7771,6 +7750,8 @@ out: name_cache_free(sctx); + close_current_inode(sctx); + kfree(sctx); } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 23bcefc84e49..08602fdd600a 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -126,7 +126,7 @@ enum { #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) #ifdef __KERNEL__ -long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg); +long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); #endif #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 294242c194d8..2dd8754cb990 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -181,6 +181,12 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) found->full = 0; } +/* + * Block groups with more than this value (percents) of unusable space will be + * scheduled for background reclaim. + */ +#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) + static int create_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -203,6 +209,9 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; + if (btrfs_is_zoned(info)) + space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; + ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) return ret; @@ -519,7 +528,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; } - trans = (struct btrfs_trans_handle *)current->journal_info; + trans = current->journal_info; /* * If we are doing more ordered than delalloc we need to just wait on @@ -737,6 +746,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, u64 thresh = div_factor_fine(space_info->total_bytes, 90); u64 used; + lockdep_assert_held(&space_info->lock); + /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved + global_rsv_size) >= thresh) @@ -1061,7 +1072,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) trans_rsv->reserved; if (block_rsv_size < space_info->bytes_may_use) delalloc_size = space_info->bytes_may_use - block_rsv_size; - spin_unlock(&space_info->lock); /* * We don't want to include the global_rsv in our calculation, @@ -1092,6 +1102,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) flush = FLUSH_DELAYED_REFS_NR; } + spin_unlock(&space_info->lock); + /* * We don't want to reclaim everything, just a portion, so scale * down the to_reclaim by 1/4. If it takes us down to 0, diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index d841fed73492..c096695598c1 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -3,6 +3,8 @@ #ifndef BTRFS_SPACE_INFO_H #define BTRFS_SPACE_INFO_H +#include "volumes.h" + struct btrfs_space_info { spinlock_t lock; @@ -24,6 +26,12 @@ struct btrfs_space_info { the space info if we had an ENOSPC in the allocator. */ + /* + * Once a block group drops below this threshold (percents) we'll + * schedule it for reclaim. + */ + int bg_reclaim_threshold; + int clamp; /* Used to scale our threshold for preemptive flushing. The value is >> clamp, so turns out to be a 2^clamp divisor. */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index ef7ae20d2b77..a105b291444f 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -63,6 +63,29 @@ * This means a slightly higher tree locking latency. */ +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) +{ + if (fs_info->sectorsize >= PAGE_SIZE) + return false; + + /* + * Only data pages (either through DIO or compression) can have no + * mapping. And if page->mapping->host is data inode, it's subpage. + * As we have ruled our sectorsize >= PAGE_SIZE case already. + */ + if (!page->mapping || !page->mapping->host || + is_data_inode(page->mapping->host)) + return true; + + /* + * Now the only remaining case is metadata, which we only go subpage + * routine if nodesize < PAGE_SIZE. + */ + if (fs_info->nodesize < PAGE_SIZE) + return true; + return false; +} + void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) { unsigned int cur = 0; @@ -107,7 +130,7 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, ASSERT(PageLocked(page)); /* Either not subpage, or the page already has private attached */ - if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) + if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page)) return 0; subpage = btrfs_alloc_subpage(fs_info, type); @@ -124,10 +147,10 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct btrfs_subpage *subpage; /* Either not subpage, or already detached */ - if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) + if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page)) return; - subpage = (struct btrfs_subpage *)detach_page_private(page); + subpage = detach_page_private(page); ASSERT(subpage); btrfs_free_subpage(subpage); } @@ -175,7 +198,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage; - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->mapping); @@ -190,7 +213,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage; - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->mapping); @@ -319,7 +342,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { lock_page(page); return 0; } @@ -336,7 +359,7 @@ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) return unlock_page(page); btrfs_subpage_clamp_range(page, &start, &len); if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) @@ -620,7 +643,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ set_page_func(page); \ return; \ } \ @@ -629,7 +652,7 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ clear_page_func(page); \ return; \ } \ @@ -638,14 +661,14 @@ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ return test_page_func(page); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ } \ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ set_page_func(page); \ return; \ } \ @@ -655,7 +678,7 @@ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ clear_page_func(page); \ return; \ } \ @@ -665,7 +688,7 @@ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ return test_page_func(page); \ btrfs_subpage_clamp_range(page, &start, &len); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ @@ -694,7 +717,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, return; ASSERT(!PageDirty(page)); - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->private); @@ -722,8 +745,8 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, struct btrfs_subpage *subpage; ASSERT(PageLocked(page)); - /* For regular page size case, we just unlock the page */ - if (fs_info->sectorsize == PAGE_SIZE) + /* For non-subpage case, we just unlock the page */ + if (!btrfs_is_subpage(fs_info, page)) return unlock_page(page); ASSERT(PagePrivate(page) && page->private); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 7accb5c40d33..0e80ad336904 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -74,6 +74,8 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page); + void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4d947ba32da9..6627dd7875ee 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,6 +66,52 @@ static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); +#ifdef CONFIG_PRINTK + +#define STATE_STRING_PREFACE ": state " +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) + +/* + * Characters to print to indicate error conditions or uncommon filesystem sate. + * RO is not an error. + */ +static const char fs_state_chars[] = { + [BTRFS_FS_STATE_ERROR] = 'E', + [BTRFS_FS_STATE_REMOUNTING] = 'M', + [BTRFS_FS_STATE_RO] = 0, + [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_DEV_REPLACING] = 'R', + [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, + [BTRFS_FS_STATE_NO_CSUMS] = 'C', + [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', +}; + +static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) +{ + unsigned int bit; + bool states_printed = false; + unsigned long fs_state = READ_ONCE(info->fs_state); + char *curr = buf; + + memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); + curr += sizeof(STATE_STRING_PREFACE) - 1; + + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { + WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); + if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { + *curr++ = fs_state_chars[bit]; + states_printed = true; + } + } + + /* If no states were printed, reset the buffer */ + if (!states_printed) + curr = buf; + + *curr++ = 0; +} +#endif + /* * Generally the error codes correspond to their respective errors, but there * are a few special cases. @@ -128,6 +174,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function { struct super_block *sb = fs_info->sb; #ifdef CONFIG_PRINTK + char statestr[STATE_STRING_BUF_LEN]; const char *errstr; #endif @@ -140,6 +187,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function #ifdef CONFIG_PRINTK errstr = btrfs_decode_error(errno); + btrfs_state_to_string(fs_info, statestr); if (fmt) { struct va_format vaf; va_list args; @@ -148,12 +196,12 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function vaf.fmt = fmt; vaf.va = &args; - pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, function, line, errno, errstr, &vaf); + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, statestr, function, line, errno, errstr, &vaf); va_end(args); } else { - pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s\n", - sb->s_id, function, line, errno, errstr); + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", + sb->s_id, statestr, function, line, errno, errstr); } #endif @@ -213,7 +261,7 @@ static struct ratelimit_state printk_limits[] = { RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), }; -void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; @@ -240,11 +288,15 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, . vaf.va = &args; if (__ratelimit(ratelimit)) { - if (fs_info) - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, - fs_info->sb->s_id, &vaf); - else - printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + if (fs_info) { + char statestr[STATE_STRING_BUF_LEN]; + + btrfs_state_to_string(fs_info, statestr); + _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + fs_info->sb->s_id, statestr, &vaf); + } else { + _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + } } va_end(args); @@ -711,6 +763,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, compress_force = false; no_compress++; } else { + btrfs_err(info, "unrecognized compression value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -769,8 +823,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_thread_pool: ret = match_int(&args[0], &intarg); if (ret) { + btrfs_err(info, "unrecognized thread_pool value %s", + args[0].from); goto out; } else if (intarg == 0) { + btrfs_err(info, "invalid value 0 for thread_pool"); ret = -EINVAL; goto out; } @@ -831,8 +888,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_ratio: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized metadata_ratio value %s", + args[0].from); goto out; + } info->metadata_ratio = intarg; btrfs_info(info, "metadata ratio %u", info->metadata_ratio); @@ -849,6 +909,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, DISCARD_ASYNC, "turning on async discard"); } else { + btrfs_err(info, "unrecognized discard mode value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -861,6 +923,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_space_cache: case Opt_space_cache_version: + /* + * We already set FREE_SPACE_TREE above because we have + * compat_ro(FREE_SPACE_TREE) set, and we aren't going + * to allow v1 to be set for extent tree v2, simply + * ignore this setting if we're extent tree v2. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; if (token == Opt_space_cache || strcmp(args[0].from, "v1") == 0) { btrfs_clear_opt(info->mount_opt, @@ -873,6 +943,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, FREE_SPACE_TREE, "enabling free space tree"); } else { + btrfs_err(info, "unrecognized space_cache value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -881,6 +953,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: + /* + * We cannot operate without the free space tree with + * extent tree v2, ignore this option. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; if (btrfs_test_opt(info, SPACE_CACHE)) { btrfs_clear_and_info(info, SPACE_CACHE, "disabling disk space caching"); @@ -896,6 +974,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, "the 'inode_cache' option is deprecated and has no effect since 5.11"); break; case Opt_clear_cache: + /* + * We cannot clear the free space tree with extent tree + * v2, ignore this option. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; btrfs_set_and_info(info, CLEAR_CACHE, "force clearing of disk cache"); break; @@ -942,8 +1026,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_check_integrity_print_mask: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, + "unrecognized check_integrity_print_mask value %s", + args[0].from); goto out; + } info->check_integrity_print_mask = intarg; btrfs_info(info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); @@ -958,13 +1046,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; #endif case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) + if (strcmp(args[0].from, "panic") == 0) { btrfs_set_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else if (strcmp(args[0].from, "bug") == 0) + } else if (strcmp(args[0].from, "bug") == 0) { btrfs_clear_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else { + } else { + btrfs_err(info, "unrecognized fatal_errors value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -972,8 +1062,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_commit_interval: intarg = 0; ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized commit_interval value %s", + args[0].from); + ret = -EINVAL; goto out; + } if (intarg == 0) { btrfs_info(info, "using default commit interval %us", @@ -987,8 +1081,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_rescue: ret = parse_rescue_options(info, args[0].from); - if (ret < 0) + if (ret < 0) { + btrfs_err(info, "unrecognized rescue value %s", + args[0].from); goto out; + } break; #ifdef CONFIG_BTRFS_DEBUG case Opt_fragment_all: @@ -1831,6 +1928,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, old_pool_size, new_pool_size); btrfs_workqueue_set_max(fs_info->workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); @@ -1840,8 +1938,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, - new_pool_size); } static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, @@ -1914,6 +2010,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + /* V1 cache is not supported for subpage mount. */ + if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + btrfs_warn(fs_info, + "v1 space cache is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + ret = -EINVAL; + goto restore; + } btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); @@ -2383,6 +2487,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, { struct btrfs_ioctl_vol_args *vol; struct btrfs_device *device = NULL; + dev_t devt = 0; int ret = -ENOTTY; if (!capable(CAP_SYS_ADMIN)) @@ -2402,7 +2507,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, mutex_unlock(&uuid_mutex); break; case BTRFS_IOC_FORGET_DEV: - ret = btrfs_forget_devices(vol->name); + if (vol->name[0] != 0) { + ret = lookup_bdev(vol->name, &devt); + if (ret) + break; + } + ret = btrfs_forget_devices(devt); break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index beb7f72d50b8..92a1fa8e3da6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -283,9 +283,11 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); -/* Remove once support for zoned allocation is feature complete */ #ifdef CONFIG_BTRFS_DEBUG +/* Remove once support for zoned allocation is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); +/* Remove once support for extent tree v2 is feature complete */ +BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); @@ -314,6 +316,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(raid1c34), #ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(zoned), + BTRFS_FEAT_ATTR_PTR(extent_tree_v2), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), @@ -391,11 +394,9 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, { ssize_t ret = 0; - /* 4K sector size is also supported with 64K page size */ - if (PAGE_SIZE == SZ_64K) + /* An artificial limit to only support 4K and PAGE_SIZE */ + if (PAGE_SIZE > SZ_4K) ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); - - /* Only sectorsize == PAGE_SIZE is now supported */ ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); return ret; @@ -719,6 +720,42 @@ SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); +static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + ssize_t ret; + + ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); + + return ret; +} + +static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + int thresh; + int ret; + + ret = kstrtoint(buf, 10, &thresh); + if (ret) + return ret; + + if (thresh < 0 || thresh > 100) + return -EINVAL; + + WRITE_ONCE(space_info->bg_reclaim_threshold, thresh); + + return len; +} + +BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, + btrfs_sinfo_bg_reclaim_threshold_show, + btrfs_sinfo_bg_reclaim_threshold_store); + /* * Allocation information about block group types. * @@ -735,6 +772,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), + BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), NULL, }; ATTRIBUTE_GROUPS(space_info); @@ -919,6 +957,9 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, case BTRFS_EXCLOP_BALANCE: str = "balance\n"; break; + case BTRFS_EXCLOP_BALANCE_PAUSED: + str = "balance paused\n"; + break; case BTRFS_EXCLOP_DEV_ADD: str = "device add\n"; break; @@ -1104,6 +1145,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS]; +static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) == + ARRAY_SIZE(btrfs_feature_attrs)); +static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) == + ARRAY_SIZE(btrfs_feature_attrs[0])); + static const u64 supported_feature_masks[FEAT_MAX] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, @@ -1272,11 +1318,6 @@ static void init_feature_attrs(void) struct btrfs_feature_attr *fa; int set, i; - BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) != - ARRAY_SIZE(btrfs_feature_attrs)); - BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) != - ARRAY_SIZE(btrfs_feature_attrs[0])); - memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); memset(btrfs_unknown_feature_names, 0, sizeof(btrfs_unknown_feature_names)); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index d8e56edd6991..1591bfa55bcc 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -150,8 +150,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - struct radix_tree_iter iter; - void **slot; + unsigned long index; + struct extent_buffer *eb; struct btrfs_device *dev, *tmp; if (!fs_info) @@ -163,25 +163,9 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - spin_lock(&fs_info->buffer_lock); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { - struct extent_buffer *eb; - - eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); - if (!eb) - continue; - /* Shouldn't happen but that kind of thinking creates CVE's */ - if (radix_tree_exception(eb)) { - if (radix_tree_deref_retry(eb)) - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - spin_unlock(&fs_info->buffer_lock); + xa_for_each(&fs_info->extent_buffers, index, eb) { free_extent_buffer_stale(eb); - spin_lock(&fs_info->buffer_lock); } - spin_unlock(&fs_info->buffer_lock); btrfs_mapping_tree_free(&fs_info->mapping_tree); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, @@ -202,7 +186,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) if (!root) return; /* Will be freed by btrfs_free_fs_roots */ - if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) + if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state))) return; btrfs_global_root_delete(root); btrfs_put_root(root); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 319fed82d741..c5b3a631bf4f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -15,6 +15,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) struct extent_map *em; struct rb_node *node; + write_lock(&em_tree->lock); while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) { node = rb_first_cached(&em_tree->map); em = rb_entry(node, struct extent_map, rb_node); @@ -32,6 +33,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) #endif free_extent_map(em); } + write_unlock(&em_tree->lock); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1f1c25db6f6b..06c0a958d114 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,7 +23,7 @@ #include "space-info.h" #include "zoned.h" -#define BTRFS_ROOT_TRANS_TAG 0 +#define BTRFS_ROOT_TRANS_TAG XA_MARK_0 /* * Transaction states and transitions @@ -221,7 +221,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) * the caching thread will re-start it's search from 3, and thus find * the hole from [4,6) to add to the free space cache. */ - spin_lock(&fs_info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { struct btrfs_block_group *cache = caching_ctl->block_group; @@ -234,7 +234,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) cache->last_byte_to_unpin = caching_ctl->progress; } } - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); up_write(&fs_info->commit_root_sem); } @@ -437,15 +437,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, */ smp_wmb(); - spin_lock(&fs_info->fs_roots_radix_lock); + spin_lock(&fs_info->fs_roots_lock); if (root->last_trans == trans->transid && !force) { - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); return 0; } - radix_tree_tag_set(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_radix_lock); + xa_set_mark(&fs_info->fs_roots, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_lock); root->last_trans = trans->transid; /* this is pretty tricky. We don't want to @@ -487,11 +487,9 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, spin_unlock(&cur_trans->dropped_roots_lock); /* Make sure we don't try to update the root at commit time */ - spin_lock(&fs_info->fs_roots_radix_lock); - radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_radix_lock); + xa_clear_mark(&fs_info->fs_roots, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); } int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, @@ -1404,9 +1402,8 @@ void btrfs_add_dead_root(struct btrfs_root *root) static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *gang[8]; - int i; - int ret; + struct btrfs_root *root; + unsigned long index; /* * At this point no one can be using this transaction to modify any tree @@ -1414,57 +1411,46 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) */ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); - spin_lock(&fs_info->fs_roots_radix_lock); - while (1) { - ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, - (void **)gang, 0, - ARRAY_SIZE(gang), - BTRFS_ROOT_TRANS_TAG); - if (ret == 0) - break; - for (i = 0; i < ret; i++) { - struct btrfs_root *root = gang[i]; - int ret2; - - /* - * At this point we can neither have tasks logging inodes - * from a root nor trying to commit a log tree. - */ - ASSERT(atomic_read(&root->log_writers) == 0); - ASSERT(atomic_read(&root->log_commit[0]) == 0); - ASSERT(atomic_read(&root->log_commit[1]) == 0); - - radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_radix_lock); - - btrfs_free_log(trans, root); - ret2 = btrfs_update_reloc_root(trans, root); - if (ret2) - return ret2; - - /* see comments in should_cow_block() */ - clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); - smp_mb__after_atomic(); - - if (root->commit_root != root->node) { - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); - btrfs_set_root_node(&root->root_item, - root->node); - } + spin_lock(&fs_info->fs_roots_lock); + xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) { + int ret; + + /* + * At this point we can neither have tasks logging inodes + * from a root nor trying to commit a log tree. + */ + ASSERT(atomic_read(&root->log_writers) == 0); + ASSERT(atomic_read(&root->log_commit[0]) == 0); + ASSERT(atomic_read(&root->log_commit[1]) == 0); + + xa_clear_mark(&fs_info->fs_roots, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_lock); + + btrfs_free_log(trans, root); + ret = btrfs_update_reloc_root(trans, root); + if (ret) + return ret; + + /* See comments in should_cow_block() */ + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); - ret2 = btrfs_update_root(trans, fs_info->tree_root, - &root->root_key, - &root->root_item); - if (ret2) - return ret2; - spin_lock(&fs_info->fs_roots_radix_lock); - btrfs_qgroup_free_meta_all_pertrans(root); + if (root->commit_root != root->node) { + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); + btrfs_set_root_node(&root->root_item, root->node); } + + ret = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, &root->root_item); + if (ret) + return ret; + spin_lock(&fs_info->fs_roots_lock); + btrfs_qgroup_free_meta_all_pertrans(root); } - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); return 0; } @@ -1911,6 +1897,14 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; + + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + root_item = &fs_info->block_group_root->root_item; + + super->block_group_root = root_item->bytenr; + super->block_group_root_generation = root_item->generation; + super->block_group_root_level = root_item->level; + } } int btrfs_transaction_in_commit(struct btrfs_fs_info *info) @@ -2362,6 +2356,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) list_add_tail(&fs_info->chunk_root->dirty_list, &cur_trans->switch_commits); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_set_root_node(&fs_info->block_group_root->root_item, + fs_info->block_group_root->node); + list_add_tail(&fs_info->block_group_root->dirty_list, + &cur_trans->switch_commits); + } + switch_commit_roots(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); @@ -2490,10 +2491,10 @@ cleanup_transaction: * because btrfs_commit_super will poke cleaner thread and it will process it a * few seconds later. */ -int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) +int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) { + struct btrfs_root *root; int ret; - struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&fs_info->trans_lock); if (list_empty(&fs_info->dead_roots)) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index ba8a9826eb37..970ff316069d 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -217,7 +217,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); -int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); +int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index aae5697dde32..9e0e0ae2288c 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -639,8 +639,10 @@ static void block_group_err(const struct extent_buffer *eb, int slot, static int check_block_group_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_block_group_item bgi; u32 item_size = btrfs_item_size(leaf, slot); + u64 chunk_objectid; u64 flags; u64 type; @@ -663,8 +665,23 @@ static int check_block_group_item(struct extent_buffer *leaf, read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), sizeof(bgi)); - if (unlikely(btrfs_stack_block_group_chunk_objectid(&bgi) != - BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { + chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + /* + * We don't init the nr_global_roots until we load the global + * roots, so this could be 0 at mount time. If it's 0 we'll + * just assume we're fine, and later we'll check against our + * actual value. + */ + if (unlikely(fs_info->nr_global_roots && + chunk_objectid >= fs_info->nr_global_roots)) { + block_group_err(leaf, slot, + "invalid block group global root id, have %llu, needs to be <= %llu", + chunk_objectid, + fs_info->nr_global_roots); + return -EUCLEAN; + } + } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { block_group_err(leaf, slot, "invalid block group chunk objectid, have %llu expect %llu", btrfs_stack_block_group_chunk_objectid(&bgi), @@ -1648,7 +1665,6 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) /* These trees must never be empty */ if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID || owner == BTRFS_CHUNK_TREE_OBJECTID || - owner == BTRFS_EXTENT_TREE_OBJECTID || owner == BTRFS_DEV_TREE_OBJECTID || owner == BTRFS_FS_TREE_OBJECTID || owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { @@ -1657,12 +1673,25 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) owner); return -EUCLEAN; } + /* Unknown tree */ if (unlikely(owner == 0)) { generic_err(leaf, 0, "invalid owner, root 0 is not defined"); return -EUCLEAN; } + + /* EXTENT_TREE_V2 can have empty extent trees. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) { + generic_err(leaf, 0, + "invalid root, root %llu must never be empty", + owner); + return -EUCLEAN; + } + return 0; } @@ -1826,3 +1855,58 @@ out: return ret; } ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); + +int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) +{ + const bool is_subvol = is_fstree(root_owner); + const u64 eb_owner = btrfs_header_owner(eb); + + /* + * Skip dummy fs, as selftests don't create unique ebs for each dummy + * root. + */ + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state)) + return 0; + /* + * There are several call sites (backref walking, qgroup, and data + * reloc) passing 0 as @root_owner, as they are not holding the + * tree root. In that case, we can not do a reliable ownership check, + * so just exit. + */ + if (root_owner == 0) + return 0; + /* + * These trees use key.offset as their owner, our callers don't have + * the extra capacity to pass key.offset here. So we just skip them. + */ + if (root_owner == BTRFS_TREE_LOG_OBJECTID || + root_owner == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + if (!is_subvol) { + /* For non-subvolume trees, the eb owner should match root owner */ + if (unlikely(root_owner != eb_owner)) { + btrfs_crit(eb->fs_info, +"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root_owner, btrfs_header_bytenr(eb), eb_owner, + root_owner); + return -EUCLEAN; + } + return 0; + } + + /* + * For subvolume trees, owners can mismatch, but they should all belong + * to subvolume trees. + */ + if (unlikely(is_subvol != is_fstree(eb_owner))) { + btrfs_crit(eb->fs_info, +"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root_owner, btrfs_header_bytenr(eb), eb_owner, + BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID); + return -EUCLEAN; + } + return 0; +} diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 32fecc9dc1dd..ece497e26558 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -25,5 +25,6 @@ int btrfs_check_node(struct extent_buffer *node); int btrfs_check_chunk_valid(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 logical); +int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6bc8834ac8f7..370388fadf96 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -270,12 +270,6 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } -static int btrfs_write_tree_block(struct extent_buffer *buf) -{ - return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, - buf->start + buf->len - 1); -} - static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { filemap_fdatawait_range(buf->pages[0]->mapping, @@ -294,16 +288,6 @@ struct walk_control { */ int free; - /* should we write out the extent buffer? This is used - * while flushing the log tree to disk during a sync - */ - int write; - - /* should we wait for the extent buffer io to finish? Also used - * while flushing the log tree to disk for a sync - */ - int wait; - /* pin only walk, we record which extents on disk belong to the * log trees */ @@ -349,22 +333,20 @@ static int process_one_buffer(struct btrfs_root *log, * pin down any logged extents, so we have to read the block. */ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { - ret = btrfs_read_buffer(eb, gen, level, NULL); + ret = btrfs_read_extent_buffer(eb, gen, level, NULL); if (ret) return ret; } - if (wc->pin) + if (wc->pin) { ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, eb->len); + if (ret) + return ret; - if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { - if (wc->pin && btrfs_header_level(eb) == 0) + if (btrfs_buffer_uptodate(eb, gen, 0) && + btrfs_header_level(eb) == 0) ret = btrfs_exclude_logged_extents(eb); - if (wc->write) - btrfs_write_tree_block(eb); - if (wc->wait) - btrfs_wait_tree_block_writeback(eb); } return ret; } @@ -912,11 +894,30 @@ update_inode: btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out: - if (inode) - iput(inode); + iput(inode); return ret; } +static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, + struct btrfs_inode *inode, + const char *name, + int name_len) +{ + int ret; + + ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); + if (ret) + return ret; + /* + * Whenever we need to check if a name exists or not, we check the + * fs/subvolume tree. So after an unlink we must run delayed items, so + * that future checks for a name during log replay see that the name + * does not exists anymore. + */ + return btrfs_run_delayed_items(trans); +} + /* * when cleaning up conflicts between the directory names in the * subvolume, directory names in the log and directory names in the @@ -959,12 +960,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name, + ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, name_len); - if (ret) - goto out; - else - ret = btrfs_run_delayed_items(trans); out: kfree(name); iput(inode); @@ -1124,14 +1121,11 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, dir, inode, + ret = unlink_inode_for_log_replay(trans, dir, inode, victim_name, victim_name_len); kfree(victim_name); if (ret) return ret; - ret = btrfs_run_delayed_items(trans); - if (ret) - return ret; *search_done = 1; goto again; } @@ -1196,14 +1190,11 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, + ret = unlink_inode_for_log_replay(trans, BTRFS_I(victim_parent), inode, victim_name, victim_name_len); - if (!ret) - ret = btrfs_run_delayed_items( - trans); } iput(victim_parent); kfree(victim_name); @@ -1358,19 +1349,10 @@ again: kfree(name); goto out; } - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), inode, name, namelen); kfree(name); iput(dir); - /* - * Whenever we need to check if a name exists or not, we - * check the subvolume tree. So after an unlink we must - * run delayed items, so that future checks for a name - * during log replay see that the name does not exists - * anymore. - */ - if (!ret) - ret = btrfs_run_delayed_items(trans); if (ret) goto out; goto again; @@ -1466,8 +1448,8 @@ static int add_link(struct btrfs_trans_handle *trans, ret = -ENOENT; goto out; } - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode), - name, namelen); + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode), + name, namelen); if (ret) goto out; /* @@ -1476,10 +1458,6 @@ static int add_link(struct btrfs_trans_handle *trans, */ if (other_inode->i_nlink == 0) inc_nlink(other_inode); - - ret = btrfs_run_delayed_items(trans); - if (ret) - goto out; add_link: ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, namelen, 0, ref_index); @@ -1612,7 +1590,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = btrfs_inode_ref_exists(inode, dir, key->type, name, namelen); if (ret > 0) { - ret = btrfs_unlink_inode(trans, + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), name, namelen); @@ -1623,15 +1601,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, */ if (!ret && inode->i_nlink == 0) inc_nlink(inode); - /* - * Whenever we need to check if a name exists or - * not, we check the subvolume tree. So after an - * unlink we must run delayed items, so that future - * checks for a name during log replay see that the - * name does not exists anymore. - */ - if (!ret) - ret = btrfs_run_delayed_items(trans); } if (ret < 0) goto out; @@ -2368,15 +2337,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, goto out; inc_nlink(inode); - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, - name_len); - if (ret) - goto out; - - ret = btrfs_run_delayed_items(trans); - if (ret) - goto out; - + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), + name, name_len); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2612,7 +2574,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, int i; int ret; - ret = btrfs_read_buffer(eb, gen, level, NULL); + ret = btrfs_read_extent_buffer(eb, gen, level, NULL); if (ret) return ret; @@ -2823,7 +2785,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - ret = btrfs_read_buffer(next, ptr_gen, + ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); if (ret) { free_extent_buffer(next); @@ -2852,7 +2814,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); + ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); if (ret) { free_extent_buffer(next); return ret; @@ -3225,6 +3187,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_alloc_log_tree_node(trans, log_root_tree); if (ret) { mutex_unlock(&fs_info->tree_root->log_mutex); + blk_finish_plug(&plug); goto out; } } @@ -3495,35 +3458,156 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, } /* - * Check if an inode was logged in the current transaction. This may often - * return some false positives, because logged_trans is an in memory only field, - * not persisted anywhere. This is meant to be used in contexts where a false - * positive has no functional consequences. + * Check if an inode was logged in the current transaction. This correctly deals + * with the case where the inode was logged but has a logged_trans of 0, which + * happens if the inode is evicted and loaded again, as logged_trans is an in + * memory only field (not persisted). + * + * Returns 1 if the inode was logged before in the transaction, 0 if it was not, + * and < 0 on error. */ -static bool inode_logged(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) +static int inode_logged(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path_in) { + struct btrfs_path *path = path_in; + struct btrfs_key key; + int ret; + if (inode->logged_trans == trans->transid) - return true; + return 1; - if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) - return false; + /* + * If logged_trans is not 0, then we know the inode logged was not logged + * in this transaction, so we can return false right away. + */ + if (inode->logged_trans > 0) + return 0; + + /* + * If no log tree was created for this root in this transaction, then + * the inode can not have been logged in this transaction. In that case + * set logged_trans to anything greater than 0 and less than the current + * transaction's ID, to avoid the search below in a future call in case + * a log tree gets created after this. + */ + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { + inode->logged_trans = trans->transid - 1; + return 0; + } + + /* + * We have a log tree and the inode's logged_trans is 0. We can't tell + * for sure if the inode was logged before in this transaction by looking + * only at logged_trans. We could be pessimistic and assume it was, but + * that can lead to unnecessarily logging an inode during rename and link + * operations, and then further updating the log in followup rename and + * link operations, specially if it's a directory, which adds latency + * visible to applications doing a series of rename or link operations. + * + * A logged_trans of 0 here can mean several things: + * + * 1) The inode was never logged since the filesystem was mounted, and may + * or may have not been evicted and loaded again; + * + * 2) The inode was logged in a previous transaction, then evicted and + * then loaded again; + * + * 3) The inode was logged in the current transaction, then evicted and + * then loaded again. + * + * For cases 1) and 2) we don't want to return true, but we need to detect + * case 3) and return true. So we do a search in the log root for the inode + * item. + */ + key.objectid = btrfs_ino(inode); + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + + if (path_in) + btrfs_release_path(path); + else + btrfs_free_path(path); + + /* + * Logging an inode always results in logging its inode item. So if we + * did not find the item we know the inode was not logged for sure. + */ + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* + * Set logged_trans to a value greater than 0 and less then the + * current transaction to avoid doing the search in future calls. + */ + inode->logged_trans = trans->transid - 1; + return 0; + } + + /* + * The inode was previously logged and then evicted, set logged_trans to + * the current transacion's ID, to avoid future tree searches as long as + * the inode is not evicted again. + */ + inode->logged_trans = trans->transid; + + /* + * If it's a directory, then we must set last_dir_index_offset to the + * maximum possible value, so that the next attempt to log the inode does + * not skip checking if dir index keys found in modified subvolume tree + * leaves have been logged before, otherwise it would result in attempts + * to insert duplicate dir index keys in the log tree. This must be done + * because last_dir_index_offset is an in-memory only field, not persisted + * in the inode item or any other on-disk structure, so its value is lost + * once the inode is evicted. + */ + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->last_dir_index_offset = (u64)-1; + + return 1; +} + +/* + * Delete a directory entry from the log if it exists. + * + * Returns < 0 on error + * 1 if the entry does not exists + * 0 if the entry existed and was successfully deleted + */ +static int del_logged_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dir_ino, + const char *name, int name_len, + u64 index) +{ + struct btrfs_dir_item *di; /* - * The inode's logged_trans is always 0 when we load it (because it is - * not persisted in the inode item or elsewhere). So if it is 0, the - * inode was last modified in the current transaction then the inode may - * have been logged before in the current transaction, then evicted and - * loaded again in the current transaction - or may have never been logged - * in the current transaction, but since we can not be sure, we have to - * assume it was, otherwise our callers can leave an inconsistent log. + * We only log dir index items of a directory, so we don't need to look + * for dir item keys. */ - if (inode->logged_trans == 0 && - inode->last_trans == trans->transid && - !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) - return true; + di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, + index, name, name_len, -1); + if (IS_ERR(di)) + return PTR_ERR(di); + else if (!di) + return 1; - return false; + /* + * We do not need to update the size field of the directory's + * inode item because on log replay we update the field to reflect + * all existing entries in the directory (see overwrite_item()). + */ + return btrfs_delete_one_dir_name(trans, log, path, di); } /* @@ -3552,15 +3636,16 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, u64 index) { - struct btrfs_root *log; - struct btrfs_dir_item *di; struct btrfs_path *path; int ret; - int err = 0; - u64 dir_ino = btrfs_ino(dir); - if (!inode_logged(trans, dir)) + ret = inode_logged(trans, dir, NULL); + if (ret == 0) + return; + else if (ret < 0) { + btrfs_set_log_full_commit(trans); return; + } ret = join_running_log_trans(root); if (ret) @@ -3568,41 +3653,18 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, mutex_lock(&dir->log_mutex); - log = root->log_root; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out_unlock; } - /* - * We only log dir index items of a directory, so we don't need to look - * for dir item keys. - */ - di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, - index, name, name_len, -1); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto fail; - } - if (di) { - ret = btrfs_delete_one_dir_name(trans, log, path, di); - if (ret) { - err = ret; - goto fail; - } - } - - /* - * We do not need to update the size field of the directory's inode item - * because on log replay we update the field to reflect all existing - * entries in the directory (see overwrite_item()). - */ -fail: + ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), + name, name_len, index); btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); - if (err < 0) + if (ret < 0) btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); } @@ -3617,8 +3679,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, u64 index; int ret; - if (!inode_logged(trans, inode)) + ret = inode_logged(trans, inode, NULL); + if (ret == 0) + return; + else if (ret < 0) { + btrfs_set_log_full_commit(trans); return; + } ret = join_running_log_trans(root); if (ret) @@ -3653,11 +3720,29 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, key.offset = first_offset; key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); - if (ret) + /* + * -EEXIST is fine and can happen sporadically when we are logging a + * directory and have concurrent insertions in the subvolume's tree for + * items from other inodes and that result in pushing off some dir items + * from one leaf to another in order to accommodate for the new items. + * This results in logging the same dir index range key. + */ + if (ret && ret != -EEXIST) return ret; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); + if (ret == -EEXIST) { + const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); + + /* + * btrfs_del_dir_entries_in_log() might have been called during + * an unlink between the initial insertion of this key and the + * current update, or we might be logging a single entry deletion + * during a rename, so set the new last_offset to the max value. + */ + last_offset = max(last_offset, curr_end); + } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(path); @@ -3743,19 +3828,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, - struct btrfs_log_ctx *ctx) + struct btrfs_log_ctx *ctx, + u64 *last_old_dentry_offset) { struct btrfs_root *log = inode->root->log_root; struct extent_buffer *src = path->nodes[0]; const int nritems = btrfs_header_nritems(src); const u64 ino = btrfs_ino(inode); - const bool inode_logged_before = inode_logged(trans, inode); bool last_found = false; int batch_start = 0; int batch_size = 0; int i; for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -3766,7 +3852,27 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, break; } + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); ctx->last_dir_item_offset = key.offset; + + /* + * Skip ranges of items that consist only of dir item keys created + * in past transactions. However if we find a gap, we must log a + * dir index range item for that gap, so that index keys in that + * gap are deleted during log replay. + */ + if (btrfs_dir_transid(src, di) < trans->transid) { + if (key.offset > *last_old_dentry_offset + 1) { + ret = insert_dir_log_key(trans, log, dst_path, + ino, *last_old_dentry_offset + 1, + key.offset - 1); + if (ret < 0) + return ret; + } + + *last_old_dentry_offset = key.offset; + continue; + } /* * We must make sure that when we log a directory entry, the * corresponding inode, after log replay, has a matching link @@ -3790,25 +3896,23 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, * resulting in -ENOTEMPTY errors. */ if (!ctx->log_new_dentries) { - struct btrfs_dir_item *di; struct btrfs_key di_key; - di = btrfs_item_ptr(src, i, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(src, di, &di_key); - if ((btrfs_dir_transid(src, di) == trans->transid || - btrfs_dir_type(src, di) == BTRFS_FT_DIR) && - di_key.type != BTRFS_ROOT_ITEM_KEY) + if (di_key.type != BTRFS_ROOT_ITEM_KEY) ctx->log_new_dentries = true; } - if (!inode_logged_before) + if (!ctx->logged_before) goto add_to_batch; /* * If we were logged before and have logged dir items, we can skip * checking if any item with a key offset larger than the last one * we logged is in the log tree, saving time and avoiding adding - * contention on the log tree. + * contention on the log tree. We can only rely on the value of + * last_dir_index_offset when we know for sure that the inode was + * previously logged in the current transaction. */ if (key.offset > inode->last_dir_index_offset) goto add_to_batch; @@ -3878,7 +3982,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; int err = 0; int ret; - u64 first_offset = min_offset; + u64 last_old_dentry_offset = min_offset - 1; u64 last_offset = (u64)-1; u64 ino = btrfs_ino(inode); @@ -3912,10 +4016,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ if (ret == 0) { struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) - first_offset = max(min_offset, tmp.offset) + 1; + last_old_dentry_offset = tmp.offset; } goto done; } @@ -3924,17 +4029,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); if (ret == 0) { struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (tmp.type == BTRFS_DIR_INDEX_KEY) { - first_offset = tmp.offset; - ret = overwrite_item(trans, log, dst_path, - path->nodes[0], path->slots[0], - &tmp); - if (ret) { - err = ret; - goto done; - } - } + /* + * The dir index key before the first one we found that needs to + * be logged might be in a previous leaf, and there might be a + * gap between these keys, meaning that we had deletions that + * happened. So the key range item we log (key type + * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the + * previous key's offset plus 1, so that those deletes are replayed. + */ + if (tmp.type == BTRFS_DIR_INDEX_KEY) + last_old_dentry_offset = tmp.offset; } btrfs_release_path(path); @@ -3956,7 +4062,8 @@ search: * from our directory */ while (1) { - ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx); + ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, + &last_old_dentry_offset); if (ret != 0) { if (ret < 0) err = ret; @@ -3982,14 +4089,16 @@ search: goto done; } if (btrfs_header_generation(path->nodes[0]) != trans->transid) { - ctx->last_dir_item_offset = min_key.offset; - ret = overwrite_item(trans, log, dst_path, - path->nodes[0], path->slots[0], - &min_key); - if (ret) - err = ret; - else - last_offset = min_key.offset; + /* + * The next leaf was not changed in the current transaction + * and has at least one dir index key. + * We check for the next key because there might have been + * one or more deletions between the last key we logged and + * that next key. So the key range item we log (key type + * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's + * offset minus 1, so that those deletes are replayed. + */ + last_offset = min_key.offset - 1; goto done; } if (need_resched()) { @@ -4005,13 +4114,21 @@ done: if (err == 0) { *last_offset_ret = last_offset; /* - * insert the log range keys to indicate where the log - * is valid + * In case the leaf was changed in the current transaction but + * all its dir items are from a past transaction, the last item + * in the leaf is a dir item and there's no gap between that last + * dir item and the first one on the next leaf (which did not + * change in the current transaction), then we don't need to log + * a range, last_old_dentry_offset is == to last_offset. */ - ret = insert_dir_log_key(trans, log, path, ino, first_offset, - last_offset); - if (ret) - err = ret; + ASSERT(last_old_dentry_offset <= last_offset); + if (last_old_dentry_offset < last_offset) { + ret = insert_dir_log_key(trans, log, path, ino, + last_old_dentry_offset + 1, + last_offset); + if (ret) + err = ret; + } } return err; } @@ -4038,22 +4155,7 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, u64 max_key; int ret; - /* - * If this is the first time we are being logged in the current - * transaction, or we were logged before but the inode was evicted and - * reloaded later, in which case its logged_trans is 0, reset the value - * of the last logged key offset. Note that we don't use the helper - * function inode_logged() here - that is because the function returns - * true after an inode eviction, assuming the worst case as it can not - * know for sure if the inode was logged before. So we can not skip key - * searches in the case the inode was evicted, because it may not have - * been logged in this transaction and may have been logged in a past - * transaction, so we need to reset the last dir index offset to (u64)-1. - */ - if (inode->logged_trans != trans->transid) - inode->last_dir_index_offset = (u64)-1; - - min_key = 0; + min_key = BTRFS_DIR_START_INDEX; max_key = 0; ctx->last_dir_item_offset = inode->last_dir_index_offset; @@ -4089,9 +4191,6 @@ static int drop_inode_items(struct btrfs_trans_handle *trans, struct btrfs_key found_key; int start_slot; - if (!inode_logged(trans, inode)) - return 0; - key.objectid = btrfs_ino(inode); key.type = max_key_type; key.offset = (u64)-1; @@ -4311,23 +4410,18 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int start_slot, int nr, int inode_only, u64 logged_isize) { - struct btrfs_fs_info *fs_info = trans->fs_info; - unsigned long src_offset; - unsigned long dst_offset; struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; - struct btrfs_inode_item *inode_item; struct extent_buffer *src = src_path->nodes[0]; - int ret; + int ret = 0; struct btrfs_key *ins_keys; u32 *ins_sizes; struct btrfs_item_batch batch; char *ins_data; int i; - struct list_head ordered_sums; - int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; - - INIT_LIST_HEAD(&ordered_sums); + int dst_index; + const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); + const u64 i_size = i_size_read(&inode->vfs_inode); ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); @@ -4339,28 +4433,152 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, batch.keys = ins_keys; batch.data_sizes = ins_sizes; batch.total_data_size = 0; - batch.nr = nr; + batch.nr = 0; + dst_index = 0; for (i = 0; i < nr; i++) { - ins_sizes[i] = btrfs_item_size(src, i + start_slot); - batch.total_data_size += ins_sizes[i]; - btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); + const int src_slot = start_slot + i; + struct btrfs_root *csum_root; + struct btrfs_ordered_sum *sums; + struct btrfs_ordered_sum *sums_next; + LIST_HEAD(ordered_sums); + u64 disk_bytenr; + u64 disk_num_bytes; + u64 extent_offset; + u64 extent_num_bytes; + bool is_old_extent; + + btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); + + if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) + goto add_to_batch; + + extent = btrfs_item_ptr(src, src_slot, + struct btrfs_file_extent_item); + + is_old_extent = (btrfs_file_extent_generation(src, extent) < + trans->transid); + + /* + * Don't copy extents from past generations. That would make us + * log a lot more metadata for common cases like doing only a + * few random writes into a file and then fsync it for the first + * time or after the full sync flag is set on the inode. We can + * get leaves full of extent items, most of which are from past + * generations, so we can skip them - as long as the inode has + * not been the target of a reflink operation in this transaction, + * as in that case it might have had file extent items with old + * generations copied into it. We also must always log prealloc + * extents that start at or beyond eof, otherwise we would lose + * them on log replay. + */ + if (is_old_extent && + ins_keys[dst_index].offset < i_size && + inode->last_reflink_trans < trans->transid) + continue; + + if (skip_csum) + goto add_to_batch; + + /* Only regular extents have checksums. */ + if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) + goto add_to_batch; + + /* + * If it's an extent created in a past transaction, then its + * checksums are already accessible from the committed csum tree, + * no need to log them. + */ + if (is_old_extent) + goto add_to_batch; + + disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); + /* If it's an explicit hole, there are no checksums. */ + if (disk_bytenr == 0) + goto add_to_batch; + + disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); + + if (btrfs_file_extent_compression(src, extent)) { + extent_offset = 0; + extent_num_bytes = disk_num_bytes; + } else { + extent_offset = btrfs_file_extent_offset(src, extent); + extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); + } + + csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); + disk_bytenr += extent_offset; + ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, + disk_bytenr + extent_num_bytes - 1, + &ordered_sums, 0); + if (ret) + goto out; + + list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { + if (!ret) + ret = log_csums(trans, inode, log, sums); + list_del(&sums->list); + kfree(sums); + } + if (ret) + goto out; + +add_to_batch: + ins_sizes[dst_index] = btrfs_item_size(src, src_slot); + batch.total_data_size += ins_sizes[dst_index]; + batch.nr++; + dst_index++; } + + /* + * We have a leaf full of old extent items that don't need to be logged, + * so we don't need to do anything. + */ + if (batch.nr == 0) + goto out; + ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); - if (ret) { - kfree(ins_data); - return ret; - } + if (ret) + goto out; + + dst_index = 0; + for (i = 0; i < nr; i++) { + const int src_slot = start_slot + i; + const int dst_slot = dst_path->slots[0] + dst_index; + struct btrfs_key key; + unsigned long src_offset; + unsigned long dst_offset; + + /* + * We're done, all the remaining items in the source leaf + * correspond to old file extent items. + */ + if (dst_index >= batch.nr) + break; - for (i = 0; i < nr; i++, dst_path->slots[0]++) { - dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], - dst_path->slots[0]); + btrfs_item_key_to_cpu(src, &key, src_slot); - src_offset = btrfs_item_ptr_offset(src, start_slot + i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + goto copy_item; + + extent = btrfs_item_ptr(src, src_slot, + struct btrfs_file_extent_item); + + /* See the comment in the previous loop, same logic. */ + if (btrfs_file_extent_generation(src, extent) < trans->transid && + key.offset < i_size && + inode->last_reflink_trans < trans->transid) + continue; - if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { - inode_item = btrfs_item_ptr(dst_path->nodes[0], - dst_path->slots[0], +copy_item: + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); + src_offset = btrfs_item_ptr_offset(src, src_slot); + + if (key.type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *inode_item; + + inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, struct btrfs_inode_item); fill_inode_item(trans, dst_path->nodes[0], inode_item, &inode->vfs_inode, @@ -4368,71 +4586,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, logged_isize); } else { copy_extent_buffer(dst_path->nodes[0], src, dst_offset, - src_offset, ins_sizes[i]); + src_offset, ins_sizes[dst_index]); } - /* take a reference on file data extents so that truncates - * or deletes of this inode don't have to relog the inode - * again - */ - if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && - !skip_csum) { - int found_type; - extent = btrfs_item_ptr(src, start_slot + i, - struct btrfs_file_extent_item); - - if (btrfs_file_extent_generation(src, extent) < trans->transid) - continue; - - found_type = btrfs_file_extent_type(src, extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { - struct btrfs_root *csum_root; - u64 ds, dl, cs, cl; - ds = btrfs_file_extent_disk_bytenr(src, - extent); - /* ds == 0 is a hole */ - if (ds == 0) - continue; - - dl = btrfs_file_extent_disk_num_bytes(src, - extent); - cs = btrfs_file_extent_offset(src, extent); - cl = btrfs_file_extent_num_bytes(src, - extent); - if (btrfs_file_extent_compression(src, - extent)) { - cs = 0; - cl = dl; - } - - csum_root = btrfs_csum_root(fs_info, ds); - ret = btrfs_lookup_csums_range(csum_root, - ds + cs, ds + cs + cl - 1, - &ordered_sums, 0); - if (ret) - break; - } - } + dst_index++; } btrfs_mark_buffer_dirty(dst_path->nodes[0]); btrfs_release_path(dst_path); +out: kfree(ins_data); - /* - * we have to do this after the loop above to avoid changing the - * log tree while trying to change the log tree. - */ - while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); - if (!ret) - ret = log_csums(trans, inode, log, sums); - list_del(&sums->list); - kfree(sums); - } - return ret; } @@ -4568,14 +4732,34 @@ static int log_one_extent(struct btrfs_trans_handle *trans, { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *log = inode->root->log_root; - struct btrfs_file_extent_item *fi; + struct btrfs_file_extent_item fi = { 0 }; struct extent_buffer *leaf; - struct btrfs_map_token token; struct btrfs_key key; u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; + btrfs_set_stack_file_extent_generation(&fi, trans->transid); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); + else + btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); + btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - + extent_offset); + btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); + } + + btrfs_set_stack_file_extent_offset(&fi, extent_offset); + btrfs_set_stack_file_extent_num_bytes(&fi, em->len); + btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); + btrfs_set_stack_file_extent_compression(&fi, em->compress_type); + ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; @@ -4589,12 +4773,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans, * are small, with a root at level 2 or 3 at most, due to their short * life span. */ - if (inode_logged(trans, inode)) { + if (ctx->logged_before) { drop_args.path = path; drop_args.start = em->start; drop_args.end = em->start + em->len; drop_args.replace_extent = true; - drop_args.extent_item_size = sizeof(*fi); + drop_args.extent_item_size = sizeof(fi); ret = btrfs_drop_extents(trans, log, inode, &drop_args); if (ret) return ret; @@ -4606,44 +4790,14 @@ static int log_one_extent(struct btrfs_trans_handle *trans, key.offset = em->start; ret = btrfs_insert_empty_item(trans, log, path, &key, - sizeof(*fi)); + sizeof(fi)); if (ret) return ret; } leaf = path->nodes[0]; - btrfs_init_map_token(&token, leaf); - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - btrfs_set_token_file_extent_generation(&token, fi, trans->transid); - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - btrfs_set_token_file_extent_type(&token, fi, - BTRFS_FILE_EXTENT_PREALLOC); - else - btrfs_set_token_file_extent_type(&token, fi, - BTRFS_FILE_EXTENT_REG); - - block_len = max(em->block_len, em->orig_block_len); - if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_token_file_extent_disk_bytenr(&token, fi, - em->block_start); - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); - } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_token_file_extent_disk_bytenr(&token, fi, - em->block_start - - extent_offset); - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); - } else { - btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); - } - - btrfs_set_token_file_extent_offset(&token, fi, extent_offset); - btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); - btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); - btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); - btrfs_set_token_file_extent_encryption(&token, fi, 0); - btrfs_set_token_file_extent_other_encoding(&token, fi, 0); + write_extent_buffer(leaf, &fi, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(fi)); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -4857,7 +5011,6 @@ process: WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - btrfs_release_path(path); if (!ret) ret = btrfs_log_prealloc_extents(trans, inode, path); if (ret) @@ -5551,6 +5704,13 @@ next_key: } else { break; } + + /* + * We may process many leaves full of items for our inode, so + * avoid monopolizing a cpu for too long by rescheduling while + * not holding locks on any tree. + */ + cond_resched(); } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, @@ -5595,8 +5755,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = inode->root->log_root; - int err = 0; - int ret = 0; + int ret; bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &inode->extent_tree; @@ -5605,6 +5764,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool xattrs_logged = false; bool recursive_logging = false; bool inode_item_dropped = true; + const bool orig_logged_before = ctx->logged_before; path = btrfs_alloc_path(); if (!path) @@ -5638,8 +5798,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * and figure out which index ranges have to be logged. */ if (S_ISDIR(inode->vfs_inode.i_mode)) { - err = btrfs_commit_inode_delayed_items(trans, inode); - if (err) + ret = btrfs_commit_inode_delayed_items(trans, inode); + if (ret) goto out; } @@ -5655,6 +5815,29 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } /* + * For symlinks, we must always log their content, which is stored in an + * inline extent, otherwise we could end up with an empty symlink after + * log replay, which is invalid on linux (symlink(2) returns -ENOENT if + * one attempts to create an empty symlink). + * We don't need to worry about flushing delalloc, because when we create + * the inline extent when the symlink is created (we never have delalloc + * for symlinks). + */ + if (S_ISLNK(inode->vfs_inode.i_mode)) + inode_only = LOG_INODE_ALL; + + /* + * Before logging the inode item, cache the value returned by + * inode_logged(), because after that we have the need to figure out if + * the inode was previously logged in this transaction. + */ + ret = inode_logged(trans, inode, path); + if (ret < 0) + goto out_unlock; + ctx->logged_before = (ret == 1); + ret = 0; + + /* * This is for cases where logging a directory could result in losing a * a file after replaying the log. For example, if we move a file from a * directory A to a directory B, then fsync directory A, we have no way @@ -5665,7 +5848,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, inode_only == LOG_INODE_ALL && inode->last_unlink_trans >= trans->transid) { btrfs_set_log_full_commit(trans); - err = 1; + ret = 1; goto out_unlock; } @@ -5679,9 +5862,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); if (inode_only == LOG_INODE_EXISTS) max_key_type = BTRFS_XATTR_ITEM_KEY; - ret = drop_inode_items(trans, log, path, inode, max_key_type); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, inode, + max_key_type); } else { - if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) { + if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { /* * Make sure the new inode item we write to the log has * the same isize as the current one (if it exists). @@ -5695,22 +5880,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * (zeroes), as if an expanding truncate happened, * instead of getting a file of 4Kb only. */ - err = logged_inode_size(log, inode, path, &logged_isize); - if (err) + ret = logged_inode_size(log, inode, path, &logged_isize); + if (ret) goto out_unlock; } if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) { if (inode_only == LOG_INODE_EXISTS) { max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_inode_items(trans, log, path, inode, - max_key.type); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, + inode, max_key.type); } else { clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); - if (inode_logged(trans, inode)) + if (ctx->logged_before) ret = truncate_inode_items(trans, log, inode, 0, 0); } @@ -5720,8 +5906,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (inode_only == LOG_INODE_ALL) fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_inode_items(trans, log, path, inode, - max_key.type); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, inode, + max_key.type); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; @@ -5730,37 +5917,35 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } } - if (ret) { - err = ret; + if (ret) goto out_unlock; - } - err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, + ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, path, dst_path, logged_isize, recursive_logging, inode_only, ctx, &need_log_inode_item); - if (err) + if (ret) goto out_unlock; btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_all_xattrs(trans, inode, path, dst_path); - if (err) + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + if (ret) goto out_unlock; xattrs_logged = true; if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_holes(trans, inode, path); - if (err) + ret = btrfs_log_holes(trans, inode, path); + if (ret) goto out_unlock; } log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (need_log_inode_item) { - err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); - if (err) + ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); + if (ret) goto out_unlock; /* * If we are doing a fast fsync and the inode was logged before @@ -5771,18 +5956,16 @@ log_extents: * BTRFS_INODE_COPY_EVERYTHING set. */ if (!xattrs_logged && inode->logged_trans < trans->transid) { - err = btrfs_log_all_xattrs(trans, inode, path, dst_path); - if (err) + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + if (ret) goto out_unlock; btrfs_release_path(path); } } if (fast_search) { ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); - if (ret) { - err = ret; + if (ret) goto out_unlock; - } } else if (inode_only == LOG_INODE_ALL) { struct extent_map *em, *n; @@ -5794,10 +5977,8 @@ log_extents: if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { ret = log_directory_changes(trans, inode, path, dst_path, ctx); - if (ret) { - err = ret; + if (ret) goto out_unlock; - } } spin_lock(&inode->lock); @@ -5836,12 +6017,24 @@ log_extents: if (inode_only != LOG_INODE_EXISTS) inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); + + /* + * Reset the last_reflink_trans so that the next fsync does not need to + * go through the slower path when logging extents and their checksums. + */ + if (inode_only == LOG_INODE_ALL) + inode->last_reflink_trans = 0; + out_unlock: mutex_unlock(&inode->log_mutex); out: btrfs_free_path(path); btrfs_free_path(dst_path); - return err; + + if (recursive_logging) + ctx->logged_before = orig_logged_before; + + return ret; } /* @@ -5926,7 +6119,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_root *log = root->log_root; struct btrfs_path *path; LIST_HEAD(dir_list); struct btrfs_dir_list *dir_elem; @@ -5968,7 +6160,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, min_key.offset = 0; again: btrfs_release_path(path); - ret = btrfs_search_forward(log, &min_key, path, trans->transid); + ret = btrfs_search_forward(root, &min_key, path, trans->transid); if (ret < 0) { goto next_dir_inode; } else if (ret > 0) { @@ -5976,7 +6168,6 @@ again: goto next_dir_inode; } -process_leaf: leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); for (i = path->slots[0]; i < nritems; i++) { @@ -5994,8 +6185,7 @@ process_leaf: di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); type = btrfs_dir_type(leaf, di); - if (btrfs_dir_transid(leaf, di) < trans->transid && - type != BTRFS_FT_DIR) + if (btrfs_dir_transid(leaf, di) < trans->transid) continue; btrfs_dir_item_key_to_cpu(leaf, di, &di_key); if (di_key.type == BTRFS_ROOT_ITEM_KEY) @@ -6014,7 +6204,7 @@ process_leaf: } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) + if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); @@ -6033,16 +6223,6 @@ process_leaf: } break; } - if (i == nritems) { - ret = btrfs_next_leaf(log, path); - if (ret < 0) { - goto next_dir_inode; - } else if (ret > 0) { - ret = 0; - goto next_dir_inode; - } - goto process_leaf; - } if (min_key.offset < (u64)-1) { min_key.offset++; goto again; @@ -6773,15 +6953,32 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, mutex_unlock(&dir->log_mutex); } -/* - * Call this after adding a new name for a file and it will properly - * update the log to reflect the new name. +/** + * Update the log after adding a new name for an inode. + * + * @trans: Transaction handle. + * @old_dentry: The dentry associated with the old name and the old + * parent directory. + * @old_dir: The inode of the previous parent directory for the case + * of a rename. For a link operation, it must be NULL. + * @old_dir_index: The index number associated with the old name, meaningful + * only for rename operations (when @old_dir is not NULL). + * Ignored for link operations. + * @parent: The dentry associated with the directory under which the + * new name is located. + * + * Call this after adding a new name for an inode, as a result of a link or + * rename operation, and it will properly update the log to reflect the new name. */ void btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent) + struct dentry *old_dentry, struct btrfs_inode *old_dir, + u64 old_dir_index, struct dentry *parent) { + struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); + struct btrfs_root *root = inode->root; struct btrfs_log_ctx ctx; + bool log_pinned = false; + int ret; /* * this will force the logging code to walk the dentry chain @@ -6794,26 +6991,83 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * if this inode hasn't been logged and directory we're renaming it * from hasn't been logged, we don't need to log it */ - if (!inode_logged(trans, inode) && - (!old_dir || !inode_logged(trans, old_dir))) - return; + ret = inode_logged(trans, inode, NULL); + if (ret < 0) { + goto out; + } else if (ret == 0) { + if (!old_dir) + return; + /* + * If the inode was not logged and we are doing a rename (old_dir is not + * NULL), check if old_dir was logged - if it was not we can return and + * do nothing. + */ + ret = inode_logged(trans, old_dir, NULL); + if (ret < 0) + goto out; + else if (ret == 0) + return; + } + ret = 0; /* * If we are doing a rename (old_dir is not NULL) from a directory that - * was previously logged, make sure the next log attempt on the directory - * is not skipped and logs the inode again. This is because the log may - * not currently be authoritative for a range including the old - * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we - * do not end up with both the new and old dentries around (in case the - * inode is a directory we would have a directory with two hard links and - * 2 inode references for different parents). The next log attempt of - * old_dir will happen at btrfs_log_all_parents(), called through - * btrfs_log_inode_parent() below, because we have previously set - * inode->last_unlink_trans to the current transaction ID, either here or - * at btrfs_record_unlink_dir() in case the inode is a directory. + * was previously logged, make sure that on log replay we get the old + * dir entry deleted. This is needed because we will also log the new + * name of the renamed inode, so we need to make sure that after log + * replay we don't end up with both the new and old dir entries existing. */ - if (old_dir) - old_dir->logged_trans = 0; + if (old_dir && old_dir->logged_trans == trans->transid) { + struct btrfs_root *log = old_dir->root->log_root; + struct btrfs_path *path; + + ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + + /* + * We have two inodes to update in the log, the old directory and + * the inode that got renamed, so we must pin the log to prevent + * anyone from syncing the log until we have updated both inodes + * in the log. + */ + log_pinned = true; + btrfs_pin_log_trans(root); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* + * Other concurrent task might be logging the old directory, + * as it can be triggered when logging other inode that had or + * still has a dentry in the old directory. We lock the old + * directory's log_mutex to ensure the deletion of the old + * name is persisted, because during directory logging we + * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of + * the old name's dir index item is in the delayed items, so + * it could be missed by an in progress directory logging. + */ + mutex_lock(&old_dir->log_mutex); + ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), + old_dentry->d_name.name, + old_dentry->d_name.len, old_dir_index); + if (ret > 0) { + /* + * The dentry does not exist in the log, so record its + * deletion. + */ + btrfs_release_path(path); + ret = insert_dir_log_key(trans, log, path, + btrfs_ino(old_dir), + old_dir_index, old_dir_index); + } + mutex_unlock(&old_dir->log_mutex); + + btrfs_free_path(path); + if (ret < 0) + goto out; + } btrfs_init_log_ctx(&ctx, &inode->vfs_inode); ctx.logging_new_name = true; @@ -6825,5 +7079,16 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); +out: + /* + * If an error happened mark the log for a full commit because it's not + * consistent and up to date or we couldn't find out if one of the + * inodes was logged before in this transaction. Do it before unpinning + * the log, to avoid any races with someone else trying to commit it. + */ + if (ret < 0) + btrfs_set_log_full_commit(trans); + if (log_pinned) + btrfs_end_log_trans(root); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index f6811c3df38a..1620f8170629 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -17,6 +17,8 @@ struct btrfs_log_ctx { int log_transid; bool log_new_dentries; bool logging_new_name; + /* Indicate if the inode being logged was logged before. */ + bool logged_before; /* Tracks the last logged dir item/index key offset. */ u64 last_dir_item_offset; struct inode *inode; @@ -32,6 +34,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_transid = 0; ctx->log_new_dentries = false; ctx->logging_new_name = false; + ctx->logged_before = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); INIT_LIST_HEAD(&ctx->ordered_extents); @@ -86,7 +89,7 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); void btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent); + struct dentry *old_dentry, struct btrfs_inode *old_dir, + u64 old_dir_index, struct dentry *parent); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b07d382d53a8..9c20049d1fec 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -164,24 +164,12 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { */ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) { - if (flags & BTRFS_BLOCK_GROUP_RAID10) - return BTRFS_RAID_RAID10; - else if (flags & BTRFS_BLOCK_GROUP_RAID1) - return BTRFS_RAID_RAID1; - else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) - return BTRFS_RAID_RAID1C3; - else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) - return BTRFS_RAID_RAID1C4; - else if (flags & BTRFS_BLOCK_GROUP_DUP) - return BTRFS_RAID_DUP; - else if (flags & BTRFS_BLOCK_GROUP_RAID0) - return BTRFS_RAID_RAID0; - else if (flags & BTRFS_BLOCK_GROUP_RAID5) - return BTRFS_RAID_RAID5; - else if (flags & BTRFS_BLOCK_GROUP_RAID6) - return BTRFS_RAID_RAID6; - - return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ + const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); + + if (!profile) + return BTRFS_RAID_SINGLE; + + return BTRFS_BG_FLAG_TO_INDEX(profile); } const char *btrfs_bg_type_to_raid_name(u64 flags) @@ -405,7 +393,6 @@ void btrfs_free_device(struct btrfs_device *device) WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state); - bio_put(device->flush_bio); btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -534,30 +521,20 @@ error: return ret; } -static bool device_path_matched(const char *path, struct btrfs_device *device) -{ - int found; - - rcu_read_lock(); - found = strcmp(rcu_str_deref(device->name), path); - rcu_read_unlock(); - - return found == 0; -} - -/* - * Search and remove all stale (devices which are not mounted) devices. +/** + * Search and remove all stale devices (which are not mounted). * When both inputs are NULL, it will search and release all stale devices. - * path: Optional. When provided will it release all unmounted devices - * matching this path only. - * skip_dev: Optional. Will skip this device when searching for the stale + * + * @devt: Optional. When provided will it release all unmounted devices + * matching this devt only. + * @skip_device: Optional. Will skip this device when searching for the stale * devices. - * Return: 0 for success or if @path is NULL. - * -EBUSY if @path is a mounted device. - * -ENOENT if @path does not match any device in the list. + * + * Return: 0 for success or if @devt is 0. + * -EBUSY if @devt is a mounted device. + * -ENOENT if @devt does not match any device in the list. */ -static int btrfs_free_stale_devices(const char *path, - struct btrfs_device *skip_device) +static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; @@ -565,7 +542,7 @@ static int btrfs_free_stale_devices(const char *path, lockdep_assert_held(&uuid_mutex); - if (path) + if (devt) ret = -ENOENT; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { @@ -575,13 +552,11 @@ static int btrfs_free_stale_devices(const char *path, &fs_devices->devices, dev_list) { if (skip_device && skip_device == device) continue; - if (path && !device->name) - continue; - if (path && !device_path_matched(path, device)) + if (devt && devt != device->devt) continue; if (fs_devices->opened) { /* for an already deleted device return 0 */ - if (path && ret != 0) + if (devt && ret != 0) ret = -EBUSY; break; } @@ -614,7 +589,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, fmode_t flags, void *holder) { - struct request_queue *q; struct block_device *bdev; struct btrfs_super_block *disk_super; u64 devid; @@ -656,8 +630,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - q = bdev_get_queue(bdev); - if (!blk_queue_nonrot(q)) + if (!bdev_nonrot(bdev)) fs_devices->rotating = true; device->bdev = bdev; @@ -781,11 +754,17 @@ static noinline struct btrfs_device *device_list_add(const char *path, struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); + dev_t path_devt; + int error; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + error = lookup_bdev(path, &path_devt); + if (error) + return ERR_PTR(error); + if (fsid_change_in_progress) { if (!has_metadata_uuid) fs_devices = find_fsid_inprogress(disk_super); @@ -868,6 +847,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); + device->devt = path_devt; list_add_rcu(&device->dev_list, &fs_devices->devices); fs_devices->num_devices++; @@ -928,25 +908,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, /* * We are going to replace the device path for a given devid, * make sure it's the same device if the device is mounted + * + * NOTE: the device->fs_info may not be reliable here so pass + * in a NULL to message helpers instead. This avoids a possible + * use-after-free when the fs_info and fs_info->sb are already + * torn down. */ if (device->bdev) { - int error; - dev_t path_dev; - - error = lookup_bdev(path, &path_dev); - if (error) { + if (device->devt != path_devt) { mutex_unlock(&fs_devices->device_list_mutex); - return ERR_PTR(error); - } - - if (device->bdev->bd_dev != path_dev) { - mutex_unlock(&fs_devices->device_list_mutex); - /* - * device->fs_info may not be reliable here, so - * pass in a NULL instead. This avoids a - * possible use-after-free when the fs_info and - * fs_info->sb are already torn down. - */ btrfs_warn_in_rcu(NULL, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", path, devid, found_transid, @@ -954,7 +924,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, task_pid_nr(current)); return ERR_PTR(-EEXIST); } - btrfs_info_in_rcu(device->fs_info, + btrfs_info_in_rcu(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, rcu_str_deref(device->name), path, current->comm, @@ -972,6 +942,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, fs_devices->missing_devices--; clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } + device->devt = path_devt; } /* @@ -1331,12 +1302,12 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev return disk_super; } -int btrfs_forget_devices(const char *path) +int btrfs_forget_devices(dev_t devt) { int ret; mutex_lock(&uuid_mutex); - ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); + ret = btrfs_free_stale_devices(devt, NULL); mutex_unlock(&uuid_mutex); return ret; @@ -1385,10 +1356,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, } device = device_list_add(path, disk_super, &new_device_added); - if (!IS_ERR(device)) { - if (new_device_added) - btrfs_free_stale_devices(path, device); - } + if (!IS_ERR(device) && new_device_added) + btrfs_free_stale_devices(device->devt, device); btrfs_release_disk_super(disk_super); @@ -1914,23 +1883,18 @@ static void update_dev_time(const char *device_path) path_put(&path); } -static int btrfs_rm_dev_item(struct btrfs_device *device) +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, + struct btrfs_device *device) { struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_trans_handle *trans; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1941,21 +1905,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) if (ret) { if (ret > 0) ret = -ENOENT; - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - } - out: btrfs_free_path(path); - if (!ret) - ret = btrfs_commit_transaction(trans); return ret; } @@ -2096,12 +2051,18 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode) { + struct btrfs_trans_handle *trans; struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 num_devices; int ret = 0; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); + return -EINVAL; + } + /* * The device list in fs_devices is accessed without locks (neither * uuid_mutex nor device_list_mutex) as it won't change on a mounted @@ -2111,7 +2072,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) - goto out; + return ret; device = btrfs_find_device(fs_info->fs_devices, args); if (!device) { @@ -2119,27 +2080,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = -ENOENT; - goto out; + return ret; } if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", rcu_str_deref(device->name), device->devid); - ret = -ETXTBSY; - goto out; + return -ETXTBSY; } - if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto out; - } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + return BTRFS_ERROR_DEV_TGT_REPLACE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && - fs_info->fs_devices->rw_devices == 1) { - ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto out; - } + fs_info->fs_devices->rw_devices == 1) + return BTRFS_ERROR_DEV_ONLY_WRITABLE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); @@ -2152,14 +2108,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (ret) goto error_undo; - /* - * TODO: the superblock still includes this device in its num_devices - * counter although write_all_supers() is not locked out. This - * could give a filesystem state which requires a degraded mount. - */ - ret = btrfs_rm_dev_item(device); - if (ret) + trans = btrfs_start_transaction(fs_info->chunk_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto error_undo; + } + + ret = btrfs_rm_dev_item(trans, device); + if (ret) { + /* Any error in dev item removal is critical */ + btrfs_crit(fs_info, + "failed to remove device item for devid %llu: %d", + device->devid, ret); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(device); @@ -2242,7 +2206,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, free_fs_devices(cur_devices); } -out: + ret = btrfs_commit_transaction(trans); + return ret; error_undo: @@ -2253,7 +2218,7 @@ error_undo: device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } - goto out; + return ret; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) @@ -2606,7 +2571,6 @@ error: int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) { struct btrfs_root *root = fs_info->dev_root; - struct request_queue *q; struct btrfs_trans_handle *trans; struct btrfs_device *device; struct block_device *bdev; @@ -2668,6 +2632,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->fs_info = fs_info; device->bdev = bdev; + ret = lookup_bdev(device_path, &device->devt); + if (ret) + goto error_free_device; ret = btrfs_get_dev_zone_info(device, false); if (ret) @@ -2679,7 +2646,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error_free_zone; } - q = bdev_get_queue(bdev); set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = trans->transid; device->io_width = fs_info->sectorsize; @@ -2727,7 +2693,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!blk_queue_nonrot(q)) + if (!bdev_nonrot(bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); @@ -2814,7 +2780,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path * We can ignore the return value as it typically returns -EINVAL and * only succeeds if the device was an alien. */ - btrfs_forget_devices(device_path); + btrfs_forget_devices(device->devt); /* Update ctime/mtime for blkid or udev */ update_dev_time(device_path); @@ -3251,6 +3217,12 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) u64 length; int ret; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "relocate: not supported on extent tree v2 yet"); + return -EINVAL; + } + /* * Prevent races with automatic removal of unused block groups. * After we relocate and before we remove the chunk with offset @@ -4078,13 +4050,6 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) return true; - if (fs_info->sectorsize < PAGE_SIZE && - bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { - btrfs_err(fs_info, - "RAID56 is not yet supported for sectorsize %u with page size %lu", - fs_info->sectorsize, PAGE_SIZE); - return false; - } /* Profile is valid and does not have bits outside of the allowed set */ if (alloc_profile_is_valid(bargs->target, 1) && (bargs->target & ~allowed) == 0) @@ -4445,10 +4410,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; + sb_start_write(fs_info->sb); mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); + sb_end_write(fs_info->sb); return ret; } @@ -6326,7 +6293,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, u64 offset; u64 stripe_offset; u64 stripe_nr; - u64 stripe_len; + u32 stripe_len; u64 raid56_full_stripe_start = (u64)-1; int data_stripes; @@ -6337,19 +6304,13 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, offset = logical - em->start; /* Len of a stripe in a chunk */ stripe_len = map->stripe_len; - /* Stripe where this block falls in */ - stripe_nr = div64_u64(offset, stripe_len); - /* Offset of stripe in the chunk */ - stripe_offset = stripe_nr * stripe_len; - if (offset < stripe_offset) { - btrfs_crit(fs_info, -"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", - stripe_offset, offset, em->start, logical, stripe_len); - return -EINVAL; - } + /* + * Stripe_nr is where this block falls in + * stripe_offset is the offset of this block in its stripe. + */ + stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); + ASSERT(stripe_offset < U32_MAX); - /* stripe_offset is the offset of this block in its stripe */ - stripe_offset = offset - stripe_offset; data_stripes = nr_data_stripes(map); /* Only stripe based profiles needs to check against stripe length. */ @@ -6751,11 +6712,11 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, bio->bi_iter.bi_size); - bio_set_dev(bio, dev->bdev); btrfs_bio_counter_inc_noblocked(fs_info); - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); } static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) @@ -6837,10 +6798,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, continue; } - if (dev_nr < total_devs - 1) - bio = btrfs_bio_clone(first_bio); - else + if (dev_nr < total_devs - 1) { + bio = btrfs_bio_clone(dev->bdev, first_bio); + } else { bio = first_bio; + bio_set_dev(bio, dev->bdev); + } submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); } @@ -6962,16 +6925,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (!dev) return ERR_PTR(-ENOMEM); - /* - * Preallocate a bio that's always going to be used for flushing device - * barriers and matches the device lifespan - */ - dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); - if (!dev->flush_bio) { - kfree(dev); - return ERR_PTR(-ENOMEM); - } - INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->post_commit_list); @@ -7060,6 +7013,27 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, } #endif +static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, + u64 devid, u8 *uuid) +{ + struct btrfs_device *dev; + + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_report_missing_device(fs_info, devid, uuid, true); + return ERR_PTR(-ENOENT); + } + + dev = add_missing_dev(fs_info->fs_devices, devid, uuid); + if (IS_ERR(dev)) { + btrfs_err(fs_info, "failed to init missing device %llu: %ld", + devid, PTR_ERR(dev)); + return dev; + } + btrfs_report_missing_device(fs_info, devid, uuid, false); + + return dev; +} + static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { @@ -7147,28 +7121,17 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, BTRFS_UUID_SIZE); args.uuid = uuid; map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); - if (!map->stripes[i].dev && - !btrfs_test_opt(fs_info, DEGRADED)) { - free_extent_map(em); - btrfs_report_missing_device(fs_info, devid, uuid, true); - return -ENOENT; - } if (!map->stripes[i].dev) { - map->stripes[i].dev = - add_missing_dev(fs_info->fs_devices, devid, - uuid); + map->stripes[i].dev = handle_missing_device(fs_info, + devid, uuid); if (IS_ERR(map->stripes[i].dev)) { free_extent_map(em); - btrfs_err(fs_info, - "failed to init missing dev %llu: %ld", - devid, PTR_ERR(map->stripes[i].dev)); return PTR_ERR(map->stripes[i].dev); } - btrfs_report_missing_device(fs_info, devid, uuid, false); } + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &(map->stripes[i].dev->dev_state)); - } write_lock(&map_tree->lock); @@ -7373,7 +7336,6 @@ static int read_one_dev(struct extent_buffer *leaf, int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root = fs_info->tree_root; struct btrfs_super_block *super_copy = fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; @@ -7389,30 +7351,16 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); + /* - * This will create extent buffer of nodesize, superblock size is - * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will - * overallocate but we can keep it as-is, only the first page is used. + * We allocated a dummy extent, just to use extent buffer accessors. + * There will be unused space after BTRFS_SUPER_INFO_SIZE, but + * that's fine, we will not go beyond system chunk array anyway. */ - sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, - root->root_key.objectid, 0); - if (IS_ERR(sb)) - return PTR_ERR(sb); + sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); + if (!sb) + return -ENOMEM; set_extent_buffer_uptodate(sb); - /* - * The sb extent buffer is artificial and just used to read the system array. - * set_extent_buffer_uptodate() call does not properly mark all it's - * pages up-to-date when the page is larger: extent does not cover the - * whole page and consequently check_page_uptodate does not find all - * the page's extents up-to-date (the hole beyond sb), - * write_extent_buffer then triggers a WARN_ON. - * - * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, - * but sb spans only this function. Add an explicit SetPageUptodate call - * to silence the warning eg. on PowerPC 64. - */ - if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) - SetPageUptodate(sb->pages[0]); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); @@ -7575,6 +7523,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) struct btrfs_key found_key; int ret; int slot; + int iter_ret = 0; u64 total_dev = 0; u64 last_ra_node = 0; @@ -7618,30 +7567,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.offset = 0; key.type = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto error; - while (1) { - struct extent_buffer *node; + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + struct extent_buffer *node = path->nodes[1]; leaf = path->nodes[0]; slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto error; - break; - } - node = path->nodes[1]; + if (node) { if (last_ra_node != node->start) { readahead_tree_node_children(node); last_ra_node = node->start; } } - btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.type == BTRFS_DEV_ITEM_KEY) { struct btrfs_dev_item *dev_item; dev_item = btrfs_item_ptr(leaf, slot, @@ -7666,7 +7603,11 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) if (ret) goto error; } - path->slots[0]++; + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto error; } /* @@ -7674,12 +7615,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * do another round of validation checks. */ if (total_dev != fs_info->fs_devices->total_devices) { - btrfs_err(fs_info, - "super_num_devices %llu mismatch with num_devices %llu found here", + btrfs_warn(fs_info, +"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", btrfs_super_num_devices(fs_info->super_copy), total_dev); - ret = -EINVAL; - goto error; + fs_info->fs_devices->total_devices = total_dev; + btrfs_set_super_num_devices(fs_info->super_copy, total_dev); } if (btrfs_super_total_bytes(fs_info->super_copy) < fs_info->fs_devices->total_rw_bytes) { @@ -8291,7 +8232,7 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) static int relocating_repair_kthread(void *data) { - struct btrfs_block_group *cache = (struct btrfs_block_group *)data; + struct btrfs_block_group *cache = data; struct btrfs_fs_info *fs_info = cache->fs_info; u64 target; int ret = 0; @@ -8299,10 +8240,12 @@ static int relocating_repair_kthread(void *data) target = cache->start; btrfs_put_block_group(cache); + sb_start_write(fs_info->sb); if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { btrfs_info(fs_info, "zoned: skip relocating block group %llu to repair: EBUSY", target); + sb_end_write(fs_info->sb); return -EBUSY; } @@ -8330,6 +8273,7 @@ out: btrfs_put_block_group(cache); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 005c9e2a491a..6721002000ee 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -17,17 +17,51 @@ extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN SZ_64K +/* Used by sanity check for btrfs_raid_types. */ +#define const_ffs(n) (__builtin_ctzll(n) + 1) + +/* + * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires + * RAID0 always to be the lowest profile bit. + * Although it's part of on-disk format and should never change, do extra + * compile-time sanity checks. + */ +static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < + const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); +static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) > + ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); + +/* ilog2() can handle both constants and variables */ +#define BTRFS_BG_FLAG_TO_INDEX(profile) \ + ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1)) + +enum btrfs_raid_types { + /* SINGLE is the special one as it doesn't have on-disk bit. */ + BTRFS_RAID_SINGLE = 0, + + BTRFS_RAID_RAID0 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0), + BTRFS_RAID_RAID1 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1), + BTRFS_RAID_DUP = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP), + BTRFS_RAID_RAID10 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10), + BTRFS_RAID_RAID5 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5), + BTRFS_RAID_RAID6 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6), + BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3), + BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4), + + BTRFS_NR_RAID_TYPES +}; + struct btrfs_io_geometry { /* remaining bytes before crossing a stripe */ u64 len; /* offset of logical address in chunk */ u64 offset; /* length of single IO stripe */ - u64 stripe_len; + u32 stripe_len; + /* offset of address in stripe */ + u32 stripe_offset; /* number of stripe where address falls */ u64 stripe_nr; - /* offset of address in stripe */ - u64 stripe_offset; /* offset of raid56 stripe into the chunk */ u64 raid56_stripe_offset; }; @@ -72,6 +106,11 @@ struct btrfs_device { /* the mode sent to blkdev_get */ fmode_t mode; + /* + * Device's major-minor number. Must be set even if the device is not + * opened (bdev == NULL), unless the device is missing. + */ + dev_t devt; unsigned long dev_state; blk_status_t last_flush_error; @@ -116,8 +155,8 @@ struct btrfs_device { /* bytes used on the current transaction */ u64 commit_bytes_used; - /* for sending down flush barriers */ - struct bio *flush_bio; + /* Bio used for flushing device barriers */ + struct bio flush_bio; struct completion flush_wait; /* per-device scrub information */ @@ -323,6 +362,9 @@ struct btrfs_fs_devices { struct btrfs_bio { unsigned int mirror_num; + /* for direct I/O */ + u64 file_offset; + /* @device is for stripe IO submission. */ struct btrfs_device *device; u8 *csum; @@ -422,7 +464,7 @@ struct map_lookup { u64 type; int io_align; int io_width; - u64 stripe_len; + u32 stripe_len; int num_stripes; int sub_stripes; int verified_stripes; /* For mount time dev extent verification */ @@ -505,7 +547,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); -int btrfs_forget_devices(const char *path); +int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); void btrfs_assign_next_active_device(struct btrfs_device *device, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 99abf41b89b9..7421abcf325a 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -262,7 +262,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, ret); out: if (start_trans) btrfs_end_transaction(trans); @@ -271,10 +272,12 @@ out: ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { + struct btrfs_key found_key; struct btrfs_key key; struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; + int iter_ret = 0; int ret = 0; size_t total_size = 0, size_left = size; @@ -293,44 +296,23 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) path->reada = READA_FORWARD; /* search for our xattrs */ - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *leaf; int slot; struct btrfs_dir_item *di; - struct btrfs_key found_key; u32 item_size; u32 cur; leaf = path->nodes[0]; slot = path->slots[0]; - /* this is where we start walking through the path */ - if (slot >= btrfs_header_nritems(leaf)) { - /* - * if we've reached the last slot in this leaf we need - * to go to the next leaf and reset everything - */ - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto err; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &found_key, slot); - /* check to make sure this item is what we want */ if (found_key.objectid != key.objectid) break; if (found_key.type > BTRFS_XATTR_ITEM_KEY) break; if (found_key.type < BTRFS_XATTR_ITEM_KEY) - goto next_item; + continue; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); item_size = btrfs_item_size(leaf, slot); @@ -350,8 +332,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) goto next; if (!buffer || (name_len + 1) > size_left) { - ret = -ERANGE; - goto err; + iter_ret = -ERANGE; + break; } read_extent_buffer(leaf, buffer, name_ptr, name_len); @@ -363,12 +345,13 @@ next: cur += this_len; di = (struct btrfs_dir_item *)((char *)di + this_len); } -next_item: - path->slots[0]++; } - ret = total_size; -err: + if (iter_ret < 0) + ret = iter_ret; + else + ret = total_size; + btrfs_free_path(path); return ret; @@ -403,10 +386,13 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, struct btrfs_root *root = BTRFS_I(inode)->root; name = xattr_full_name(handler, name); - ret = btrfs_validate_prop(name, value, size); + ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size); if (ret) return ret; + if (btrfs_ignore_prop(BTRFS_I(inode), name)) + return 0; + trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -416,7 +402,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, ret); } btrfs_end_transaction(trans); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f559d517c7c4..79e8c8cd75ed 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -51,11 +51,13 @@ #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) /* - * Maximum supported zone size. Currently, SMR disks have a zone size of - * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not - * expect the zone size to become larger than 8GiB in the near future. + * Minimum / maximum supported zone size. Currently, SMR disks have a zone + * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. + * We do not expect the zone size to become larger than 8GiB or smaller than + * 4MiB in the near future. */ #define BTRFS_MAX_ZONE_SIZE SZ_8G +#define BTRFS_MIN_ZONE_SIZE SZ_4M #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) @@ -350,7 +352,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; - struct request_queue *queue = bdev_get_queue(bdev); unsigned int max_active_zones; unsigned int nactive; sector_t nr_sectors; @@ -402,6 +403,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); ret = -EINVAL; goto out; + } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { + btrfs_err_in_rcu(fs_info, + "zoned: %s: zone size %llu smaller than supported minimum %u", + rcu_str_deref(device->name), + zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); + ret = -EINVAL; + goto out; } nr_sectors = bdev_nr_sectors(bdev); @@ -410,7 +418,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; - max_active_zones = queue_max_active_zones(queue); + max_active_zones = bdev_max_active_zones(bdev); if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { btrfs_err_in_rcu(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", @@ -652,8 +660,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) if (model == BLK_ZONED_HM || (model == BLK_ZONED_HA && incompat_zoned) || (model == BLK_ZONED_NONE && incompat_zoned)) { - struct btrfs_zoned_device_info *zone_info = - device->zone_info; + struct btrfs_zoned_device_info *zone_info; zone_info = device->zone_info; zoned_devices++; @@ -1215,12 +1222,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) struct btrfs_device *device; u64 logical = cache->start; u64 length = cache->length; - u64 physical = 0; int ret; int i; unsigned int nofs_flag; u64 *alloc_offsets = NULL; u64 *caps = NULL; + u64 *physical = NULL; unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; @@ -1264,6 +1271,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS); + if (!physical) { + ret = -ENOMEM; + goto out; + } + active = bitmap_zalloc(map->num_stripes, GFP_NOFS); if (!active) { ret = -ENOMEM; @@ -1277,14 +1290,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) int dev_replace_is_ongoing = 0; device = map->stripes[i].dev; - physical = map->stripes[i].physical; + physical[i] = map->stripes[i].physical; if (device->bdev == NULL) { alloc_offsets[i] = WP_MISSING_DEV; continue; } - is_sequential = btrfs_dev_is_sequential(device, physical); + is_sequential = btrfs_dev_is_sequential(device, physical[i]); if (is_sequential) num_sequential++; else @@ -1299,21 +1312,21 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * This zone will be used for allocation, so mark this zone * non-empty. */ - btrfs_dev_clear_zone_empty(device, physical); + btrfs_dev_clear_zone_empty(device, physical[i]); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) - btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]); up_read(&dev_replace->rwsem); /* * The group is mapped to a sequential zone. Get the zone write * pointer to determine the allocation offset within the zone. */ - WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); + WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size)); nofs_flag = memalloc_nofs_save(); - ret = btrfs_get_dev_zone(device, physical, &zone); + ret = btrfs_get_dev_zone(device, physical[i], &zone); memalloc_nofs_restore(nofs_flag); if (ret == -EIO || ret == -EOPNOTSUPP) { ret = 0; @@ -1339,7 +1352,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) case BLK_ZONE_COND_READONLY: btrfs_err(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", - physical >> device->zone_info->zone_size_shift, + physical[i] >> device->zone_info->zone_size_shift, rcu_str_deref(device->name), device->devid); alloc_offsets[i] = WP_MISSING_DEV; break; @@ -1404,7 +1417,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) if (alloc_offsets[0] == WP_MISSING_DEV) { btrfs_err(fs_info, "zoned: cannot recover write pointer for zone %llu", - physical); + physical[0]); ret = -EIO; goto out; } @@ -1413,6 +1426,42 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) cache->zone_is_active = test_bit(0, active); break; case BTRFS_BLOCK_GROUP_DUP: + if (map->type & BTRFS_BLOCK_GROUP_DATA) { + btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg"); + ret = -EINVAL; + goto out; + } + if (alloc_offsets[0] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[0]); + ret = -EIO; + goto out; + } + if (alloc_offsets[1] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[1]); + ret = -EIO; + goto out; + } + if (alloc_offsets[0] != alloc_offsets[1]) { + btrfs_err(fs_info, + "zoned: write pointer offset mismatch of zones in DUP profile"); + ret = -EIO; + goto out; + } + if (test_bit(0, active) != test_bit(1, active)) { + if (!btrfs_zone_activate(cache)) { + ret = -EIO; + goto out; + } + } else { + cache->zone_is_active = test_bit(0, active); + } + cache->alloc_offset = alloc_offsets[0]; + cache->zone_capacity = min(caps[0], caps[1]); + break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID0: case BTRFS_BLOCK_GROUP_RAID10: @@ -1465,6 +1514,7 @@ out: cache->physical_map = NULL; } bitmap_free(active); + kfree(physical); kfree(caps); kfree(alloc_offsets); free_extent_map(em); @@ -1759,7 +1809,6 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, map = em->map_lookup; /* We only support single profile for now */ - ASSERT(map->num_stripes == 1); device = map->stripes[0].dev; free_extent_map(em); @@ -1781,48 +1830,47 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) struct btrfs_device *device; u64 physical; bool ret; + int i; if (!btrfs_is_zoned(block_group->fs_info)) return true; map = block_group->physical_map; - /* Currently support SINGLE profile only */ - ASSERT(map->num_stripes == 1); - device = map->stripes[0].dev; - physical = map->stripes[0].physical; - - if (device->zone_info->max_active_zones == 0) - return true; spin_lock(&block_group->lock); - if (block_group->zone_is_active) { ret = true; goto out_unlock; } /* No space left */ - if (block_group->alloc_offset == block_group->zone_capacity) { + if (btrfs_zoned_bg_is_full(block_group)) { ret = false; goto out_unlock; } - if (!btrfs_dev_set_active_zone(device, physical)) { - /* Cannot activate the zone */ - ret = false; - goto out_unlock; + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + physical = map->stripes[i].physical; + + if (device->zone_info->max_active_zones == 0) + continue; + + if (!btrfs_dev_set_active_zone(device, physical)) { + /* Cannot activate the zone */ + ret = false; + goto out_unlock; + } } /* Successfully activated all the zones */ block_group->zone_is_active = 1; - spin_unlock(&block_group->lock); /* For the active block group list */ btrfs_get_block_group(block_group); spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(list_empty(&block_group->active_bg_list)); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); @@ -1833,26 +1881,13 @@ out_unlock: return ret; } -int btrfs_zone_finish(struct btrfs_block_group *block_group) +static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct map_lookup *map; - struct btrfs_device *device; - u64 physical; + bool need_zone_finish; int ret = 0; - - if (!btrfs_is_zoned(fs_info)) - return 0; - - map = block_group->physical_map; - /* Currently support SINGLE profile only */ - ASSERT(map->num_stripes == 1); - - device = map->stripes[0].dev; - physical = map->stripes[0].physical; - - if (device->zone_info->max_active_zones == 0) - return 0; + int i; spin_lock(&block_group->lock); if (!block_group->zone_is_active) { @@ -1863,40 +1898,56 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) /* Check if we have unwritten allocated space */ if ((block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && - block_group->alloc_offset > block_group->meta_write_pointer) { + block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { spin_unlock(&block_group->lock); return -EAGAIN; } - spin_unlock(&block_group->lock); - - ret = btrfs_inc_block_group_ro(block_group, false); - if (ret) - return ret; - - /* Ensure all writes in this block group finish */ - btrfs_wait_block_group_reservations(block_group); - /* No need to wait for NOCOW writers. Zoned mode does not allow that. */ - btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, - block_group->length); - - spin_lock(&block_group->lock); /* - * Bail out if someone already deactivated the block group, or - * allocated space is left in the block group. + * If we are sure that the block group is full (= no more room left for + * new allocation) and the IO for the last usable block is completed, we + * don't need to wait for the other IOs. This holds because we ensure + * the sequential IO submissions using the ZONE_APPEND command for data + * and block_group->meta_write_pointer for metadata. */ - if (!block_group->zone_is_active) { + if (!fully_written) { spin_unlock(&block_group->lock); - btrfs_dec_block_group_ro(block_group); - return 0; - } - if (block_group->reserved) { - spin_unlock(&block_group->lock); - btrfs_dec_block_group_ro(block_group); - return -EAGAIN; + ret = btrfs_inc_block_group_ro(block_group, false); + if (ret) + return ret; + + /* Ensure all writes in this block group finish */ + btrfs_wait_block_group_reservations(block_group); + /* No need to wait for NOCOW writers. Zoned mode does not allow that */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, + block_group->length); + + spin_lock(&block_group->lock); + + /* + * Bail out if someone already deactivated the block group, or + * allocated space is left in the block group. + */ + if (!block_group->zone_is_active) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return 0; + } + + if (block_group->reserved) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return -EAGAIN; + } } + /* + * The block group is not fully allocated, so not fully written yet. We + * need to send ZONE_FINISH command to free up an active zone. + */ + need_zone_finish = !btrfs_zoned_bg_is_full(block_group); + block_group->zone_is_active = 0; block_group->alloc_offset = block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; @@ -1904,41 +1955,61 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, - physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, - GFP_NOFS); - btrfs_dec_block_group_ro(block_group); + map = block_group->physical_map; + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *device = map->stripes[i].dev; + const u64 physical = map->stripes[i].physical; - if (!ret) { - btrfs_dev_clear_active_zone(device, physical); + if (device->zone_info->max_active_zones == 0) + continue; - spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(!list_empty(&block_group->active_bg_list)); - list_del_init(&block_group->active_bg_list); - spin_unlock(&fs_info->zone_active_bgs_lock); + if (need_zone_finish) { + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + device->zone_info->zone_size >> SECTOR_SHIFT, + GFP_NOFS); + + if (ret) + return ret; + } - /* For active_bg_list */ - btrfs_put_block_group(block_group); + btrfs_dev_clear_active_zone(device, physical); } - return ret; + if (!fully_written) + btrfs_dec_block_group_ro(block_group); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(!list_empty(&block_group->active_bg_list)); + list_del_init(&block_group->active_bg_list); + spin_unlock(&fs_info->zone_active_bgs_lock); + + /* For active_bg_list */ + btrfs_put_block_group(block_group); + + return 0; +} + +int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + if (!btrfs_is_zoned(block_group->fs_info)) + return 0; + + return do_zone_finish(block_group, false); } bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) { + struct btrfs_fs_info *fs_info = fs_devices->fs_info; struct btrfs_device *device; bool ret = false; - if (!btrfs_is_zoned(fs_devices->fs_info)) + if (!btrfs_is_zoned(fs_info)) return true; - /* Non-single profiles are not supported yet */ - ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0); - /* Check if there is a device with active zones left */ - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + mutex_lock(&fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { struct btrfs_zoned_device_info *zinfo = device->zone_info; if (!device->bdev) @@ -1950,7 +2021,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) break; } } - mutex_unlock(&fs_devices->device_list_mutex); + mutex_unlock(&fs_info->chunk_mutex); return ret; } @@ -1958,9 +2029,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; - struct map_lookup *map; - struct btrfs_device *device; - u64 physical; + u64 min_alloc_bytes; if (!btrfs_is_zoned(fs_info)) return; @@ -1968,42 +2037,52 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len block_group = btrfs_lookup_block_group(fs_info, logical); ASSERT(block_group); - if (logical + length < block_group->start + block_group->zone_capacity) - goto out; - - spin_lock(&block_group->lock); + /* No MIXED_BG on zoned btrfs. */ + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) + min_alloc_bytes = fs_info->sectorsize; + else + min_alloc_bytes = fs_info->nodesize; - if (!block_group->zone_is_active) { - spin_unlock(&block_group->lock); + /* Bail out if we can allocate more data from this block group. */ + if (logical + length + min_alloc_bytes <= + block_group->start + block_group->zone_capacity) goto out; - } - block_group->zone_is_active = 0; - /* We should have consumed all the free space */ - ASSERT(block_group->alloc_offset == block_group->zone_capacity); - ASSERT(block_group->free_space_ctl->free_space == 0); - btrfs_clear_treelog_bg(block_group); - btrfs_clear_data_reloc_bg(block_group); - spin_unlock(&block_group->lock); + do_zone_finish(block_group, true); - map = block_group->physical_map; - device = map->stripes[0].dev; - physical = map->stripes[0].physical; +out: + btrfs_put_block_group(block_group); +} - if (!device->zone_info->max_active_zones) - goto out; +static void btrfs_zone_finish_endio_workfn(struct work_struct *work) +{ + struct btrfs_block_group *bg = + container_of(work, struct btrfs_block_group, zone_finish_work); - btrfs_dev_clear_active_zone(device, physical); + wait_on_extent_buffer_writeback(bg->last_eb); + free_extent_buffer(bg->last_eb); + btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); + btrfs_put_block_group(bg); +} - spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(!list_empty(&block_group->active_bg_list)); - list_del_init(&block_group->active_bg_list); - spin_unlock(&fs_info->zone_active_bgs_lock); +void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) +{ + if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) + return; - btrfs_put_block_group(block_group); + if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { + btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", + bg->start); + return; + } -out: - btrfs_put_block_group(block_group); + /* For the work */ + btrfs_get_block_group(bg); + atomic_inc(&eb->refs); + bg->last_eb = eb; + INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); + queue_work(system_unbound_wq, &bg->zone_finish_work); } void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) @@ -2033,3 +2112,57 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) } mutex_unlock(&fs_devices->device_list_mutex); } + +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 used = 0; + u64 total = 0; + u64 factor; + + ASSERT(btrfs_is_zoned(fs_info)); + + if (fs_info->bg_reclaim_threshold == 0) + return false; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + + total += device->disk_total_bytes; + used += device->bytes_used; + } + mutex_unlock(&fs_devices->device_list_mutex); + + factor = div64_u64(used * 100, total); + return factor >= fs_info->bg_reclaim_threshold; +} + +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length) +{ + struct btrfs_block_group *block_group; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + /* It should be called on a previous data relocation block group. */ + ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); + + spin_lock(&block_group->lock); + if (!block_group->zoned_data_reloc_ongoing) + goto out; + + /* All relocation extents are written. */ + if (block_group->start + block_group->alloc_offset == logical + length) { + /* Now, release this block group for further allocations. */ + block_group->zoned_data_reloc_ongoing = 0; + } + +out: + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index cbf016a7bb5d..6b2eec99162b 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -10,11 +10,7 @@ #include "block-group.h" #include "btrfs_inode.h" -/* - * Block groups with more than this value (percents) of unusable space will be - * scheduled for background reclaim. - */ -#define BTRFS_DEFAULT_RECLAIM_THRESH 75 +#define BTRFS_DEFAULT_RECLAIM_THRESH (75) struct btrfs_zoned_device_info { /* @@ -76,8 +72,13 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length); +void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -233,9 +234,20 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { } +static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) { } + static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } + +static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +{ + return false; +} + +static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -359,7 +371,7 @@ static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) - btrfs_inode_lock(&inode->vfs_inode, 0); + mutex_lock(&root->fs_info->zoned_data_reloc_io_lock); } static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) @@ -367,7 +379,13 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) - btrfs_inode_unlock(&inode->vfs_inode, 0); + mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock); +} + +static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg) +{ + ASSERT(btrfs_is_zoned(bg->fs_info)); + return (bg->alloc_offset == bg->zone_capacity); } #endif diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index fc42dd0badd7..0fe31a6f6e68 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -93,22 +93,26 @@ static inline struct workspace *list_to_workspace(struct list_head *list) void zstd_free_workspace(struct list_head *ws); struct list_head *zstd_alloc_workspace(unsigned int level); -/* - * zstd_reclaim_timer_fn - reclaim timer + +/** + * Timer callback to free unused workspaces. + * * @t: timer * * This scans the lru_list and attempts to reclaim any workspace that hasn't * been used for ZSTD_BTRFS_RECLAIM_JIFFIES. + * + * The context is softirq and does not need the _bh locking primitives. */ static void zstd_reclaim_timer_fn(struct timer_list *timer) { unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; - spin_lock_bh(&wsm.lock); + spin_lock(&wsm.lock); if (list_empty(&wsm.lru_list)) { - spin_unlock_bh(&wsm.lock); + spin_unlock(&wsm.lock); return; } @@ -137,7 +141,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) if (!list_empty(&wsm.lru_list)) mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); - spin_unlock_bh(&wsm.lock); + spin_unlock(&wsm.lock); } /* |