aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile22
-rw-r--r--fs/btrfs/backref.c51
-rw-r--r--fs/btrfs/backref.h9
-rw-r--r--fs/btrfs/block-group.c458
-rw-r--r--fs/btrfs/block-group.h23
-rw-r--r--fs/btrfs/block-rsv.c11
-rw-r--r--fs/btrfs/btrfs_inode.h35
-rw-r--r--fs/btrfs/check-integrity.c19
-rw-r--r--fs/btrfs/compression.c38
-rw-r--r--fs/btrfs/ctree.c291
-rw-r--r--fs/btrfs/ctree.h270
-rw-r--r--fs/btrfs/delalloc-space.c29
-rw-r--r--fs/btrfs/delayed-inode.c25
-rw-r--r--fs/btrfs/delayed-inode.h3
-rw-r--r--fs/btrfs/delayed-ref.c79
-rw-r--r--fs/btrfs/delayed-ref.h28
-rw-r--r--fs/btrfs/dev-replace.c233
-rw-r--r--fs/btrfs/dev-replace.h3
-rw-r--r--fs/btrfs/dir-item.c1
-rw-r--r--fs/btrfs/discard.c112
-rw-r--r--fs/btrfs/discard.h3
-rw-r--r--fs/btrfs/disk-io.c1009
-rw-r--r--fs/btrfs/disk-io.h30
-rw-r--r--fs/btrfs/export.c1
-rw-r--r--fs/btrfs/extent-io-tree.h71
-rw-r--r--fs/btrfs/extent-tree.c537
-rw-r--r--fs/btrfs/extent_io.c1427
-rw-r--r--fs/btrfs/extent_io.h67
-rw-r--r--fs/btrfs/extent_map.c18
-rw-r--r--fs/btrfs/file-item.c368
-rw-r--r--fs/btrfs/file.c852
-rw-r--r--fs/btrfs/free-space-cache.c681
-rw-r--r--fs/btrfs/free-space-cache.h24
-rw-r--r--fs/btrfs/free-space-tree.c36
-rw-r--r--fs/btrfs/inode-item.c6
-rw-r--r--fs/btrfs/inode-map.c582
-rw-r--r--fs/btrfs/inode-map.h16
-rw-r--r--fs/btrfs/inode.c1264
-rw-r--r--fs/btrfs/ioctl.c103
-rw-r--r--fs/btrfs/locking.c459
-rw-r--r--fs/btrfs/locking.h24
-rw-r--r--fs/btrfs/ordered-data.c269
-rw-r--r--fs/btrfs/ordered-data.h62
-rw-r--r--fs/btrfs/print-tree.c25
-rw-r--r--fs/btrfs/print-tree.h2
-rw-r--r--fs/btrfs/qgroup.c193
-rw-r--r--fs/btrfs/raid56.c18
-rw-r--r--fs/btrfs/reada.c81
-rw-r--r--fs/btrfs/ref-verify.c71
-rw-r--r--fs/btrfs/reflink.c38
-rw-r--r--fs/btrfs/relocation.c222
-rw-r--r--fs/btrfs/scrub.c490
-rw-r--r--fs/btrfs/send.c101
-rw-r--r--fs/btrfs/space-info.c367
-rw-r--r--fs/btrfs/space-info.h25
-rw-r--r--fs/btrfs/struct-funcs.c18
-rw-r--r--fs/btrfs/subpage.c278
-rw-r--r--fs/btrfs/subpage.h91
-rw-r--r--fs/btrfs/super.c225
-rw-r--r--fs/btrfs/sysfs.c134
-rw-r--r--fs/btrfs/tests/btrfs-tests.c13
-rw-r--r--fs/btrfs/tests/extent-io-tests.c26
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/tests/free-space-tests.c1
-rw-r--r--fs/btrfs/tests/inode-tests.c21
-rw-r--r--fs/btrfs/tests/qgroup-tests.c4
-rw-r--r--fs/btrfs/transaction.c286
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-checker.c361
-rw-r--r--fs/btrfs/tree-defrag.c1
-rw-r--r--fs/btrfs/tree-log.c451
-rw-r--r--fs/btrfs/uuid-tree.c3
-rw-r--r--fs/btrfs/volumes.c557
-rw-r--r--fs/btrfs/volumes.h30
-rw-r--r--fs/btrfs/xattr.c8
-rw-r--r--fs/btrfs/zoned.c1461
-rw-r--r--fs/btrfs/zoned.h307
77 files changed, 9898 insertions, 5670 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index e738f6206ea5..b634c42115ea 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,9 +1,25 @@
# SPDX-License-Identifier: GPL-2.0
+# Subset of W=1 warnings
+subdir-ccflags-y += -Wextra -Wunused -Wno-unused-parameter
+subdir-ccflags-y += -Wmissing-declarations
+subdir-ccflags-y += -Wmissing-format-attribute
+subdir-ccflags-y += -Wmissing-prototypes
+subdir-ccflags-y += -Wold-style-definition
+subdir-ccflags-y += -Wmissing-include-dirs
+subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable)
+subdir-ccflags-y += $(call cc-option, -Wunused-const-variable)
+subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned)
+subdir-ccflags-y += $(call cc-option, -Wstringop-truncation)
+# The following turn off the warnings enabled by -Wextra
+subdir-ccflags-y += -Wno-missing-field-initializers
+subdir-ccflags-y += -Wno-sign-compare
+subdir-ccflags-y += -Wno-type-limits
+
obj-$(CONFIG_BTRFS_FS) := btrfs.o
btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
- file-item.o inode-item.o inode-map.o disk-io.o \
+ file-item.o inode-item.o disk-io.o \
transaction.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
@@ -11,11 +27,13 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
- block-rsv.o delalloc-space.o block-group.o discard.o reflink.o
+ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
+ subpage.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
+btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b3268f4ea5f3..f47c1528eb9a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -544,7 +544,18 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
int level = ref->level;
struct btrfs_key search_key = ref->key_for_search;
- root = btrfs_get_fs_root(fs_info, ref->root_id, false);
+ /*
+ * If we're search_commit_root we could possibly be holding locks on
+ * other tree nodes. This happens when qgroups does backref walks when
+ * adding new delayed refs. To deal with this we need to look in cache
+ * for the root, and if we don't find it then we need to search the
+ * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage
+ * here.
+ */
+ if (path->search_commit_root)
+ root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id);
+ else
+ root = btrfs_get_fs_root(fs_info, ref->root_id, false);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out_free;
@@ -772,8 +783,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
BUG_ON(ref->key_for_search.type);
BUG_ON(!ref->wanted_disk_byte);
- eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0,
- ref->level - 1, NULL);
+ eb = read_tree_block(fs_info, ref->wanted_disk_byte,
+ ref->root_id, 0, ref->level - 1, NULL);
if (IS_ERR(eb)) {
free_pref(ref);
return PTR_ERR(eb);
@@ -1320,7 +1331,7 @@ again:
struct extent_buffer *eb;
eb = read_tree_block(fs_info, ref->parent, 0,
- ref->level, NULL);
+ 0, ref->level, NULL);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
@@ -1330,14 +1341,12 @@ again:
goto out;
}
- if (!path->skip_locking) {
+ if (!path->skip_locking)
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- }
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie, ignore_offset);
if (!path->skip_locking)
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
if (ret < 0)
goto out;
@@ -1492,7 +1501,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
}
/**
- * btrfs_check_shared - tell us whether an extent is shared
+ * Check if an extent is shared or not
+ *
+ * @root: root inode belongs to
+ * @inum: inode number of the inode whose extent we are checking
+ * @bytenr: logical bytenr of the extent we are checking
+ * @roots: list of roots this extent is shared among
+ * @tmp: temporary list used for iteration
*
* btrfs_check_shared uses the backref walking code but will short
* circuit as soon as it finds a root or inode that doesn't match the
@@ -1660,13 +1675,11 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
s64 bytes_left = ((s64)size) - 1;
struct extent_buffer *eb = eb_in;
struct btrfs_key found_key;
- int leave_spinning = path->leave_spinning;
struct btrfs_inode_ref *iref;
if (bytes_left >= 0)
dest[bytes_left] = '\0';
- path->leave_spinning = 1;
while (1) {
bytes_left -= name_len;
if (bytes_left >= 0)
@@ -1674,7 +1687,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
name_off, name_len);
if (eb != eb_in) {
if (!path->skip_locking)
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
}
ret = btrfs_find_item(fs_root, path, parent, 0,
@@ -1694,8 +1707,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
eb = path->nodes[0];
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
- if (!path->skip_locking)
- btrfs_set_lock_blocking_read(eb);
path->nodes[0] = NULL;
path->locks[0] = 0;
}
@@ -1712,7 +1723,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
}
btrfs_release_path(path);
- path->leave_spinning = leave_spinning;
if (ret)
return ERR_PTR(ret);
@@ -2537,13 +2547,6 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
list_del(&edge->list[UPPER]);
btrfs_backref_free_edge(cache, edge);
- if (RB_EMPTY_NODE(&upper->rb_node)) {
- BUG_ON(!list_empty(&node->upper));
- btrfs_backref_drop_node(cache, node);
- node = upper;
- node->lowest = 1;
- continue;
- }
/*
* Add the node to leaf node list if no other child block
* cached.
@@ -2620,7 +2623,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
/* Only reloc backref cache cares about a specific root */
if (cache->is_reloc) {
root = find_reloc_root(cache->fs_info, cur->bytenr);
- if (WARN_ON(!root))
+ if (!root)
return -ENOENT;
cur->root = root;
} else {
@@ -3113,7 +3116,7 @@ void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache,
list_del_init(&lower->list);
if (lower == node)
node = NULL;
- btrfs_backref_free_node(cache, lower);
+ btrfs_backref_drop_node(cache, lower);
}
btrfs_backref_cleanup_node(cache, node);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index ff705cc564a9..17abde7f794c 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -296,6 +296,9 @@ static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node)
{
if (node) {
+ ASSERT(list_empty(&node->list));
+ ASSERT(list_empty(&node->lower));
+ ASSERT(node->eb == NULL);
cache->nr_nodes--;
btrfs_put_root(node->root);
kfree(node);
@@ -340,11 +343,11 @@ static inline void btrfs_backref_drop_node_buffer(
static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
struct btrfs_backref_node *node)
{
- BUG_ON(!list_empty(&node->upper));
+ ASSERT(list_empty(&node->upper));
btrfs_backref_drop_node_buffer(node);
- list_del(&node->list);
- list_del(&node->lower);
+ list_del_init(&node->list);
+ list_del_init(&node->lower);
if (!RB_EMPTY_NODE(&node->rb_node))
rb_erase(&node->rb_node, &tree->rb_root);
btrfs_backref_free_node(tree, node);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c0f1d6818df7..5064be59dac5 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -15,6 +15,7 @@
#include "delalloc-space.h"
#include "discard.h"
#include "raid56.h"
+#include "zoned.h"
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -424,6 +425,23 @@ int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
return ret;
}
+static bool space_cache_v1_done(struct btrfs_block_group *cache)
+{
+ bool ret;
+
+ spin_lock(&cache->lock);
+ ret = cache->cached != BTRFS_CACHE_FAST;
+ spin_unlock(&cache->lock);
+
+ return ret;
+}
+
+void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
+ struct btrfs_caching_control *caching_ctl)
+{
+ wait_event(caching_ctl->wait, space_cache_v1_done(cache));
+}
+
#ifdef CONFIG_BTRFS_DEBUG
static void fragment_free_space(struct btrfs_block_group *block_group)
{
@@ -639,11 +657,36 @@ static noinline void caching_thread(struct btrfs_work *work)
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ ret = load_free_space_cache(block_group);
+ if (ret == 1) {
+ ret = 0;
+ goto done;
+ }
+
+ /*
+ * We failed to load the space cache, set ourselves to
+ * CACHE_STARTED and carry on.
+ */
+ spin_lock(&block_group->lock);
+ block_group->cached = BTRFS_CACHE_STARTED;
+ spin_unlock(&block_group->lock);
+ wake_up(&caching_ctl->wait);
+ }
+
+ /*
+ * If we are in the transaction that populated the free space tree we
+ * can't actually cache from the free space tree as our commit root and
+ * real root are the same, so we could change the contents of the blocks
+ * while caching. Instead do the slow caching in this case, and after
+ * the transaction has committed we will be safe.
+ */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
ret = load_free_space_tree(caching_ctl);
else
ret = load_extent_tree_free(caching_ctl);
-
+done:
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
@@ -679,9 +722,13 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
{
DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct btrfs_caching_control *caching_ctl;
+ struct btrfs_caching_control *caching_ctl = NULL;
int ret = 0;
+ /* Allocator for zoned filesystems does not use the cache at all */
+ if (btrfs_is_zoned(fs_info))
+ return 0;
+
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
if (!caching_ctl)
return -ENOMEM;
@@ -691,119 +738,41 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
caching_ctl->progress = cache->start;
- refcount_set(&caching_ctl->count, 1);
+ refcount_set(&caching_ctl->count, 2);
btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
spin_lock(&cache->lock);
- /*
- * This should be a rare occasion, but this could happen I think in the
- * case where one thread starts to load the space cache info, and then
- * some other thread starts a transaction commit which tries to do an
- * allocation while the other thread is still loading the space cache
- * info. The previous loop should have kept us from choosing this block
- * group, but if we've moved to the state where we will wait on caching
- * block groups we need to first check if we're doing a fast load here,
- * so we can wait for it to finish, otherwise we could end up allocating
- * from a block group who's cache gets evicted for one reason or
- * another.
- */
- while (cache->cached == BTRFS_CACHE_FAST) {
- struct btrfs_caching_control *ctl;
-
- ctl = cache->caching_ctl;
- refcount_inc(&ctl->count);
- prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&cache->lock);
-
- schedule();
-
- finish_wait(&ctl->wait, &wait);
- btrfs_put_caching_control(ctl);
- spin_lock(&cache->lock);
- }
-
if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
kfree(caching_ctl);
- return 0;
+
+ caching_ctl = cache->caching_ctl;
+ if (caching_ctl)
+ refcount_inc(&caching_ctl->count);
+ spin_unlock(&cache->lock);
+ goto out;
}
WARN_ON(cache->caching_ctl);
cache->caching_ctl = caching_ctl;
- cache->cached = BTRFS_CACHE_FAST;
+ if (btrfs_test_opt(fs_info, SPACE_CACHE))
+ cache->cached = BTRFS_CACHE_FAST;
+ else
+ cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
spin_unlock(&cache->lock);
- if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
- mutex_lock(&caching_ctl->mutex);
- ret = load_free_space_cache(cache);
-
- spin_lock(&cache->lock);
- if (ret == 1) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_FINISHED;
- cache->last_byte_to_unpin = (u64)-1;
- caching_ctl->progress = (u64)-1;
- } else {
- if (load_cache_only) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_NO;
- } else {
- cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
- }
- }
- spin_unlock(&cache->lock);
-#ifdef CONFIG_BTRFS_DEBUG
- if (ret == 1 &&
- btrfs_should_fragment_free_space(cache)) {
- u64 bytes_used;
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- bytes_used = cache->length - cache->used;
- cache->space_info->bytes_used += bytes_used >> 1;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- fragment_free_space(cache);
- }
-#endif
- mutex_unlock(&caching_ctl->mutex);
-
- wake_up(&caching_ctl->wait);
- if (ret == 1) {
- btrfs_put_caching_control(caching_ctl);
- btrfs_free_excluded_extents(cache);
- return 0;
- }
- } else {
- /*
- * We're either using the free space tree or no caching at all.
- * Set cached to the appropriate value and wakeup any waiters.
- */
- spin_lock(&cache->lock);
- if (load_cache_only) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_NO;
- } else {
- cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
- }
- spin_unlock(&cache->lock);
- wake_up(&caching_ctl->wait);
- }
-
- if (load_cache_only) {
- btrfs_put_caching_control(caching_ctl);
- return 0;
- }
-
- down_write(&fs_info->commit_root_sem);
+ spin_lock(&fs_info->block_group_cache_lock);
refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- up_write(&fs_info->commit_root_sem);
+ spin_unlock(&fs_info->block_group_cache_lock);
btrfs_get_block_group(cache);
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
+out:
+ if (load_cache_only && caching_ctl)
+ btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
+ if (caching_ctl)
+ btrfs_put_caching_control(caching_ctl);
return ret;
}
@@ -892,8 +861,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_block_group *block_group;
struct btrfs_free_cluster *cluster;
- struct btrfs_root *tree_root = fs_info->tree_root;
- struct btrfs_key key;
struct inode *inode;
struct kobject *kobj = NULL;
int ret;
@@ -934,6 +901,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&cluster->refill_lock);
+ btrfs_clear_treelog_bg(block_group);
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -971,42 +940,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&trans->transaction->dirty_bgs_lock);
mutex_unlock(&trans->transaction->cache_write_mutex);
- if (!IS_ERR(inode)) {
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (ret) {
- btrfs_add_delayed_iput(inode);
- goto out;
- }
- clear_nlink(inode);
- /* One for the block groups ref */
- spin_lock(&block_group->lock);
- if (block_group->iref) {
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- iput(inode);
- } else {
- spin_unlock(&block_group->lock);
- }
- /* One for our lookup ref */
- btrfs_add_delayed_iput(inode);
- }
-
- key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.type = 0;
- key.offset = block_group->start;
-
- ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- if (ret < 0)
+ ret = btrfs_remove_free_space_inode(trans, inode, block_group);
+ if (ret)
goto out;
- if (ret > 0)
- btrfs_release_path(path);
- if (ret == 0) {
- ret = btrfs_del_item(trans, tree_root, path);
- if (ret)
- goto out;
- btrfs_release_path(path);
- }
spin_lock(&fs_info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
@@ -1043,7 +979,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
if (block_group->has_caching_ctl) {
- down_write(&fs_info->commit_root_sem);
+ spin_lock(&fs_info->block_group_cache_lock);
if (!caching_ctl) {
struct btrfs_caching_control *ctl;
@@ -1057,7 +993,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
if (caching_ctl)
list_del_init(&caching_ctl->list);
- up_write(&fs_info->commit_root_sem);
+ spin_unlock(&fs_info->block_group_cache_lock);
if (caching_ctl) {
/* Once for the caching bgs list and once for us. */
btrfs_put_caching_control(caching_ctl);
@@ -1079,12 +1015,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
WARN_ON(block_group->space_info->total_bytes
< block_group->length);
WARN_ON(block_group->space_info->bytes_readonly
- < block_group->length);
+ < block_group->length - block_group->zone_unusable);
+ WARN_ON(block_group->space_info->bytes_zone_unusable
+ < block_group->zone_unusable);
WARN_ON(block_group->space_info->disk_total
< block_group->length * factor);
}
block_group->space_info->total_bytes -= block_group->length;
- block_group->space_info->bytes_readonly -= block_group->length;
+ block_group->space_info->bytes_readonly -=
+ (block_group->length - block_group->zone_unusable);
+ block_group->space_info->bytes_zone_unusable -=
+ block_group->zone_unusable;
block_group->space_info->disk_total -= block_group->length * factor;
spin_unlock(&block_group->space_info->lock);
@@ -1228,7 +1169,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
}
num_bytes = cache->length - cache->reserved - cache->pinned -
- cache->bytes_super - cache->used;
+ cache->bytes_super - cache->zone_unusable - cache->used;
/*
* Data never overcommits, even in mixed mode, so do just the straight
@@ -1259,6 +1200,12 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
if (!ret) {
sinfo->bytes_readonly += num_bytes;
+ if (btrfs_is_zoned(cache->fs_info)) {
+ /* Migrate zone_unusable bytes to readonly */
+ sinfo->bytes_readonly += cache->zone_unusable;
+ sinfo->bytes_zone_unusable -= cache->zone_unusable;
+ cache->zone_unusable = 0;
+ }
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
}
@@ -1333,6 +1280,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ /*
+ * Long running balances can keep us blocked here for eternity, so
+ * simply skip deletion if we're unable to get the mutex.
+ */
+ if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex))
+ return;
+
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
int trimming;
@@ -1352,8 +1306,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
-
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
@@ -1442,9 +1394,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_space_info_update_bytes_pinned(fs_info, space_info,
-block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -block_group->pinned,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned);
block_group->pinned = 0;
spin_unlock(&block_group->lock);
@@ -1460,8 +1410,12 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
goto flip_async;
- /* DISCARD can flip during remount */
- trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
+ /*
+ * DISCARD can flip during remount. On zoned filesystems, we
+ * need to reset sequential-required zones.
+ */
+ trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_is_zoned(fs_info);
/* Implicit trim during transaction commit. */
if (trimming)
@@ -1499,11 +1453,11 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
end_trans:
btrfs_end_transaction(trans);
next:
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
return;
flip_async:
@@ -1632,8 +1586,11 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
/**
- * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
+ * Map a physical disk address to a list of logical addresses
+ *
+ * @fs_info: the filesystem
* @chunk_start: logical address of block group
+ * @bdev: physical device to resolve, can be NULL to indicate any device
* @physical: physical address to map to logical addresses
* @logical: return array of logical addresses which map to @physical
* @naddrs: length of @logical
@@ -1643,9 +1600,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
* Used primarily to exclude those portions of a block group that contain super
* block copies.
*/
-EXPORT_FOR_TESTS
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- u64 physical, u64 **logical, int *naddrs, int *stripe_len)
+ struct block_device *bdev, u64 physical, u64 **logical,
+ int *naddrs, int *stripe_len)
{
struct extent_map *em;
struct map_lookup *map;
@@ -1663,6 +1620,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
map = em->map_lookup;
data_stripe_length = em->orig_block_len;
io_stripe_size = map->stripe_len;
+ chunk_start = em->start;
/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -1677,14 +1635,18 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
for (i = 0; i < map->num_stripes; i++) {
bool already_inserted = false;
u64 stripe_nr;
+ u64 offset;
int j;
if (!in_range(physical, map->stripes[i].physical,
data_stripe_length))
continue;
+ if (bdev && map->stripes[i].dev->bdev != bdev)
+ continue;
+
stripe_nr = physical - map->stripes[i].physical;
- stripe_nr = div64_u64(stripe_nr, map->stripe_len);
+ stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
@@ -1698,7 +1660,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
* instead of map->stripe_len
*/
- bytenr = chunk_start + stripe_nr * io_stripe_size;
+ bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
/* Ensure we don't add duplicate addresses */
for (j = 0; j < nr; j++) {
@@ -1723,6 +1685,7 @@ out:
static int exclude_super_stripes(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
+ const bool zoned = btrfs_is_zoned(fs_info);
u64 bytenr;
u64 *logical;
int stripe_len;
@@ -1739,11 +1702,19 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- ret = btrfs_rmap_block(fs_info, cache->start,
+ ret = btrfs_rmap_block(fs_info, cache->start, NULL,
bytenr, &logical, &nr, &stripe_len);
if (ret)
return ret;
+ /* Shouldn't have super stripes in sequential zones */
+ if (zoned && nr) {
+ btrfs_err(fs_info,
+ "zoned: block group %llu must not contain super block",
+ cache->start);
+ return -EUCLEAN;
+ }
+
while (nr--) {
u64 len = min_t(u64, stripe_len,
cache->start + cache->length - logical[nr]);
@@ -1805,7 +1776,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
- btrfs_init_free_space_ctl(cache);
+ btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
@@ -1867,24 +1838,8 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
return ret;
}
-static void read_block_group_item(struct btrfs_block_group *cache,
- struct btrfs_path *path,
- const struct btrfs_key *key)
-{
- struct extent_buffer *leaf = path->nodes[0];
- struct btrfs_block_group_item bgi;
- int slot = path->slots[0];
-
- cache->length = key->offset;
-
- read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
- sizeof(bgi));
- cache->used = btrfs_stack_block_group_used(&bgi);
- cache->flags = btrfs_stack_block_group_flags(&bgi);
-}
-
static int read_one_block_group(struct btrfs_fs_info *info,
- struct btrfs_path *path,
+ struct btrfs_block_group_item *bgi,
const struct btrfs_key *key,
int need_clear)
{
@@ -1899,7 +1854,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
if (!cache)
return -ENOMEM;
- read_block_group_item(cache, path, key);
+ cache->length = key->offset;
+ cache->used = btrfs_stack_block_group_used(bgi);
+ cache->flags = btrfs_stack_block_group_flags(bgi);
set_free_space_tree_thresholds(cache);
@@ -1926,6 +1883,13 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
+ ret = btrfs_load_block_group_zone_info(cache, false);
+ if (ret) {
+ btrfs_err(info, "zoned: failed to load zone info of bg %llu",
+ cache->start);
+ goto error;
+ }
+
/*
* We need to exclude the super stripes now so that the space info has
* super bytes accounted for, otherwise we'll think we have more space
@@ -1939,12 +1903,20 @@ static int read_one_block_group(struct btrfs_fs_info *info,
}
/*
- * Check for two cases, either we are full, and therefore don't need
- * to bother with the caching work since we won't find any space, or we
- * are empty, and we can just add all the space in and be done with it.
- * This saves us _a_lot_ of time, particularly in the full case.
+ * For zoned filesystem, space after the allocation offset is the only
+ * free space for a block group. So, we don't need any caching work.
+ * btrfs_calc_zone_unusable() will set the amount of free space and
+ * zone_unusable space.
+ *
+ * For regular filesystem, check for two cases, either we are full, and
+ * therefore don't need to bother with the caching work since we won't
+ * find any space, or we are empty, and we can just add all the space
+ * in and be done with it. This saves us _a_lot_ of time, particularly
+ * in the full case.
*/
- if (cache->length == cache->used) {
+ if (btrfs_is_zoned(info)) {
+ btrfs_calc_zone_unusable(cache);
+ } else if (cache->length == cache->used) {
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
btrfs_free_excluded_extents(cache);
@@ -1963,7 +1935,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
}
trace_btrfs_add_block_group(info, cache, 0);
btrfs_update_space_info(info, cache->flags, cache->length,
- cache->used, cache->bytes_super, &space_info);
+ cache->used, cache->bytes_super,
+ cache->zone_unusable, &space_info);
cache->space_info = space_info;
@@ -1985,6 +1958,51 @@ error:
return ret;
}
+static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct btrfs_space_info *space_info;
+ struct rb_node *node;
+ int ret = 0;
+
+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_block_group *bg;
+
+ em = rb_entry(node, struct extent_map, rb_node);
+ map = em->map_lookup;
+ bg = btrfs_create_block_group_cache(fs_info, em->start);
+ if (!bg) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ /* Fill dummy cache as FULL */
+ bg->length = em->len;
+ bg->flags = map->type;
+ bg->last_byte_to_unpin = (u64)-1;
+ bg->cached = BTRFS_CACHE_FINISHED;
+ bg->used = em->len;
+ bg->flags = map->type;
+ ret = btrfs_add_block_group_cache(fs_info, bg);
+ if (ret) {
+ btrfs_remove_free_space_cache(bg);
+ btrfs_put_block_group(bg);
+ break;
+ }
+ btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
+ 0, 0, &space_info);
+ bg->space_info = space_info;
+ link_block_group(bg);
+
+ set_avail_alloc_bits(fs_info, bg->flags);
+ }
+ if (!ret)
+ btrfs_init_global_block_rsv(fs_info);
+ return ret;
+}
+
int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_path *path;
@@ -1995,6 +2013,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
int need_clear = 0;
u64 cache_gen;
+ if (!info->extent_root)
+ return fill_dummy_bgs(info);
+
key.objectid = 0;
key.offset = 0;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
@@ -2010,20 +2031,31 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
need_clear = 1;
while (1) {
+ struct btrfs_block_group_item bgi;
+ struct extent_buffer *leaf;
+ int slot;
+
ret = find_first_block_group(info, path, &key);
if (ret > 0)
break;
if (ret != 0)
goto error;
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- ret = read_one_block_group(info, path, &key, need_clear);
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ btrfs_release_path(path);
+ ret = read_one_block_group(info, &bgi, &key, need_clear);
if (ret < 0)
goto error;
key.objectid += key.offset;
key.offset = 0;
- btrfs_release_path(path);
}
+ btrfs_release_path(path);
list_for_each_entry(space_info, &info->space_info, list) {
int i;
@@ -2151,7 +2183,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
- cache->needs_free_space = 1;
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ cache->needs_free_space = 1;
+
+ ret = btrfs_load_block_group_zone_info(cache, true);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
@@ -2193,7 +2233,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, &cache->space_info);
+ cache->bytes_super, 0, &cache->space_info);
btrfs_update_global_block_rsv(fs_info);
link_block_group(cache);
@@ -2301,8 +2341,15 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
spin_lock(&cache->lock);
if (!--cache->ro) {
num_bytes = cache->length - cache->reserved -
- cache->pinned - cache->bytes_super - cache->used;
+ cache->pinned - cache->bytes_super -
+ cache->zone_unusable - cache->used;
sinfo->bytes_readonly -= num_bytes;
+ if (btrfs_is_zoned(cache->fs_info)) {
+ /* Migrate zone_unusable bytes back */
+ cache->zone_unusable = cache->alloc_offset - cache->used;
+ sinfo->bytes_zone_unusable += cache->zone_unusable;
+ sinfo->bytes_readonly -= cache->zone_unusable;
+ }
list_del_init(&cache->ro_list);
}
spin_unlock(&cache->lock);
@@ -2360,6 +2407,9 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
int retries = 0;
int ret = 0;
+ if (!btrfs_test_opt(fs_info, SPACE_CACHE))
+ return 0;
+
/*
* If this block group is smaller than 100 megs don't bother caching the
* block group.
@@ -2400,7 +2450,7 @@ again:
* time.
*/
BTRFS_I(inode)->generation = 0;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret) {
/*
* So theoretically we could recover from this, simply set the
@@ -2573,8 +2623,10 @@ again:
if (!path) {
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
/*
@@ -2668,16 +2720,14 @@ again:
btrfs_put_block_group(cache);
if (drop_reserve)
btrfs_delayed_refs_rsv_release(fs_info, 1);
-
- if (ret)
- break;
-
/*
* Avoid blocking other tasks for too long. It might even save
* us from writing caches for block groups that are going to be
* removed.
*/
mutex_unlock(&trans->transaction->cache_write_mutex);
+ if (ret)
+ goto out;
mutex_lock(&trans->transaction->cache_write_mutex);
}
mutex_unlock(&trans->transaction->cache_write_mutex);
@@ -2686,7 +2736,8 @@ again:
* Go through delayed refs for all the stuff we've just kicked off
* and then loop back (just once)
*/
- ret = btrfs_run_delayed_refs(trans, 0);
+ if (!ret)
+ ret = btrfs_run_delayed_refs(trans, 0);
if (!ret && loops == 0) {
loops++;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2700,7 +2751,12 @@ again:
goto again;
}
spin_unlock(&cur_trans->dirty_bgs_lock);
- } else if (ret < 0) {
+ }
+out:
+ if (ret < 0) {
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_splice_init(&dirty, &cur_trans->dirty_bgs);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
}
@@ -2904,10 +2960,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(
- &cache->space_info->total_bytes_pinned,
- num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(cache->space_info,
+ num_bytes);
set_extent_dirty(&trans->transaction->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
@@ -3306,14 +3360,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
- down_write(&info->commit_root_sem);
+ spin_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
- up_write(&info->commit_root_sem);
+ spin_unlock(&info->block_group_cache_lock);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index adfd7583a17b..29678426247d 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -95,6 +95,8 @@ struct btrfs_block_group {
unsigned int iref:1;
unsigned int has_caching_ctl:1;
unsigned int removed:1;
+ unsigned int to_copy:1;
+ unsigned int relocating_repair:1;
int disk_cache_state;
@@ -181,8 +183,19 @@ struct btrfs_block_group {
*/
int needs_free_space;
+ /* Flag indicating this block group is placed on a sequential zone */
+ bool seq_zone;
+
/* Record locked full stripes for RAID5/6 block group */
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
+
+ /*
+ * Allocation offset for the block group to implement sequential
+ * allocation. This is used only on a zoned filesystem.
+ */
+ u64 alloc_offset;
+ u64 zone_unusable;
+ u64 meta_write_pointer;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -268,6 +281,11 @@ void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
+void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
+ struct btrfs_caching_control *caching_ctl);
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ struct block_device *bdev, u64 physical, u64 **logical,
+ int *naddrs, int *stripe_len);
static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
{
@@ -294,9 +312,4 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
void btrfs_freeze_block_group(struct btrfs_block_group *cache);
void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- u64 physical, u64 **logical, int *naddrs, int *stripe_len);
-#endif
-
#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 7e1549a84fcc..04a6226e0388 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -426,6 +426,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->delayed_block_rsv.space_info = space_info;
fs_info->delayed_refs_rsv.space_info = space_info;
+ /*
+ * Our various recovery options can leave us with NULL roots, so check
+ * here and just bail before we go dereferencing NULLs everywhere.
+ */
+ if (!fs_info->extent_root || !fs_info->csum_root ||
+ !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root)
+ return;
+
fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
@@ -511,7 +519,8 @@ again:
/*DEFAULT_RATELIMIT_BURST*/ 1);
if (__ratelimit(&_rs))
WARN(1, KERN_DEBUG
- "BTRFS: block rsv returned %d\n", ret);
+ "BTRFS: block rsv %d returned %d\n",
+ block_rsv->type, ret);
}
try_reserve:
ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 92dd86bceae3..28e202e89660 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -35,6 +35,22 @@ enum {
BTRFS_INODE_IN_DELALLOC_LIST,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
+ /*
+ * Set and used when logging an inode and it serves to signal that an
+ * inode does not have xattrs, so subsequent fsyncs can avoid searching
+ * for xattrs to log. This bit must be cleared whenever a xattr is added
+ * to an inode.
+ */
+ BTRFS_INODE_NO_XATTRS,
+ /*
+ * Set when we are in a context where we need to start a transaction and
+ * have dirty pages with the respective file range locked. This is to
+ * ensure that when reserving space for the transaction, if we are low
+ * on available space and need to flush delalloc, we will not flush
+ * delalloc for this inode, because that could result in a deadlock (on
+ * the file range, inode's io_tree).
+ */
+ BTRFS_INODE_NO_DELALLOC_FLUSH,
};
/* in memory btrfs inode */
@@ -50,7 +66,8 @@ struct btrfs_inode {
/*
* Lock for counters and all fields used to determine if the inode is in
* the log or not (last_trans, last_sub_trans, last_log_commit,
- * logged_trans).
+ * logged_trans), to access/update new_delalloc_bytes and to update the
+ * VFS' inode number of bytes used.
*/
spinlock_t lock;
@@ -203,16 +220,6 @@ struct btrfs_inode {
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
- /*
- * To avoid races between lockless (i_mutex not held) direct IO writes
- * and concurrent fsync requests. Direct IO writes must acquire read
- * access on this semaphore for creating an extent map and its
- * corresponding ordered extent. The fast fsync path must acquire write
- * access on this semaphore before it collects ordered extents and
- * extent maps.
- */
- struct rw_semaphore dio_sem;
-
struct inode vfs_inode;
};
@@ -318,7 +325,8 @@ struct btrfs_dio_private {
struct inode *inode;
u64 logical_offset;
u64 disk_bytenr;
- u64 bytes;
+ /* Used for bio::bi_size */
+ u32 bytes;
/*
* References to this structure. There is one reference per in-flight
@@ -341,8 +349,7 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
{
struct btrfs_root *root = inode->root;
- struct btrfs_super_block *sb = root->fs_info->super_copy;
- const u16 csum_size = btrfs_super_csum_size(sb);
+ const u32 csum_size = root->fs_info->csum_size;
/* Output minus objectid, which is more meaningful */
if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 81a8c87a5afb..113cb85c1fd4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -233,7 +233,6 @@ struct btrfsic_stack_frame {
struct btrfsic_state {
u32 print_mask;
int include_extent_data;
- int csum_size;
struct list_head all_blocks_list;
struct btrfsic_block_hashtable block_hashtable;
struct btrfsic_block_link_hashtable block_link_hashtable;
@@ -660,8 +659,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
return -1;
}
- state->csum_size = btrfs_super_csum_size(selected_super);
-
for (pass = 0; pass < 3; pass++) {
int num_copies;
int mirror_num;
@@ -954,7 +951,7 @@ static noinline_for_stack int btrfsic_process_metablock(
sf->prev = NULL;
continue_with_new_stack_frame:
- sf->block->generation = le64_to_cpu(sf->hdr->generation);
+ sf->block->generation = btrfs_stack_header_generation(sf->hdr);
if (0 == sf->hdr->level) {
struct btrfs_leaf *const leafhdr =
(struct btrfs_leaf *)sf->hdr;
@@ -1723,7 +1720,7 @@ static noinline_for_stack int btrfsic_test_for_metadata(
crypto_shash_update(shash, data, sublen);
}
crypto_shash_final(shash, csum);
- if (memcmp(csum, h->csum, state->csum_size))
+ if (memcmp(csum, h->csum, fs_info->csum_size))
return 1;
return 0; /* is metadata */
@@ -2677,7 +2674,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
mutex_lock(&btrfsic_mutex);
/* since btrfsic_submit_bio() is also called before
* btrfsic_mount(), this might return NULL */
- dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno);
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
if (NULL != dev_state &&
(bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
unsigned int i = 0;
@@ -2693,10 +2690,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
bio_is_patched = 0;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n",
+ pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
bio_op(bio), bio->bi_opf, segs,
- (unsigned long long)bio->bi_iter.bi_sector,
- dev_bytenr, bio->bi_disk);
+ bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
mapped_datav = kmalloc_array(segs,
sizeof(*mapped_datav), GFP_NOFS);
@@ -2725,8 +2721,8 @@ static void __btrfsic_submit_bio(struct bio *bio)
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x FLUSH, disk=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_disk);
+ pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, bio->bi_bdev);
if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
if ((dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
@@ -2797,7 +2793,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
state->fs_info = fs_info;
state->print_mask = print_mask;
state->include_extent_data = including_extent_data;
- state->csum_size = 0;
state->metablock_size = fs_info->nodesize;
state->datablock_size = fs_info->sectorsize;
INIT_LIST_HEAD(&state->all_blocks_list);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index eeface30facd..6d203acfdeb3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -131,10 +131,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
unsigned long disk_size)
{
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
-
return sizeof(struct compressed_bio) +
- (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size;
+ (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
}
static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
@@ -142,7 +140,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
struct page *page;
unsigned long i;
char *kaddr;
@@ -150,7 +148,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
struct compressed_bio *cb = bio->bi_private;
u8 *cb_sum = cb->sums;
- if (inode->flags & BTRFS_INODE_NODATASUM)
+ if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM))
return 0;
shash->tfm = fs_info->csum_shash;
@@ -220,7 +218,7 @@ static void end_compressed_bio_read(struct bio *bio)
inode = cb->inode;
ret = check_compressed_csum(BTRFS_I(inode), bio,
- (u64)bio->bi_iter.bi_sector << 9);
+ bio->bi_iter.bi_sector << 9);
if (ret)
goto csum_failed;
@@ -544,13 +542,19 @@ static noinline int add_ra_bio_pages(struct inode *inode,
goto next;
}
- end = last_offset + PAGE_SIZE - 1;
/*
* at this point, we have a locked page in the page cache
* for these bytes in the file. But, we have to make
* sure they map to this compressed extent on disk.
*/
- set_page_extent_mapped(page);
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ break;
+ }
+
+ end = last_offset + PAGE_SIZE - 1;
lock_extent(tree, last_offset, end);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
@@ -622,13 +626,12 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
unsigned long pg_index;
struct page *page;
struct bio *comp_bio;
- u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
+ u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
u64 em_len;
u64 em_start;
struct extent_map *em;
blk_status_t ret = BLK_STS_RESOURCE;
int faili = 0;
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
u8 *sums;
em_tree = &BTRFS_I(inode)->extent_tree;
@@ -722,15 +725,12 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
*/
refcount_inc(&cb->pending_bios);
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = btrfs_lookup_bio_sums(inode, comp_bio,
- (u64)-1, sums);
- BUG_ON(ret); /* -ENOMEM */
- }
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
+ BUG_ON(ret); /* -ENOMEM */
nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
fs_info->sectorsize);
- sums += csum_size * nr_sectors;
+ sums += fs_info->csum_size * nr_sectors;
ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret) {
@@ -751,10 +751,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums);
- BUG_ON(ret); /* -ENOMEM */
- }
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
+ BUG_ON(ret); /* -ENOMEM */
ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 113da62dc17f..d56730a67885 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -221,9 +221,12 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
-
- if (ret)
+ if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
return ret;
+ }
btrfs_mark_buffer_dirty(cow);
*cow_ret = cow;
@@ -1278,14 +1281,11 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
if (!tm)
return eb;
- btrfs_set_path_blocking(path);
- btrfs_set_lock_blocking_read(eb);
-
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
if (!eb_rewin) {
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
return NULL;
}
@@ -1297,13 +1297,13 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
} else {
eb_rewin = btrfs_clone_extent_buffer(eb);
if (!eb_rewin) {
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
return NULL;
}
}
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
@@ -1356,7 +1356,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- old = read_tree_block(fs_info, logical, 0, level, NULL);
+ old = read_tree_block(fs_info, logical, root->root_key.objectid,
+ 0, level, NULL);
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
if (!IS_ERR(old))
free_extent_buffer(old);
@@ -1373,9 +1374,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(fs_info, logical);
} else {
- btrfs_set_lock_blocking_read(eb_root);
eb = btrfs_clone_extent_buffer(eb_root);
- btrfs_tree_read_unlock_blocking(eb_root);
+ btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
}
@@ -1483,10 +1483,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
search_start = buf->start & ~((u64)SZ_1G - 1);
- if (parent)
- btrfs_set_lock_blocking_write(parent);
- btrfs_set_lock_blocking_write(buf);
-
/*
* Before CoWing this block for later modification, check if it's
* the subtree root and do the delayed subtree trace if needed.
@@ -1501,6 +1497,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
return ret;
}
+ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
/*
* helper function for defrag to decide if two blocks pointed to by a
@@ -1578,7 +1575,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *cur;
u64 blocknr;
- u64 gen;
u64 search_start = *last_ret;
u64 last_block = 0;
u64 other;
@@ -1586,14 +1582,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
int end_slot;
int i;
int err = 0;
- int parent_level;
- int uptodate;
u32 blocksize;
int progress_passed = 0;
struct btrfs_disk_key disk_key;
- parent_level = btrfs_header_level(parent);
-
WARN_ON(trans->transaction != fs_info->running_transaction);
WARN_ON(trans->transid != fs_info->generation);
@@ -1604,10 +1596,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (parent_nritems <= 1)
return 0;
- btrfs_set_lock_blocking_write(parent);
-
for (i = start_slot; i <= end_slot; i++) {
- struct btrfs_key first_key;
int close = 1;
btrfs_node_key(parent, &disk_key, i);
@@ -1616,8 +1605,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
progress_passed = 1;
blocknr = btrfs_node_blockptr(parent, i);
- gen = btrfs_node_ptr_generation(parent, i);
- btrfs_node_key_to_cpu(parent, &first_key, i);
if (last_block == 0)
last_block = blocknr;
@@ -1634,36 +1621,13 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
continue;
}
- cur = find_extent_buffer(fs_info, blocknr);
- if (cur)
- uptodate = btrfs_buffer_uptodate(cur, gen, 0);
- else
- uptodate = 0;
- if (!cur || !uptodate) {
- if (!cur) {
- cur = read_tree_block(fs_info, blocknr, gen,
- parent_level - 1,
- &first_key);
- if (IS_ERR(cur)) {
- return PTR_ERR(cur);
- } else if (!extent_buffer_uptodate(cur)) {
- free_extent_buffer(cur);
- return -EIO;
- }
- } else if (!uptodate) {
- err = btrfs_read_buffer(cur, gen,
- parent_level - 1,&first_key);
- if (err) {
- free_extent_buffer(cur);
- return err;
- }
- }
- }
+ cur = btrfs_read_node_slot(parent, i);
+ if (IS_ERR(cur))
+ return PTR_ERR(cur);
if (search_start == 0)
search_start = last_block;
btrfs_tree_lock(cur);
- btrfs_set_lock_blocking_write(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
@@ -1723,9 +1687,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
oip = offset_in_page(offset);
if (oip + key_size <= PAGE_SIZE) {
- const unsigned long idx = offset >> PAGE_SHIFT;
+ const unsigned long idx = get_eb_page_index(offset);
char *kaddr = page_address(eb->pages[idx]);
+ oip = get_eb_offset_in_page(eb, offset);
tmp = (struct btrfs_disk_key *)(kaddr + oip);
} else {
read_extent_buffer(eb, &unaligned, offset, key_size);
@@ -1801,6 +1766,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
btrfs_node_key_to_cpu(parent, &first_key, slot);
eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
+ btrfs_header_owner(parent),
btrfs_node_ptr_generation(parent, slot),
level - 1, &first_key);
if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
@@ -1835,8 +1801,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
mid = path->nodes[level];
- WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
- path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
+ WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
WARN_ON(btrfs_header_generation(mid) != trans->transid);
orig_ptr = btrfs_node_blockptr(mid, orig_slot);
@@ -1865,7 +1830,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
btrfs_tree_lock(child);
- btrfs_set_lock_blocking_write(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
BTRFS_NESTING_COW);
if (ret) {
@@ -1904,7 +1868,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (left) {
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
- btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left,
BTRFS_NESTING_LEFT_COW);
@@ -1920,7 +1883,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (right) {
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
- btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right,
BTRFS_NESTING_RIGHT_COW);
@@ -2084,7 +2046,6 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 left_nr;
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
- btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2139,7 +2100,6 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 right_nr;
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
- btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2243,7 +2203,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
search = btrfs_node_blockptr(node, nr);
if ((search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
- readahead_tree_block(fs_info, search);
+ btrfs_readahead_node_child(node, nr);
nread += blocksize;
}
nscan++;
@@ -2252,16 +2212,11 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
}
}
-static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, int level)
+static noinline void reada_for_balance(struct btrfs_path *path, int level)
{
+ struct extent_buffer *parent;
int slot;
int nritems;
- struct extent_buffer *parent;
- struct extent_buffer *eb;
- u64 gen;
- u64 block1 = 0;
- u64 block2 = 0;
parent = path->nodes[level + 1];
if (!parent)
@@ -2270,32 +2225,10 @@ static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
- if (slot > 0) {
- block1 = btrfs_node_blockptr(parent, slot - 1);
- gen = btrfs_node_ptr_generation(parent, slot - 1);
- eb = find_extent_buffer(fs_info, block1);
- /*
- * if we get -eagain from btrfs_buffer_uptodate, we
- * don't want to return eagain here. That will loop
- * forever
- */
- if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
- block1 = 0;
- free_extent_buffer(eb);
- }
- if (slot + 1 < nritems) {
- block2 = btrfs_node_blockptr(parent, slot + 1);
- gen = btrfs_node_ptr_generation(parent, slot + 1);
- eb = find_extent_buffer(fs_info, block2);
- if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
- block2 = 0;
- free_extent_buffer(eb);
- }
-
- if (block1)
- readahead_tree_block(fs_info, block1);
- if (block2)
- readahead_tree_block(fs_info, block2);
+ if (slot > 0)
+ btrfs_readahead_node_child(parent, slot - 1);
+ if (slot + 1 < nritems)
+ btrfs_readahead_node_child(parent, slot + 1);
}
@@ -2399,14 +2332,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
- /* the pages were up to date, but we failed
- * the generation number check. Do a full
- * read for the generation number that is correct.
- * We must do this without dropping locks so
- * we can trust our generation number
- */
- btrfs_set_path_blocking(p);
-
/* now we're allowed to do a blocking uptodate check */
ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
if (!ret) {
@@ -2426,14 +2351,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
* out which blocks to read.
*/
btrfs_unlock_up_safe(p, level + 1);
- btrfs_set_path_blocking(p);
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
ret = -EAGAIN;
- tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
- &first_key);
+ tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
+ gen, parent_level - 1, &first_key);
if (!IS_ERR(tmp)) {
/*
* If the read above didn't mark this buffer up to date,
@@ -2468,58 +2392,42 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
int *write_lock_level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ int ret = 0;
if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
- int sret;
if (*write_lock_level < level + 1) {
*write_lock_level = level + 1;
btrfs_release_path(p);
- goto again;
+ return -EAGAIN;
}
- btrfs_set_path_blocking(p);
- reada_for_balance(fs_info, p, level);
- sret = split_node(trans, root, p, level);
+ reada_for_balance(p, level);
+ ret = split_node(trans, root, p, level);
- BUG_ON(sret > 0);
- if (sret) {
- ret = sret;
- goto done;
- }
b = p->nodes[level];
} else if (ins_len < 0 && btrfs_header_nritems(b) <
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
- int sret;
if (*write_lock_level < level + 1) {
*write_lock_level = level + 1;
btrfs_release_path(p);
- goto again;
+ return -EAGAIN;
}
- btrfs_set_path_blocking(p);
- reada_for_balance(fs_info, p, level);
- sret = balance_level(trans, root, p, level);
+ reada_for_balance(p, level);
+ ret = balance_level(trans, root, p, level);
+ if (ret)
+ return ret;
- if (sret) {
- ret = sret;
- goto done;
- }
b = p->nodes[level];
if (!b) {
btrfs_release_path(p);
- goto again;
+ return -EAGAIN;
}
BUG_ON(btrfs_header_nritems(b) == 1);
}
- return 0;
-
-again:
- ret = -EAGAIN;
-done:
return ret;
}
@@ -2616,7 +2524,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = __btrfs_read_lock_root_node(root, p->recurse);
+ b = btrfs_read_lock_root_node(root);
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -2651,8 +2559,14 @@ out:
* @p: Holds all btree nodes along the search path
* @root: The root node of the tree
* @key: The key we are looking for
- * @ins_len: Indicates purpose of search, for inserts it is 1, for
- * deletions it's -1. 0 for plain searches
+ * @ins_len: Indicates purpose of search:
+ * >0 for inserts it's size of item inserted (*)
+ * <0 for deletions
+ * 0 for plain searches, not modifying the tree
+ *
+ * (*) If size of item inserted doesn't include
+ * sizeof(struct btrfs_item), then p->search_for_extension must
+ * be set.
* @cow: boolean should CoW operations be performed. Must always be 1
* when modifying the tree.
*
@@ -2752,7 +2666,6 @@ again:
goto again;
}
- btrfs_set_path_blocking(p);
if (last_level)
err = btrfs_cow_block(trans, root, b, NULL, 0,
&b,
@@ -2814,6 +2727,20 @@ cow_done:
if (level == 0) {
p->slots[level] = slot;
+ /*
+ * Item key already exists. In this case, if we are
+ * allowed to insert the item (for example, in dir_item
+ * case, item key collision is allowed), it will be
+ * merged with the original item. Only the item size
+ * grows, no new btrfs item will be added. If
+ * search_for_extension is not set, ins_len already
+ * accounts the size btrfs_item, deduct it here so leaf
+ * space check will be correct.
+ */
+ if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
+ ASSERT(ins_len >= sizeof(struct btrfs_item));
+ ins_len -= sizeof(struct btrfs_item);
+ }
if (ins_len > 0 &&
btrfs_leaf_free_space(b) < ins_len) {
if (write_lock_level < 1) {
@@ -2822,7 +2749,6 @@ cow_done:
goto again;
}
- btrfs_set_path_blocking(p);
err = split_leaf(trans, root, key,
p, ins_len, ret == 0);
@@ -2884,17 +2810,10 @@ cow_done:
if (!p->skip_locking) {
level = btrfs_header_level(b);
if (level <= write_lock_level) {
- if (!btrfs_try_tree_write_lock(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_lock(b);
- }
+ btrfs_tree_lock(b);
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
- __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
- p->recurse);
- }
+ btrfs_tree_read_lock(b);
p->locks[level] = BTRFS_READ_LOCK;
}
p->nodes[level] = b;
@@ -2902,16 +2821,11 @@ cow_done:
}
ret = 1;
done:
- /*
- * we don't really know what they plan on doing with the path
- * from here on, so for now just mark it as blocking
- */
- if (!p->leave_spinning)
- btrfs_set_path_blocking(p);
if (ret < 0 && !p->skip_release_on_error)
btrfs_release_path(p);
return ret;
}
+ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
/*
* Like btrfs_search_slot, this looks for a key in the given tree. It uses the
@@ -2999,10 +2913,7 @@ again:
}
level = btrfs_header_level(b);
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
- }
+ btrfs_tree_read_lock(b);
b = tree_mod_log_rewind(fs_info, p, b, time_seq);
if (!b) {
ret = -ENOMEM;
@@ -3013,8 +2924,6 @@ again:
}
ret = 1;
done:
- if (!p->leave_spinning)
- btrfs_set_path_blocking(p);
if (ret < 0)
btrfs_release_path(p);
@@ -3441,7 +3350,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
atomic_inc(&c->refs);
path->nodes[level] = c;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
path->slots[level] = 0;
return 0;
}
@@ -3562,7 +3471,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
(c_nritems - mid) * sizeof(struct btrfs_key_ptr));
btrfs_set_header_nritems(split, c_nritems - mid);
btrfs_set_header_nritems(c, mid);
- ret = 0;
btrfs_mark_buffer_dirty(c);
btrfs_mark_buffer_dirty(split);
@@ -3580,7 +3488,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(split);
free_extent_buffer(split);
}
- return ret;
+ return 0;
}
/*
@@ -3814,7 +3722,6 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
- btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(right);
if (free_space < data_size)
@@ -4053,7 +3960,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
- btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(left);
if (free_space < data_size) {
@@ -4448,7 +4354,6 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
goto err;
}
- btrfs_set_path_blocking(path);
ret = split_leaf(trans, root, &key, path, ins_len, 1);
if (ret)
goto err;
@@ -4478,8 +4383,6 @@ static noinline int split_item(struct btrfs_path *path,
leaf = path->nodes[0];
BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
- btrfs_set_path_blocking(path);
-
item = btrfs_item_nr(path->slots[0]);
orig_offset = btrfs_item_offset(leaf, item);
item_size = btrfs_item_size(leaf, item);
@@ -5055,7 +4958,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (leaf == root->node) {
btrfs_set_header_level(leaf, 0);
} else {
- btrfs_set_path_blocking(path);
btrfs_clean_tree_block(leaf);
btrfs_del_leaf(trans, root, path, leaf);
}
@@ -5077,7 +4979,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
slot = path->slots[1];
atomic_inc(&leaf->refs);
- btrfs_set_path_blocking(path);
wret = push_leaf_left(trans, root, path, 1, 1,
1, (u32)-1);
if (wret < 0 && wret != -ENOSPC)
@@ -5248,7 +5149,6 @@ find_next_key:
*/
if (slot >= nritems) {
path->slots[level] = slot;
- btrfs_set_path_blocking(path);
sret = btrfs_find_next_key(root, path, min_key, level,
min_trans);
if (sret == 0) {
@@ -5265,7 +5165,6 @@ find_next_key:
ret = 0;
goto out;
}
- btrfs_set_path_blocking(path);
cur = btrfs_read_node_slot(cur, slot);
if (IS_ERR(cur)) {
ret = PTR_ERR(cur);
@@ -5282,7 +5181,6 @@ out:
path->keep_locks = keep_locks;
if (ret == 0) {
btrfs_unlock_up_safe(path, path->lowest_level + 1);
- btrfs_set_path_blocking(path);
memcpy(min_key, &found_key, sizeof(found_key));
}
return ret;
@@ -5384,8 +5282,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key key;
u32 nritems;
int ret;
- int old_spinning = path->leave_spinning;
- int next_rw_lock = 0;
+ int i;
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
@@ -5395,11 +5292,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
again:
level = 1;
next = NULL;
- next_rw_lock = 0;
btrfs_release_path(path);
path->keep_locks = 1;
- path->leave_spinning = 1;
if (time_seq)
ret = btrfs_search_old_slot(root, &key, path, time_seq);
@@ -5459,13 +5354,22 @@ again:
continue;
}
- if (next) {
- btrfs_tree_unlock_rw(next, next_rw_lock);
- free_extent_buffer(next);
+
+ /*
+ * Our current level is where we're going to start from, and to
+ * make sure lockdep doesn't complain we need to drop our locks
+ * and nodes from 0 to our current level.
+ */
+ for (i = 0; i < level; i++) {
+ if (path->locks[level]) {
+ btrfs_tree_read_unlock(path->nodes[i]);
+ path->locks[i] = 0;
+ }
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
}
next = c;
- next_rw_lock = path->locks[level];
ret = read_block_for_search(root, path, &next, level,
slot, &key);
if (ret == -EAGAIN)
@@ -5491,28 +5395,18 @@ again:
cond_resched();
goto again;
}
- if (!ret) {
- btrfs_set_path_blocking(path);
- __btrfs_tree_read_lock(next,
- BTRFS_NESTING_RIGHT,
- path->recurse);
- }
- next_rw_lock = BTRFS_READ_LOCK;
+ if (!ret)
+ btrfs_tree_read_lock(next);
}
break;
}
path->slots[level] = slot;
while (1) {
level--;
- c = path->nodes[level];
- if (path->locks[level])
- btrfs_tree_unlock_rw(c, path->locks[level]);
-
- free_extent_buffer(c);
path->nodes[level] = next;
path->slots[level] = 0;
if (!path->skip_locking)
- path->locks[level] = next_rw_lock;
+ path->locks[level] = BTRFS_READ_LOCK;
if (!level)
break;
@@ -5526,23 +5420,12 @@ again:
goto done;
}
- if (!path->skip_locking) {
- ret = btrfs_try_tree_read_lock(next);
- if (!ret) {
- btrfs_set_path_blocking(path);
- __btrfs_tree_read_lock(next,
- BTRFS_NESTING_RIGHT,
- path->recurse);
- }
- next_rw_lock = BTRFS_READ_LOCK;
- }
+ if (!path->skip_locking)
+ btrfs_tree_read_lock(next);
}
ret = 0;
done:
unlock_up(path, 0, 1, 0, NULL);
- path->leave_spinning = old_spinning;
- if (!old_spinning)
- btrfs_set_path_blocking(path);
return ret;
}
@@ -5564,7 +5447,6 @@ int btrfs_previous_item(struct btrfs_root *root,
while (1) {
if (path->slots[0] == 0) {
- btrfs_set_path_blocking(path);
ret = btrfs_prev_leaf(root, path);
if (ret != 0)
return ret;
@@ -5606,7 +5488,6 @@ int btrfs_previous_extent_item(struct btrfs_root *root,
while (1) {
if (path->slots[0] == 0) {
- btrfs_set_path_blocking(path);
ret = btrfs_prev_leaf(root, path);
if (ret != 0)
return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aac3d6f4e35b..3bc00aed13b2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -17,7 +17,6 @@
#include <linux/wait.h>
#include <linux/slab.h>
#include <trace/events/btrfs.h>
-#include <asm/kmap_types.h>
#include <asm/unaligned.h>
#include <linux/pagemap.h>
#include <linux/btrfs.h>
@@ -28,6 +27,7 @@
#include <linux/dynamic_debug.h>
#include <linux/refcount.h>
#include <linux/crc32c.h>
+#include <linux/iomap.h>
#include "extent-io-tree.h"
#include "extent_io.h"
#include "extent_map.h"
@@ -67,12 +67,6 @@ struct btrfs_ref;
#define BTRFS_OLDEST_GENERATION 0ULL
/*
- * the max metadata block size. This limit is somewhat artificial,
- * but the memmove costs go through the roof for larger blocks.
- */
-#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
-
-/*
* we can actually store much bigger names, but lets not confuse the rest
* of linux
*/
@@ -137,6 +131,8 @@ enum {
* defrag
*/
BTRFS_FS_STATE_REMOUNTING,
+ /* Filesystem in RO mode */
+ BTRFS_FS_STATE_RO,
/* Track if a transaction abort has been reported on this filesystem */
BTRFS_FS_STATE_TRANS_ABORTED,
/*
@@ -302,7 +298,8 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
- BTRFS_FEATURE_INCOMPAT_RAID1C34)
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED)
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -370,11 +367,15 @@ struct btrfs_path {
unsigned int search_for_split:1;
unsigned int keep_locks:1;
unsigned int skip_locking:1;
- unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
unsigned int need_commit_sem:1;
unsigned int skip_release_on_error:1;
- unsigned int recurse:1;
+ /*
+ * Indicate that new item (btrfs_search_slot) is extending already
+ * existing item and ins_len contains only the data size and not item
+ * header (ie. sizeof(struct btrfs_item) is not included).
+ */
+ unsigned int search_for_extension:1;
};
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
@@ -469,10 +470,11 @@ struct btrfs_discard_ctl {
struct btrfs_block_group *block_group;
struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
u64 prev_discard;
+ u64 prev_discard_time;
atomic_t discardable_extents;
atomic64_t discardable_bytes;
u64 max_discard_size;
- unsigned long delay;
+ u64 delay_ms;
u32 iops_limit;
u32 kbps_limit;
u64 discard_extent_bytes;
@@ -559,6 +561,12 @@ enum {
/* Indicate that the discard workqueue can service discards. */
BTRFS_FS_DISCARD_RUNNING,
+
+ /* Indicate that we need to cleanup space cache v1 */
+ BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
+
+ /* Indicate that we can't trust the free space tree for caching yet */
+ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
};
/*
@@ -790,7 +798,7 @@ struct btrfs_fs_info {
/* used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
struct percpu_counter delalloc_bytes;
- struct percpu_counter dio_bytes;
+ struct percpu_counter ordered_bytes;
s32 dirty_metadata_batch;
s32 delalloc_batch;
@@ -878,7 +886,10 @@ struct btrfs_fs_info {
*/
struct ulist *qgroup_ulist;
- /* protect user change for quota operations */
+ /*
+ * Protect user change for quota operations. If a transaction is needed,
+ * it must be started before locking this lock.
+ */
struct mutex qgroup_ioctl_lock;
/* list of dirty qgroups to be written at next commit */
@@ -909,6 +920,7 @@ struct btrfs_fs_info {
/* Extent buffer radix tree */
spinlock_t buffer_lock;
+ /* Entries are eb->start / sectorsize */
struct radix_tree_root buffer_radix;
/* next backup root to be overwritten */
@@ -922,6 +934,7 @@ struct btrfs_fs_info {
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
struct work_struct async_data_reclaim_work;
+ struct work_struct preempt_reclaim_work;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
@@ -931,6 +944,10 @@ struct btrfs_fs_info {
/* Cached block sizes */
u32 nodesize;
u32 sectorsize;
+ /* ilog2 of sectorsize, use to avoid 64bit division */
+ u32 sectorsize_bits;
+ u32 csum_size;
+ u32 csums_per_leaf;
u32 stripesize;
/* Block groups and devices containing active swapfiles. */
@@ -948,6 +965,21 @@ struct btrfs_fs_info {
/* Type of exclusive operation running */
unsigned long exclusive_operation;
+ /*
+ * Zone size > 0 when in ZONED mode, otherwise it's used for a check
+ * if the mode is enabled
+ */
+ union {
+ u64 zone_size;
+ u64 zoned;
+ };
+
+ /* Max size to emit ZONE_APPEND write command */
+ u64 max_zone_append_size;
+ struct mutex zoned_meta_io_lock;
+ spinlock_t treelog_bg_lock;
+ u64 treelog_bg;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1018,7 +1050,7 @@ enum {
BTRFS_ROOT_DEAD_RELOC_TREE,
/* Mark dead root stored on device whose cleanup needs to be resumed */
BTRFS_ROOT_DEAD_TREE,
- /* The root has a log tree. Used only for subvolume roots. */
+ /* The root has a log tree. Used for subvolume roots and the tree root. */
BTRFS_ROOT_HAS_LOG_TREE,
/* Qgroup flushing is in progress */
BTRFS_ROOT_QGROUP_FLUSHING,
@@ -1057,15 +1089,6 @@ struct btrfs_root {
spinlock_t accounting_lock;
struct btrfs_block_rsv *block_rsv;
- /* free ino cache stuff */
- struct btrfs_free_space_ctl *free_ino_ctl;
- enum btrfs_caching_type ino_cache_state;
- spinlock_t ino_cache_lock;
- wait_queue_head_t ino_cache_wait;
- struct btrfs_free_space_ctl *free_ino_pinned;
- u64 ino_cache_progress;
- struct inode *ino_cache_inode;
-
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
@@ -1086,7 +1109,7 @@ struct btrfs_root {
u32 type;
- u64 highest_objectid;
+ u64 free_objectid;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
@@ -1224,6 +1247,63 @@ struct btrfs_replace_extent_info {
int insertions;
};
+/* Arguments for btrfs_drop_extents() */
+struct btrfs_drop_extents_args {
+ /* Input parameters */
+
+ /*
+ * If NULL, btrfs_drop_extents() will allocate and free its own path.
+ * If 'replace_extent' is true, this must not be NULL. Also the path
+ * is always released except if 'replace_extent' is true and
+ * btrfs_drop_extents() sets 'extent_inserted' to true, in which case
+ * the path is kept locked.
+ */
+ struct btrfs_path *path;
+ /* Start offset of the range to drop extents from */
+ u64 start;
+ /* End (exclusive, last byte + 1) of the range to drop extents from */
+ u64 end;
+ /* If true drop all the extent maps in the range */
+ bool drop_cache;
+ /*
+ * If true it means we want to insert a new extent after dropping all
+ * the extents in the range. If this is true, the 'extent_item_size'
+ * parameter must be set as well and the 'extent_inserted' field will
+ * be set to true by btrfs_drop_extents() if it could insert the new
+ * extent.
+ * Note: when this is set to true the path must not be NULL.
+ */
+ bool replace_extent;
+ /*
+ * Used if 'replace_extent' is true. Size of the file extent item to
+ * insert after dropping all existing extents in the range
+ */
+ u32 extent_item_size;
+
+ /* Output parameters */
+
+ /*
+ * Set to the minimum between the input parameter 'end' and the end
+ * (exclusive, last byte + 1) of the last dropped extent. This is always
+ * set even if btrfs_drop_extents() returns an error.
+ */
+ u64 drop_end;
+ /*
+ * The number of allocated bytes found in the range. This can be smaller
+ * than the range's length when there are holes in the range.
+ */
+ u64 bytes_found;
+ /*
+ * Only set if 'replace_extent' is true. Set to true if we were able
+ * to insert a replacement extent after dropping all extents in the
+ * range, otherwise set to false by btrfs_drop_extents().
+ * Also, if btrfs_drop_extents() has set this to true it means it
+ * returned with the path locked, otherwise if it has set this to
+ * false it has returned with the path released.
+ */
+ bool extent_inserted;
+};
+
struct btrfs_file_private {
void *filldir_buf;
};
@@ -1282,7 +1362,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
-#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
+/* bit 17 is free */
#define BTRFS_MOUNT_USEBACKUPROOT (1 << 18)
#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
@@ -1295,6 +1375,8 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
#define BTRFS_MOUNT_REF_VERIFY (1 << 28)
#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29)
+#define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30)
+#define BTRFS_MOUNT_IGNOREDATACSUMS (1 << 31)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
@@ -1327,9 +1409,7 @@ do { \
* transaction commit)
*/
-#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0)
-#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1)
-#define BTRFS_PENDING_COMMIT (2)
+#define BTRFS_PENDING_COMMIT (0)
#define btrfs_test_pending(info, opt) \
test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
@@ -1402,7 +1482,7 @@ struct btrfs_map_token {
};
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
- ((bytes) >> (fs_info)->sb->s_blocksize_bits)
+ ((bytes) >> (fs_info)->sectorsize_bits)
static inline void btrfs_init_map_token(struct btrfs_map_token *token,
struct extent_buffer *eb)
@@ -1487,13 +1567,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
- const type *p = page_address(eb->pages[0]); \
+ const type *p = page_address(eb->pages[0]) + \
+ offset_in_page(eb->start); \
return get_unaligned_le##bits(&p->member); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
- type *p = page_address(eb->pages[0]); \
+ type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
put_unaligned_le##bits(val, &p->member); \
}
@@ -2083,6 +2164,7 @@ BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
generation, 64);
BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8);
BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
@@ -2515,7 +2597,17 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes);
+/*
+ * Take the number of bytes to be checksummmed and figure out how many leaves
+ * it would require to store the csums for that many bytes.
+ */
+static inline u64 btrfs_csum_bytes_to_leaves(
+ const struct btrfs_fs_info *fs_info, u64 csum_bytes)
+{
+ const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
+
+ return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
+}
/*
* Use this if we would be adding new items, as we could split nodes as we cow
@@ -2590,7 +2682,6 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
u64 len);
-void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref);
@@ -2654,6 +2745,7 @@ enum btrfs_flush_state {
ALLOC_CHUNK_FORCE = 8,
RUN_DELAYED_IPUTS = 9,
COMMIT_TRANS = 10,
+ FORCE_COMMIT_TRANS = 11,
};
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
@@ -2810,10 +2902,26 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
* If we remount the fs to be R/O or umount the fs, the cleaner needn't do
* anything except sleeping. This function is used to check the status of
* the fs.
+ * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount,
+ * since setting and checking for SB_RDONLY in the superblock's flags is not
+ * atomic.
*/
static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
{
- return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info);
+ return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
+ btrfs_fs_closing(fs_info);
+}
+
+static inline void btrfs_set_sb_rdonly(struct super_block *sb)
+{
+ sb->s_flags |= SB_RDONLY;
+ set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
+}
+
+static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
+{
+ sb->s_flags &= ~SB_RDONLY;
+ clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
}
/* tree mod log functions from ctree.c */
@@ -2937,8 +3045,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u64 offset, u8 *dst);
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -2965,13 +3072,13 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len);
int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len);
-void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size);
+void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size);
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
struct page *page, u64 start, u64 end, int mirror);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
@@ -2991,22 +3098,22 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
const char *name, int name_len, int add_backref, u64 index);
int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
-int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
- int front);
+int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+ int front);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct inode *inode, u64 new_size,
+ struct btrfs_inode *inode, u64 new_size,
u32 min_type);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
+ bool in_reclaim_context);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- u64 new_dirid);
+ struct btrfs_root *parent_root);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
unsigned *bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
@@ -3017,6 +3124,8 @@ void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
unsigned long bio_flags);
+bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
+ unsigned int size);
void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
@@ -3035,14 +3144,13 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
u64 start, u64 end);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode);
+ struct btrfs_root *root, struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode);
+ struct btrfs_root *root, struct btrfs_inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
-int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
+int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
@@ -3060,7 +3168,18 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
-ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
+extern const struct iomap_ops btrfs_dio_iomap_ops;
+extern const struct iomap_dio_ops btrfs_dio_ops;
+
+/* Inode locking type flags, by default the exclusive lock is taken */
+#define BTRFS_ILOCK_SHARED (1U << 0)
+#define BTRFS_ILOCK_TRY (1U << 1)
+
+int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
+void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
+void btrfs_update_inode_bytes(struct btrfs_inode *inode,
+ const u64 add_bytes,
+ const u64 del_bytes);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3090,16 +3209,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
int skip_pinned);
extern const struct file_operations btrfs_file_operations;
-int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
- struct btrfs_path *path, u64 start, u64 end,
- u64 *drop_end, int drop_cache,
- int replace_extent,
- u32 extent_item_size,
- int *key_inserted);
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode, u64 start,
- u64 end, int drop_cache);
+ struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_drop_extents_args *args);
int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
struct btrfs_replace_extent_info *extent_info,
@@ -3109,7 +3221,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached);
+ struct extent_state **cached, bool noreserve);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes);
@@ -3289,6 +3401,39 @@ static inline void assertfail(const char *expr, const char* file, int line) { }
#endif
/*
+ * Get the correct offset inside the page of extent buffer.
+ *
+ * @eb: target extent buffer
+ * @start: offset inside the extent buffer
+ *
+ * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
+ */
+static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
+ unsigned long offset)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
+ * to PAGE_SIZE, thus adding it won't cause any difference.
+ *
+ * For sectorsize < PAGE_SIZE, we must only read the data that belongs
+ * to the eb, thus we have to take the eb->start into consideration.
+ */
+ return offset_in_page(offset + eb->start);
+}
+
+static inline unsigned long get_eb_page_index(unsigned long offset)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+ *
+ * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
+ * and have ensured that all tree blocks are contained in one page,
+ * thus we always get index == 0.
+ */
+ return offset >> PAGE_SHIFT;
+}
+
+/*
* Use that for functions that are conditionally exported for sanity tests but
* otherwise static
*/
@@ -3564,6 +3709,8 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
int btrfs_reada_wait(void *handle);
void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct extent_buffer *eb, int err);
+void btrfs_reada_remove_dev(struct btrfs_device *dev);
+void btrfs_reada_undo_remove_dev(struct btrfs_device *dev);
static inline int is_fstree(u64 rootid)
{
@@ -3595,4 +3742,9 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
}
#endif
+static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->zoned != 0;
+}
+
#endif
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index bacee09b7bfd..56642ca7af10 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -191,12 +191,14 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
}
/**
- * btrfs_inode_rsv_release - release any excessive reservation.
- * @inode - the inode we need to release from.
- * @qgroup_free - free or convert qgroup meta.
- * Unlike normal operation, qgroup meta reservation needs to know if we are
- * freeing qgroup reservation or just converting it into per-trans. Normally
- * @qgroup_free is true for error handling, and false for normal release.
+ * Release any excessive reservation
+ *
+ * @inode: the inode we need to release from
+ * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup
+ * meta reservation needs to know if we are freeing qgroup
+ * reservation or just converting it into per-trans. Normally
+ * @qgroup_free is true for error handling, and false for normal
+ * release.
*
* This is the same as btrfs_block_rsv_release, except that it handles the
* tracepoint for the reservation.
@@ -361,7 +363,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
}
/**
- * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * Release a metadata reservation for an inode
+ *
* @inode: the inode to release the reservation for.
* @num_bytes: the number of bytes we are releasing.
* @qgroup_free: free qgroup reservation or convert it to per-trans reservation
@@ -455,11 +458,13 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
}
/**
- * btrfs_delalloc_release_space - release data and metadata space for delalloc
- * @inode: inode we're releasing space for
- * @start: start position of the space already reserved
- * @len: the len of the space already reserved
- * @release_bytes: the len of the space we consumed or didn't use
+ * Release data and metadata space for delalloc
+ *
+ * @inode: inode we're releasing space for
+ * @reserved: list of changed/reserved ranges
+ * @start: start position of the space already reserved
+ * @len: length of the space already reserved
+ * @qgroup_free: should qgroup reserved-space also be freed
*
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 5aba81e16113..ec0b50b8c5d6 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -740,13 +740,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
goto out;
}
- /*
- * we need allocate some memory space, but it might cause the task
- * to sleep, so we set all locked nodes in the path to blocking locks
- * first.
- */
- btrfs_set_path_blocking(path);
-
keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
if (!keys) {
ret = -ENOMEM;
@@ -1154,7 +1147,6 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
block_rsv = trans->block_rsv;
trans->block_rsv = &fs_info->delayed_block_rsv;
@@ -1162,7 +1154,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
delayed_root = fs_info->delayed_root;
curr_node = btrfs_first_delayed_node(delayed_root);
- while (curr_node && (!count || (count && nr--))) {
+ while (curr_node && (!count || nr--)) {
ret = __btrfs_commit_inode_delayed_items(trans, path,
curr_node);
if (ret) {
@@ -1219,7 +1211,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
btrfs_release_delayed_node(delayed_node);
return -ENOMEM;
}
- path->leave_spinning = 1;
block_rsv = trans->block_rsv;
trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
@@ -1264,7 +1255,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
ret = -ENOMEM;
goto trans_out;
}
- path->leave_spinning = 1;
block_rsv = trans->block_rsv;
trans->block_rsv = &fs_info->delayed_block_rsv;
@@ -1333,7 +1323,6 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
if (!delayed_node)
break;
- path->leave_spinning = 1;
root = delayed_node->root;
trans = btrfs_join_transaction(root);
@@ -1826,27 +1815,29 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
}
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode)
+ struct btrfs_root *root,
+ struct btrfs_inode *inode)
{
struct btrfs_delayed_node *delayed_node;
int ret = 0;
- delayed_node = btrfs_get_or_create_delayed_node(BTRFS_I(inode));
+ delayed_node = btrfs_get_or_create_delayed_node(inode);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
mutex_lock(&delayed_node->mutex);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
- fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item,
+ &inode->vfs_inode);
goto release_node;
}
- ret = btrfs_delayed_inode_reserve_metadata(trans, root, BTRFS_I(inode),
+ ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
delayed_node);
if (ret)
goto release_node;
- fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode);
set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count++;
atomic_inc(&root->fs_info->delayed_root->items);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index ca96ef007d8f..b2412160c5bc 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -110,7 +110,8 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode);
+ struct btrfs_root *root,
+ struct btrfs_inode *inode);
int btrfs_fill_inode(struct inode *inode, u32 *rdev);
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 353cc2994d10..63be7d01a9a3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -69,9 +69,10 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
}
/**
- * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
- * @fs_info - the fs_info for our fs.
- * @nr - the number of items to drop.
+ * Release a ref head's reservation
+ *
+ * @fs_info: the filesystem
+ * @nr: number of items to drop
*
* This drops the delayed ref head's count from the delayed refs rsv and frees
* any excess reservation we had.
@@ -114,10 +115,11 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
}
/**
- * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
- * @fs_info - the fs info for our fs.
- * @src - the source block rsv to transfer from.
- * @num_bytes - the number of bytes to transfer.
+ * Transfer bytes to our delayed refs rsv
+ *
+ * @fs_info: the filesystem
+ * @src: source block rsv to transfer from
+ * @num_bytes: number of bytes to transfer
*
* This transfers up to the num_bytes amount from the src rsv to the
* delayed_refs_rsv. Any extra bytes are returned to the space info.
@@ -162,9 +164,10 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
}
/**
- * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
- * @fs_info - the fs_info for our fs.
- * @flush - control how we can flush for this reservation.
+ * Refill based on our delayed refs usage
+ *
+ * @fs_info: the filesystem
+ * @flush: control how we can flush for this reservation.
*
* This will refill the delayed block_rsv up to 1 items size worth of space and
* will return -ENOSPC if we can't make the reservation.
@@ -648,12 +651,12 @@ inserted:
*/
static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *existing,
- struct btrfs_delayed_ref_head *update,
- int *old_ref_mod_ret)
+ struct btrfs_delayed_ref_head *update)
{
struct btrfs_delayed_ref_root *delayed_refs =
&trans->transaction->delayed_refs;
struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 flags = btrfs_ref_head_to_space_flags(existing);
int old_ref_mod;
BUG_ON(existing->is_data != update->is_data);
@@ -701,8 +704,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
* currently, for refs we just added we know we're a-ok.
*/
old_ref_mod = existing->total_ref_mod;
- if (old_ref_mod_ret)
- *old_ref_mod_ret = old_ref_mod;
existing->ref_mod += update->ref_mod;
existing->total_ref_mod += update->ref_mod;
@@ -724,6 +725,27 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates += csum_leaves;
}
}
+
+ /*
+ * This handles the following conditions:
+ *
+ * 1. We had a ref mod of 0 or more and went negative, indicating that
+ * we may be freeing space, so add our space to the
+ * total_bytes_pinned counter.
+ * 2. We were negative and went to 0 or positive, so no longer can say
+ * that the space would be pinned, decrement our counter from the
+ * total_bytes_pinned counter.
+ * 3. We are now at 0 and have ->must_insert_reserved set, which means
+ * this was a new allocation and then we dropped it, and thus must
+ * add our space to the total_bytes_pinned counter.
+ */
+ if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
+ btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
+ else if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
+ btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes);
+ else if (existing->total_ref_mod == 0 && existing->must_insert_reserved)
+ btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
+
spin_unlock(&existing->lock);
}
@@ -798,8 +820,7 @@ static noinline struct btrfs_delayed_ref_head *
add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_qgroup_extent_record *qrecord,
- int action, int *qrecord_inserted_ret,
- int *old_ref_mod, int *new_ref_mod)
+ int action, int *qrecord_inserted_ret)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -821,8 +842,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
- update_existing_head_ref(trans, existing, head_ref,
- old_ref_mod);
+ update_existing_head_ref(trans, existing, head_ref);
/*
* we've updated the existing ref, free the newly
* allocated ref
@@ -830,14 +850,17 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
- if (old_ref_mod)
- *old_ref_mod = 0;
+ u64 flags = btrfs_ref_head_to_space_flags(head_ref);
+
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
trans->delayed_ref_updates +=
btrfs_csum_bytes_to_leaves(trans->fs_info,
head_ref->num_bytes);
}
+ if (head_ref->ref_mod < 0)
+ btrfs_mod_total_bytes_pinned(trans->fs_info, flags,
+ head_ref->num_bytes);
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
@@ -845,8 +868,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
- if (new_ref_mod)
- *new_ref_mod = head_ref->total_ref_mod;
return head_ref;
}
@@ -909,8 +930,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
*/
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- struct btrfs_delayed_extent_op *extent_op,
- int *old_ref_mod, int *new_ref_mod)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_tree_ref *ref;
@@ -977,8 +997,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
* the spin lock
*/
head_ref = add_delayed_ref_head(trans, head_ref, record,
- action, &qrecord_inserted,
- old_ref_mod, new_ref_mod);
+ action, &qrecord_inserted);
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
@@ -1006,8 +1025,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
*/
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- u64 reserved, int *old_ref_mod,
- int *new_ref_mod)
+ u64 reserved)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_data_ref *ref;
@@ -1073,8 +1091,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
* the spin lock
*/
head_ref = add_delayed_ref_head(trans, head_ref, record,
- action, &qrecord_inserted,
- old_ref_mod, new_ref_mod);
+ action, &qrecord_inserted);
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
@@ -1117,7 +1134,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
spin_lock(&delayed_refs->lock);
add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
- NULL, NULL, NULL);
+ NULL);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 1c977e6d45dc..e22fba272e4f 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -135,6 +135,11 @@ struct btrfs_delayed_data_ref {
u64 offset;
};
+enum btrfs_delayed_ref_flags {
+ /* Indicate that we are flushing delayed refs for the commit */
+ BTRFS_DELAYED_REFS_FLUSHING,
+};
+
struct btrfs_delayed_ref_root {
/* head ref rbtree */
struct rb_root_cached href_root;
@@ -158,12 +163,7 @@ struct btrfs_delayed_ref_root {
u64 pending_csums;
- /*
- * set when the tree is flushing before a transaction commit,
- * used by the throttling code to decide if new updates need
- * to be run right away
- */
- int flushing;
+ unsigned long flags;
u64 run_delayed_start;
@@ -326,6 +326,16 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
}
}
+static inline u64 btrfs_ref_head_to_space_flags(
+ struct btrfs_delayed_ref_head *head_ref)
+{
+ if (head_ref->is_data)
+ return BTRFS_BLOCK_GROUP_DATA;
+ else if (head_ref->is_system)
+ return BTRFS_BLOCK_GROUP_SYSTEM;
+ return BTRFS_BLOCK_GROUP_METADATA;
+}
+
static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head)
{
if (refcount_dec_and_test(&head->refs))
@@ -334,12 +344,10 @@ static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *hea
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- struct btrfs_delayed_extent_op *extent_op,
- int *old_ref_mod, int *new_ref_mod);
+ struct btrfs_delayed_extent_op *extent_op);
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- u64 reserved, int *old_ref_mod,
- int *new_ref_mod);
+ u64 reserved);
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 4a0243cb9d97..3a9c1e046ebe 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -21,6 +21,8 @@
#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
+#include "zoned.h"
+#include "block-group.h"
/*
* Device replace overview
@@ -91,6 +93,17 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
if (ret) {
no_valid_dev_replace_entry_found:
+ /*
+ * We don't have a replace item or it's corrupted. If there is
+ * a replace target, fail the mount.
+ */
+ if (btrfs_find_device(fs_info->fs_devices,
+ BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+ btrfs_err(fs_info,
+ "found replace target device without a valid replace item");
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = 0;
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
@@ -143,16 +156,27 @@ no_valid_dev_replace_entry_found:
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
- dev_replace->srcdev = NULL;
- dev_replace->tgtdev = NULL;
+ /*
+ * We don't have an active replace item but if there is a
+ * replace target, fail the mount.
+ */
+ if (btrfs_find_device(fs_info->fs_devices,
+ BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+ btrfs_err(fs_info,
+ "replace devid present without an active replace item");
+ ret = -EUCLEAN;
+ } else {
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ }
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
- src_devid, NULL, NULL, true);
+ src_devid, NULL, NULL);
dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID,
- NULL, NULL, true);
+ NULL, NULL);
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
@@ -237,6 +261,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return PTR_ERR(bdev);
}
+ if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ btrfs_err(fs_info,
+ "dev-replace: zoned type of target device mismatch with filesystem");
+ ret = -EINVAL;
+ goto error;
+ }
+
sync_blockdev(bdev);
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
@@ -291,6 +322,10 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_info->fs_devices;
+ ret = btrfs_get_dev_zone_info(device);
+ if (ret)
+ goto error;
+
mutex_lock(&fs_info->fs_devices->device_list_mutex);
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
@@ -425,6 +460,185 @@ static char* btrfs_dev_name(struct btrfs_device *device)
return rcu_str_deref(device->name);
}
+static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *src_dev)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_block_group *cache;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+ u64 chunk_offset;
+
+ /* Do not use "to_copy" on non zoned filesystem for now */
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ mutex_lock(&fs_info->chunk_mutex);
+
+ /* Ensure we don't have pending new block group */
+ spin_lock(&fs_info->trans_lock);
+ while (fs_info->running_transaction &&
+ !list_empty(&fs_info->running_transaction->dev_update_list)) {
+ spin_unlock(&fs_info->trans_lock);
+ mutex_unlock(&fs_info->chunk_mutex);
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ mutex_lock(&fs_info->chunk_mutex);
+ if (ret == -ENOENT) {
+ spin_lock(&fs_info->trans_lock);
+ continue;
+ } else {
+ goto unlock;
+ }
+ }
+
+ ret = btrfs_commit_transaction(trans);
+ mutex_lock(&fs_info->chunk_mutex);
+ if (ret)
+ goto unlock;
+
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ path->reada = READA_FORWARD;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ key.objectid = src_dev->devid;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto free_path;
+ if (ret > 0) {
+ if (path->slots[0] >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto free_path;
+ if (ret > 0) {
+ ret = 0;
+ goto free_path;
+ }
+ } else {
+ ret = 0;
+ }
+ }
+
+ while (1) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.objectid != src_dev->devid)
+ break;
+
+ if (found_key.type != BTRFS_DEV_EXTENT_KEY)
+ break;
+
+ if (found_key.offset < key.offset)
+ break;
+
+ dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
+
+ chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ if (!cache)
+ goto skip;
+
+ spin_lock(&cache->lock);
+ cache->to_copy = 1;
+ spin_unlock(&cache->lock);
+
+ btrfs_put_block_group(cache);
+
+skip:
+ ret = btrfs_next_item(root, path);
+ if (ret != 0) {
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+ }
+
+free_path:
+ btrfs_free_path(path);
+unlock:
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ return ret;
+}
+
+bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
+ struct btrfs_block_group *cache,
+ u64 physical)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 chunk_offset = cache->start;
+ int num_extents, cur_extent;
+ int i;
+
+ /* Do not use "to_copy" on non zoned filesystem for now */
+ if (!btrfs_is_zoned(fs_info))
+ return true;
+
+ spin_lock(&cache->lock);
+ if (cache->removed) {
+ spin_unlock(&cache->lock);
+ return true;
+ }
+ spin_unlock(&cache->lock);
+
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(!IS_ERR(em));
+ map = em->map_lookup;
+
+ num_extents = cur_extent = 0;
+ for (i = 0; i < map->num_stripes; i++) {
+ /* We have more device extent to copy */
+ if (srcdev != map->stripes[i].dev)
+ continue;
+
+ num_extents++;
+ if (physical == map->stripes[i].physical)
+ cur_extent = i;
+ }
+
+ free_extent_map(em);
+
+ if (num_extents > 1 && cur_extent < num_extents - 1) {
+ /*
+ * Has more stripes on this device. Keep this block group
+ * readonly until we finish all the stripes.
+ */
+ return false;
+ }
+
+ /* Last stripe on this device */
+ spin_lock(&cache->lock);
+ cache->to_copy = 0;
+ spin_unlock(&cache->lock);
+
+ return true;
+}
+
static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
@@ -466,6 +680,10 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (ret)
return ret;
+ ret = mark_block_group_to_copy(fs_info, src_device);
+ if (ret)
+ return ret;
+
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -681,13 +899,16 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
+ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ if (!scrub_ret)
+ btrfs_reada_remove_dev(src_device);
+
/*
* We have to use this loop approach because at this point src_device
* has to be available for transaction commit to complete, yet new
@@ -696,6 +917,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
while (1) {
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
+ btrfs_reada_undo_remove_dev(src_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return PTR_ERR(trans);
}
@@ -746,6 +968,7 @@ error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ btrfs_reada_undo_remove_dev(src_device);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 60b70dacc299..3911049a5f23 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -18,5 +18,8 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
+ struct btrfs_block_group *cache,
+ u64 physical);
#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 863367c2c620..98b63ebed539 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -127,7 +127,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
btrfs_cpu_key_to_disk(&disk_key, location);
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 741c7e19c32f..306ff20af70f 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -185,10 +185,12 @@ static struct btrfs_block_group *find_next_block_group(
}
/**
- * peek_discard_list - wrap find_next_block_group()
- * @discard_ctl: discard control
+ * Wrap find_next_block_group()
+ *
+ * @discard_ctl: discard control
* @discard_state: the discard_state of the block_group after state management
* @discard_index: the discard_index of the block_group after state management
+ * @now: time when discard was invoked, in ns
*
* This wraps find_next_block_group() and sets the block_group to be in use.
* discard_state's control flow is managed here. Variables related to
@@ -199,16 +201,15 @@ static struct btrfs_block_group *find_next_block_group(
static struct btrfs_block_group *peek_discard_list(
struct btrfs_discard_ctl *discard_ctl,
enum btrfs_discard_state *discard_state,
- int *discard_index)
+ int *discard_index, u64 now)
{
struct btrfs_block_group *block_group;
- const u64 now = ktime_get_ns();
spin_lock(&discard_ctl->lock);
again:
block_group = find_next_block_group(discard_ctl, now);
- if (block_group && now > block_group->discard_eligible_time) {
+ if (block_group && now >= block_group->discard_eligible_time) {
if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
block_group->used != 0) {
if (btrfs_is_block_group_data_only(block_group))
@@ -222,12 +223,11 @@ again:
block_group->discard_state = BTRFS_DISCARD_EXTENTS;
}
discard_ctl->block_group = block_group;
+ }
+ if (block_group) {
*discard_state = block_group->discard_state;
*discard_index = block_group->discard_index;
- } else {
- block_group = NULL;
}
-
spin_unlock(&discard_ctl->lock);
return block_group;
@@ -330,32 +330,19 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
btrfs_discard_schedule_work(discard_ctl, false);
}
-/**
- * btrfs_discard_schedule_work - responsible for scheduling the discard work
- * @discard_ctl: discard control
- * @override: override the current timer
- *
- * Discards are issued by a delayed workqueue item. @override is used to
- * update the current delay as the baseline delay interval is reevaluated on
- * transaction commit. This is also maxed with any other rate limit.
- */
-void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
- bool override)
+static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+ u64 now, bool override)
{
struct btrfs_block_group *block_group;
- const u64 now = ktime_get_ns();
-
- spin_lock(&discard_ctl->lock);
if (!btrfs_run_discard_work(discard_ctl))
- goto out;
-
+ return;
if (!override && delayed_work_pending(&discard_ctl->work))
- goto out;
+ return;
block_group = find_next_block_group(discard_ctl, now);
if (block_group) {
- unsigned long delay = discard_ctl->delay;
+ u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
/*
@@ -366,9 +353,9 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
if (kbps_limit && discard_ctl->prev_discard) {
u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
u64 bps_delay = div64_u64(discard_ctl->prev_discard *
- MSEC_PER_SEC, bps_limit);
+ NSEC_PER_SEC, bps_limit);
- delay = max(delay, msecs_to_jiffies(bps_delay));
+ delay = max(delay, bps_delay);
}
/*
@@ -378,13 +365,39 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
if (now < block_group->discard_eligible_time) {
u64 bg_timeout = block_group->discard_eligible_time - now;
- delay = max(delay, nsecs_to_jiffies(bg_timeout));
+ delay = max(delay, bg_timeout);
+ }
+
+ if (override && discard_ctl->prev_discard) {
+ u64 elapsed = now - discard_ctl->prev_discard_time;
+
+ if (delay > elapsed)
+ delay -= elapsed;
+ else
+ delay = 0;
}
mod_delayed_work(discard_ctl->discard_workers,
- &discard_ctl->work, delay);
+ &discard_ctl->work, nsecs_to_jiffies(delay));
}
-out:
+}
+
+/*
+ * btrfs_discard_schedule_work - responsible for scheduling the discard work
+ * @discard_ctl: discard control
+ * @override: override the current timer
+ *
+ * Discards are issued by a delayed workqueue item. @override is used to
+ * update the current delay as the baseline delay interval is reevaluated on
+ * transaction commit. This is also maxed with any other rate limit.
+ */
+void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+ bool override)
+{
+ const u64 now = ktime_get_ns();
+
+ spin_lock(&discard_ctl->lock);
+ __btrfs_discard_schedule_work(discard_ctl, now, override);
spin_unlock(&discard_ctl->lock);
}
@@ -429,13 +442,18 @@ static void btrfs_discard_workfn(struct work_struct *work)
int discard_index = 0;
u64 trimmed = 0;
u64 minlen = 0;
+ u64 now = ktime_get_ns();
discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
block_group = peek_discard_list(discard_ctl, &discard_state,
- &discard_index);
+ &discard_index, now);
if (!block_group || !btrfs_run_discard_work(discard_ctl))
return;
+ if (now < block_group->discard_eligible_time) {
+ btrfs_discard_schedule_work(discard_ctl, false);
+ return;
+ }
/* Perform discarding */
minlen = discard_minlen[discard_index];
@@ -465,8 +483,6 @@ static void btrfs_discard_workfn(struct work_struct *work)
discard_ctl->discard_extent_bytes += trimmed;
}
- discard_ctl->prev_discard = trimmed;
-
/* Determine next steps for a block_group */
if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
if (discard_state == BTRFS_DISCARD_BITMAPS) {
@@ -482,11 +498,13 @@ static void btrfs_discard_workfn(struct work_struct *work)
}
}
+ now = ktime_get_ns();
spin_lock(&discard_ctl->lock);
+ discard_ctl->prev_discard = trimmed;
+ discard_ctl->prev_discard_time = now;
discard_ctl->block_group = NULL;
+ __btrfs_discard_schedule_work(discard_ctl, now, false);
spin_unlock(&discard_ctl->lock);
-
- btrfs_discard_schedule_work(discard_ctl, false);
}
/**
@@ -519,7 +537,6 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
s64 discardable_bytes;
u32 iops_limit;
unsigned long delay;
- unsigned long lower_limit = BTRFS_DISCARD_MIN_DELAY_MSEC;
discardable_extents = atomic_read(&discard_ctl->discardable_extents);
if (!discardable_extents)
@@ -550,12 +567,13 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
iops_limit = READ_ONCE(discard_ctl->iops_limit);
if (iops_limit)
- lower_limit = max_t(unsigned long, lower_limit,
- MSEC_PER_SEC / iops_limit);
+ delay = MSEC_PER_SEC / iops_limit;
+ else
+ delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
- delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
- delay = clamp(delay, lower_limit, BTRFS_DISCARD_MAX_DELAY_MSEC);
- discard_ctl->delay = msecs_to_jiffies(delay);
+ delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC,
+ BTRFS_DISCARD_MAX_DELAY_MSEC);
+ discard_ctl->delay_ms = delay;
spin_unlock(&discard_ctl->lock);
}
@@ -563,15 +581,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
/**
* btrfs_discard_update_discardable - propagate discard counters
* @block_group: block_group of interest
- * @ctl: free_space_ctl of @block_group
*
* This propagates deltas of counters up to the discard_ctl. It maintains a
* current counter and a previous counter passing the delta up to the global
* stat. Then the current counter value becomes the previous counter value.
*/
-void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
- struct btrfs_free_space_ctl *ctl)
+void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
{
+ struct btrfs_free_space_ctl *ctl;
struct btrfs_discard_ctl *discard_ctl;
s32 extents_delta;
s64 bytes_delta;
@@ -581,8 +598,10 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
!btrfs_is_block_group_data_only(block_group))
return;
+ ctl = block_group->free_space_ctl;
discard_ctl = &block_group->fs_info->discard_ctl;
+ lockdep_assert_held(&ctl->tree_lock);
extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
ctl->discardable_extents[BTRFS_STAT_PREV];
if (extents_delta) {
@@ -684,10 +703,11 @@ void btrfs_discard_init(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
discard_ctl->prev_discard = 0;
+ discard_ctl->prev_discard_time = 0;
atomic_set(&discard_ctl->discardable_extents, 0);
atomic64_set(&discard_ctl->discardable_bytes, 0);
discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
- discard_ctl->delay = BTRFS_DISCARD_MAX_DELAY_MSEC;
+ discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
discard_ctl->kbps_limit = 0;
discard_ctl->discard_extent_bytes = 0;
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
index 353228d62f5a..57b9202f427f 100644
--- a/fs/btrfs/discard.h
+++ b/fs/btrfs/discard.h
@@ -28,8 +28,7 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl);
/* Update operations */
void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl);
-void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
- struct btrfs_free_space_ctl *ctl);
+void btrfs_discard_update_discardable(struct btrfs_block_group *block_group);
/* Setup/cleanup operations */
void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8e3438672a82..41b718cfea40 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,7 +29,6 @@
#include "tree-log.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "inode-map.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
@@ -42,6 +41,7 @@
#include "block-group.h"
#include "discard.h"
#include "space-info.h"
+#include "zoned.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -109,15 +109,13 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
- void *private_data;
+ struct inode *inode;
struct bio *bio;
extent_submit_bio_start_t *submit_bio_start;
int mirror_num;
- /*
- * bio_offset is optional, can be used if the pages in the bio
- * can't tell us where in the file the bio should go
- */
- u64 bio_offset;
+
+ /* Optional parameter for submit_bio_start used by direct io */
+ u64 dio_file_offset;
struct btrfs_work work;
blk_status_t status;
};
@@ -150,40 +148,41 @@ struct async_submit_bio {
# error
# endif
+#define DEFINE_LEVEL(stem, level) \
+ .names[level] = "btrfs-" stem "-0" #level,
+
+#define DEFINE_NAME(stem) \
+ DEFINE_LEVEL(stem, 0) \
+ DEFINE_LEVEL(stem, 1) \
+ DEFINE_LEVEL(stem, 2) \
+ DEFINE_LEVEL(stem, 3) \
+ DEFINE_LEVEL(stem, 4) \
+ DEFINE_LEVEL(stem, 5) \
+ DEFINE_LEVEL(stem, 6) \
+ DEFINE_LEVEL(stem, 7)
+
static struct btrfs_lockdep_keyset {
u64 id; /* root objectid */
- const char *name_stem; /* lock name stem */
- char names[BTRFS_MAX_LEVEL + 1][20];
- struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
+ /* Longest entry: btrfs-free-space-00 */
+ char names[BTRFS_MAX_LEVEL][20];
+ struct lock_class_key keys[BTRFS_MAX_LEVEL];
} btrfs_lockdep_keysets[] = {
- { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
- { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
- { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
- { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
- { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
- { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
- { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" },
- { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
- { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
- { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
- { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" },
- { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" },
- { .id = 0, .name_stem = "tree" },
+ { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") },
+ { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") },
+ { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") },
+ { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") },
+ { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") },
+ { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") },
+ { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") },
+ { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") },
+ { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") },
+ { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
+ { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
+ { .id = 0, DEFINE_NAME("tree") },
};
-void __init btrfs_init_lockdep(void)
-{
- int i, j;
-
- /* initialize lockdep class names */
- for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
- struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
-
- for (j = 0; j < ARRAY_SIZE(ks->names); j++)
- snprintf(ks->names[j], sizeof(ks->names[j]),
- "btrfs-%s-%02d", ks->name_stem, j);
- }
-}
+#undef DEFINE_LEVEL
+#undef DEFINE_NAME
void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
int level)
@@ -210,15 +209,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
const int num_pages = fs_info->nodesize >> PAGE_SHIFT;
+ const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- kaddr = page_address(buf->pages[0]);
+ kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
- PAGE_SIZE - BTRFS_CSUM_SIZE);
+ first_page_part - BTRFS_CSUM_SIZE);
for (i = 1; i < num_pages; i++) {
kaddr = page_address(buf->pages[i]);
@@ -248,10 +248,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
- if (need_lock) {
+ if (need_lock)
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- }
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
@@ -280,7 +278,7 @@ out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
if (need_lock)
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
return ret;
}
@@ -319,7 +317,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
- if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
+ if (memcmp(disk_sb->csum, result, fs_info->csum_size))
return 1;
return 0;
@@ -443,16 +441,16 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
}
/*
- * checksum a dirty tree block before IO. This has extra checks to make sure
- * we only fill in the checksum field in the first page of a multi-page block
+ * Checksum a dirty tree block before IO. This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block.
+ * For subpage extent buffers we need bvec to also read the offset in the page.
*/
-
-static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
+static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
{
+ struct page *page = bvec->bv_page;
u64 start = page_offset(page);
u64 found_start;
u8 result[BTRFS_CSUM_SIZE];
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
struct extent_buffer *eb;
int ret;
@@ -461,6 +459,12 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
return 0;
found_start = btrfs_header_bytenr(eb);
+
+ if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
+ WARN_ON(found_start != 0);
+ return 0;
+ }
+
/*
* Please do not consolidate these warnings into a single if.
* It is useful to know what went wrong.
@@ -489,7 +493,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
return ret;
}
- write_extent_buffer(eb, result, 0, csum_size);
+ write_extent_buffer(eb, result, 0, fs_info->csum_size);
return 0;
}
@@ -523,65 +527,37 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
return 1;
}
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int mirror)
+/* Do basic extent buffer checks at read time */
+static int validate_extent_buffer(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
- int found_level;
- struct extent_buffer *eb;
- struct btrfs_fs_info *fs_info;
- u16 csum_size;
- int ret = 0;
+ const u32 csum_size = fs_info->csum_size;
+ u8 found_level;
u8 result[BTRFS_CSUM_SIZE];
- int reads_done;
-
- if (!page->private)
- goto out;
-
- eb = (struct extent_buffer *)page->private;
- fs_info = eb->fs_info;
- csum_size = btrfs_super_csum_size(fs_info->super_copy);
-
- /* the pending IO might have been the only thing that kept this buffer
- * in memory. Make sure we have a ref for all this other checks
- */
- atomic_inc(&eb->refs);
-
- reads_done = atomic_dec_and_test(&eb->io_pages);
- if (!reads_done)
- goto err;
-
- eb->read_mirror = mirror;
- if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
- ret = -EIO;
- goto err;
- }
+ int ret = 0;
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
eb->start, found_start);
ret = -EIO;
- goto err;
+ goto out;
}
if (check_tree_block_fsid(eb)) {
btrfs_err_rl(fs_info, "bad fsid on block %llu",
eb->start);
ret = -EIO;
- goto err;
+ goto out;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
btrfs_err(fs_info, "bad tree block level %d on %llu",
(int)btrfs_header_level(eb), eb->start);
ret = -EIO;
- goto err;
+ goto out;
}
- btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
- eb, found_level);
-
csum_tree_block(eb, result);
if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
@@ -595,7 +571,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb));
ret = -EUCLEAN;
- goto err;
+ goto out;
}
/*
@@ -617,6 +593,94 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
btrfs_err(fs_info,
"block=%llu read time tree block corruption detected",
eb->start);
+out:
+ return ret;
+}
+
+static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
+ int mirror)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct extent_buffer *eb;
+ bool reads_done;
+ int ret = 0;
+
+ /*
+ * We don't allow bio merge for subpage metadata read, so we should
+ * only get one eb for each endio hook.
+ */
+ ASSERT(end == start + fs_info->nodesize - 1);
+ ASSERT(PagePrivate(page));
+
+ eb = find_extent_buffer(fs_info, start);
+ /*
+ * When we are reading one tree block, eb must have been inserted into
+ * the radix tree. If not, something is wrong.
+ */
+ ASSERT(eb);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ /* Subpage read must finish in page read */
+ ASSERT(reads_done);
+
+ eb->read_mirror = mirror;
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+ ret = -EIO;
+ goto err;
+ }
+ ret = validate_extent_buffer(eb);
+ if (ret < 0)
+ goto err;
+
+ if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
+ btree_readahead_hook(eb, ret);
+
+ set_extent_buffer_uptodate(eb);
+
+ free_extent_buffer(eb);
+ return ret;
+err:
+ /*
+ * end_bio_extent_readpage decrements io_pages in case of error,
+ * make sure it has something to decrement.
+ */
+ atomic_inc(&eb->io_pages);
+ clear_extent_buffer_uptodate(eb);
+ free_extent_buffer(eb);
+ return ret;
+}
+
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+ struct page *page, u64 start, u64 end,
+ int mirror)
+{
+ struct extent_buffer *eb;
+ int ret = 0;
+ int reads_done;
+
+ ASSERT(page->private);
+
+ if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ return validate_subpage_buffer(page, start, end, mirror);
+
+ eb = (struct extent_buffer *)page->private;
+
+ /*
+ * The pending IO might have been the only thing that kept this buffer
+ * in memory. Make sure we have a ref for all this other checks
+ */
+ atomic_inc(&eb->refs);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ if (!reads_done)
+ goto err;
+
+ eb->read_mirror = mirror;
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+ ret = -EIO;
+ goto err;
+ }
+ ret = validate_extent_buffer(eb);
err:
if (reads_done &&
test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@ -632,7 +696,7 @@ err:
clear_extent_buffer_uptodate(eb);
}
free_extent_buffer(eb);
-out:
+
return ret;
}
@@ -645,7 +709,7 @@ static void end_workqueue_bio(struct bio *bio)
fs_info = end_io_wq->info;
end_io_wq->status = bio->bi_status;
- if (bio_op(bio) == REQ_OP_WRITE) {
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
wq = fs_info->endio_meta_write_workers;
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
@@ -694,8 +758,8 @@ static void run_one_async_start(struct btrfs_work *work)
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
- ret = async->submit_bio_start(async->private_data, async->bio,
- async->bio_offset);
+ ret = async->submit_bio_start(async->inode, async->bio,
+ async->dio_file_offset);
if (ret)
async->status = ret;
}
@@ -715,7 +779,7 @@ static void run_one_async_done(struct btrfs_work *work)
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
- inode = async->private_data;
+ inode = async->inode;
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
@@ -745,18 +809,19 @@ static void run_one_async_free(struct btrfs_work *work)
kfree(async);
}
-blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
- u64 bio_offset, void *private_data,
+ u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start)
{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
if (!async)
return BLK_STS_RESOURCE;
- async->private_data = private_data;
+ async->inode = inode;
async->bio = bio;
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
@@ -764,7 +829,7 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
run_one_async_free);
- async->bio_offset = bio_offset;
+ async->dio_file_offset = dio_file_offset;
async->status = 0;
@@ -785,7 +850,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
- ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
+ ret = csum_dirty_buffer(root->fs_info, bvec);
if (ret)
break;
}
@@ -793,8 +858,8 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
return errno_to_blk_status(ret);
}
-static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
- u64 bio_offset)
+static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
+ u64 dio_file_offset)
{
/*
* when we're called for a write, we're already in the async
@@ -806,6 +871,8 @@ static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
static int check_async_write(struct btrfs_fs_info *fs_info,
struct btrfs_inode *bi)
{
+ if (btrfs_is_zoned(fs_info))
+ return 0;
if (atomic_read(&bi->sync_writers))
return 0;
if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
@@ -820,7 +887,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
int async = check_async_write(fs_info, BTRFS_I(inode));
blk_status_t ret;
- if (bio_op(bio) != REQ_OP_WRITE) {
+ if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
/*
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
@@ -840,8 +907,8 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
- ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
- 0, inode, btree_submit_bio_start);
+ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
+ 0, btree_submit_bio_start);
}
if (ret)
@@ -947,47 +1014,33 @@ static const struct address_space_operations btree_aops = {
.set_page_dirty = btree_set_page_dirty,
};
-void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
- struct extent_buffer *buf = NULL;
- int ret;
-
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
- if (IS_ERR(buf))
- return;
-
- ret = read_extent_buffer_pages(buf, WAIT_NONE, 0);
- if (ret < 0)
- free_extent_buffer_stale(buf);
- else
- free_extent_buffer(buf);
-}
-
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
- u64 bytenr)
+ u64 bytenr, u64 owner_root,
+ int level)
{
if (btrfs_is_testing(fs_info))
return alloc_test_extent_buffer(fs_info, bytenr);
- return alloc_extent_buffer(fs_info, bytenr);
+ return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
}
/*
* Read tree block at logical address @bytenr and do variant basic but critical
* verification.
*
+ * @owner_root: the objectid of the root owner for this block.
* @parent_transid: expected transid of this tree block, skip check if 0
* @level: expected level, mandatory check
* @first_key: expected key in slot 0, skip check if NULL
*/
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 parent_transid, int level,
- struct btrfs_key *first_key)
+ u64 owner_root, u64 parent_transid,
+ int level, struct btrfs_key *first_key)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
if (IS_ERR(buf))
return buf;
@@ -1012,8 +1065,6 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
-buf->len,
fs_info->dirty_metadata_batch);
- /* ugh, clear_extent_buffer_dirty needs to lock the page */
- btrfs_set_lock_blocking_write(buf);
clear_extent_buffer_dirty(buf);
}
}
@@ -1030,7 +1081,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->orphan_cleanup_state = 0;
root->last_trans = 0;
- root->highest_objectid = 0;
+ root->free_objectid = 0;
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
@@ -1155,7 +1206,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
- goto fail;
+ goto fail_unlock;
}
root->node = leaf;
@@ -1164,8 +1215,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
root->commit_root = btrfs_root_node(root);
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- root->root_item.flags = 0;
- root->root_item.byte_limit = 0;
+ btrfs_set_root_flags(&root->root_item, 0);
+ btrfs_set_root_limit(&root->root_item, 0);
btrfs_set_root_bytenr(&root->root_item, leaf->start);
btrfs_set_root_generation(&root->root_item, trans->transid);
btrfs_set_root_level(&root->root_item, 0);
@@ -1177,7 +1228,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
generate_random_guid(root->root_item.uuid);
else
export_guid(root->root_item.uuid, &guid_null);
- root->root_item.drop_level = 0;
+ btrfs_set_root_drop_level(&root->root_item, 0);
+
+ btrfs_tree_unlock(leaf);
key.objectid = objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1186,13 +1239,12 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (ret)
goto fail;
- btrfs_tree_unlock(leaf);
-
return root;
-fail:
+fail_unlock:
if (leaf)
btrfs_tree_unlock(leaf);
+fail:
btrfs_put_root(root);
return ERR_PTR(ret);
@@ -1202,7 +1254,6 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
- struct extent_buffer *leaf;
root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
if (!root)
@@ -1212,6 +1263,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+ return root;
+}
+
+int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct extent_buffer *leaf;
+
/*
* DON'T set SHAREABLE bit for log trees.
*
@@ -1224,16 +1283,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
- if (IS_ERR(leaf)) {
- btrfs_put_root(root);
- return ERR_CAST(leaf);
- }
+ if (IS_ERR(leaf))
+ return PTR_ERR(leaf);
root->node = leaf;
btrfs_mark_buffer_dirty(root->node);
btrfs_tree_unlock(root->node);
- return root;
+
+ return 0;
}
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -1244,6 +1302,16 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
log_root = alloc_log_tree(trans, fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
+
+ if (!btrfs_is_zoned(fs_info)) {
+ int ret = btrfs_alloc_log_tree_node(trans, log_root);
+
+ if (ret) {
+ btrfs_put_root(log_root);
+ return ret;
+ }
+ }
+
WARN_ON(fs_info->log_root_tree);
fs_info->log_root_tree = log_root;
return 0;
@@ -1255,11 +1323,18 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log_root;
struct btrfs_inode_item *inode_item;
+ int ret;
log_root = alloc_log_tree(trans, fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
+ ret = btrfs_alloc_log_tree_node(trans, log_root);
+ if (ret) {
+ btrfs_put_root(log_root);
+ return ret;
+ }
+
log_root->last_trans = trans->transid;
log_root->root_key.offset = root->root_key.objectid;
@@ -1281,57 +1356,61 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return 0;
}
-struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
- struct btrfs_key *key)
+static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
+ struct btrfs_path *path,
+ struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_fs_info *fs_info = tree_root->fs_info;
- struct btrfs_path *path;
u64 generation;
int ret;
int level;
- path = btrfs_alloc_path();
- if (!path)
- return ERR_PTR(-ENOMEM);
-
root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
- if (!root) {
- ret = -ENOMEM;
- goto alloc_fail;
- }
+ if (!root)
+ return ERR_PTR(-ENOMEM);
ret = btrfs_find_root(tree_root, key, path,
&root->root_item, &root->root_key);
if (ret) {
if (ret > 0)
ret = -ENOENT;
- goto find_fail;
+ goto fail;
}
generation = btrfs_root_generation(&root->root_item);
level = btrfs_root_level(&root->root_item);
root->node = read_tree_block(fs_info,
btrfs_root_bytenr(&root->root_item),
- generation, level, NULL);
+ key->objectid, generation, level, NULL);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
- goto find_fail;
+ goto fail;
} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
- goto find_fail;
+ goto fail;
}
root->commit_root = btrfs_root_node(root);
-out:
- btrfs_free_path(path);
return root;
-
-find_fail:
+fail:
btrfs_put_root(root);
-alloc_fail:
- root = ERR_PTR(ret);
- goto out;
+ return ERR_PTR(ret);
+}
+
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key)
+{
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+ root = read_tree_root_path(tree_root, path, key);
+ btrfs_free_path(path);
+
+ return root;
}
/*
@@ -1344,14 +1423,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
int ret;
unsigned int nofs_flag;
- root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
- root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
- GFP_NOFS);
- if (!root->free_ino_pinned || !root->free_ino_ctl) {
- ret = -ENOMEM;
- goto fail;
- }
-
/*
* We might be called under a transaction (e.g. indirect backref
* resolution) which could deadlock if it triggers memory reclaim
@@ -1368,10 +1439,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
btrfs_check_and_init_root_item(&root->root_item);
}
- btrfs_init_free_ino_ctl(root);
- spin_lock_init(&root->ino_cache_lock);
- init_waitqueue_head(&root->ino_cache_wait);
-
/*
* Don't assign anonymous block device to roots that are not exposed to
* userspace, the id pool is limited to 1M
@@ -1388,14 +1455,13 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
}
mutex_lock(&root->objectid_mutex);
- ret = btrfs_find_highest_objectid(root,
- &root->highest_objectid);
+ ret = btrfs_init_root_free_objectid(root);
if (ret) {
mutex_unlock(&root->objectid_mutex);
goto fail;
}
- ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+ ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
mutex_unlock(&root->objectid_mutex);
@@ -1419,6 +1485,31 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
return root;
}
+static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ if (objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->tree_root);
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->extent_root);
+ if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->chunk_root);
+ if (objectid == BTRFS_DEV_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->dev_root);
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->csum_root);
+ if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->quota_root) ?
+ fs_info->quota_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_UUID_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->uuid_root) ?
+ fs_info->uuid_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->free_space_root) ?
+ fs_info->free_space_root : ERR_PTR(-ENOENT);
+ return NULL;
+}
+
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root)
{
@@ -1453,7 +1544,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
root = list_first_entry(&fs_info->allocated_roots,
struct btrfs_root, leak_list);
btrfs_err(fs_info, "leaked root %s refcount %d",
- btrfs_root_name(root->root_key.objectid, buf),
+ btrfs_root_name(&root->root_key, buf),
refcount_read(&root->refs));
while (refcount_read(&root->refs) > 1)
btrfs_put_root(root);
@@ -1466,7 +1557,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
- percpu_counter_destroy(&fs_info->dio_bytes);
+ percpu_counter_destroy(&fs_info->ordered_bytes);
percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
btrfs_free_csum_hash(fs_info);
btrfs_free_stripe_hash_table(fs_info);
@@ -1518,25 +1609,9 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
struct btrfs_key key;
int ret;
- if (objectid == BTRFS_ROOT_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->tree_root);
- if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->extent_root);
- if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->chunk_root);
- if (objectid == BTRFS_DEV_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->dev_root);
- if (objectid == BTRFS_CSUM_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->csum_root);
- if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->quota_root) ?
- fs_info->quota_root : ERR_PTR(-ENOENT);
- if (objectid == BTRFS_UUID_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->uuid_root) ?
- fs_info->uuid_root : ERR_PTR(-ENOENT);
- if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->free_space_root) ?
- fs_info->free_space_root : ERR_PTR(-ENOENT);
+ root = btrfs_get_global_root(fs_info, objectid);
+ if (root)
+ return root;
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
@@ -1622,6 +1697,52 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
}
/*
+ * btrfs_get_fs_root_commit_root - return a root for the given objectid
+ * @fs_info: the fs_info
+ * @objectid: the objectid we need to lookup
+ *
+ * This is exclusively used for backref walking, and exists specifically because
+ * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref
+ * creation time, which means we may have to read the tree_root in order to look
+ * up a fs root that is not in memory. If the root is not in memory we will
+ * read the tree root commit root and look up the fs root from there. This is a
+ * temporary root, it will not be inserted into the radix tree as it doesn't
+ * have the most uptodate information, it'll simply be discarded once the
+ * backref code is finished using the root.
+ */
+struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ u64 objectid)
+{
+ struct btrfs_root *root;
+ struct btrfs_key key;
+
+ ASSERT(path->search_commit_root && path->skip_locking);
+
+ /*
+ * This can return -ENOENT if we ask for a root that doesn't exist, but
+ * since this is called via the backref walking code we won't be looking
+ * up a root that doesn't exist, unless there's corruption. So if root
+ * != NULL just return it.
+ */
+ root = btrfs_get_global_root(fs_info, objectid);
+ if (root)
+ return root;
+
+ root = btrfs_lookup_fs_root(fs_info, objectid);
+ if (root)
+ return root;
+
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = read_tree_root_path(fs_info->tree_root, path, &key);
+ btrfs_release_path(path);
+
+ return root;
+}
+
+/*
* called by the kthread helper functions to finally call the bio end_io
* functions. This is where read checksum verification actually happens
*/
@@ -1695,7 +1816,7 @@ static int cleaner_kthread(void *arg)
*/
btrfs_delete_unused_bgs(fs_info);
sleep:
- clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
+ clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
if (kthread_should_park())
kthread_parkme();
if (kthread_should_stop())
@@ -1715,13 +1836,13 @@ static int transaction_kthread(void *arg)
struct btrfs_trans_handle *trans;
struct btrfs_transaction *cur;
u64 transid;
- time64_t now;
+ time64_t delta;
unsigned long delay;
bool cannot_commit;
do {
cannot_commit = false;
- delay = HZ * fs_info->commit_interval;
+ delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
mutex_lock(&fs_info->transaction_kthread_mutex);
spin_lock(&fs_info->trans_lock);
@@ -1731,12 +1852,13 @@ static int transaction_kthread(void *arg)
goto sleep;
}
- now = ktime_get_seconds();
+ delta = ktime_get_seconds() - cur->start_time;
if (cur->state < TRANS_STATE_COMMIT_START &&
- (now < cur->start_time ||
- now - cur->start_time < fs_info->commit_interval)) {
+ delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
- delay = HZ * 5;
+ delay -= msecs_to_jiffies((delta - 1) * 1000);
+ delay = min(delay,
+ msecs_to_jiffies(fs_info->commit_interval * 1000));
goto sleep;
}
transid = cur->transid;
@@ -1985,8 +2107,6 @@ void btrfs_put_root(struct btrfs_root *root)
free_anon_bdev(root->anon_dev);
btrfs_drew_lock_destroy(&root->snapshot_lock);
free_root_extent_buffers(root);
- kfree(root->free_ino_ctl);
- kfree(root->free_ino_pinned);
#ifdef CONFIG_BTRFS_DEBUG
spin_lock(&root->fs_info->fs_roots_radix_lock);
list_del_init(&root->leak_list);
@@ -2201,8 +2321,9 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
return -ENOMEM;
log_tree_root->node = read_tree_block(fs_info, bytenr,
- fs_info->generation + 1,
- level, NULL);
+ BTRFS_TREE_LOG_OBJECTID,
+ fs_info->generation + 1, level,
+ NULL);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
@@ -2247,30 +2368,42 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->extent_root = root;
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->extent_root = root;
location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->dev_root = root;
+ btrfs_init_devices_late(fs_info);
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->dev_root = root;
- btrfs_init_devices_late(fs_info);
- location.objectid = BTRFS_CSUM_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ /* If IGNOREDATACSUMS is set don't bother reading the csum root. */
+ if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
+ location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->csum_root = root;
+ }
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->csum_root = root;
/*
* This tree can share blocks with some other fs tree during relocation
@@ -2279,11 +2412,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
root = btrfs_get_fs_root(tree_root->fs_info,
BTRFS_DATA_RELOC_TREE_OBJECTID, true);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->data_reloc_root = root;
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->data_reloc_root = root;
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
@@ -2296,9 +2432,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
location.objectid = BTRFS_UUID_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- if (ret != -ENOENT)
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ if (ret != -ENOENT)
+ goto out;
+ }
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->uuid_root = root;
@@ -2308,11 +2446,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->free_space_root = root;
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->free_space_root = root;
}
return 0;
@@ -2373,13 +2514,21 @@ static int validate_super(struct btrfs_fs_info *fs_info,
btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
ret = -EINVAL;
}
- /* Only PAGE SIZE is supported yet */
- if (sectorsize != PAGE_SIZE) {
+
+ /*
+ * For 4K page size, we only support 4K sector size.
+ * For 64K page size, we support read-write for 64K sector size, and
+ * read-only for 4K sector size.
+ */
+ if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
+ (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
+ sectorsize != SZ_64K))) {
btrfs_err(fs_info,
- "sectorsize %llu not supported yet, only support %lu",
+ "sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
ret = -EINVAL;
}
+
if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
@@ -2568,6 +2717,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
generation = btrfs_super_generation(sb);
level = btrfs_super_root_level(sb);
tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
+ BTRFS_ROOT_TREE_OBJECTID,
generation, level, NULL);
if (IS_ERR(tree_root->node)) {
handle_error = true;
@@ -2591,14 +2741,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
* No need to hold btrfs_root::objectid_mutex since the fs
* hasn't been fully initialised and we are the only user
*/
- ret = btrfs_find_highest_objectid(tree_root,
- &tree_root->highest_objectid);
+ ret = btrfs_init_root_free_objectid(tree_root);
if (ret < 0) {
handle_error = true;
continue;
}
- ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+ ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
ret = btrfs_read_roots(fs_info);
if (ret < 0) {
@@ -2640,11 +2789,13 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
+ spin_lock_init(&fs_info->treelog_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
+ mutex_init(&fs_info->zoned_meta_io_lock);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2732,6 +2883,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
/* Usable values until the real ones are cached from the superblock */
fs_info->nodesize = 4096;
fs_info->sectorsize = 4096;
+ fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
spin_lock_init(&fs_info->swapfile_pins_lock);
@@ -2748,7 +2900,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
- ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
@@ -2774,6 +2926,9 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
return -ENOMEM;
btrfs_init_delayed_root(fs_info->delayed_root);
+ if (sb_rdonly(sb))
+ set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
+
return btrfs_alloc_stripe_hash_table(fs_info);
}
@@ -2814,6 +2969,110 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
return 0;
}
+/*
+ * Some options only have meaning at mount time and shouldn't persist across
+ * remounts, or be displayed. Clear these at the end of mount and remount
+ * code paths.
+ */
+void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
+{
+ btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
+}
+
+/*
+ * Mounting logic specific to read-write file systems. Shared by open_ctree
+ * and btrfs_remount when remounting from read-only to read-write.
+ */
+int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
+ bool clear_free_space_tree = false;
+
+ if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
+ btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ clear_free_space_tree = true;
+ } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
+ btrfs_warn(fs_info, "free space tree is invalid");
+ clear_free_space_tree = true;
+ }
+
+ if (clear_free_space_tree) {
+ btrfs_info(fs_info, "clearing free space tree");
+ ret = btrfs_clear_free_space_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to clear free space tree: %d", ret);
+ goto out;
+ }
+ }
+
+ ret = btrfs_cleanup_fs_roots(fs_info);
+ if (ret)
+ goto out;
+
+ down_read(&fs_info->cleanup_work_sem);
+ if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
+ (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
+ up_read(&fs_info->cleanup_work_sem);
+ goto out;
+ }
+ up_read(&fs_info->cleanup_work_sem);
+
+ mutex_lock(&fs_info->cleaner_mutex);
+ ret = btrfs_recover_relocation(fs_info->tree_root);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
+ goto out;
+ }
+
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_info(fs_info, "creating free space tree");
+ ret = btrfs_create_free_space_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to create free space tree: %d", ret);
+ goto out;
+ }
+ }
+
+ if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
+ ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_resume_balance_async(fs_info);
+ if (ret)
+ goto out;
+
+ ret = btrfs_resume_dev_replace_async(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info, "failed to resume dev_replace");
+ goto out;
+ }
+
+ btrfs_qgroup_rescan_resume(fs_info);
+
+ if (!fs_info->uuid_root) {
+ btrfs_info(fs_info, "creating UUID tree");
+ ret = btrfs_create_uuid_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to create the UUID tree %d", ret);
+ goto out;
+ }
+ }
+
+ ret = btrfs_find_orphan_roots(fs_info);
+out:
+ return ret;
+}
+
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
char *options)
{
@@ -2829,7 +3088,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
struct btrfs_root *chunk_root;
int ret;
int err = -EINVAL;
- int clear_free_space_tree = 0;
int level;
ret = init_mount_fs_info(fs_info, sb);
@@ -2882,6 +3140,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
+ fs_info->csum_size = btrfs_super_csum_size(disk_super);
+
ret = btrfs_init_csum_hash(fs_info, csum_type);
if (ret) {
err = ret;
@@ -2996,6 +3256,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
/* Cache block sizes */
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
+ fs_info->sectorsize_bits = ilog2(sectorsize);
+ fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
/*
@@ -3026,6 +3288,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
+ /* For 4K sector size support, it's only read-only */
+ if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) {
+ if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) {
+ btrfs_err(fs_info,
+ "subpage sectorsize %u only supported read-only for page size %lu",
+ sectorsize, PAGE_SIZE);
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+ }
+
ret = btrfs_init_workqueues(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -3052,6 +3325,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
chunk_root->node = read_tree_block(fs_info,
btrfs_super_chunk_root(disk_super),
+ BTRFS_CHUNK_TREE_OBJECTID,
generation, level, NULL);
if (IS_ERR(chunk_root->node) ||
!extent_buffer_uptodate(chunk_root->node)) {
@@ -3075,11 +3349,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
/*
- * Keep the devid that is marked to be the target device for the
- * device replace procedure
+ * At this point we know all the devices that make this filesystem,
+ * including the seed devices but we don't know yet if the replace
+ * target is required. So free devices that are not part of this
+ * filesystem but skip the replace traget device which is checked
+ * below in btrfs_init_dev_replace().
*/
- btrfs_free_extra_devids(fs_devices, 0);
-
+ btrfs_free_extra_devids(fs_devices);
if (!fs_devices->latest_bdev) {
btrfs_err(fs_info, "failed to read devices");
goto fail_tree_roots;
@@ -3090,6 +3366,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_tree_roots;
/*
+ * Get zone type information of zoned block devices. This will also
+ * handle emulation of a zoned filesystem if a regular device has the
+ * zoned incompat feature flag set.
+ */
+ ret = btrfs_get_dev_zone_info_all_devices(fs_info);
+ if (ret) {
+ btrfs_err(fs_info,
+ "zoned: failed to read device zone info: %d",
+ ret);
+ goto fail_block_groups;
+ }
+
+ /*
* If we have a uuid root and we're not being told to rescan we need to
* check the generation here so we can set the
* BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the
@@ -3126,7 +3415,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_block_groups;
}
- btrfs_free_extra_devids(fs_devices, 1);
+ ret = btrfs_check_zoned_mode(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to initialize zoned mode: %d",
+ ret);
+ goto fail_block_groups;
+ }
ret = btrfs_sysfs_add_fsid(fs_devices);
if (ret) {
@@ -3212,26 +3506,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
}
- ret = btrfs_find_orphan_roots(fs_info);
- if (ret)
- goto fail_qgroup;
-
- if (!sb_rdonly(sb)) {
- ret = btrfs_cleanup_fs_roots(fs_info);
- if (ret)
- goto fail_qgroup;
-
- mutex_lock(&fs_info->cleaner_mutex);
- ret = btrfs_recover_relocation(tree_root);
- mutex_unlock(&fs_info->cleaner_mutex);
- if (ret < 0) {
- btrfs_warn(fs_info, "failed to recover relocation: %d",
- ret);
- err = -EINVAL;
- goto fail_qgroup;
- }
- }
-
fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
@@ -3241,78 +3515,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
if (sb_rdonly(sb))
- return 0;
-
- if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
- btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- clear_free_space_tree = 1;
- } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
- !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
- btrfs_warn(fs_info, "free space tree is invalid");
- clear_free_space_tree = 1;
- }
-
- if (clear_free_space_tree) {
- btrfs_info(fs_info, "clearing free space tree");
- ret = btrfs_clear_free_space_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to clear free space tree: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
- }
-
- if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
- !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- btrfs_info(fs_info, "creating free space tree");
- ret = btrfs_create_free_space_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to create free space tree: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
- }
+ goto clear_oneshot;
- down_read(&fs_info->cleanup_work_sem);
- if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
- (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
- up_read(&fs_info->cleanup_work_sem);
- close_ctree(fs_info);
- return ret;
- }
- up_read(&fs_info->cleanup_work_sem);
-
- ret = btrfs_resume_balance_async(fs_info);
- if (ret) {
- btrfs_warn(fs_info, "failed to resume balance: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
-
- ret = btrfs_resume_dev_replace_async(fs_info);
+ ret = btrfs_start_pre_rw_mount(fs_info);
if (ret) {
- btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
close_ctree(fs_info);
return ret;
}
-
- btrfs_qgroup_rescan_resume(fs_info);
btrfs_discard_resume(fs_info);
- if (!fs_info->uuid_root) {
- btrfs_info(fs_info, "creating UUID tree");
- ret = btrfs_create_uuid_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to create the UUID tree: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
- } else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
- fs_info->generation !=
- btrfs_super_uuid_tree_generation(disk_super)) {
+ if (fs_info->uuid_root &&
+ (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
+ fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
btrfs_info(fs_info, "checking UUID tree");
ret = btrfs_check_uuid_tree(fs_info);
if (ret) {
@@ -3322,14 +3536,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
return ret;
}
}
- set_bit(BTRFS_FS_OPEN, &fs_info->flags);
- /*
- * backuproot only affect mount behavior, and if open_ctree succeeded,
- * no need to keep the flag
- */
- btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+clear_oneshot:
+ btrfs_clear_oneshot_options(fs_info);
return 0;
fail_qgroup:
@@ -3410,10 +3621,17 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
{
struct btrfs_super_block *super;
struct page *page;
- u64 bytenr;
+ u64 bytenr, bytenr_orig;
struct address_space *mapping = bdev->bd_inode->i_mapping;
+ int ret;
+
+ bytenr_orig = btrfs_sb_offset(copy_num);
+ ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
+ if (ret == -ENOENT)
+ return ERR_PTR(-EINVAL);
+ else if (ret)
+ return ERR_PTR(ret);
- bytenr = btrfs_sb_offset(copy_num);
if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
return ERR_PTR(-EINVAL);
@@ -3427,7 +3645,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
return ERR_PTR(-ENODATA);
}
- if (btrfs_super_bytenr(super) != bytenr) {
+ if (btrfs_super_bytenr(super) != bytenr_orig) {
btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
@@ -3482,7 +3700,8 @@ static int write_dev_supers(struct btrfs_device *device,
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
int i;
int errors = 0;
- u64 bytenr;
+ int ret;
+ u64 bytenr, bytenr_orig;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
@@ -3494,12 +3713,22 @@ static int write_dev_supers(struct btrfs_device *device,
struct bio *bio;
struct btrfs_super_block *disk_super;
- bytenr = btrfs_sb_offset(i);
+ bytenr_orig = btrfs_sb_offset(i);
+ ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
+ if (ret == -ENOENT) {
+ continue;
+ } else if (ret < 0) {
+ btrfs_err(device->fs_info,
+ "couldn't get super block location for mirror %d",
+ i);
+ errors++;
+ continue;
+ }
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
- btrfs_set_super_bytenr(sb, bytenr);
+ btrfs_set_super_bytenr(sb, bytenr_orig);
crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
@@ -3544,6 +3773,7 @@ static int write_dev_supers(struct btrfs_device *device,
bio->bi_opf |= REQ_FUA;
btrfsic_submit_bio(bio);
+ btrfs_advance_sb_log(device, i);
}
return errors < i ? 0 : -1;
}
@@ -3560,6 +3790,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
int i;
int errors = 0;
bool primary_failed = false;
+ int ret;
u64 bytenr;
if (max_mirrors == 0)
@@ -3568,7 +3799,15 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
for (i = 0; i < max_mirrors; i++) {
struct page *page;
- bytenr = btrfs_sb_offset(i);
+ ret = btrfs_sb_log_location(device, i, READ, &bytenr);
+ if (ret == -ENOENT) {
+ break;
+ } else if (ret < 0) {
+ errors++;
+ if (i == 0)
+ primary_failed = true;
+ continue;
+ }
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
@@ -3882,14 +4121,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
}
}
- if (root->free_ino_pinned)
- __btrfs_remove_free_space_cache(root->free_ino_pinned);
- if (root->free_ino_ctl)
- __btrfs_remove_free_space_cache(root->free_ino_ctl);
- if (root->ino_cache_inode) {
- iput(root->ino_cache_inode);
- root->ino_cache_inode = NULL;
- }
if (drop_ref)
btrfs_put_root(root);
}
@@ -4001,6 +4232,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
+ cancel_work_sync(&fs_info->preempt_reclaim_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4053,9 +4285,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_sum(&fs_info->delalloc_bytes));
}
- if (percpu_counter_sum(&fs_info->dio_bytes))
+ if (percpu_counter_sum(&fs_info->ordered_bytes))
btrfs_info(fs_info, "at unmount dio bytes count %lld",
- percpu_counter_sum(&fs_info->dio_bytes));
+ percpu_counter_sum(&fs_info->ordered_bytes));
btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -4069,6 +4301,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
+ /* We shouldn't have any transaction open at this point */
+ ASSERT(list_empty(&fs_info->trans_list));
+
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
free_root_pointers(fs_info, true);
btrfs_free_fs_roots(fs_info);
@@ -4112,8 +4347,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
- struct btrfs_fs_info *fs_info;
- struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info = buf->fs_info;
u64 transid = btrfs_header_generation(buf);
int was_dirty;
@@ -4126,8 +4360,6 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
return;
#endif
- root = BTRFS_I(buf->pages[0]->mapping->host)->root;
- fs_info = root->fs_info;
btrfs_assert_tree_locked(buf);
if (transid != fs_info->generation)
WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
@@ -4576,6 +4808,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
EXTENT_DIRTY);
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
+ btrfs_free_redirty_list(cur_trans);
+
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
}
@@ -4632,3 +4866,58 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
return 0;
}
+
+int btrfs_init_root_free_objectid(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *l;
+ struct btrfs_key search_key;
+ struct btrfs_key found_key;
+ int slot;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+ search_key.type = -1;
+ search_key.offset = (u64)-1;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ BUG_ON(ret == 0); /* Corruption */
+ if (path->slots[0] > 0) {
+ slot = path->slots[0] - 1;
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+ root->free_objectid = max_t(u64, found_key.objectid + 1,
+ BTRFS_FIRST_FREE_OBJECTID);
+ } else {
+ root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
+{
+ int ret;
+ mutex_lock(&root->objectid_mutex);
+
+ if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+ btrfs_warn(root->fs_info,
+ "the objectid of root %llu reaches its highest value",
+ root->root_key.objectid);
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ *objectid = root->free_objectid++;
+ ret = 0;
+out:
+ mutex_unlock(&root->objectid_mutex);
+ return ret;
+}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index fee69ced58b4..0e7e9526b6a8 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -43,13 +43,15 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
struct btrfs_key *first_key, u64 parent_transid);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 parent_transid, int level,
- struct btrfs_key *first_key);
-void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr);
+ u64 owner_root, u64 parent_transid,
+ int level, struct btrfs_key *first_key);
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
- u64 bytenr);
+ u64 bytenr, u64 owner_root,
+ int level);
void btrfs_clean_tree_block(struct extent_buffer *buf);
+void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
+int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
@@ -69,6 +71,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref);
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, dev_t anon_dev);
+struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ u64 objectid);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
@@ -76,7 +81,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
struct page *page, u64 start, u64 end,
int mirror);
blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
@@ -109,12 +114,14 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
struct btrfs_key *first_key);
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata);
-blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset, void *private_data,
- extent_submit_bio_start_t *submit_bio_start);
+blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);
+int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
@@ -128,16 +135,15 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
+int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
+int btrfs_init_root_free_objectid(struct btrfs_root *root);
int __init btrfs_end_io_wq_init(void);
void __cold btrfs_end_io_wq_exit(void);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_init_lockdep(void);
void btrfs_set_buffer_lockdep_class(u64 objectid,
struct extent_buffer *eb, int level);
#else
-static inline void btrfs_init_lockdep(void)
-{ }
static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
struct extent_buffer *eb, int level)
{
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1a8d419d9e1f..1d4c2397d0d6 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -222,7 +222,6 @@ static int btrfs_get_name(struct dentry *parent, char *name,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
if (ino == BTRFS_FIRST_FREE_OBJECTID) {
key.objectid = BTRFS_I(inode)->root->root_key.objectid;
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 9800a8306368..04083ee5ae6e 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -21,10 +21,24 @@ struct io_failure_record;
#define EXTENT_NORESERVE (1U << 11)
#define EXTENT_QGROUP_RESERVED (1U << 12)
#define EXTENT_CLEAR_DATA_RESV (1U << 13)
+/*
+ * Must be cleared only during ordered extent completion or on error paths if we
+ * did not manage to submit bios and create the ordered extents for the range.
+ * Should not be cleared during page release and page invalidation (if there is
+ * an ordered extent in flight), that is left for the ordered extent completion.
+ */
#define EXTENT_DELALLOC_NEW (1U << 14)
+/*
+ * When an ordered extent successfully completes for a region marked as a new
+ * delalloc range, use this flag when clearing a new delalloc range to indicate
+ * that the VFS' inode number of bytes should be incremented and the inode's new
+ * delalloc bytes decremented, in an atomic way to prevent races with stat(2).
+ */
+#define EXTENT_ADD_INODE_BYTES (1U << 15)
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \
+ EXTENT_ADD_INODE_BYTES)
/*
* Redefined bits above which are used only in the device allocation tree,
@@ -73,7 +87,7 @@ struct extent_state {
/* ADD NEW ELEMENTS AFTER THIS */
wait_queue_head_t wq;
refcount_t refs;
- unsigned state;
+ u32 state;
struct io_failure_record *failrec;
@@ -105,19 +119,18 @@ void __cold extent_io_exit(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
- u64 max_bytes, unsigned bits, int contig);
+ u64 max_bytes, u32 bits, int contig);
void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int filled,
- struct extent_state *cached_state);
+ u32 bits, int filled, struct extent_state *cached_state);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset);
+ u32 bits, struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
+ u32 bits, int wake, int delete,
struct extent_state **cached);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
+ u32 bits, int wake, int delete,
struct extent_state **cached, gfp_t mask,
struct extent_changeset *changeset);
@@ -141,7 +154,7 @@ static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits)
+ u64 end, u32 bits)
{
int wake = 0;
@@ -152,17 +165,19 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset);
+ u32 bits, struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask);
+ u32 bits, unsigned exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask,
+ struct extent_changeset *changeset);
int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits);
+ u32 bits);
static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits)
+ u64 end, u32 bits)
{
- return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
+ return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
+ NULL);
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -175,8 +190,8 @@ static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
- NULL, mask);
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL,
+ mask, NULL);
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
@@ -188,16 +203,16 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned clear_bits,
+ u32 bits, u32 clear_bits,
struct extent_state **cached_state);
static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned int extra_bits,
+ u64 end, u32 extra_bits,
struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
- NULL, cached_state, GFP_NOFS);
+ 0, NULL, cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
@@ -205,30 +220,30 @@ static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- NULL, cached_state, GFP_NOFS);
+ 0, NULL, cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
u64 end)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
- GFP_NOFS);
+ return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL,
+ GFP_NOFS, NULL);
}
static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
- cached_state, mask);
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+ cached_state, mask, NULL);
}
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits,
+ u64 *start_ret, u64 *end_ret, u32 bits,
struct extent_state **cached_state);
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits);
+ u64 *start_ret, u64 *end_ret, u32 bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits);
+ u64 *start_ret, u64 *end_ret, u32 bits);
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3b21fee13e77..78ad31a59e59 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,6 +34,8 @@
#include "block-group.h"
#include "discard.h"
#include "rcu-string.h"
+#include "zoned.h"
+#include "dev-replace.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -82,41 +84,6 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
EXTENT_UPTODATE);
}
-static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
-{
- if (ref->type == BTRFS_REF_METADATA) {
- if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
- return BTRFS_BLOCK_GROUP_SYSTEM;
- else
- return BTRFS_BLOCK_GROUP_METADATA;
- }
- return BTRFS_BLOCK_GROUP_DATA;
-}
-
-static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *ref)
-{
- struct btrfs_space_info *space_info;
- u64 flags = generic_ref_to_space_flags(ref);
-
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-}
-
-static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *ref)
-{
- struct btrfs_space_info *space_info;
- u64 flags = generic_ref_to_space_flags(ref);
-
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-}
-
/* simple helper to search for an existing data extent at a given offset */
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
@@ -844,6 +811,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
want = extent_ref_type(parent, owner);
if (insert) {
extra_size = btrfs_extent_inline_ref_size(want);
+ path->search_for_extension = 1;
path->keep_locks = 1;
} else
extra_size = -1;
@@ -996,6 +964,7 @@ again:
out:
if (insert) {
path->keep_locks = 0;
+ path->search_for_extension = 0;
btrfs_unlock_up_safe(path, 1);
}
return err;
@@ -1297,6 +1266,46 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
return ret;
}
+static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes)
+{
+ struct btrfs_device *dev = stripe->dev;
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ u64 phys = stripe->physical;
+ u64 len = stripe->length;
+ u64 discarded = 0;
+ int ret = 0;
+
+ /* Zone reset on a zoned filesystem */
+ if (btrfs_can_zone_reset(dev, phys, len)) {
+ u64 src_disc;
+
+ ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
+ if (ret)
+ goto out;
+
+ if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
+ dev != dev_replace->srcdev)
+ goto out;
+
+ src_disc = discarded;
+
+ /* Send to replace target as well */
+ ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
+ &discarded);
+ discarded += src_disc;
+ } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) {
+ ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
+ } else {
+ ret = 0;
+ *bytes = 0;
+ }
+
+out:
+ *bytes = discarded;
+ return ret;
+}
+
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes)
{
@@ -1331,20 +1340,13 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
stripe = bbio->stripes;
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
- struct request_queue *req_q;
if (!stripe->dev->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue;
}
- req_q = bdev_get_queue(stripe->dev->bdev);
- if (!blk_queue_discard(req_q))
- continue;
- ret = btrfs_issue_discard(stripe->dev->bdev,
- stripe->physical,
- stripe->length,
- &bytes);
+ ret = do_discard_extent(stripe, &bytes);
if (!ret) {
discarded_bytes += bytes;
} else if (ret != -EOPNOTSUPP) {
@@ -1386,7 +1388,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int old_ref_mod, new_ref_mod;
int ret;
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
@@ -1395,17 +1396,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
- ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
- NULL, &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
else
- ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
btrfs_ref_tree_mod(fs_info, generic_ref);
- if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
- sub_pinned_bytes(fs_info, generic_ref);
-
return ret;
}
@@ -1465,7 +1461,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
parent, root_objectid, owner,
@@ -1489,7 +1484,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- path->leave_spinning = 1;
/* now insert the actual backref */
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
BUG_ON(refs_to_add != 1);
@@ -1605,7 +1599,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
}
again:
- path->leave_spinning = 1;
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
if (ret < 0) {
err = ret;
@@ -1796,34 +1789,28 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
{
int nr_items = 1; /* Dropping this ref head update. */
- if (head->total_ref_mod < 0) {
- struct btrfs_space_info *space_info;
- u64 flags;
+ /*
+ * We had csum deletions accounted for in our delayed refs rsv, we need
+ * to drop the csum leaves for this update from our delayed_refs_rsv.
+ */
+ if (head->total_ref_mod < 0 && head->is_data) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= head->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+ }
- if (head->is_data)
- flags = BTRFS_BLOCK_GROUP_DATA;
- else if (head->is_system)
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
- else
- flags = BTRFS_BLOCK_GROUP_METADATA;
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -head->num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ /*
+ * We were dropping refs, or had a new ref and dropped it, and thus must
+ * adjust down our total_bytes_pinned, the space may or may not have
+ * been pinned and so is accounted for properly in the pinned space by
+ * now.
+ */
+ if (head->total_ref_mod < 0 ||
+ (head->total_ref_mod == 0 && head->must_insert_reserved)) {
+ u64 flags = btrfs_ref_head_to_space_flags(head);
- /*
- * We had csum deletions accounted for in our delayed refs rsv,
- * we need to drop the csum leaves for this update from our
- * delayed_refs_rsv.
- */
- if (head->is_data) {
- spin_lock(&delayed_refs->lock);
- delayed_refs->pending_csums -= head->num_bytes;
- spin_unlock(&delayed_refs->lock);
- nr_items += btrfs_csum_bytes_to_leaves(fs_info,
- head->num_bytes);
- }
+ btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
}
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
@@ -2133,25 +2120,6 @@ static u64 find_middle(struct rb_root *root)
#endif
/*
- * Takes the number of bytes to be csumm'ed and figures out how many leaves it
- * would require to store the csums for that many bytes.
- */
-u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
-{
- u64 csum_size;
- u64 num_csums_per_leaf;
- u64 num_csums;
-
- csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
- num_csums_per_leaf = div64_u64(csum_size,
- (u64)btrfs_super_csum_size(fs_info->super_copy));
- num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
- num_csums += num_csums_per_leaf - 1;
- num_csums = div64_u64(num_csums, num_csums_per_leaf);
- return num_csums;
-}
-
-/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
* 0, which means to process everything in the tree at the start
@@ -2180,7 +2148,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
if (count == 0)
- count = atomic_read(&delayed_refs->num_entries) * 2;
+ count = delayed_refs->num_heads_ready;
again:
#ifdef SCRAMBLE_DELAYED_REFS
@@ -2592,8 +2560,7 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
- num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
@@ -2622,8 +2589,6 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
struct btrfs_block_group *cache;
int ret;
- btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes);
-
cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
if (!cache)
return -EINVAL;
@@ -2635,11 +2600,19 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
* the pinned extents.
*/
btrfs_cache_block_group(cache, 1);
+ /*
+ * Make sure we wait until the cache is completely built in case it is
+ * missing or is invalid and therefore needs to be rebuilt.
+ */
+ ret = btrfs_wait_block_group_cache_done(cache);
+ if (ret)
+ goto out;
pin_down_extent(trans, cache, bytenr, num_bytes, 0);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+out:
btrfs_put_block_group(cache);
return ret;
}
@@ -2649,45 +2622,22 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
{
int ret;
struct btrfs_block_group *block_group;
- struct btrfs_caching_control *caching_ctl;
block_group = btrfs_lookup_block_group(fs_info, start);
if (!block_group)
return -EINVAL;
- btrfs_cache_block_group(block_group, 0);
- caching_ctl = btrfs_get_caching_control(block_group);
-
- if (!caching_ctl) {
- /* Logic error */
- BUG_ON(!btrfs_block_group_done(block_group));
- ret = btrfs_remove_free_space(block_group, start, num_bytes);
- } else {
- mutex_lock(&caching_ctl->mutex);
-
- if (start >= caching_ctl->progress) {
- ret = btrfs_add_excluded_extent(fs_info, start,
- num_bytes);
- } else if (start + num_bytes <= caching_ctl->progress) {
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- } else {
- num_bytes = caching_ctl->progress - start;
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- if (ret)
- goto out_lock;
+ btrfs_cache_block_group(block_group, 1);
+ /*
+ * Make sure we wait until the cache is completely built in case it is
+ * missing or is invalid and therefore needs to be rebuilt.
+ */
+ ret = btrfs_wait_block_group_cache_done(block_group);
+ if (ret)
+ goto out;
- num_bytes = (start + num_bytes) -
- caching_ctl->progress;
- start = caching_ctl->progress;
- ret = btrfs_add_excluded_extent(fs_info, start,
- num_bytes);
- }
-out_lock:
- mutex_unlock(&caching_ctl->mutex);
- btrfs_put_caching_control(caching_ctl);
- }
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+out:
btrfs_put_block_group(block_group);
return ret;
}
@@ -2730,31 +2680,6 @@ btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
atomic_inc(&bg->reservations);
}
-void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_caching_control *next;
- struct btrfs_caching_control *caching_ctl;
- struct btrfs_block_group *cache;
-
- down_write(&fs_info->commit_root_sem);
-
- list_for_each_entry_safe(caching_ctl, next,
- &fs_info->caching_block_groups, list) {
- cache = caching_ctl->block_group;
- if (btrfs_block_group_done(cache)) {
- cache->last_byte_to_unpin = (u64)-1;
- list_del_init(&caching_ctl->list);
- btrfs_put_caching_control(caching_ctl);
- } else {
- cache->last_byte_to_unpin = caching_ctl->progress;
- }
- }
-
- up_write(&fs_info->commit_root_sem);
-
- btrfs_update_global_block_rsv(fs_info);
-}
-
/*
* Returns the free cluster for the given space info and sets empty_cluster to
* what it should be based on the mount options.
@@ -2816,11 +2741,13 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
- if (start < cache->last_byte_to_unpin) {
- len = min(len, cache->last_byte_to_unpin - start);
- if (return_free_space)
- btrfs_add_free_space(cache, start, len);
+ down_read(&fs_info->commit_root_sem);
+ if (start < cache->last_byte_to_unpin && return_free_space) {
+ u64 add_len = min(len, cache->last_byte_to_unpin - start);
+
+ btrfs_add_free_space(cache, start, add_len);
}
+ up_read(&fs_info->commit_root_sem);
start += len;
total_unpinned += len;
@@ -2844,11 +2771,14 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
cache->pinned -= len;
btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
space_info->max_extent_size = 0;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(space_info, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
+ } else if (btrfs_is_zoned(fs_info)) {
+ /* Need reset before reusing in a zoned block group */
+ space_info->bytes_zone_unusable += len;
+ readonly = true;
}
spin_unlock(&cache->lock);
if (!readonly && return_free_space &&
@@ -2901,9 +2831,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
- if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
- clear_extent_bits(&fs_info->excluded_extents, start,
- end, EXTENT_UPTODATE);
if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
@@ -3040,8 +2967,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
if (!is_data && refs_to_drop != 1) {
@@ -3106,7 +3031,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_release_path(path);
- path->leave_spinning = 1;
/* Slow path to locate EXTENT/METADATA_ITEM */
key.objectid = bytenr;
@@ -3185,7 +3109,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_tree_block_info *bi;
if (item_size < sizeof(*ei) + sizeof(*bi)) {
btrfs_crit(info,
-"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu",
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
key.objectid, key.type, key.offset,
owner_objectid, item_size,
sizeof(*ei) + sizeof(*bi));
@@ -3384,7 +3308,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ref generic_ref = { 0 };
- int pin = 1;
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
@@ -3393,13 +3316,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
root->root_key.objectid);
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- int old_ref_mod, new_ref_mod;
-
btrfs_ref_tree_mod(fs_info, &generic_ref);
- ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
BUG_ON(ret); /* -ENOMEM */
- pin = old_ref_mod >= 0 && new_ref_mod < 0;
}
if (last_ref && btrfs_header_generation(buf) == trans->transid) {
@@ -3407,11 +3326,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start);
- if (!ret)
+ if (!ret) {
+ btrfs_redirty_list_add(trans->transaction, buf);
goto out;
+ }
}
- pin = 0;
cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
@@ -3420,6 +3340,13 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
goto out;
}
+ if (btrfs_is_zoned(fs_info)) {
+ btrfs_redirty_list_add(trans->transaction, buf);
+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
+ btrfs_put_block_group(cache);
+ goto out;
+ }
+
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
@@ -3428,9 +3355,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
}
out:
- if (pin)
- add_pinned_bytes(fs_info, &generic_ref);
-
if (last_ref) {
/*
* Deleting the buffer, clear the corrupt flag since it doesn't
@@ -3444,7 +3368,6 @@ out:
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int old_ref_mod, new_ref_mod;
int ret;
if (btrfs_is_testing(fs_info))
@@ -3460,14 +3383,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
- old_ref_mod = new_ref_mod = 0;
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
- ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
} else {
- ret = btrfs_add_delayed_data_ref(trans, ref, 0,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_data_ref(trans, ref, 0);
}
if (!((ref->type == BTRFS_REF_METADATA &&
@@ -3476,9 +3396,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
btrfs_ref_tree_mod(fs_info, ref);
- if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
- add_pinned_bytes(fs_info, ref);
-
return ret;
}
@@ -3555,6 +3472,7 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
enum btrfs_extent_allocation_policy {
BTRFS_EXTENT_ALLOC_CLUSTERED,
+ BTRFS_EXTENT_ALLOC_ZONED,
};
/*
@@ -3579,6 +3497,9 @@ struct find_free_extent_ctl {
bool have_caching_bg;
bool orig_have_caching_bg;
+ /* Allocation is called for tree-log */
+ bool for_treelog;
+
/* RAID index, converted from flags */
int index;
@@ -3807,6 +3728,118 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group,
return find_free_extent_unclustered(block_group, ffe_ctl);
}
+/*
+ * Tree-log block group locking
+ * ============================
+ *
+ * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
+ * indicates the starting address of a block group, which is reserved only
+ * for tree-log metadata.
+ *
+ * Lock nesting
+ * ============
+ *
+ * space_info::lock
+ * block_group::lock
+ * fs_info::treelog_bg_lock
+ */
+
+/*
+ * Simple allocator for sequential-only block group. It only allows sequential
+ * allocation. No need to play with trees. This function also reserves the
+ * bytes as in btrfs_add_reserved_bytes.
+ */
+static int do_allocation_zoned(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_space_info *space_info = block_group->space_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ u64 start = block_group->start;
+ u64 num_bytes = ffe_ctl->num_bytes;
+ u64 avail;
+ u64 bytenr = block_group->start;
+ u64 log_bytenr;
+ int ret = 0;
+ bool skip;
+
+ ASSERT(btrfs_is_zoned(block_group->fs_info));
+
+ /*
+ * Do not allow non-tree-log blocks in the dedicated tree-log block
+ * group, and vice versa.
+ */
+ spin_lock(&fs_info->treelog_bg_lock);
+ log_bytenr = fs_info->treelog_bg;
+ skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
+ (!ffe_ctl->for_treelog && bytenr == log_bytenr));
+ spin_unlock(&fs_info->treelog_bg_lock);
+ if (skip)
+ return 1;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ spin_lock(&fs_info->treelog_bg_lock);
+
+ ASSERT(!ffe_ctl->for_treelog ||
+ block_group->start == fs_info->treelog_bg ||
+ fs_info->treelog_bg == 0);
+
+ if (block_group->ro) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Do not allow currently using block group to be tree-log dedicated
+ * block group.
+ */
+ if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
+ (block_group->used || block_group->reserved)) {
+ ret = 1;
+ goto out;
+ }
+
+ avail = block_group->length - block_group->alloc_offset;
+ if (avail < num_bytes) {
+ if (ffe_ctl->max_extent_size < avail) {
+ /*
+ * With sequential allocator, free space is always
+ * contiguous
+ */
+ ffe_ctl->max_extent_size = avail;
+ ffe_ctl->total_free_space = avail;
+ }
+ ret = 1;
+ goto out;
+ }
+
+ if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
+ fs_info->treelog_bg = block_group->start;
+
+ ffe_ctl->found_offset = start + block_group->alloc_offset;
+ block_group->alloc_offset += num_bytes;
+ spin_lock(&ctl->tree_lock);
+ ctl->free_space -= num_bytes;
+ spin_unlock(&ctl->tree_lock);
+
+ /*
+ * We do not check if found_offset is aligned to stripesize. The
+ * address is anyway rewritten when using zone append writing.
+ */
+
+ ffe_ctl->search_start = ffe_ctl->found_offset;
+
+out:
+ if (ret && ffe_ctl->for_treelog)
+ fs_info->treelog_bg = 0;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ return ret;
+}
+
static int do_allocation(struct btrfs_block_group *block_group,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_block_group **bg_ret)
@@ -3814,6 +3847,8 @@ static int do_allocation(struct btrfs_block_group *block_group,
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
default:
BUG();
}
@@ -3828,6 +3863,9 @@ static void release_block_group(struct btrfs_block_group *block_group,
ffe_ctl->retry_clustered = false;
ffe_ctl->retry_unclustered = false;
break;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Nothing to do */
+ break;
default:
BUG();
}
@@ -3856,6 +3894,9 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
case BTRFS_EXTENT_ALLOC_CLUSTERED:
found_extent_clustered(ffe_ctl, ins);
break;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Nothing to do */
+ break;
default:
BUG();
}
@@ -3871,6 +3912,9 @@ static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
*/
ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
return 0;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Give up here */
+ return -ENOSPC;
default:
BUG();
}
@@ -4039,6 +4083,14 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
case BTRFS_EXTENT_ALLOC_CLUSTERED:
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ if (ffe_ctl->for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg)
+ ffe_ctl->hint_byte = fs_info->treelog_bg;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ }
+ return 0;
default:
BUG();
}
@@ -4081,6 +4133,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
bool full_search = false;
+ bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
WARN_ON(num_bytes < fs_info->sectorsize);
@@ -4094,6 +4147,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
ffe_ctl.orig_have_caching_bg = false;
ffe_ctl.found_offset = 0;
ffe_ctl.hint_byte = hint_byte_orig;
+ ffe_ctl.for_treelog = for_treelog;
ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
/* For clustered allocation */
@@ -4102,6 +4156,9 @@ static noinline int find_free_extent(struct btrfs_root *root,
ffe_ctl.last_ptr = NULL;
ffe_ctl.use_cluster = true;
+ if (btrfs_is_zoned(fs_info))
+ ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED;
+
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
@@ -4165,8 +4222,11 @@ search:
struct btrfs_block_group *bg_ret;
/* If the block group is read-only, we can skip it entirely. */
- if (unlikely(block_group->ro))
+ if (unlikely(block_group->ro)) {
+ if (for_treelog)
+ btrfs_clear_treelog_bg(block_group);
continue;
+ }
btrfs_grab_block_group(block_group, delalloc);
ffe_ctl.search_start = block_group->start;
@@ -4244,20 +4304,21 @@ have_block_group:
/* move on to the next group */
if (ffe_ctl.search_start + num_bytes >
block_group->start + block_group->length) {
- btrfs_add_free_space(block_group, ffe_ctl.found_offset,
- num_bytes);
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl.found_offset, num_bytes);
goto loop;
}
if (ffe_ctl.found_offset < ffe_ctl.search_start)
- btrfs_add_free_space(block_group, ffe_ctl.found_offset,
- ffe_ctl.search_start - ffe_ctl.found_offset);
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl.found_offset,
+ ffe_ctl.search_start - ffe_ctl.found_offset);
ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
num_bytes, delalloc);
if (ret == -EAGAIN) {
- btrfs_add_free_space(block_group, ffe_ctl.found_offset,
- num_bytes);
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl.found_offset, num_bytes);
goto loop;
}
btrfs_inc_block_group_reservations(block_group);
@@ -4351,6 +4412,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
+ bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
flags = get_alloc_profile_by_root(root, is_data);
again:
@@ -4374,8 +4436,8 @@ again:
sinfo = btrfs_find_space_info(fs_info, flags);
btrfs_err(fs_info,
- "allocation failed flags %llu, wanted %llu",
- flags, num_bytes);
+ "allocation failed flags %llu, wanted %llu tree-log %d",
+ flags, num_bytes, for_treelog);
if (sinfo)
btrfs_dump_space_info(fs_info, sinfo,
num_bytes, 1);
@@ -4448,7 +4510,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
if (ret) {
@@ -4533,7 +4594,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
&extent_key, size);
if (ret) {
@@ -4559,7 +4619,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
}
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
- BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_SHARED_BLOCK_REF_KEY);
btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
@@ -4596,7 +4655,6 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins)
{
struct btrfs_ref generic_ref = { 0 };
- int ret;
BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
@@ -4604,9 +4662,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ins->objectid, ins->offset, 0);
btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
- ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
- ram_bytes, NULL, NULL);
- return ret;
+
+ return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
}
/*
@@ -4662,7 +4719,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
if (IS_ERR(buf))
return buf;
@@ -4679,12 +4736,17 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ERR_PTR(-EUCLEAN);
}
+ /*
+ * This needs to stay, because we could allocate a freed block from an
+ * old tree into a new tree, so we need to make sure this new block is
+ * set to the appropriate level and owner.
+ */
btrfs_set_buffer_lockdep_class(owner, buf, level);
__btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
+ clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
- btrfs_set_lock_blocking_write(buf);
set_extent_buffer_uptodate(buf);
memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -4794,8 +4856,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
generic_ref.real_root = root->root_key.objectid;
btrfs_init_tree_ref(&generic_ref, level, root_objectid);
btrfs_ref_tree_mod(fs_info, &generic_ref);
- ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
- extent_op, NULL, NULL);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
if (ret)
goto out_free_delayed;
}
@@ -4905,7 +4966,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- readahead_tree_block(fs_info, bytenr);
+ btrfs_readahead_node_child(eb, slot);
nread++;
}
wc->reada_slot = slot;
@@ -5064,16 +5125,13 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
next = find_extent_buffer(fs_info, bytenr);
if (!next) {
- next = btrfs_find_create_tree_block(fs_info, bytenr);
+ next = btrfs_find_create_tree_block(fs_info, bytenr,
+ root->root_key.objectid, level - 1);
if (IS_ERR(next))
return PTR_ERR(next);
-
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
- level - 1);
reada = 1;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
@@ -5124,8 +5182,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, generation, level - 1,
- &first_key);
+ next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
+ generation, level - 1, &first_key);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
@@ -5133,7 +5191,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
return -EIO;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
}
level--;
@@ -5145,7 +5202,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
path->nodes[level] = next;
path->slots[level] = 0;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
wc->level = level;
if (wc->level == 1)
wc->reada_slot = 0;
@@ -5273,8 +5330,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level]) {
BUG_ON(level == 0);
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking_write(eb);
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
@@ -5317,8 +5373,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level] &&
btrfs_header_generation(eb) == trans->transid) {
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking_write(eb);
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
}
btrfs_clean_tree_block(eb);
}
@@ -5486,9 +5541,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking_write(path->nodes[level]);
path->slots[level] = 0;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
memset(&wc->update_progress, 0,
sizeof(wc->update_progress));
} else {
@@ -5496,7 +5550,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
memcpy(&wc->update_progress, &key,
sizeof(wc->update_progress));
- level = root_item->drop_level;
+ level = btrfs_root_drop_level(root_item);
BUG_ON(level == 0);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5516,8 +5570,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
level = btrfs_header_level(root->node);
while (1) {
btrfs_tree_lock(path->nodes[level]);
- btrfs_set_lock_blocking_write(path->nodes[level]);
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
@@ -5529,7 +5582,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
}
BUG_ON(wc->refs[level] == 0);
- if (level == root_item->drop_level)
+ if (level == btrfs_root_drop_level(root_item))
break;
btrfs_tree_unlock(path->nodes[level]);
@@ -5574,7 +5627,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
}
btrfs_cpu_key_to_disk(&root_item->drop_progress,
&wc->drop_progress);
- root_item->drop_level = wc->drop_level;
+ btrfs_set_root_drop_level(root_item, wc->drop_level);
BUG_ON(wc->level == 0);
if (btrfs_should_end_transaction(trans) ||
@@ -5596,7 +5649,15 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_free;
}
- trans = btrfs_start_transaction(tree_root, 0);
+ /*
+ * Use join to avoid potential EINTR from transaction
+ * start. See wait_reserve_ticket and the whole
+ * reservation callchain.
+ */
+ if (for_reloc)
+ trans = btrfs_join_transaction(tree_root);
+ else
+ trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
@@ -5704,7 +5765,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
level = btrfs_header_level(node);
path->nodes[level] = node;
path->slots[level] = 0;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
wc->refs[parent_level] = 1;
wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 60f5f68d892d..4dfb3ead1175 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -24,6 +24,9 @@
#include "rcu-string.h"
#include "backref.h"
#include "disk-io.h"
+#include "subpage.h"
+#include "zoned.h"
+#include "block-group.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -142,7 +145,7 @@ struct extent_page_data {
unsigned int sync_io:1;
};
-static int add_extent_changeset(struct extent_state *state, unsigned bits,
+static int add_extent_changeset(struct extent_state *state, u32 bits,
struct extent_changeset *changeset,
int set)
{
@@ -389,16 +392,16 @@ do_insert:
}
/**
- * __etree_search - searche @tree for an entry that contains @offset. Such
- * entry would have entry->start <= offset && entry->end >= offset.
+ * Search @tree for an entry that contains @offset. Such entry would have
+ * entry->start <= offset && entry->end >= offset.
*
- * @tree - the tree to search
- * @offset - offset that should fall within an entry in @tree
- * @next_ret - pointer to the first entry whose range ends after @offset
- * @prev - pointer to the first entry whose range begins before @offset
- * @p_ret - pointer where new node should be anchored (used when inserting an
- * entry in the tree)
- * @parent_ret - points to entry which would have been the parent of the entry,
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @next_ret: pointer to the first entry whose range ends after @offset
+ * @prev_ret: pointer to the first entry whose range begins before @offset
+ * @p_ret: pointer where new node should be anchored (used when inserting an
+ * entry in the tree)
+ * @parent_ret: points to entry which would have been the parent of the entry,
* containing @offset
*
* This function returns a pointer to the entry that contains @offset byte
@@ -530,7 +533,7 @@ static void merge_state(struct extent_io_tree *tree,
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits,
+ struct extent_state *state, u32 *bits,
struct extent_changeset *changeset);
/*
@@ -547,7 +550,7 @@ static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
struct rb_node ***p,
struct rb_node **parent,
- unsigned *bits, struct extent_changeset *changeset)
+ u32 *bits, struct extent_changeset *changeset)
{
struct rb_node *node;
@@ -628,11 +631,11 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits, int wake,
+ u32 *bits, int wake,
struct extent_changeset *changeset)
{
struct extent_state *next;
- unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
+ u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
int ret;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
@@ -676,9 +679,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
{
- struct inode *inode = tree->private_data;
-
- btrfs_panic(btrfs_sb(inode->i_sb), err,
+ btrfs_panic(tree->fs_info, err,
"locking error: extent tree was modified by another thread while locked");
}
@@ -695,9 +696,9 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
+ u32 bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
@@ -868,7 +869,7 @@ static void wait_on_state(struct extent_io_tree *tree,
* The tree lock is taken by this function
*/
static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits)
+ u32 bits)
{
struct extent_state *state;
struct rb_node *node;
@@ -915,9 +916,9 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits, struct extent_changeset *changeset)
+ u32 *bits, struct extent_changeset *changeset)
{
- unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
+ u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
int ret;
if (tree->private_data && is_data_inode(tree->private_data))
@@ -961,12 +962,10 @@ static void cache_state(struct extent_state *state,
*
* [start, end] is inclusive This takes the tree lock.
*/
-
-static int __must_check
-__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned exclusive_bits,
- u64 *failed_start, struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ u32 exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask,
+ struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -980,6 +979,10 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
+ if (exclusive_bits)
+ ASSERT(failed_start);
+ else
+ ASSERT(failed_start == NULL);
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
@@ -1179,15 +1182,6 @@ out:
}
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, u64 * failed_start,
- struct extent_state **cached_state, gfp_t mask)
-{
- return __set_extent_bit(tree, start, end, bits, 0, failed_start,
- cached_state, mask, NULL);
-}
-
-
/**
* convert_extent_bit - convert all bits in a given range from one bit to
* another
@@ -1207,7 +1201,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
* All allocations are done with GFP_NOFS.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned clear_bits,
+ u32 bits, u32 clear_bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@@ -1408,7 +1402,7 @@ out:
/* wrappers around set/clear extent bit */
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset)
+ u32 bits, struct extent_changeset *changeset)
{
/*
* We don't support EXTENT_LOCKED yet, as current changeset will
@@ -1418,19 +1412,19 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
BUG_ON(bits & EXTENT_LOCKED);
- return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
- changeset);
+ return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
+ changeset);
}
int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits)
+ u32 bits)
{
- return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
- GFP_NOWAIT, NULL);
+ return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
+ GFP_NOWAIT, NULL);
}
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
+ u32 bits, int wake, int delete,
struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, bits, wake, delete,
@@ -1438,7 +1432,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset)
+ u32 bits, struct extent_changeset *changeset)
{
/*
* Don't support EXTENT_LOCKED case, same reason as
@@ -1461,9 +1455,9 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u64 failed_start;
while (1) {
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
- EXTENT_LOCKED, &failed_start,
- cached_state, GFP_NOFS, NULL);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
+ EXTENT_LOCKED, &failed_start,
+ cached_state, GFP_NOFS, NULL);
if (err == -EEXIST) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
@@ -1479,8 +1473,8 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
int err;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, GFP_NOFS, NULL);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+ &failed_start, NULL, GFP_NOFS, NULL);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
@@ -1526,8 +1520,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
* nothing was found after 'start'
*/
static struct extent_state *
-find_first_extent_bit_state(struct extent_io_tree *tree,
- u64 start, unsigned bits)
+find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
{
struct rb_node *node;
struct extent_state *state;
@@ -1554,14 +1547,15 @@ out:
}
/*
- * find the first offset in the io tree with 'bits' set. zero is
- * returned if we find something, and *start_ret and *end_ret are
- * set to reflect the state struct that was found.
+ * Find the first offset in the io tree with one or more @bits set.
*
- * If nothing was found, 1 is returned. If found something, return 0.
+ * Note: If there are multiple bits set in @bits, any of them will match.
+ *
+ * Return 0 if we find something, and update @start_ret and @end_ret.
+ * Return 1 if we found nothing.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits,
+ u64 *start_ret, u64 *end_ret, u32 bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@@ -1597,12 +1591,13 @@ out:
}
/**
- * find_contiguous_extent_bit: find a contiguous area of bits
- * @tree - io tree to check
- * @start - offset to start the search from
- * @start_ret - the first offset we found with the bits set
- * @end_ret - the final contiguous range of the bits that were set
- * @bits - bits to look for
+ * Find a contiguous area of bits
+ *
+ * @tree: io tree to check
+ * @start: offset to start the search from
+ * @start_ret: the first offset we found with the bits set
+ * @end_ret: the final contiguous range of the bits that were set
+ * @bits: bits to look for
*
* set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
* to set bits appropriately, and then merge them again. During this time it
@@ -1612,7 +1607,7 @@ out:
* returned will be the full contiguous area with the bits set.
*/
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits)
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
int ret = 1;
@@ -1634,14 +1629,14 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
}
/**
- * find_first_clear_extent_bit - find the first range that has @bits not set.
- * This range could start before @start.
+ * Find the first range that has @bits not set. This range could start before
+ * @start.
*
- * @tree - the tree to search
- * @start - the offset at/after which the found extent should start
- * @start_ret - records the beginning of the range
- * @end_ret - records the end of the range (inclusive)
- * @bits - the set of bits which must be unset
+ * @tree: the tree to search
+ * @start: offset at/after which the found extent should start
+ * @start_ret: records the beginning of the range
+ * @end_ret: records the end of the range (inclusive)
+ * @bits: the set of bits which must be unset
*
* Since unallocated range is also considered one which doesn't have the bits
* set it's possible that @end_ret contains -1, this happens in case the range
@@ -1649,7 +1644,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
* trim @end_ret to the appropriate size.
*/
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits)
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
struct rb_node *node, *prev = NULL, *next;
@@ -1946,7 +1941,7 @@ static int __process_pages_contig(struct address_space *mapping,
unsigned long page_ops, pgoff_t *index_ret)
{
unsigned long nr_pages = end_index - start_index + 1;
- unsigned long pages_locked = 0;
+ unsigned long pages_processed = 0;
pgoff_t index = start_index;
struct page *pages[16];
unsigned ret;
@@ -1981,13 +1976,13 @@ static int __process_pages_contig(struct address_space *mapping,
if (locked_page && pages[i] == locked_page) {
put_page(pages[i]);
- pages_locked++;
+ pages_processed++;
continue;
}
- if (page_ops & PAGE_CLEAR_DIRTY)
+ if (page_ops & PAGE_START_WRITEBACK) {
clear_page_dirty_for_io(pages[i]);
- if (page_ops & PAGE_SET_WRITEBACK)
set_page_writeback(pages[i]);
+ }
if (page_ops & PAGE_SET_ERROR)
SetPageError(pages[i]);
if (page_ops & PAGE_END_WRITEBACK)
@@ -2006,7 +2001,7 @@ static int __process_pages_contig(struct address_space *mapping,
}
}
put_page(pages[i]);
- pages_locked++;
+ pages_processed++;
}
nr_pages -= ret;
index += ret;
@@ -2014,14 +2009,13 @@ static int __process_pages_contig(struct address_space *mapping,
}
out:
if (err && index_ret)
- *index_ret = start_index + pages_locked - 1;
+ *index_ret = start_index + pages_processed - 1;
return err;
}
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned clear_bits,
- unsigned long page_ops)
+ u32 clear_bits, unsigned long page_ops)
{
clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
@@ -2037,7 +2031,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
- unsigned bits, int contig)
+ u32 bits, int contig)
{
struct rb_node *node;
struct extent_state *state;
@@ -2157,7 +2151,7 @@ out:
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int filled, struct extent_state *cached)
+ u32 bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
@@ -2266,6 +2260,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
+ if (btrfs_is_zoned(fs_info))
+ return btrfs_repair_one_zone(fs_info, logical);
+
bio = btrfs_io_bio_alloc(1);
bio->bi_iter.bi_size = 0;
map_length = length;
@@ -2642,7 +2639,7 @@ static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio)
}
blk_status_t btrfs_submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u64 phy_offset,
+ struct bio *failed_bio, u32 bio_offset,
struct page *page, unsigned int pgoff,
u64 start, u64 end, int failed_mirror,
submit_bio_hook_t *submit_bio_hook)
@@ -2652,7 +2649,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
- const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits;
+ const int icsum = bio_offset >> fs_info->sectorsize_bits;
bool need_validation;
struct bio *repair_bio;
struct btrfs_io_bio *repair_io_bio;
@@ -2685,7 +2682,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
repair_bio->bi_private = failed_bio->bi_private;
if (failed_io_bio->csum) {
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
repair_io_bio->csum = repair_io_bio->csum_inline;
memcpy(repair_io_bio->csum,
@@ -2742,6 +2739,7 @@ static void end_bio_extent_writepage(struct bio *bio)
u64 start;
u64 end;
struct bvec_iter_all iter_all;
+ bool first_bvec = true;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
@@ -2768,6 +2766,11 @@ static void end_bio_extent_writepage(struct bio *bio)
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
+ if (first_bvec) {
+ btrfs_record_physical_zoned(inode, start, bio);
+ first_bvec = false;
+ }
+
end_extent_writepage(page, error, start, end);
end_page_writeback(page);
}
@@ -2775,16 +2778,111 @@ static void end_bio_extent_writepage(struct bio *bio)
bio_put(bio);
}
-static void
-endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
- int uptodate)
+/*
+ * Record previously processed extent range
+ *
+ * For endio_readpage_release_extent() to handle a full extent range, reducing
+ * the extent io operations.
+ */
+struct processed_extent {
+ struct btrfs_inode *inode;
+ /* Start of the range in @inode */
+ u64 start;
+ /* End of the range in @inode */
+ u64 end;
+ bool uptodate;
+};
+
+/*
+ * Try to release processed extent range
+ *
+ * May not release the extent range right now if the current range is
+ * contiguous to processed extent.
+ *
+ * Will release processed extent when any of @inode, @uptodate, the range is
+ * no longer contiguous to the processed range.
+ *
+ * Passing @inode == NULL will force processed extent to be released.
+ */
+static void endio_readpage_release_extent(struct processed_extent *processed,
+ struct btrfs_inode *inode, u64 start, u64 end,
+ bool uptodate)
{
struct extent_state *cached = NULL;
- u64 end = start + len - 1;
+ struct extent_io_tree *tree;
- if (uptodate && tree->track_uptodate)
- set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(tree, start, end, &cached);
+ /* The first extent, initialize @processed */
+ if (!processed->inode)
+ goto update;
+
+ /*
+ * Contiguous to processed extent, just uptodate the end.
+ *
+ * Several things to notice:
+ *
+ * - bio can be merged as long as on-disk bytenr is contiguous
+ * This means we can have page belonging to other inodes, thus need to
+ * check if the inode still matches.
+ * - bvec can contain range beyond current page for multi-page bvec
+ * Thus we need to do processed->end + 1 >= start check
+ */
+ if (processed->inode == inode && processed->uptodate == uptodate &&
+ processed->end + 1 >= start && end >= processed->end) {
+ processed->end = end;
+ return;
+ }
+
+ tree = &processed->inode->io_tree;
+ /*
+ * Now we don't have range contiguous to the processed range, release
+ * the processed range now.
+ */
+ if (processed->uptodate && tree->track_uptodate)
+ set_extent_uptodate(tree, processed->start, processed->end,
+ &cached, GFP_ATOMIC);
+ unlock_extent_cached_atomic(tree, processed->start, processed->end,
+ &cached);
+
+update:
+ /* Update processed to current range */
+ processed->inode = inode;
+ processed->start = start;
+ processed->end = end;
+ processed->uptodate = uptodate;
+}
+
+static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
+{
+ ASSERT(PageLocked(page));
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page));
+ btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
+}
+
+static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+
+ ASSERT(page_offset(page) <= start &&
+ start + len <= page_offset(page) + PAGE_SIZE);
+
+ if (uptodate) {
+ btrfs_page_set_uptodate(fs_info, page, start, len);
+ } else {
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_page_set_error(fs_info, page, start, len);
+ }
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ unlock_page(page);
+ else if (is_data_inode(page->mapping->host))
+ /*
+ * For subpage data, unlock the page if we're the last reader.
+ * For subpage metadata, page lock is not utilized for read.
+ */
+ btrfs_subpage_end_reader(fs_info, page, start, len);
}
/*
@@ -2804,12 +2902,12 @@ static void end_bio_extent_readpage(struct bio *bio)
int uptodate = !bio->bi_status;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct extent_io_tree *tree, *failure_tree;
- u64 offset = 0;
- u64 start;
- u64 end;
- u64 len;
- u64 extent_start = 0;
- u64 extent_len = 0;
+ struct processed_extent processed = { 0 };
+ /*
+ * The offset to the beginning of a bio, since one bio can never be
+ * larger than UINT_MAX, u32 here is enough.
+ */
+ u32 bio_offset = 0;
int mirror;
int ret;
struct bvec_iter_all iter_all;
@@ -2819,42 +2917,48 @@ static void end_bio_extent_readpage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 start;
+ u64 end;
+ u32 len;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- (u64)bio->bi_iter.bi_sector, bio->bi_status,
+ bio->bi_iter.bi_sector, bio->bi_status,
io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
- /* We always issue full-page reads, but if some block
- * in a page fails to read, blk_update_request() will
- * advance bv_offset and adjust bv_len to compensate.
- * Print a warning for nonzero offsets, and an error
- * if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
- if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
- btrfs_err(fs_info,
- "partial page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else
- btrfs_info(fs_info,
- "incomplete page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- }
-
- start = page_offset(page);
- end = start + bvec->bv_offset + bvec->bv_len - 1;
+ /*
+ * We always issue full-sector reads, but if some block in a
+ * page fails to read, blk_update_request() will advance
+ * bv_offset and adjust bv_len to compensate. Print a warning
+ * for unaligned offsets, and an error if they don't add up to
+ * a full sector.
+ */
+ if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ btrfs_err(fs_info,
+ "partial page read in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
+ sectorsize))
+ btrfs_info(fs_info,
+ "incomplete page read with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+
+ start = page_offset(page) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
len = bvec->bv_len;
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
if (is_data_inode(inode))
- ret = btrfs_verify_data_csum(io_bio, offset, page,
- start, end, mirror);
+ ret = btrfs_verify_data_csum(io_bio,
+ bio_offset, page, start, end,
+ mirror);
else
ret = btrfs_validate_metadata_buffer(io_bio,
- offset, page, start, end, mirror);
+ page, start, end, mirror);
if (ret)
uptodate = 0;
else
@@ -2879,12 +2983,14 @@ static void end_bio_extent_readpage(struct bio *bio)
* If it can't handle the error it will return -EIO and
* we remain responsible for that page.
*/
- if (!btrfs_submit_read_repair(inode, bio, offset, page,
+ if (!btrfs_submit_read_repair(inode, bio, bio_offset,
+ page,
start - page_offset(page),
start, end, mirror,
btrfs_submit_data_bio)) {
uptodate = !bio->bi_status;
- offset += len;
+ ASSERT(bio_offset + len > bio_offset);
+ bio_offset += len;
continue;
}
} else {
@@ -2908,40 +3014,17 @@ readpage_ok:
off = offset_in_page(i_size);
if (page->index == end_index && off)
zero_user_segment(page, off, PAGE_SIZE);
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
}
- unlock_page(page);
- offset += len;
-
- if (unlikely(!uptodate)) {
- if (extent_len) {
- endio_readpage_release_extent(tree,
- extent_start,
- extent_len, 1);
- extent_start = 0;
- extent_len = 0;
- }
- endio_readpage_release_extent(tree, start,
- end - start + 1, 0);
- } else if (!extent_len) {
- extent_start = start;
- extent_len = end + 1 - start;
- } else if (extent_start + extent_len == start) {
- extent_len += end + 1 - start;
- } else {
- endio_readpage_release_extent(tree, extent_start,
- extent_len, uptodate);
- extent_start = start;
- extent_len = end + 1 - start;
- }
- }
+ ASSERT(bio_offset + len > bio_offset);
+ bio_offset += len;
- if (extent_len)
- endio_readpage_release_extent(tree, extent_start, extent_len,
- uptodate);
+ /* Update page status and unlock */
+ end_page_read(page, uptodate, start, len);
+ endio_readpage_release_extent(&processed, BTRFS_I(inode),
+ start, end, uptodate);
+ }
+ /* Release the last extent */
+ endio_readpage_release_extent(&processed, NULL, 0, 0, false);
btrfs_io_bio_free_csum(io_bio);
bio_put(bio);
}
@@ -3011,14 +3094,67 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
return bio;
}
+/**
+ * Attempt to add a page to bio
+ *
+ * @bio: destination bio
+ * @page: page to add to the bio
+ * @disk_bytenr: offset of the new bio or to check whether we are adding
+ * a contiguous page to the previous one
+ * @pg_offset: starting offset in the page
+ * @size: portion of page that we want to write
+ * @prev_bio_flags: flags of previous bio to see if we can merge the current one
+ * @bio_flags: flags of the current bio to see if we can merge them
+ * @return: true if page was added, false otherwise
+ *
+ * Attempt to add a page to bio considering stripe alignment etc.
+ *
+ * Return true if successfully page added. Otherwise, return false.
+ */
+static bool btrfs_bio_add_page(struct bio *bio, struct page *page,
+ u64 disk_bytenr, unsigned int size,
+ unsigned int pg_offset,
+ unsigned long prev_bio_flags,
+ unsigned long bio_flags)
+{
+ const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
+ bool contig;
+ int ret;
+
+ if (prev_bio_flags != bio_flags)
+ return false;
+
+ if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
+ contig = bio->bi_iter.bi_sector == sector;
+ else
+ contig = bio_end_sector(bio) == sector;
+ if (!contig)
+ return false;
+
+ if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags))
+ return false;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct page *first_page = bio_first_bvec_all(bio)->bv_page;
+
+ if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size))
+ return false;
+ ret = bio_add_zone_append_page(bio, page, size, pg_offset);
+ } else {
+ ret = bio_add_page(bio, page, size, pg_offset);
+ }
+
+ return ret == size;
+}
+
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @wbc: optional writeback control for io accounting
* @page: page to add to the bio
+ * @disk_bytenr: logical bytenr where the write will be
+ * @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
- * @size: portion of page that we want to write
- * @offset: starting offset in the page
* @bio_ret: must be valid pointer, newly allocated bio will be stored there
* @end_io_func: end_io callback for new bio
* @mirror_num: desired mirror to read/write
@@ -3027,7 +3163,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
*/
static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
- struct page *page, u64 offset,
+ struct page *page, u64 disk_bytenr,
size_t size, unsigned long pg_offset,
struct bio **bio_ret,
bio_end_io_t end_io_func,
@@ -3038,28 +3174,18 @@ static int submit_extent_page(unsigned int opf,
{
int ret = 0;
struct bio *bio;
- size_t page_size = min_t(size_t, size, PAGE_SIZE);
- sector_t sector = offset >> 9;
- struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
+ size_t io_size = min_t(size_t, size, PAGE_SIZE);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct extent_io_tree *tree = &inode->io_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
ASSERT(bio_ret);
if (*bio_ret) {
- bool contig;
- bool can_merge = true;
-
bio = *bio_ret;
- if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
- contig = bio->bi_iter.bi_sector == sector;
- else
- contig = bio_end_sector(bio) == sector;
-
- if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
- can_merge = false;
-
- if (prev_bio_flags != bio_flags || !contig || !can_merge ||
- force_bio_submit ||
- bio_add_page(bio, page, page_size, pg_offset) < page_size) {
+ if (force_bio_submit ||
+ !btrfs_bio_add_page(bio, page, disk_bytenr, io_size,
+ pg_offset, prev_bio_flags, bio_flags)) {
ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
if (ret < 0) {
*bio_ret = NULL;
@@ -3068,13 +3194,13 @@ static int submit_extent_page(unsigned int opf,
bio = NULL;
} else {
if (wbc)
- wbc_account_cgroup_owner(wbc, page, page_size);
+ wbc_account_cgroup_owner(wbc, page, io_size);
return 0;
}
}
- bio = btrfs_bio_alloc(offset);
- bio_add_page(bio, page, page_size, pg_offset);
+ bio = btrfs_bio_alloc(disk_bytenr);
+ bio_add_page(bio, page, io_size, pg_offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
bio->bi_write_hint = page->mapping->host->i_write_hint;
@@ -3082,10 +3208,25 @@ static int submit_extent_page(unsigned int opf,
if (wbc) {
struct block_device *bdev;
- bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
+ bdev = fs_info->fs_devices->latest_bdev;
bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
- wbc_account_cgroup_owner(wbc, page, page_size);
+ wbc_account_cgroup_owner(wbc, page, io_size);
+ }
+ if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct extent_map *em;
+ struct map_lookup *map;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+ /* We only support single profile for now */
+ ASSERT(map->num_stripes == 1);
+ btrfs_io_bio(bio)->device = map->stripes[0].dev;
+
+ free_extent_map(em);
}
*bio_ret = bio;
@@ -3093,19 +3234,78 @@ static int submit_extent_page(unsigned int opf,
return ret;
}
-static void attach_extent_buffer_page(struct extent_buffer *eb,
- struct page *page)
+static int attach_extent_buffer_page(struct extent_buffer *eb,
+ struct page *page,
+ struct btrfs_subpage *prealloc)
{
- if (!PagePrivate(page))
- attach_page_private(page, eb);
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ int ret = 0;
+
+ /*
+ * If the page is mapped to btree inode, we should hold the private
+ * lock to prevent race.
+ * For cloned or dummy extent buffers, their pages are not mapped and
+ * will not race with any other ebs.
+ */
+ if (page->mapping)
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ if (fs_info->sectorsize == PAGE_SIZE) {
+ if (!PagePrivate(page))
+ attach_page_private(page, eb);
+ else
+ WARN_ON(page->private != (unsigned long)eb);
+ return 0;
+ }
+
+ /* Already mapped, just free prealloc */
+ if (PagePrivate(page)) {
+ btrfs_free_subpage(prealloc);
+ return 0;
+ }
+
+ if (prealloc)
+ /* Has preallocated memory for subpage */
+ attach_page_private(page, prealloc);
else
- WARN_ON(page->private != (unsigned long)eb);
+ /* Do new allocation to attach subpage */
+ ret = btrfs_attach_subpage(fs_info, page,
+ BTRFS_SUBPAGE_METADATA);
+ return ret;
+}
+
+int set_page_extent_mapped(struct page *page)
+{
+ struct btrfs_fs_info *fs_info;
+
+ ASSERT(page->mapping);
+
+ if (PagePrivate(page))
+ return 0;
+
+ fs_info = btrfs_sb(page->mapping->host->i_sb);
+
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
+
+ attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ return 0;
}
-void set_page_extent_mapped(struct page *page)
+void clear_page_extent_mapped(struct page *page)
{
+ struct btrfs_fs_info *fs_info;
+
+ ASSERT(page->mapping);
+
if (!PagePrivate(page))
- attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ return;
+
+ fs_info = btrfs_sb(page->mapping->host->i_sb);
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return btrfs_detach_subpage(fs_info, page);
+
+ detach_page_private(page);
}
static struct extent_map *
@@ -3146,6 +3346,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 start = page_offset(page);
const u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
@@ -3158,17 +3359,23 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
int nr = 0;
size_t pg_offset = 0;
size_t iosize;
- size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
unsigned long this_bio_flag = 0;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- set_page_extent_mapped(page);
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_extent(tree, start, end);
+ btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
+ unlock_page(page);
+ goto out;
+ }
if (!PageUptodate(page)) {
if (cleancache_get_page(page) == 0) {
BUG_ON(blocksize != PAGE_SIZE);
unlock_extent(tree, start, end);
+ unlock_page(page);
goto out;
}
}
@@ -3185,9 +3392,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
kunmap_atomic(userpage);
}
}
+ begin_page_read(fs_info, page);
while (cur <= end) {
bool force_bio_submit = false;
- u64 offset;
+ u64 disk_bytenr;
if (cur >= last_byte) {
char *userpage;
@@ -3202,13 +3410,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
cur + iosize - 1, &cached);
+ end_page_read(page, true, cur, iosize);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, em_cached);
if (IS_ERR_OR_NULL(em)) {
- SetPageError(page);
unlock_extent(tree, cur, end);
+ end_page_read(page, false, cur, end + 1 - cur);
break;
}
extent_offset = cur - em->start;
@@ -3224,13 +3433,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
iosize = ALIGN(iosize, blocksize);
- if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
- disk_io_size = em->block_len;
- offset = em->block_start;
- } else {
- offset = em->block_start + extent_offset;
- disk_io_size = iosize;
- }
+ if (this_bio_flag & EXTENT_BIO_COMPRESSED)
+ disk_bytenr = em->block_start;
+ else
+ disk_bytenr = em->block_start + extent_offset;
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
@@ -3294,6 +3500,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
cur + iosize - 1, &cached);
+ end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3303,6 +3510,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
unlock_extent(tree, cur, cur + iosize - 1);
+ end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3311,15 +3519,15 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
* to date. Error out
*/
if (block_start == EXTENT_MAP_INLINE) {
- SetPageError(page);
unlock_extent(tree, cur, cur + iosize - 1);
+ end_page_read(page, false, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
- page, offset, disk_io_size,
+ page, disk_bytenr, iosize,
pg_offset, bio,
end_bio_extent_readpage, 0,
*bio_flags,
@@ -3329,19 +3537,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
nr++;
*bio_flags = this_bio_flag;
} else {
- SetPageError(page);
unlock_extent(tree, cur, cur + iosize - 1);
+ end_page_read(page, false, cur, iosize);
goto out;
}
cur = cur + iosize;
pg_offset += iosize;
}
out:
- if (!nr) {
- if (!PageError(page))
- SetPageUptodate(page);
- unlock_page(page);
- }
return ret;
}
@@ -3461,23 +3664,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
unsigned long nr_written,
int *nr_ret)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
u64 start = page_offset(page);
- u64 page_end = start + PAGE_SIZE - 1;
- u64 end;
+ u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
u64 extent_offset;
u64 block_start;
- u64 iosize;
struct extent_map *em;
- size_t pg_offset = 0;
- size_t blocksize;
int ret = 0;
int nr = 0;
+ u32 opf = REQ_OP_WRITE;
const unsigned int write_flags = wbc_to_write_flags(wbc);
bool compressed;
- ret = btrfs_writepage_cow_fixup(page, start, page_end);
+ ret = btrfs_writepage_cow_fixup(page, start, end);
if (ret) {
/* Fixup worker will requeue */
redirty_page_for_writepage(wbc, page);
@@ -3492,16 +3693,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
*/
update_nr_written(wbc, nr_written + 1);
- end = page_end;
- blocksize = inode->vfs_inode.i_sb->s_blocksize;
-
while (cur <= end) {
+ u64 disk_bytenr;
u64 em_end;
- u64 offset;
+ u32 iosize;
if (cur >= i_size) {
- btrfs_writepage_endio_finish_ordered(page, cur,
- page_end, 1);
+ btrfs_writepage_endio_finish_ordered(page, cur, end, 1);
break;
}
em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
@@ -3513,13 +3711,20 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
extent_offset = cur - em->start;
em_end = extent_map_end(em);
- BUG_ON(em_end <= cur);
- BUG_ON(end < cur);
- iosize = min(em_end - cur, end - cur + 1);
- iosize = ALIGN(iosize, blocksize);
- offset = em->block_start + extent_offset;
+ ASSERT(cur <= em_end);
+ ASSERT(cur < end);
+ ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
+ ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
block_start = em->block_start;
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ disk_bytenr = em->block_start + extent_offset;
+
+ /* Note that em_end from extent_map_end() is exclusive */
+ iosize = min(em_end, end + 1) - cur;
+
+ if (btrfs_use_zone_append(inode, em))
+ opf = REQ_OP_ZONE_APPEND;
+
free_extent_map(em);
em = NULL;
@@ -3535,7 +3740,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
btrfs_writepage_endio_finish_ordered(page, cur,
cur + iosize - 1, 1);
cur += iosize;
- pg_offset += iosize;
continue;
}
@@ -3546,9 +3750,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
page->index, cur, end);
}
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- page, offset, iosize, pg_offset,
- &epd->bio,
+ ret = submit_extent_page(opf | write_flags, wbc, page,
+ disk_bytenr, iosize,
+ cur - page_offset(page), &epd->bio,
end_bio_extent_writepage,
0, 0, 0, false);
if (ret) {
@@ -3557,8 +3761,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
end_page_writeback(page);
}
- cur = cur + iosize;
- pg_offset += iosize;
+ cur += iosize;
nr++;
}
*nr_ret = nr;
@@ -3611,7 +3814,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
flush_dcache_page(page);
}
- set_page_extent_mapped(page);
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ SetPageError(page);
+ goto done;
+ }
if (!epd->extent_locked) {
ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
@@ -3656,11 +3863,14 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
}
/*
- * Lock eb pages and flush the bio if we can't the locks
+ * Lock extent buffer status and pages for writeback.
+ *
+ * May try to flush write bio if we can't get the lock.
*
- * Return 0 if nothing went wrong
- * Return >0 is same as 0, except bio is not submitted
- * Return <0 if something went wrong, no page is locked
+ * Return 0 if the extent buffer doesn't need to be submitted.
+ * (E.g. the extent buffer is not dirty)
+ * Return >0 is the extent buffer is submitted to bio.
+ * Return <0 if something went wrong, no page is locked.
*/
static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
struct extent_page_data *epd)
@@ -3868,7 +4078,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
- u64 offset = eb->start;
+ u64 disk_bytenr = eb->start;
u32 nritems;
int i, num_pages;
unsigned long start, end;
@@ -3901,7 +4111,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- p, offset, PAGE_SIZE, 0,
+ p, disk_bytenr, PAGE_SIZE, 0,
&epd->bio,
end_bio_extent_buffer_writepage,
0, 0, 0, false);
@@ -3914,7 +4124,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
ret = -EIO;
break;
}
- offset += PAGE_SIZE;
+ disk_bytenr += PAGE_SIZE;
update_nr_written(wbc, 1);
unlock_page(p);
}
@@ -3930,10 +4140,100 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
return ret;
}
+/*
+ * Submit all page(s) of one extent buffer.
+ *
+ * @page: the page of one extent buffer
+ * @eb_context: to determine if we need to submit this page, if current page
+ * belongs to this eb, we don't need to submit
+ *
+ * The caller should pass each page in their bytenr order, and here we use
+ * @eb_context to determine if we have submitted pages of one extent buffer.
+ *
+ * If we have, we just skip until we hit a new page that doesn't belong to
+ * current @eb_context.
+ *
+ * If not, we submit all the page(s) of the extent buffer.
+ *
+ * Return >0 if we have submitted the extent buffer successfully.
+ * Return 0 if we don't need to submit the page, as it's already submitted by
+ * previous call.
+ * Return <0 for fatal error.
+ */
+static int submit_eb_page(struct page *page, struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ struct extent_buffer **eb_context)
+{
+ struct address_space *mapping = page->mapping;
+ struct btrfs_block_group *cache = NULL;
+ struct extent_buffer *eb;
+ int ret;
+
+ if (!PagePrivate(page))
+ return 0;
+
+ spin_lock(&mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+
+ eb = (struct extent_buffer *)page->private;
+
+ /*
+ * Shouldn't happen and normally this would be a BUG_ON but no point
+ * crashing the machine for something we can survive anyway.
+ */
+ if (WARN_ON(!eb)) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+
+ if (eb == *eb_context) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+ ret = atomic_inc_not_zero(&eb->refs);
+ spin_unlock(&mapping->private_lock);
+ if (!ret)
+ return 0;
+
+ if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
+ /*
+ * If for_sync, this hole will be filled with
+ * trasnsaction commit.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+ ret = -EAGAIN;
+ else
+ ret = 0;
+ free_extent_buffer(eb);
+ return ret;
+ }
+
+ *eb_context = eb;
+
+ ret = lock_extent_buffer_for_io(eb, epd);
+ if (ret <= 0) {
+ btrfs_revert_meta_write_pointer(cache, eb);
+ if (cache)
+ btrfs_put_block_group(cache);
+ free_extent_buffer(eb);
+ return ret;
+ }
+ if (cache)
+ btrfs_put_block_group(cache);
+ ret = write_one_eb(eb, wbc, epd);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ return ret;
+ return 1;
+}
+
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_buffer *eb, *prev_eb = NULL;
+ struct extent_buffer *eb_context = NULL;
struct extent_page_data epd = {
.bio = NULL,
.extent_locked = 0,
@@ -3968,6 +4268,7 @@ int btree_write_cache_pages(struct address_space *mapping,
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
+ btrfs_zoned_meta_io_lock(fs_info);
retry:
if (wbc->sync_mode == WB_SYNC_ALL)
tag_pages_for_writeback(mapping, index, end);
@@ -3979,55 +4280,13 @@ retry:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- if (!PagePrivate(page))
+ ret = submit_eb_page(page, wbc, &epd, &eb_context);
+ if (ret == 0)
continue;
-
- spin_lock(&mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- eb = (struct extent_buffer *)page->private;
-
- /*
- * Shouldn't happen and normally this would be a BUG_ON
- * but no sense in crashing the users box for something
- * we can survive anyway.
- */
- if (WARN_ON(!eb)) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- if (eb == prev_eb) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->private_lock);
- if (!ret)
- continue;
-
- prev_eb = eb;
- ret = lock_extent_buffer_for_io(eb, &epd);
- if (!ret) {
- free_extent_buffer(eb);
- continue;
- } else if (ret < 0) {
- done = 1;
- free_extent_buffer(eb);
- break;
- }
-
- ret = write_one_eb(eb, wbc, &epd);
- if (ret) {
+ if (ret < 0) {
done = 1;
- free_extent_buffer(eb);
break;
}
- free_extent_buffer(eb);
/*
* the filesystem may choose to bump up nr_to_write.
@@ -4048,10 +4307,9 @@ retry:
index = 0;
goto retry;
}
- ASSERT(ret <= 0);
if (ret < 0) {
end_write_bio(&epd, ret);
- return ret;
+ goto out;
}
/*
* If something went wrong, don't allow any metadata write bio to be
@@ -4086,14 +4344,17 @@ retry:
ret = -EROFS;
end_write_bio(&epd, ret);
}
+out:
+ btrfs_zoned_meta_io_unlock(fs_info);
return ret;
}
/**
- * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * Walk the list of dirty pages of the given address space and write all of them.
+ *
* @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @data: data passed to __extent_writepage function
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @epd: holds context for the write, namely the bio
*
* If a page is already under I/O, write_cache_pages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
@@ -4382,14 +4643,22 @@ int extent_invalidatepage(struct extent_io_tree *tree,
u64 end = start + PAGE_SIZE - 1;
size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+ /* This function is only called for the btree inode */
+ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
+
start += ALIGN(offset, blocksize);
if (start > end)
return 0;
lock_extent_bits(tree, start, end, &cached_state);
wait_on_page_writeback(page);
- clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 1, 1, &cached_state);
+
+ /*
+ * Currently for btree io tree, only EXTENT_LOCKED is utilized,
+ * so here we only need to unlock the extent range to free any
+ * existing extent state.
+ */
+ unlock_extent_cached(tree, start, end, &cached_state);
return 0;
}
@@ -4409,12 +4678,14 @@ static int try_release_extent_state(struct extent_io_tree *tree,
ret = 0;
} else {
/*
- * at this point we can safely clear everything except the
- * locked bit and the nodatasum bit
+ * At this point we can safely clear everything except the
+ * locked bit, the nodatasum bit and the delalloc new bit.
+ * The delalloc new bit will be cleared by ordered extent
+ * completion.
*/
ret = __clear_extent_bit(tree, start, end,
- ~(EXTENT_LOCKED | EXTENT_NODATASUM),
- 0, 0, NULL, mask, NULL);
+ ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
+ 0, 0, NULL, mask, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
@@ -4691,7 +4962,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
roots = ulist_alloc(GFP_KERNEL);
tmp_ulist = ulist_alloc(GFP_KERNEL);
@@ -4883,25 +5153,39 @@ int extent_buffer_under_io(const struct extent_buffer *eb)
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
-/*
- * Release all pages attached to the extent buffer.
- */
-static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
{
- int i;
- int num_pages;
- int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
+ struct btrfs_subpage *subpage;
- BUG_ON(extent_buffer_under_io(eb));
+ lockdep_assert_held(&page->mapping->private_lock);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *page = eb->pages[i];
+ if (PagePrivate(page)) {
+ subpage = (struct btrfs_subpage *)page->private;
+ if (atomic_read(&subpage->eb_refs))
+ return true;
+ }
+ return false;
+}
- if (!page)
- continue;
+static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
+
+ /*
+ * For mapped eb, we're going to change the page private, which should
+ * be done under the private_lock.
+ */
+ if (mapped)
+ spin_lock(&page->mapping->private_lock);
+
+ if (!PagePrivate(page)) {
if (mapped)
- spin_lock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->private_lock);
+ return;
+ }
+
+ if (fs_info->sectorsize == PAGE_SIZE) {
/*
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
@@ -4920,9 +5204,49 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
*/
detach_page_private(page);
}
-
if (mapped)
spin_unlock(&page->mapping->private_lock);
+ return;
+ }
+
+ /*
+ * For subpage, we can have dummy eb with page private. In this case,
+ * we can directly detach the private as such page is only attached to
+ * one dummy eb, no sharing.
+ */
+ if (!mapped) {
+ btrfs_detach_subpage(fs_info, page);
+ return;
+ }
+
+ btrfs_page_dec_eb_refs(fs_info, page);
+
+ /*
+ * We can only detach the page private if there are no other ebs in the
+ * page range.
+ */
+ if (!page_range_has_eb(fs_info, page))
+ btrfs_detach_subpage(fs_info, page);
+
+ spin_unlock(&page->mapping->private_lock);
+}
+
+/* Release all pages attached to the extent buffer */
+static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+{
+ int i;
+ int num_pages;
+
+ ASSERT(!extent_buffer_under_io(eb));
+
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++) {
+ struct page *page = eb->pages[i];
+
+ if (!page)
+ continue;
+
+ detach_extent_buffer_page(eb, page);
/* One for when we allocated the page */
put_page(page);
@@ -4950,33 +5274,17 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
eb->len = len;
eb->fs_info = fs_info;
eb->bflags = 0;
- rwlock_init(&eb->lock);
- atomic_set(&eb->blocking_readers, 0);
- eb->blocking_writers = 0;
- eb->lock_recursed = false;
- init_waitqueue_head(&eb->write_lock_wq);
- init_waitqueue_head(&eb->read_lock_wq);
+ init_rwsem(&eb->lock);
btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
&fs_info->allocated_ebs);
+ INIT_LIST_HEAD(&eb->release_list);
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
atomic_set(&eb->io_pages, 0);
- /*
- * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
- */
- BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
- > MAX_INLINE_EXTENT_BUFFER_SIZE);
- BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
-
-#ifdef CONFIG_BTRFS_DEBUG
- eb->spinning_writers = 0;
- atomic_set(&eb->spinning_readers, 0);
- atomic_set(&eb->read_locks, 0);
- eb->write_locks = 0;
-#endif
+ ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
return eb;
}
@@ -4992,21 +5300,32 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
if (new == NULL)
return NULL;
+ /*
+ * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
+ * btrfs_release_extent_buffer() have different behavior for
+ * UNMAPPED subpage extent buffer.
+ */
+ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+
for (i = 0; i < num_pages; i++) {
+ int ret;
+
p = alloc_page(GFP_NOFS);
if (!p) {
btrfs_release_extent_buffer(new);
return NULL;
}
- attach_extent_buffer_page(new, p);
+ ret = attach_extent_buffer_page(new, p, NULL);
+ if (ret < 0) {
+ put_page(p);
+ btrfs_release_extent_buffer(new);
+ return NULL;
+ }
WARN_ON(PageDirty(p));
- SetPageUptodate(p);
new->pages[i] = p;
copy_page(page_address(p), page_address(src->pages[i]));
}
-
- set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
- set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+ set_extent_buffer_uptodate(new);
return new;
}
@@ -5024,9 +5343,14 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
+ int ret;
+
eb->pages[i] = alloc_page(GFP_NOFS);
if (!eb->pages[i])
goto err;
+ ret = attach_extent_buffer_page(eb, eb->pages[i], NULL);
+ if (ret < 0)
+ goto err;
}
set_extent_buffer_uptodate(eb);
btrfs_set_header_nritems(eb, 0);
@@ -5034,8 +5358,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
err:
- for (; i > 0; i--)
+ for (; i > 0; i--) {
+ detach_extent_buffer_page(eb, eb->pages[i - 1]);
__free_page(eb->pages[i - 1]);
+ }
__free_extent_buffer(eb);
return NULL;
}
@@ -5105,7 +5431,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> PAGE_SHIFT);
+ start >> fs_info->sectorsize_bits);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
/*
@@ -5157,7 +5483,7 @@ again:
}
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_SHIFT, eb);
+ start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -5177,8 +5503,40 @@ free_eb:
}
#endif
+static struct extent_buffer *grab_extent_buffer(
+ struct btrfs_fs_info *fs_info, struct page *page)
+{
+ struct extent_buffer *exists;
+
+ /*
+ * For subpage case, we completely rely on radix tree to ensure we
+ * don't try to insert two ebs for the same bytenr. So here we always
+ * return NULL and just continue.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return NULL;
+
+ /* Page not yet attached to an extent buffer */
+ if (!PagePrivate(page))
+ return NULL;
+
+ /*
+ * We could have already allocated an eb for this page and attached one
+ * so lets see if we can get a ref on the existing eb, and if we can we
+ * know it's good and we can just return that one, else we know we can
+ * just overwrite page->private.
+ */
+ exists = (struct extent_buffer *)page->private;
+ if (atomic_inc_not_zero(&exists->refs))
+ return exists;
+
+ WARN_ON(PageDirty(page));
+ detach_page_private(page);
+ return NULL;
+}
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+ u64 start, u64 owner_root, int level)
{
unsigned long len = fs_info->nodesize;
int num_pages;
@@ -5196,6 +5554,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
return ERR_PTR(-EINVAL);
}
+ if (fs_info->sectorsize < PAGE_SIZE &&
+ offset_in_page(start) + len > PAGE_SIZE) {
+ btrfs_err(fs_info,
+ "tree block crosses page boundary, start %llu nodesize %lu",
+ start, len);
+ return ERR_PTR(-EINVAL);
+ }
+
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
@@ -5203,44 +5569,62 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return ERR_PTR(-ENOMEM);
+ btrfs_set_buffer_lockdep_class(owner_root, eb, level);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++, index++) {
+ struct btrfs_subpage *prealloc = NULL;
+
p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
if (!p) {
exists = ERR_PTR(-ENOMEM);
goto free_eb;
}
- spin_lock(&mapping->private_lock);
- if (PagePrivate(p)) {
- /*
- * We could have already allocated an eb for this page
- * and attached one so lets see if we can get a ref on
- * the existing eb, and if we can we know it's good and
- * we can just return that one, else we know we can just
- * overwrite page->private.
- */
- exists = (struct extent_buffer *)p->private;
- if (atomic_inc_not_zero(&exists->refs)) {
- spin_unlock(&mapping->private_lock);
- unlock_page(p);
- put_page(p);
- mark_extent_buffer_accessed(exists, p);
- goto free_eb;
- }
- exists = NULL;
+ /*
+ * Preallocate page->private for subpage case, so that we won't
+ * allocate memory with private_lock hold. The memory will be
+ * freed by attach_extent_buffer_page() or freed manually if
+ * we exit earlier.
+ *
+ * Although we have ensured one subpage eb can only have one
+ * page, but it may change in the future for 16K page size
+ * support, so we still preallocate the memory in the loop.
+ */
+ ret = btrfs_alloc_subpage(fs_info, &prealloc,
+ BTRFS_SUBPAGE_METADATA);
+ if (ret < 0) {
+ unlock_page(p);
+ put_page(p);
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
- /*
- * Do this so attach doesn't complain and we need to
- * drop the ref the old guy had.
- */
- ClearPagePrivate(p);
- WARN_ON(PageDirty(p));
+ spin_lock(&mapping->private_lock);
+ exists = grab_extent_buffer(fs_info, p);
+ if (exists) {
+ spin_unlock(&mapping->private_lock);
+ unlock_page(p);
put_page(p);
+ mark_extent_buffer_accessed(exists, p);
+ btrfs_free_subpage(prealloc);
+ goto free_eb;
}
- attach_extent_buffer_page(eb, p);
+ /* Should not fail, as we have preallocated the memory */
+ ret = attach_extent_buffer_page(eb, p, prealloc);
+ ASSERT(!ret);
+ /*
+ * To inform we have extra eb under allocation, so that
+ * detach_extent_buffer_page() won't release the page private
+ * when the eb hasn't yet been inserted into radix tree.
+ *
+ * The ref will be decreased when the eb released the page, in
+ * detach_extent_buffer_page().
+ * Thus needs no special handling in error path.
+ */
+ btrfs_page_inc_eb_refs(fs_info, p);
spin_unlock(&mapping->private_lock);
+
WARN_ON(PageDirty(p));
eb->pages[i] = p;
if (!PageUptodate(p))
@@ -5265,7 +5649,7 @@ again:
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_SHIFT, eb);
+ start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -5321,7 +5705,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_lock(&fs_info->buffer_lock);
radix_tree_delete(&fs_info->buffer_radix,
- eb->start >> PAGE_SHIFT);
+ eb->start >> fs_info->sectorsize_bits);
spin_unlock(&fs_info->buffer_lock);
} else {
spin_unlock(&eb->refs_lock);
@@ -5446,33 +5830,103 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
- int i;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page;
int num_pages;
+ int i;
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
if (page)
- ClearPageUptodate(page);
+ btrfs_page_clear_uptodate(fs_info, page,
+ eb->start, eb->len);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
- int i;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page;
int num_pages;
+ int i;
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
- SetPageUptodate(page);
+ btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len);
}
}
+static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
+ int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct extent_io_tree *io_tree;
+ struct page *page = eb->pages[0];
+ struct bio *bio = NULL;
+ int ret = 0;
+
+ ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
+ ASSERT(PagePrivate(page));
+ io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+
+ if (wait == WAIT_NONE) {
+ ret = try_lock_extent(io_tree, eb->start,
+ eb->start + eb->len - 1);
+ if (ret <= 0)
+ return ret;
+ } else {
+ ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = 0;
+ if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
+ PageUptodate(page) ||
+ btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+ return ret;
+ }
+
+ clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = 0;
+ atomic_set(&eb->io_pages, 1);
+ check_buffer_tree_ref(eb);
+ btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
+
+ ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start,
+ eb->len, eb->start - page_offset(page), &bio,
+ end_bio_extent_readpage, mirror_num, 0, 0,
+ true);
+ if (ret) {
+ /*
+ * In the endio function, if we hit something wrong we will
+ * increase the io_pages, so here we need to decrease it for
+ * error path.
+ */
+ atomic_dec(&eb->io_pages);
+ }
+ if (bio) {
+ int tmp;
+
+ tmp = submit_one_bio(bio, mirror_num, 0);
+ if (tmp < 0)
+ return tmp;
+ }
+ if (ret || wait != WAIT_COMPLETE)
+ return ret;
+
+ wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ ret = -EIO;
+ return ret;
+}
+
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
{
int i;
@@ -5489,10 +5943,20 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
+ if (eb->fs_info->sectorsize < PAGE_SIZE)
+ return read_extent_buffer_subpage(eb, wait, mirror_num);
+
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
if (wait == WAIT_NONE) {
+ /*
+ * WAIT_NONE is only utilized by readahead. If we can't
+ * acquire the lock atomically it means either the eb
+ * is being read out or under modification.
+ * Either way the eb will be or has been cached,
+ * readahead can exit safely.
+ */
if (!trylock_page(page))
goto unlock_exit;
} else {
@@ -5622,12 +6086,12 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
struct page *page;
char *kaddr;
char *dst = (char *)dstv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
if (check_eb_range(eb, start, len))
return;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5652,13 +6116,13 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
struct page *page;
char *kaddr;
char __user *dst = (char __user *)dstv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5687,13 +6151,13 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
int ret = 0;
if (check_eb_range(eb, start, len))
return -EINVAL;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5719,7 +6183,7 @@ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
char *kaddr;
WARN_ON(!PageUptodate(eb->pages[0]));
- kaddr = page_address(eb->pages[0]);
+ kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
BTRFS_FSID_SIZE);
}
@@ -5729,7 +6193,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
char *kaddr;
WARN_ON(!PageUptodate(eb->pages[0]));
- kaddr = page_address(eb->pages[0]);
+ kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
BTRFS_FSID_SIZE);
}
@@ -5742,12 +6206,14 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
struct page *page;
char *kaddr;
char *src = (char *)srcv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
+
+ WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
if (check_eb_range(eb, start, len))
return;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5771,12 +6237,12 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
size_t offset;
struct page *page;
char *kaddr;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
if (check_eb_range(eb, start, len))
return;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5800,10 +6266,20 @@ void copy_extent_buffer_full(const struct extent_buffer *dst,
ASSERT(dst->len == src->len);
- num_pages = num_extent_pages(dst);
- for (i = 0; i < num_pages; i++)
- copy_page(page_address(dst->pages[i]),
- page_address(src->pages[i]));
+ if (dst->fs_info->sectorsize == PAGE_SIZE) {
+ num_pages = num_extent_pages(dst);
+ for (i = 0; i < num_pages; i++)
+ copy_page(page_address(dst->pages[i]),
+ page_address(src->pages[i]));
+ } else {
+ size_t src_offset = get_eb_offset_in_page(src, 0);
+ size_t dst_offset = get_eb_offset_in_page(dst, 0);
+
+ ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
+ memcpy(page_address(dst->pages[0]) + dst_offset,
+ page_address(src->pages[0]) + src_offset,
+ src->len);
+ }
}
void copy_extent_buffer(const struct extent_buffer *dst,
@@ -5816,7 +6292,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
size_t offset;
struct page *page;
char *kaddr;
- unsigned long i = dst_offset >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(dst_offset);
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(src, src_offset, len))
@@ -5824,7 +6300,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
WARN_ON(src->len != dst_len);
- offset = offset_in_page(dst_offset);
+ offset = get_eb_offset_in_page(dst, dst_offset);
while (len > 0) {
page = dst->pages[i];
@@ -5868,7 +6344,7 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start + byte_offset;
+ offset = start + offset_in_page(eb->start) + byte_offset;
*page_index = offset >> PAGE_SHIFT;
*page_offset = offset_in_page(offset);
@@ -6022,11 +6498,11 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
return;
while (len > 0) {
- dst_off_in_page = offset_in_page(dst_offset);
- src_off_in_page = offset_in_page(src_offset);
+ dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
+ src_off_in_page = get_eb_offset_in_page(dst, src_offset);
- dst_i = dst_offset >> PAGE_SHIFT;
- src_i = src_offset >> PAGE_SHIFT;
+ dst_i = get_eb_page_index(dst_offset);
+ src_i = get_eb_page_index(src_offset);
cur = min(len, (unsigned long)(PAGE_SIZE -
src_off_in_page));
@@ -6062,11 +6538,11 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
return;
}
while (len > 0) {
- dst_i = dst_end >> PAGE_SHIFT;
- src_i = src_end >> PAGE_SHIFT;
+ dst_i = get_eb_page_index(dst_end);
+ src_i = get_eb_page_index(src_end);
- dst_off_in_page = offset_in_page(dst_end);
- src_off_in_page = offset_in_page(src_end);
+ dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
+ src_off_in_page = get_eb_offset_in_page(dst, src_end);
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
@@ -6080,13 +6556,115 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
+static struct extent_buffer *get_next_extent_buffer(
+ struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+{
+ struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
+ struct extent_buffer *found = NULL;
+ u64 page_start = page_offset(page);
+ int ret;
+ int i;
+
+ ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
+ ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
+ lockdep_assert_held(&fs_info->buffer_lock);
+
+ ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
+ bytenr >> fs_info->sectorsize_bits,
+ PAGE_SIZE / fs_info->nodesize);
+ for (i = 0; i < ret; i++) {
+ /* Already beyond page end */
+ if (gang[i]->start >= page_start + PAGE_SIZE)
+ break;
+ /* Found one */
+ if (gang[i]->start >= bytenr) {
+ found = gang[i];
+ break;
+ }
+ }
+ return found;
+}
+
+static int try_release_subpage_extent_buffer(struct page *page)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ u64 cur = page_offset(page);
+ const u64 end = page_offset(page) + PAGE_SIZE;
+ int ret;
+
+ while (cur < end) {
+ struct extent_buffer *eb = NULL;
+
+ /*
+ * Unlike try_release_extent_buffer() which uses page->private
+ * to grab buffer, for subpage case we rely on radix tree, thus
+ * we need to ensure radix tree consistency.
+ *
+ * We also want an atomic snapshot of the radix tree, thus go
+ * with spinlock rather than RCU.
+ */
+ spin_lock(&fs_info->buffer_lock);
+ eb = get_next_extent_buffer(fs_info, page, cur);
+ if (!eb) {
+ /* No more eb in the page range after or at cur */
+ spin_unlock(&fs_info->buffer_lock);
+ break;
+ }
+ cur = eb->start + eb->len;
+
+ /*
+ * The same as try_release_extent_buffer(), to ensure the eb
+ * won't disappear out from under us.
+ */
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ spin_unlock(&eb->refs_lock);
+ spin_unlock(&fs_info->buffer_lock);
+ break;
+ }
+ spin_unlock(&fs_info->buffer_lock);
+
+ /*
+ * If tree ref isn't set then we know the ref on this eb is a
+ * real ref, so just return, this eb will likely be freed soon
+ * anyway.
+ */
+ if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ break;
+ }
+
+ /*
+ * Here we don't care about the return value, we will always
+ * check the page private at the end. And
+ * release_extent_buffer() will release the refs_lock.
+ */
+ release_extent_buffer(eb);
+ }
+ /*
+ * Finally to check if we have cleared page private, as if we have
+ * released all ebs in the page, the page private should be cleared now.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page))
+ ret = 1;
+ else
+ ret = 0;
+ spin_unlock(&page->mapping->private_lock);
+ return ret;
+
+}
+
int try_release_extent_buffer(struct page *page)
{
struct extent_buffer *eb;
+ if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ return try_release_subpage_extent_buffer(page);
+
/*
- * We need to make sure nobody is attaching this page to an eb right
- * now.
+ * We need to make sure nobody is changing page->private, as we rely on
+ * page->private as the pointer to extent buffer.
*/
spin_lock(&page->mapping->private_lock);
if (!PagePrivate(page)) {
@@ -6121,3 +6699,54 @@ int try_release_extent_buffer(struct page *page)
return release_extent_buffer(eb);
}
+
+/*
+ * btrfs_readahead_tree_block - attempt to readahead a child block
+ * @fs_info: the fs_info
+ * @bytenr: bytenr to read
+ * @owner_root: objectid of the root that owns this eb
+ * @gen: generation for the uptodate check, can be 0
+ * @level: level for the eb
+ *
+ * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
+ * normal uptodate check of the eb, without checking the generation. If we have
+ * to read the block we will not block on anything.
+ */
+void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 owner_root, u64 gen, int level)
+{
+ struct extent_buffer *eb;
+ int ret;
+
+ eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
+ if (IS_ERR(eb))
+ return;
+
+ if (btrfs_buffer_uptodate(eb, gen, 1)) {
+ free_extent_buffer(eb);
+ return;
+ }
+
+ ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
+ if (ret < 0)
+ free_extent_buffer_stale(eb);
+ else
+ free_extent_buffer(eb);
+}
+
+/*
+ * btrfs_readahead_node_child - readahead a node's child block
+ * @node: parent node we're reading from
+ * @slot: slot in the parent node for the child we want to read
+ *
+ * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
+ * the slot in the node provided.
+ */
+void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
+{
+ btrfs_readahead_tree_block(node->fs_info,
+ btrfs_node_blockptr(node, slot),
+ btrfs_header_owner(node),
+ btrfs_node_ptr_generation(node, slot),
+ btrfs_header_level(node) - 1);
+}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f39d02e7f7ef..824640cb0ace 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -6,6 +6,7 @@
#include <linux/rbtree.h>
#include <linux/refcount.h>
#include <linux/fiemap.h>
+#include <linux/btrfs_tree.h>
#include "ulist.h"
/*
@@ -30,16 +31,17 @@ enum {
EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
+ EXTENT_BUFFER_NO_CHECK,
};
/* these are flags for __process_pages_contig */
#define PAGE_UNLOCK (1 << 0)
-#define PAGE_CLEAR_DIRTY (1 << 1)
-#define PAGE_SET_WRITEBACK (1 << 2)
-#define PAGE_END_WRITEBACK (1 << 3)
-#define PAGE_SET_PRIVATE2 (1 << 4)
-#define PAGE_SET_ERROR (1 << 5)
-#define PAGE_LOCK (1 << 6)
+/* Page starts writeback, clear dirty bit and set writeback bit */
+#define PAGE_START_WRITEBACK (1 << 1)
+#define PAGE_END_WRITEBACK (1 << 2)
+#define PAGE_SET_PRIVATE2 (1 << 3)
+#define PAGE_SET_ERROR (1 << 4)
+#define PAGE_LOCK (1 << 5)
/*
* page->private values. Every page that is controlled by the extent
@@ -71,11 +73,10 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
int mirror_num,
unsigned long bio_flags);
-typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
- struct bio *bio, u64 bio_offset);
+typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
+ struct bio *bio, u64 dio_file_offset);
-#define INLINE_EXTENT_BUFFER_PAGES 16
-#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
+#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
struct extent_buffer {
u64 start;
unsigned long len;
@@ -87,31 +88,14 @@ struct extent_buffer {
int read_mirror;
struct rcu_head rcu_head;
pid_t lock_owner;
-
- int blocking_writers;
- atomic_t blocking_readers;
- bool lock_recursed;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
- short log_index;
-
- /* protects write locks */
- rwlock_t lock;
+ s8 log_index;
- /* readers use lock_wq while they wait for the write
- * lock holders to unlock
- */
- wait_queue_head_t write_lock_wq;
+ struct rw_semaphore lock;
- /* writers use read_lock_wq while they wait for readers
- * to unlock
- */
- wait_queue_head_t read_lock_wq;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+ struct list_head release_list;
#ifdef CONFIG_BTRFS_DEBUG
- int spinning_writers;
- atomic_t spinning_readers;
- atomic_t read_locks;
- int write_locks;
struct list_head leak_list;
#endif
};
@@ -196,10 +180,11 @@ int btree_write_cache_pages(struct address_space *mapping,
void extent_readahead(struct readahead_control *rac);
int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
-void set_page_extent_mapped(struct page *page);
+int set_page_extent_mapped(struct page *page);
+void clear_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start);
+ u64 start, u64 owner_root, int level);
struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -215,11 +200,20 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
int mirror_num);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
+void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 owner_root, u64 gen, int level);
+void btrfs_readahead_node_child(struct extent_buffer *node, int slot);
static inline int num_extent_pages(const struct extent_buffer *eb)
{
- return (round_up(eb->start + eb->len, PAGE_SIZE) >> PAGE_SHIFT) -
- (eb->start >> PAGE_SHIFT);
+ /*
+ * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to
+ * sectorsize, it's just eb->len >> PAGE_SHIFT.
+ *
+ * For sectorsize < PAGE_SIZE case, we could have nodesize < PAGE_SIZE,
+ * thus have to ensure we get at least one page.
+ */
+ return (eb->len >> PAGE_SHIFT) ?: 1;
}
static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
@@ -270,8 +264,7 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned bits_to_clear,
- unsigned long page_ops);
+ u32 bits_to_clear, unsigned long page_ops);
struct bio *btrfs_bio_alloc(u64 first_byte);
struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
@@ -307,7 +300,7 @@ struct io_failure_record {
blk_status_t btrfs_submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u64 phy_offset,
+ struct bio *failed_bio, u32 bio_offset,
struct page *page, unsigned int pgoff,
u64 start, u64 end, int failed_mirror,
submit_bio_hook_t *submit_bio_hook);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index bd6229fb2b6f..4a8e02f7b6c7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -385,9 +385,12 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
}
/**
- * add_extent_mapping - add new extent map to the extent tree
+ * Add new extent map to the extent tree
+ *
* @tree: tree to insert new map in
* @em: map to insert
+ * @modified: indicate whether the given @em should be added to the
+ * modified list, which indicates the extent needs to be logged
*
* Insert @em into @tree or perform a simple forward/backward merge with
* existing mappings. The extent_map struct passed in will be inserted
@@ -574,12 +577,13 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
}
/**
- * btrfs_add_extent_mapping - add extent mapping into em_tree
- * @fs_info - used for tracepoint
- * @em_tree - the extent tree into which we want to insert the extent mapping
- * @em_in - extent we are inserting
- * @start - start of the logical range btrfs_get_extent() is requesting
- * @len - length of the logical range btrfs_get_extent() is requesting
+ * Add extent mapping into em_tree
+ *
+ * @fs_info: the filesystem
+ * @em_tree: extent tree into which we want to insert the extent mapping
+ * @em_in: extent we are inserting
+ * @start: start of the logical range btrfs_get_extent() is requesting
+ * @len: length of the logical range btrfs_get_extent() is requesting
*
* Note that @em_in's range may be different from [start, start+len),
* but they must be overlapped.
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 8f4f2bd6d9b9..47cd3a6dc635 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -24,8 +24,10 @@
PAGE_SIZE))
/**
- * @inode - the inode we want to update the disk_i_size for
- * @new_i_size - the i_size we want to set to, 0 if we use i_size
+ * Set inode's size according to filesystem options
+ *
+ * @inode: inode we want to update the disk_i_size for
+ * @new_i_size: i_size we want to set to, 0 if we use i_size
*
* With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read()
* returns as it is perfectly fine with a file that has holes without hole file
@@ -38,33 +40,35 @@
* Finally new_i_size should only be set in the case of truncate where we're not
* ready to use i_size_read() as the limiter yet.
*/
-void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size)
+void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size)
{
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 start, end, i_size;
int ret;
- i_size = new_i_size ?: i_size_read(inode);
+ i_size = new_i_size ?: i_size_read(&inode->vfs_inode);
if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
- BTRFS_I(inode)->disk_i_size = i_size;
+ inode->disk_i_size = i_size;
return;
}
- spin_lock(&BTRFS_I(inode)->lock);
- ret = find_contiguous_extent_bit(&BTRFS_I(inode)->file_extent_tree, 0,
- &start, &end, EXTENT_DIRTY);
+ spin_lock(&inode->lock);
+ ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start,
+ &end, EXTENT_DIRTY);
if (!ret && start == 0)
i_size = min(i_size, end + 1);
else
i_size = 0;
- BTRFS_I(inode)->disk_i_size = i_size;
- spin_unlock(&BTRFS_I(inode)->lock);
+ inode->disk_i_size = i_size;
+ spin_unlock(&inode->lock);
}
/**
- * @inode - the inode we're modifying
- * @start - the start file offset of the file extent we've inserted
- * @len - the logical length of the file extent item
+ * Mark range within a file as having a new extent inserted
+ *
+ * @inode: inode being modified
+ * @start: start file offset of the file extent we've inserted
+ * @len: logical length of the file extent item
*
* Call when we are inserting a new file extent where there was none before.
* Does not need to call this in the case where we're replacing an existing file
@@ -88,9 +92,11 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
}
/**
- * @inode - the inode we're modifying
- * @start - the start file offset of the file extent we've inserted
- * @len - the logical length of the file extent item
+ * Marks an inode range as not having a backing extent
+ *
+ * @inode: inode being modified
+ * @start: start file offset of the file extent we've inserted
+ * @len: logical length of the file extent item
*
* Called when we drop a file extent, for example when we truncate. Doesn't
* need to be called for cases where we're replacing a file extent, like when
@@ -142,7 +148,6 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
file_key.offset = pos;
file_key.type = BTRFS_EXTENT_DATA_KEY;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
sizeof(*item));
if (ret < 0)
@@ -181,7 +186,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_csum_item *item;
struct extent_buffer *leaf;
u64 csum_offset = 0;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
int csums_in_item;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -201,7 +206,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
goto fail;
csum_offset = (bytenr - found_key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ fs_info->sectorsize_bits;
csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
csums_in_item /= csum_size;
@@ -239,12 +244,117 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
return ret;
}
+/*
+ * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and
+ * estore the result to @dst.
+ *
+ * Return >0 for the number of sectors we found.
+ * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum
+ * for it. Caller may want to try next sector until one range is hit.
+ * Return <0 for fatal error.
+ */
+static int search_csum_tree(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 disk_bytenr,
+ u64 len, u8 *dst)
+{
+ struct btrfs_csum_item *item = NULL;
+ struct btrfs_key key;
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 csum_size = fs_info->csum_size;
+ u32 itemsize;
+ int ret;
+ u64 csum_start;
+ u64 csum_len;
+
+ ASSERT(IS_ALIGNED(disk_bytenr, sectorsize) &&
+ IS_ALIGNED(len, sectorsize));
+
+ /* Check if the current csum item covers disk_bytenr */
+ if (path->nodes[0]) {
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+
+ csum_start = key.offset;
+ csum_len = (itemsize / csum_size) * sectorsize;
+
+ if (in_range(disk_bytenr, csum_start, csum_len))
+ goto found;
+ }
+
+ /* Current item doesn't contain the desired range, search again */
+ btrfs_release_path(path);
+ item = btrfs_lookup_csum(NULL, fs_info->csum_root, path, disk_bytenr, 0);
+ if (IS_ERR(item)) {
+ ret = PTR_ERR(item);
+ goto out;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+
+ csum_start = key.offset;
+ csum_len = (itemsize / csum_size) * sectorsize;
+ ASSERT(in_range(disk_bytenr, csum_start, csum_len));
+
+found:
+ ret = (min(csum_start + csum_len, disk_bytenr + len) -
+ disk_bytenr) >> fs_info->sectorsize_bits;
+ read_extent_buffer(path->nodes[0], dst, (unsigned long)item,
+ ret * csum_size);
+out:
+ if (ret == -ENOENT)
+ ret = 0;
+ return ret;
+}
+
+/*
+ * Locate the file_offset of @cur_disk_bytenr of a @bio.
+ *
+ * Bio of btrfs represents read range of
+ * [bi_sector << 9, bi_sector << 9 + bi_size).
+ * Knowing this, we can iterate through each bvec to locate the page belong to
+ * @cur_disk_bytenr and get the file offset.
+ *
+ * @inode is used to determine if the bvec page really belongs to @inode.
+ *
+ * Return 0 if we can't find the file offset
+ * Return >0 if we find the file offset and restore it to @file_offset_ret
+ */
+static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
+ u64 disk_bytenr, u64 *file_offset_ret)
+{
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+ u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ int ret = 0;
+
+ bio_for_each_segment(bvec, bio, iter) {
+ struct page *page = bvec.bv_page;
+
+ if (cur > disk_bytenr)
+ break;
+ if (cur + bvec.bv_len <= disk_bytenr) {
+ cur += bvec.bv_len;
+ continue;
+ }
+ ASSERT(in_range(disk_bytenr, cur, bvec.bv_len));
+ if (page->mapping && page->mapping->host &&
+ page->mapping->host == inode) {
+ ret = 1;
+ *file_offset_ret = page_offset(page) + bvec.bv_offset +
+ disk_bytenr - cur;
+ break;
+ }
+ }
+ return ret;
+}
+
/**
- * btrfs_lookup_bio_sums - Look up checksums for a bio.
+ * Lookup the checksum for the read bio in csum tree.
+ *
* @inode: inode that the bio is for.
* @bio: bio to look up.
- * @offset: Unless (u64)-1, look up checksums for this offset in the file.
- * If (u64)-1, use the page offsets from the bio instead.
* @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
* checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
* NULL, the checksum buffer is allocated and returned in
@@ -252,31 +362,40 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u64 offset, u8 *dst)
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct bio_vec bvec;
- struct bvec_iter iter;
- struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_path *path;
- const bool page_offsets = (offset == (u64)-1);
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 csum_size = fs_info->csum_size;
+ u32 orig_len = bio->bi_iter.bi_size;
+ u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 cur_disk_bytenr;
u8 *csum;
- u64 item_start_offset = 0;
- u64 item_last_offset = 0;
- u64 disk_bytenr;
- u64 page_bytes_left;
- u32 diff;
- int nblocks;
+ const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
int count = 0;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+ return BLK_STS_OK;
+
+ /*
+ * This function is only called for read bio.
+ *
+ * This means two things:
+ * - All our csums should only be in csum tree
+ * No ordered extents csums, as ordered extents are only for write
+ * path.
+ * - No need to bother any other info from bvec
+ * Since we're looking up csums, the only important info is the
+ * disk_bytenr and the length, which can be extracted from bi_iter
+ * directly.
+ */
+ ASSERT(bio_op(bio) == REQ_OP_READ);
path = btrfs_alloc_path();
if (!path)
return BLK_STS_RESOURCE;
- nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
if (!dst) {
struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
@@ -295,7 +414,11 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
csum = dst;
}
- if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
+ /*
+ * If requested number of sectors is larger than one leaf can contain,
+ * kick the readahead for csum tree.
+ */
+ if (nblocks > fs_info->csums_per_leaf)
path->reada = READA_FORWARD;
/*
@@ -309,85 +432,62 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
path->skip_locking = 1;
}
- disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
-
- bio_for_each_segment(bvec, bio, iter) {
- page_bytes_left = bvec.bv_len;
- if (count)
- goto next;
+ for (cur_disk_bytenr = orig_disk_bytenr;
+ cur_disk_bytenr < orig_disk_bytenr + orig_len;
+ cur_disk_bytenr += (count * sectorsize)) {
+ u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr;
+ unsigned int sector_offset;
+ u8 *csum_dst;
- if (page_offsets)
- offset = page_offset(bvec.bv_page) + bvec.bv_offset;
- count = btrfs_find_ordered_sum(BTRFS_I(inode), offset,
- disk_bytenr, csum, nblocks);
- if (count)
- goto found;
+ /*
+ * Although both cur_disk_bytenr and orig_disk_bytenr is u64,
+ * we're calculating the offset to the bio start.
+ *
+ * Bio size is limited to UINT_MAX, thus unsigned int is large
+ * enough to contain the raw result, not to mention the right
+ * shifted result.
+ */
+ ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX);
+ sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >>
+ fs_info->sectorsize_bits;
+ csum_dst = csum + sector_offset * csum_size;
+
+ count = search_csum_tree(fs_info, path, cur_disk_bytenr,
+ search_len, csum_dst);
+ if (count <= 0) {
+ /*
+ * Either we hit a critical error or we didn't find
+ * the csum.
+ * Either way, we put zero into the csums dst, and skip
+ * to the next sector.
+ */
+ memset(csum_dst, 0, csum_size);
+ count = 1;
- if (!item || disk_bytenr < item_start_offset ||
- disk_bytenr >= item_last_offset) {
- struct btrfs_key found_key;
- u32 item_size;
-
- if (item)
- btrfs_release_path(path);
- item = btrfs_lookup_csum(NULL, fs_info->csum_root,
- path, disk_bytenr, 0);
- if (IS_ERR(item)) {
- count = 1;
- memset(csum, 0, csum_size);
- if (BTRFS_I(inode)->root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
- set_extent_bits(io_tree, offset,
- offset + fs_info->sectorsize - 1,
+ /*
+ * For data reloc inode, we need to mark the range
+ * NODATASUM so that balance won't report false csum
+ * error.
+ */
+ if (BTRFS_I(inode)->root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ u64 file_offset;
+ int ret;
+
+ ret = search_file_offset_in_bio(bio, inode,
+ cur_disk_bytenr, &file_offset);
+ if (ret)
+ set_extent_bits(io_tree, file_offset,
+ file_offset + sectorsize - 1,
EXTENT_NODATASUM);
- } else {
- btrfs_info_rl(fs_info,
- "no csum found for inode %llu start %llu",
- btrfs_ino(BTRFS_I(inode)), offset);
- }
- item = NULL;
- btrfs_release_path(path);
- goto found;
+ } else {
+ btrfs_warn_rl(fs_info,
+ "csum hole found for disk bytenr range [%llu, %llu)",
+ cur_disk_bytenr, cur_disk_bytenr + sectorsize);
}
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
-
- item_start_offset = found_key.offset;
- item_size = btrfs_item_size_nr(path->nodes[0],
- path->slots[0]);
- item_last_offset = item_start_offset +
- (item_size / csum_size) *
- fs_info->sectorsize;
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_csum_item);
- }
- /*
- * this byte range must be able to fit inside
- * a single leaf so it will also fit inside a u32
- */
- diff = disk_bytenr - item_start_offset;
- diff = diff / fs_info->sectorsize;
- diff = diff * csum_size;
- count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >>
- inode->i_sb->s_blocksize_bits);
- read_extent_buffer(path->nodes[0], csum,
- ((unsigned long)item) + diff,
- csum_size * count);
-found:
- csum += count * csum_size;
- nblocks -= count;
-next:
- while (count > 0) {
- count--;
- disk_bytenr += fs_info->sectorsize;
- offset += fs_info->sectorsize;
- page_bytes_left -= fs_info->sectorsize;
- if (!page_bytes_left)
- break; /* move to next bio */
}
}
- WARN_ON_ONCE(count);
btrfs_free_path(path);
return BLK_STS_OK;
}
@@ -406,7 +506,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
int ret;
size_t size;
u64 csum_end;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
@@ -433,8 +533,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
key.type == BTRFS_EXTENT_CSUM_KEY) {
- offset = (start - key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ offset = (start - key.offset) >> fs_info->sectorsize_bits;
if (offset * csum_size <
btrfs_item_size_nr(leaf, path->slots[0] - 1))
path->slots[0]--;
@@ -484,10 +583,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
sums->bytenr = start;
sums->len = (int)size;
- offset = (start - key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ offset = (start - key.offset) >> fs_info->sectorsize_bits;
offset *= csum_size;
- size >>= fs_info->sb->s_blocksize_bits;
+ size >>= fs_info->sectorsize_bits;
read_extent_buffer(path->nodes[0],
sums->sums,
@@ -539,7 +637,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
int i;
u64 offset;
unsigned nofs_flag;
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
nofs_flag = memalloc_nofs_save();
sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
@@ -557,7 +654,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
else
offset = 0; /* shut up gcc */
- sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
+ sums->bytenr = bio->bi_iter.bi_sector << 9;
index = 0;
shash->tfm = fs_info->csum_shash;
@@ -596,7 +693,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
ordered = btrfs_lookup_ordered_extent(inode,
offset);
ASSERT(ordered); /* Logic error */
- sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9)
+ sums->bytenr = (bio->bi_iter.bi_sector << 9)
+ total_bytes;
index = 0;
}
@@ -607,7 +704,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
fs_info->sectorsize,
sums->sums + index);
kunmap_atomic(data);
- index += csum_size;
+ index += fs_info->csum_size;
offset += fs_info->sectorsize;
this_sum_bytes += fs_info->sectorsize;
total_bytes += fs_info->sectorsize;
@@ -637,14 +734,14 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 len)
{
struct extent_buffer *leaf;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
u64 csum_end;
u64 end_byte = bytenr + len;
- u32 blocksize_bits = fs_info->sb->s_blocksize_bits;
+ u32 blocksize_bits = fs_info->sectorsize_bits;
leaf = path->nodes[0];
csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
- csum_end <<= fs_info->sb->s_blocksize_bits;
+ csum_end <<= blocksize_bits;
csum_end += key->offset;
if (key->offset < bytenr && csum_end <= end_byte) {
@@ -691,8 +788,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 csum_end;
struct extent_buffer *leaf;
int ret;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- int blocksize_bits = fs_info->sb->s_blocksize_bits;
+ const u32 csum_size = fs_info->csum_size;
+ u32 blocksize_bits = fs_info->sectorsize_bits;
ASSERT(root == fs_info->csum_root ||
root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
@@ -706,7 +803,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
key.offset = end_byte - 1;
key.type = BTRFS_EXTENT_CSUM_KEY;
- path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
if (path->slots[0] == 0)
@@ -846,7 +942,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
int index = 0;
int found_next;
int ret;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
path = btrfs_alloc_path();
if (!path)
@@ -921,13 +1017,15 @@ again:
if (btrfs_leaf_free_space(leaf) >= csum_size) {
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
csum_offset = (bytenr - found_key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ fs_info->sectorsize_bits;
goto extend_csum;
}
btrfs_release_path(path);
+ path->search_for_extension = 1;
ret = btrfs_search_slot(trans, root, &file_key, path,
csum_size, 1);
+ path->search_for_extension = 0;
if (ret < 0)
goto out;
@@ -939,8 +1037,7 @@ again:
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- csum_offset = (bytenr - found_key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits;
if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
@@ -956,7 +1053,7 @@ extend_csum:
u32 diff;
tmp = sums->len - total_bytes;
- tmp >>= fs_info->sb->s_blocksize_bits;
+ tmp >>= fs_info->sectorsize_bits;
WARN_ON(tmp < 1);
extend_nr = max_t(int, 1, (int)tmp);
@@ -981,9 +1078,9 @@ insert:
u64 tmp;
tmp = sums->len - total_bytes;
- tmp >>= fs_info->sb->s_blocksize_bits;
+ tmp >>= fs_info->sectorsize_bits;
tmp = min(tmp, (next_offset - file_key.offset) >>
- fs_info->sb->s_blocksize_bits);
+ fs_info->sectorsize_bits);
tmp = max_t(u64, 1, tmp);
tmp = min_t(u64, tmp, MAX_CSUM_ITEMS(fs_info, csum_size));
@@ -991,10 +1088,8 @@ insert:
} else {
ins_size = csum_size;
}
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
ins_size);
- path->leave_spinning = 0;
if (ret < 0)
goto out;
if (WARN_ON(ret != 0))
@@ -1007,8 +1102,7 @@ csum:
item = (struct btrfs_csum_item *)((unsigned char *)item +
csum_offset * csum_size);
found:
- ins_size = (u32)(sums->len - total_bytes) >>
- fs_info->sb->s_blocksize_bits;
+ ins_size = (u32)(sums->len - total_bytes) >> fs_info->sectorsize_bits;
ins_size *= csum_size;
ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
ins_size);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0ff659455b1e..be9e3900cce8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -452,57 +452,16 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
}
}
-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
- const u64 start,
- const u64 len,
- struct extent_state **cached_state)
-{
- u64 search_start = start;
- const u64 end = start + len - 1;
-
- while (search_start < end) {
- const u64 search_len = end - search_start + 1;
- struct extent_map *em;
- u64 em_len;
- int ret = 0;
-
- em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- if (em->block_start != EXTENT_MAP_HOLE)
- goto next;
-
- em_len = em->len;
- if (em->start < search_start)
- em_len -= search_start - em->start;
- if (em_len > search_len)
- em_len = search_len;
-
- ret = set_extent_bit(&inode->io_tree, search_start,
- search_start + em_len - 1,
- EXTENT_DELALLOC_NEW,
- NULL, cached_state, GFP_NOFS);
-next:
- search_start = extent_map_end(em);
- free_extent_map(em);
- if (ret)
- return ret;
- }
- return 0;
-}
-
/*
- * after copy_from_user, pages need to be dirtied and we need to make
- * sure holes are created between the current EOF and the start of
- * any next extents (if required).
- *
- * this also makes the decision about creating an inline extent vs
- * doing real data extents, marking pages dirty and delalloc as required.
+ * After btrfs_copy_from_user(), update the following things for delalloc:
+ * - Mark newly dirtied pages as DELALLOC in the io tree.
+ * Used to advise which range is to be written back.
+ * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
+ * - Update inode size for past EOF write
*/
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached)
+ struct extent_state **cached, bool noreserve)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int err = 0;
@@ -514,7 +473,13 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
loff_t isize = i_size_read(&inode->vfs_inode);
unsigned int extra_bits = 0;
- start_pos = pos & ~((u64) fs_info->sectorsize - 1);
+ if (write_bytes == 0)
+ return 0;
+
+ if (noreserve)
+ extra_bits |= EXTENT_NORESERVE;
+
+ start_pos = round_down(pos, fs_info->sectorsize);
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
@@ -528,23 +493,6 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, cached);
- if (!btrfs_is_free_space_inode(inode)) {
- if (start_pos >= isize &&
- !(inode->flags & BTRFS_INODE_PREALLOC)) {
- /*
- * There can't be any extents following eof in this case
- * so just set the delalloc new bit for the range
- * directly.
- */
- extra_bits |= EXTENT_DELALLOC_NEW;
- } else {
- err = btrfs_find_new_delalloc_bytes(inode, start_pos,
- num_bytes, cached);
- if (err)
- return err;
- }
- }
-
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
if (err)
@@ -728,14 +676,16 @@ next:
* If an extent intersects the range but is not entirely inside the range
* it is either truncated or split. Anything entirely inside the range
* is deleted from the tree.
+ *
+ * Note: the VFS' inode number of bytes is not updated, it's up to the caller
+ * to deal with that. We set the field 'bytes_found' of the arguments structure
+ * with the number of allocated bytes found in the target range, so that the
+ * caller can update the inode's number of bytes in an atomic way when
+ * replacing extents in a range to avoid races with stat(2).
*/
-int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
- struct btrfs_path *path, u64 start, u64 end,
- u64 *drop_end, int drop_cache,
- int replace_extent,
- u32 extent_item_size,
- int *key_inserted)
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_drop_extents_args *args)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
@@ -743,14 +693,13 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
- struct inode *vfs_inode = &inode->vfs_inode;
u64 ino = btrfs_ino(inode);
- u64 search_start = start;
+ u64 search_start = args->start;
u64 disk_bytenr = 0;
u64 num_bytes = 0;
u64 extent_offset = 0;
u64 extent_end = 0;
- u64 last_end = start;
+ u64 last_end = args->start;
int del_nr = 0;
int del_slot = 0;
int extent_type;
@@ -760,11 +709,26 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int update_refs;
int found = 0;
int leafs_visited = 0;
+ struct btrfs_path *path = args->path;
+
+ args->bytes_found = 0;
+ args->extent_inserted = false;
- if (drop_cache)
- btrfs_drop_extent_cache(inode, start, end - 1, 0);
+ /* Must always have a path if ->replace_extent is true */
+ ASSERT(!(args->replace_extent && !args->path));
- if (start >= inode->disk_i_size && !replace_extent)
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ if (args->drop_cache)
+ btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
+
+ if (args->start >= inode->disk_i_size && !args->replace_extent)
modify_tree = 0;
update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
@@ -775,7 +739,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
search_start, modify_tree);
if (ret < 0)
break;
- if (ret > 0 && path->slots[0] > 0 && search_start == start) {
+ if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
if (key.objectid == ino &&
@@ -810,7 +774,7 @@ next_slot:
path->slots[0]++;
goto next_slot;
}
- if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+ if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
break;
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -852,7 +816,7 @@ next_slot:
}
found = 1;
- search_start = max(key.offset, start);
+ search_start = max(key.offset, args->start);
if (recow || !modify_tree) {
modify_tree = -1;
btrfs_release_path(path);
@@ -863,7 +827,7 @@ next_slot:
* | - range to drop - |
* | -------- extent -------- |
*/
- if (start > key.offset && end < extent_end) {
+ if (args->start > key.offset && args->end < extent_end) {
BUG_ON(del_nr > 0);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
@@ -871,7 +835,7 @@ next_slot:
}
memcpy(&new_key, &key, sizeof(new_key));
- new_key.offset = start;
+ new_key.offset = args->start;
ret = btrfs_duplicate_item(trans, root, path,
&new_key);
if (ret == -EAGAIN) {
@@ -885,15 +849,15 @@ next_slot:
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
btrfs_set_file_extent_num_bytes(leaf, fi,
- start - key.offset);
+ args->start - key.offset);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- extent_offset += start - key.offset;
+ extent_offset += args->start - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_end - start);
+ extent_end - args->start);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0) {
@@ -903,11 +867,11 @@ next_slot:
btrfs_init_data_ref(&ref,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset);
+ args->start - extent_offset);
ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
}
- key.offset = start;
+ key.offset = args->start;
}
/*
* From here on out we will have actually dropped something, so
@@ -919,23 +883,23 @@ next_slot:
* | ---- range to drop ----- |
* | -------- extent -------- |
*/
- if (start <= key.offset && end < extent_end) {
+ if (args->start <= key.offset && args->end < extent_end) {
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
break;
}
memcpy(&new_key, &key, sizeof(new_key));
- new_key.offset = end;
+ new_key.offset = args->end;
btrfs_set_item_key_safe(fs_info, path, &new_key);
- extent_offset += end - key.offset;
+ extent_offset += args->end - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_end - end);
+ extent_end - args->end);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(vfs_inode, end - key.offset);
+ args->bytes_found += args->end - key.offset;
break;
}
@@ -944,7 +908,7 @@ next_slot:
* | ---- range to drop ----- |
* | -------- extent -------- |
*/
- if (start > key.offset && end >= extent_end) {
+ if (args->start > key.offset && args->end >= extent_end) {
BUG_ON(del_nr > 0);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
@@ -952,11 +916,11 @@ next_slot:
}
btrfs_set_file_extent_num_bytes(leaf, fi,
- start - key.offset);
+ args->start - key.offset);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(vfs_inode, extent_end - start);
- if (end == extent_end)
+ args->bytes_found += extent_end - args->start;
+ if (args->end == extent_end)
break;
path->slots[0]++;
@@ -967,7 +931,7 @@ next_slot:
* | ---- range to drop ----- |
* | ------ extent ------ |
*/
- if (start <= key.offset && end >= extent_end) {
+ if (args->start <= key.offset && args->end >= extent_end) {
delete_extent_item:
if (del_nr == 0) {
del_slot = path->slots[0];
@@ -979,8 +943,7 @@ delete_extent_item:
if (update_refs &&
extent_type == BTRFS_FILE_EXTENT_INLINE) {
- inode_sub_bytes(vfs_inode,
- extent_end - key.offset);
+ args->bytes_found += extent_end - key.offset;
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
} else if (update_refs && disk_bytenr > 0) {
@@ -993,11 +956,10 @@ delete_extent_item:
key.offset - extent_offset);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
- inode_sub_bytes(vfs_inode,
- extent_end - key.offset);
+ args->bytes_found += extent_end - key.offset;
}
- if (end == extent_end)
+ if (args->end == extent_end)
break;
if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
@@ -1027,7 +989,7 @@ delete_extent_item:
* Set path->slots[0] to first slot, so that after the delete
* if items are move off from our leaf to its immediate left or
* right neighbor leafs, we end up with a correct and adjusted
- * path->slots[0] for our insertion (if replace_extent != 0).
+ * path->slots[0] for our insertion (if args->replace_extent).
*/
path->slots[0] = del_slot;
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
@@ -1041,15 +1003,14 @@ delete_extent_item:
* which case it unlocked our path, so check path->locks[0] matches a
* write lock.
*/
- if (!ret && replace_extent && leafs_visited == 1 &&
- (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
- path->locks[0] == BTRFS_WRITE_LOCK) &&
+ if (!ret && args->replace_extent && leafs_visited == 1 &&
+ path->locks[0] == BTRFS_WRITE_LOCK &&
btrfs_leaf_free_space(leaf) >=
- sizeof(struct btrfs_item) + extent_item_size) {
+ sizeof(struct btrfs_item) + args->extent_item_size) {
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = start;
+ key.offset = args->start;
if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
struct btrfs_key slot_key;
@@ -1057,30 +1018,18 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key, &extent_item_size, 1);
- *key_inserted = 1;
+ setup_items_for_insert(root, path, &key,
+ &args->extent_item_size, 1);
+ args->extent_inserted = true;
}
- if (!replace_extent || !(*key_inserted))
+ if (!args->path)
+ btrfs_free_path(path);
+ else if (!args->extent_inserted)
btrfs_release_path(path);
- if (drop_end)
- *drop_end = found ? min(end, last_end) : end;
- return ret;
-}
-
-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode, u64 start,
- u64 end, int drop_cache)
-{
- struct btrfs_path *path;
- int ret;
+out:
+ args->drop_end = found ? min(args->end, last_end) : args->end;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start,
- end, NULL, drop_cache, 0, 0, NULL);
- btrfs_free_path(path);
return ret;
}
@@ -1420,6 +1369,12 @@ again:
goto fail;
}
+ err = set_page_extent_mapped(pages[i]);
+ if (err < 0) {
+ faili = i;
+ goto fail;
+ }
+
if (i == 0)
err = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
@@ -1504,23 +1459,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
/*
- * It's possible the pages are dirty right now, but we don't want
- * to clean them yet because copy_from_user may catch a page fault
- * and we might have to fall back to one page at a time. If that
- * happens, we'll unlock these pages and we'd have a window where
- * reclaim could sneak in and drop the once-dirty page on the floor
- * without writing it.
- *
- * We have the pages locked and the extent range locked, so there's
- * no way someone can start IO on any dirty pages in this range.
- *
- * We'll call btrfs_dirty_pages() later on, and that will flip around
- * delalloc bits and dirty the pages as required.
+ * We should be called after prepare_pages() which should have locked
+ * all pages in the range.
*/
- for (i = 0; i < num_pages; i++) {
- set_page_extent_mapped(pages[i]);
+ for (i = 0; i < num_pages; i++)
WARN_ON(!PageLocked(pages[i]));
- }
return ret;
}
@@ -1615,11 +1558,84 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
}
+static void update_time_for_write(struct inode *inode)
+{
+ struct timespec64 now;
+
+ if (IS_NOCMTIME(inode))
+ return;
+
+ now = current_time(inode);
+ if (!timespec64_equal(&inode->i_mtime, &now))
+ inode->i_mtime = now;
+
+ if (!timespec64_equal(&inode->i_ctime, &now))
+ inode->i_ctime = now;
+
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
+}
+
+static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
+ size_t count)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ loff_t pos = iocb->ki_pos;
+ int ret;
+ loff_t oldsize;
+ loff_t start_pos;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ size_t nocow_bytes = count;
+
+ /* We will allocate space in case nodatacow is not set, so bail */
+ if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0)
+ return -EAGAIN;
+ /*
+ * There are holes in the range or parts of the range that must
+ * be COWed (shared extents, RO block groups, etc), so just bail
+ * out.
+ */
+ if (nocow_bytes < count)
+ return -EAGAIN;
+ }
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
+
+ /*
+ * We reserve space for updating the inode when we reserve space for the
+ * extent we are going to write, so we will enospc out there. We don't
+ * need to start yet another transaction to update the inode as we will
+ * update the inode when we finish writing whatever data we write.
+ */
+ update_time_for_write(inode);
+
+ start_pos = round_down(pos, fs_info->sectorsize);
+ oldsize = i_size_read(inode);
+ if (start_pos > oldsize) {
+ /* Expand hole size to cover write data, preventing empty gap */
+ loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
+
+ ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
+ if (ret) {
+ current->backing_dev_info = NULL;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct iov_iter *i)
{
struct file *file = iocb->ki_filp;
- loff_t pos = iocb->ki_pos;
+ loff_t pos;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct page **pages = NULL;
@@ -1629,17 +1645,37 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
u64 lockend;
size_t num_written = 0;
int nrptrs;
- int ret = 0;
+ ssize_t ret;
bool only_release_metadata = false;
bool force_page_uptodate = false;
+ loff_t old_isize = i_size_read(inode);
+ unsigned int ilock_flags = 0;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ ilock_flags |= BTRFS_ILOCK_TRY;
+
+ ret = btrfs_inode_lock(inode, ilock_flags);
+ if (ret < 0)
+ return ret;
+
+ ret = generic_write_checks(iocb, i);
+ if (ret <= 0)
+ goto out;
+
+ ret = btrfs_write_check(iocb, i, ret);
+ if (ret < 0)
+ goto out;
+ pos = iocb->ki_pos;
nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
PAGE_SIZE / (sizeof(struct page *)));
nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
nrptrs = max(nrptrs, 8);
pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- return -ENOMEM;
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
while (iov_iter_count(i) > 0) {
struct extent_state *cached_state = NULL;
@@ -1648,8 +1684,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
size_t write_bytes = min(iov_iter_count(i),
nrptrs * (size_t)PAGE_SIZE -
offset);
- size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
- PAGE_SIZE);
+ size_t num_pages;
size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
@@ -1657,8 +1692,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
size_t num_sectors;
int extents_locked;
- WARN_ON(num_pages > nrptrs);
-
/*
* Fault pages before locking them in prepare_pages
* to avoid recursive lock
@@ -1670,35 +1703,28 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
only_release_metadata = false;
sector_offset = pos & (fs_info->sectorsize - 1);
- reserve_bytes = round_up(write_bytes + sector_offset,
- fs_info->sectorsize);
extent_changeset_release(data_reserved);
ret = btrfs_check_data_free_space(BTRFS_I(inode),
&data_reserved, pos,
write_bytes);
if (ret < 0) {
+ /*
+ * If we don't have to COW at the offset, reserve
+ * metadata only. write_bytes may get smaller than
+ * requested here.
+ */
if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
- &write_bytes) > 0) {
- /*
- * For nodata cow case, no need to reserve
- * data space.
- */
+ &write_bytes) > 0)
only_release_metadata = true;
- /*
- * our prealloc extent may be smaller than
- * write_bytes, so scale down.
- */
- num_pages = DIV_ROUND_UP(write_bytes + offset,
- PAGE_SIZE);
- reserve_bytes = round_up(write_bytes +
- sector_offset,
- fs_info->sectorsize);
- } else {
+ else
break;
- }
}
+ num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
+ WARN_ON(num_pages > nrptrs);
+ reserve_bytes = round_up(write_bytes + sector_offset,
+ fs_info->sectorsize);
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
reserve_bytes);
@@ -1767,8 +1793,7 @@ again:
if (num_sectors > dirty_sectors) {
/* release everything except the sectors we dirtied */
- release_bytes -= dirty_sectors <<
- fs_info->sb->s_blocksize_bits;
+ release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
if (only_release_metadata) {
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
@@ -1787,10 +1812,9 @@ again:
release_bytes = round_up(copied + sector_offset,
fs_info->sectorsize);
- if (copied > 0)
- ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
- dirty_pages, pos, copied,
- &cached_state);
+ ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
+ dirty_pages, pos, copied,
+ &cached_state, only_release_metadata);
/*
* If we have not locked the extent range, because the range's
@@ -1815,17 +1839,6 @@ again:
if (only_release_metadata)
btrfs_check_nocow_unlock(BTRFS_I(inode));
- if (only_release_metadata && copied > 0) {
- lockstart = round_down(pos,
- fs_info->sectorsize);
- lockend = round_up(pos + copied,
- fs_info->sectorsize) - 1;
-
- set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, EXTENT_NORESERVE, NULL,
- NULL, GFP_NOFS);
- }
-
btrfs_drop_pages(pages, num_pages);
cond_resched();
@@ -1852,24 +1865,102 @@ again:
}
extent_changeset_free(data_reserved);
+ if (num_written > 0) {
+ pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
+ iocb->ki_pos += num_written;
+ }
+out:
+ btrfs_inode_unlock(inode, ilock_flags);
return num_written ? num_written : ret;
}
-static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ const u32 blocksize_mask = fs_info->sectorsize - 1;
+
+ if (offset & blocksize_mask)
+ return -EINVAL;
+
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
loff_t pos;
- ssize_t written;
+ ssize_t written = 0;
ssize_t written_buffered;
loff_t endbyte;
- int err;
+ ssize_t err;
+ unsigned int ilock_flags = 0;
+ struct iomap_dio *dio = NULL;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ ilock_flags |= BTRFS_ILOCK_TRY;
+
+ /* If the write DIO is within EOF, use a shared lock */
+ if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
+ ilock_flags |= BTRFS_ILOCK_SHARED;
+
+relock:
+ err = btrfs_inode_lock(inode, ilock_flags);
+ if (err < 0)
+ return err;
+
+ err = generic_write_checks(iocb, from);
+ if (err <= 0) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ return err;
+ }
+
+ err = btrfs_write_check(iocb, from, err);
+ if (err < 0) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ goto out;
+ }
+
+ pos = iocb->ki_pos;
+ /*
+ * Re-check since file size may have changed just before taking the
+ * lock or pos may have changed because of O_APPEND in generic_write_check()
+ */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
+ pos + iov_iter_count(from) > i_size_read(inode)) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
- written = btrfs_direct_IO(iocb, from);
+ if (check_direct_IO(fs_info, from, pos)) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ goto buffered;
+ }
+
+ dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ 0);
+
+ btrfs_inode_unlock(inode, ilock_flags);
- if (written < 0 || !iov_iter_count(from))
- return written;
+ if (IS_ERR_OR_NULL(dio)) {
+ err = PTR_ERR_OR_ZERO(dio);
+ if (err < 0 && err != -ENOTBLK)
+ goto out;
+ } else {
+ written = iomap_dio_complete(dio);
+ }
+ if (written < 0 || !iov_iter_count(from)) {
+ err = written;
+ goto out;
+ }
+
+buffered:
pos = iocb->ki_pos;
written_buffered = btrfs_buffered_write(iocb, from);
if (written_buffered < 0) {
@@ -1895,190 +1986,50 @@ out:
return written ? written : err;
}
-static void update_time_for_write(struct inode *inode)
-{
- struct timespec64 now;
-
- if (IS_NOCMTIME(inode))
- return;
-
- now = current_time(inode);
- if (!timespec64_equal(&inode->i_mtime, &now))
- inode->i_mtime = now;
-
- if (!timespec64_equal(&inode->i_ctime, &now))
- inode->i_ctime = now;
-
- if (IS_I_VERSION(inode))
- inode_inc_iversion(inode);
-}
-
static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 start_pos;
- u64 end_pos;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(file));
ssize_t num_written = 0;
const bool sync = iocb->ki_flags & IOCB_DSYNC;
- ssize_t err;
- loff_t pos;
- size_t count;
- loff_t oldsize;
- int clean_page = 0;
-
- if (!(iocb->ki_flags & IOCB_DIRECT) &&
- (iocb->ki_flags & IOCB_NOWAIT))
- return -EOPNOTSUPP;
-
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!inode_trylock(inode))
- return -EAGAIN;
- } else {
- inode_lock(inode);
- }
-
- err = generic_write_checks(iocb, from);
- if (err <= 0) {
- inode_unlock(inode);
- return err;
- }
-
- pos = iocb->ki_pos;
- count = iov_iter_count(from);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- size_t nocow_bytes = count;
-
- /*
- * We will allocate space in case nodatacow is not set,
- * so bail
- */
- if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes)
- <= 0) {
- inode_unlock(inode);
- return -EAGAIN;
- }
- /*
- * There are holes in the range or parts of the range that must
- * be COWed (shared extents, RO block groups, etc), so just bail
- * out.
- */
- if (nocow_bytes < count) {
- inode_unlock(inode);
- return -EAGAIN;
- }
- }
-
- current->backing_dev_info = inode_to_bdi(inode);
- err = file_remove_privs(file);
- if (err) {
- inode_unlock(inode);
- goto out;
- }
/*
- * If BTRFS flips readonly due to some impossible error
- * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
- * although we have opened a file as writable, we have
- * to stop this write operation to ensure FS consistency.
+ * If the fs flips readonly due to some impossible error, although we
+ * have opened a file as writable, we have to stop this write operation
+ * to ensure consistency.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
- inode_unlock(inode);
- err = -EROFS;
- goto out;
- }
-
- /*
- * We reserve space for updating the inode when we reserve space for the
- * extent we are going to write, so we will enospc out there. We don't
- * need to start yet another transaction to update the inode as we will
- * update the inode when we finish writing whatever data we write.
- */
- update_time_for_write(inode);
+ if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
+ return -EROFS;
- start_pos = round_down(pos, fs_info->sectorsize);
- oldsize = i_size_read(inode);
- if (start_pos > oldsize) {
- /* Expand hole size to cover write data, preventing empty gap */
- end_pos = round_up(pos + count,
- fs_info->sectorsize);
- err = btrfs_cont_expand(inode, oldsize, end_pos);
- if (err) {
- inode_unlock(inode);
- goto out;
- }
- if (start_pos > round_up(oldsize, fs_info->sectorsize))
- clean_page = 1;
- }
+ if (!(iocb->ki_flags & IOCB_DIRECT) &&
+ (iocb->ki_flags & IOCB_NOWAIT))
+ return -EOPNOTSUPP;
if (sync)
- atomic_inc(&BTRFS_I(inode)->sync_writers);
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- /*
- * 1. We must always clear IOCB_DSYNC in order to not deadlock
- * in iomap, as it calls generic_write_sync() in this case.
- * 2. If we are async, we can call iomap_dio_complete() either
- * in
- *
- * 2.1. A worker thread from the last bio completed. In this
- * case we need to mark the btrfs_dio_data that it is
- * async in order to call generic_write_sync() properly.
- * This is handled by setting BTRFS_DIO_SYNC_STUB in the
- * current->journal_info.
- * 2.2 The submitter context, because all IO completed
- * before we exited iomap_dio_rw(). In this case we can
- * just re-set the IOCB_DSYNC on the iocb and we'll do
- * the sync below. If our ->end_io() gets called and
- * current->journal_info is set, then we know we're in
- * our current context and we will clear
- * current->journal_info to indicate that we need to
- * sync below.
- */
- if (sync) {
- ASSERT(current->journal_info == NULL);
- iocb->ki_flags &= ~IOCB_DSYNC;
- current->journal_info = BTRFS_DIO_SYNC_STUB;
- }
- num_written = __btrfs_direct_write(iocb, from);
+ atomic_inc(&inode->sync_writers);
- /*
- * As stated above, we cleared journal_info, so we need to do
- * the sync ourselves.
- */
- if (sync && current->journal_info == NULL)
- iocb->ki_flags |= IOCB_DSYNC;
- current->journal_info = NULL;
- } else {
+ if (iocb->ki_flags & IOCB_DIRECT)
+ num_written = btrfs_direct_write(iocb, from);
+ else
num_written = btrfs_buffered_write(iocb, from);
- if (num_written > 0)
- iocb->ki_pos = pos + num_written;
- if (clean_page)
- pagecache_isize_extended(inode, oldsize,
- i_size_read(inode));
- }
-
- inode_unlock(inode);
/*
* We also have to set last_sub_trans to the current log transid,
* otherwise subsequent syncs to a file that's been synced in this
* transaction will appear to have already occurred.
*/
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->last_sub_trans = root->log_transid;
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ inode->last_sub_trans = inode->root->log_transid;
+ spin_unlock(&inode->lock);
if (num_written > 0)
num_written = generic_write_sync(iocb, num_written);
if (sync)
- atomic_dec(&BTRFS_I(inode)->sync_writers);
-out:
+ atomic_dec(&inode->sync_writers);
+
current->backing_dev_info = NULL;
- return num_written ? num_written : err;
+ return num_written;
}
int btrfs_release_file(struct inode *inode, struct file *filp)
@@ -2173,13 +2124,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
inode_lock(inode);
- /*
- * We take the dio_sem here because the tree log stuff can race with
- * lockless dio writes and get an extent map logged for an extent we
- * never waited on. We need it this high up for lockdep reasons.
- */
- down_write(&BTRFS_I(inode)->dio_sem);
-
atomic_inc(&root->log_batch);
/*
@@ -2210,7 +2154,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2225,8 +2168,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* commit waits for their completion, to avoid data loss if we fsync,
* the current transaction commits before the ordered extents complete
* and a power failure happens right after that.
+ *
+ * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
+ * logical address recorded in the ordered extent may change. We need
+ * to wait for the IO to stabilize the logical address.
*/
- if (full_sync) {
+ if (full_sync || btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode, start, len);
} else {
/*
@@ -2289,6 +2236,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = PTR_ERR(trans);
goto out_release_extents;
}
+ trans->in_fsync = true;
ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
btrfs_release_log_ctx_extents(&ctx);
@@ -2307,7 +2255,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
if (ret != BTRFS_NO_LOG_SYNC) {
@@ -2338,7 +2285,6 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
- up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2500,13 +2446,13 @@ out:
* em->start + em->len > start)
* When a hole extent is found, return 1 and modify start/len.
*/
-static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
+static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map *em;
int ret = 0;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+ em = btrfs_get_extent(inode, NULL, 0,
round_down(*start, fs_info->sectorsize),
round_up(*len, fs_info->sectorsize));
if (IS_ERR(em))
@@ -2566,13 +2512,14 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
}
static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
- struct inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_replace_extent_info *extent_info,
- const u64 replace_len)
+ const u64 replace_len,
+ const u64 bytes_to_drop)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -2584,10 +2531,12 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
return 0;
if (extent_info->disk_offset == 0 &&
- btrfs_fs_incompat(fs_info, NO_HOLES))
+ btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
return 0;
+ }
- key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = extent_info->file_offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -2608,23 +2557,25 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
- extent_info->file_offset, replace_len);
+ ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
+ replace_len);
if (ret)
return ret;
/* If it's a hole, nothing more needs to be done. */
- if (extent_info->disk_offset == 0)
+ if (extent_info->disk_offset == 0) {
+ btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
return 0;
+ }
- inode_add_bytes(inode, replace_len);
+ btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
if (extent_info->is_new_extent && extent_info->insertions == 0) {
key.objectid = extent_info->disk_offset;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = extent_info->disk_len;
ret = btrfs_alloc_reserved_file_extent(trans, root,
- btrfs_ino(BTRFS_I(inode)),
+ btrfs_ino(inode),
extent_info->file_offset,
extent_info->qgroup_reserved,
&key);
@@ -2636,7 +2587,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
extent_info->disk_len, 0);
ref_offset = extent_info->file_offset - extent_info->data_offset;
btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)), ref_offset);
+ btrfs_ino(inode), ref_offset);
ret = btrfs_inc_extent_ref(trans, &ref);
}
@@ -2659,6 +2610,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out)
{
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
@@ -2667,7 +2619,6 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
struct btrfs_block_rsv *rsv;
unsigned int rsv_count;
u64 cur_offset;
- u64 drop_end;
u64 len = end - start;
int ret = 0;
@@ -2706,10 +2657,16 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
trans->block_rsv = rsv;
cur_offset = start;
+ drop_args.path = path;
+ drop_args.end = end + 1;
+ drop_args.drop_cache = true;
while (cur_offset < end) {
- ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path,
- cur_offset, end + 1, &drop_end,
- 1, 0, 0, NULL);
+ drop_args.start = cur_offset;
+ ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
+ /* If we are punching a hole decrement the inode's byte count */
+ if (!extent_info)
+ btrfs_update_inode_bytes(BTRFS_I(inode), 0,
+ drop_args.bytes_found);
if (ret != -ENOSPC) {
/*
* When cloning we want to avoid transaction aborts when
@@ -2726,10 +2683,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
trans->block_rsv = &fs_info->trans_block_rsv;
- if (!extent_info && cur_offset < drop_end &&
+ if (!extent_info && cur_offset < drop_args.drop_end &&
cur_offset < ino_size) {
ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_end);
+ cur_offset, drop_args.drop_end);
if (ret) {
/*
* If we failed then we didn't insert our hole
@@ -2740,7 +2697,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
break;
}
- } else if (!extent_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_args.drop_end) {
/*
* We are past the i_size here, but since we didn't
* insert holes we need to clear the mapped area so we
@@ -2748,7 +2705,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
* file extent is inserted here.
*/
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
- cur_offset, drop_end - cur_offset);
+ cur_offset,
+ drop_args.drop_end - cur_offset);
if (ret) {
/*
* We couldn't clear our area, so we could
@@ -2760,11 +2718,14 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
}
}
- if (extent_info && drop_end > extent_info->file_offset) {
- u64 replace_len = drop_end - extent_info->file_offset;
+ if (extent_info &&
+ drop_args.drop_end > extent_info->file_offset) {
+ u64 replace_len = drop_args.drop_end -
+ extent_info->file_offset;
- ret = btrfs_insert_replace_extent(trans, inode, path,
- extent_info, replace_len);
+ ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode),
+ path, extent_info, replace_len,
+ drop_args.bytes_found);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
@@ -2774,9 +2735,9 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
extent_info->file_offset += replace_len;
}
- cur_offset = drop_end;
+ cur_offset = drop_args.drop_end;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
break;
@@ -2796,7 +2757,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
trans->block_rsv = rsv;
if (!extent_info) {
- ret = find_first_non_hole(inode, &cur_offset, &len);
+ ret = find_first_non_hole(BTRFS_I(inode), &cur_offset,
+ &len);
if (unlikely(ret < 0))
break;
if (ret && !len) {
@@ -2833,25 +2795,26 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
* will not record the existence of the hole region
* [existing_hole_start, lockend].
*/
- if (drop_end <= end)
- drop_end = end + 1;
+ if (drop_args.drop_end <= end)
+ drop_args.drop_end = end + 1;
/*
* Don't insert file hole extent item if it's for a range beyond eof
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
*/
- if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) {
+ if (!extent_info && cur_offset < ino_size &&
+ cur_offset < drop_args.drop_end) {
ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_end);
+ cur_offset, drop_args.drop_end);
if (ret) {
/* Same comment as above. */
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
- } else if (!extent_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_args.drop_end) {
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
- cur_offset, drop_end - cur_offset);
+ cur_offset, drop_args.drop_end - cur_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2859,8 +2822,9 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
}
if (extent_info) {
- ret = btrfs_insert_replace_extent(trans, inode, path, extent_info,
- extent_info->data_len);
+ ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), path,
+ extent_info, extent_info->data_len,
+ drop_args.bytes_found);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2906,7 +2870,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
inode_lock(inode);
ino_size = round_up(inode->i_size, fs_info->sectorsize);
- ret = find_first_non_hole(inode, &offset, &len);
+ ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
goto out_only_mutex;
if (ret && !len) {
@@ -2931,7 +2895,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (same_block && len < fs_info->sectorsize) {
if (offset < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(inode, offset, len, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
+ 0);
} else {
ret = 0;
}
@@ -2941,7 +2906,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
/* zero back part of the first block */
if (offset < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(inode, offset, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret) {
inode_unlock(inode);
return ret;
@@ -2956,7 +2921,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
/* after truncate page, check hole again */
len = offset + len - lockstart;
offset = lockstart;
- ret = find_first_non_hole(inode, &offset, &len);
+ ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
goto out_only_mutex;
if (ret && !len) {
@@ -2970,14 +2935,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
tail_start = lockend + 1;
tail_len = offset + len - tail_start;
if (tail_len) {
- ret = find_first_non_hole(inode, &tail_start, &tail_len);
+ ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
if (unlikely(ret < 0))
goto out_only_mutex;
if (!ret) {
/* zero the front end of the last page */
if (tail_start + tail_len < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(inode,
+ ret = btrfs_truncate_block(BTRFS_I(inode),
tail_start + tail_len,
0, 1);
if (ret)
@@ -3011,7 +2976,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
ASSERT(trans != NULL);
inode_inc_iversion(inode);
inode->i_mtime = inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
updated_inode = true;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -3038,7 +3003,7 @@ out_only_mutex:
} else {
int ret2;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
if (!ret)
ret = ret2;
@@ -3106,8 +3071,8 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
inode->i_ctime = current_time(inode);
i_size_write(inode, end);
- btrfs_inode_safe_disk_i_size_write(inode, 0);
- ret = btrfs_update_inode(trans, root, inode);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
return ret ? ret : ret2;
@@ -3219,7 +3184,8 @@ static int btrfs_zero_range(struct inode *inode,
}
if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
free_extent_map(em);
- ret = btrfs_truncate_block(inode, offset, len, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
+ 0);
if (!ret)
ret = btrfs_fallocate_update_isize(inode,
offset + len,
@@ -3250,7 +3216,7 @@ static int btrfs_zero_range(struct inode *inode,
alloc_start = round_down(offset, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
- ret = btrfs_truncate_block(inode, offset, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret)
goto out;
} else {
@@ -3267,7 +3233,8 @@ static int btrfs_zero_range(struct inode *inode,
alloc_end = round_up(offset + len, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
- ret = btrfs_truncate_block(inode, offset + len, 0, 1);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
+ 0, 1);
if (ret)
goto out;
} else {
@@ -3337,6 +3304,10 @@ static long btrfs_fallocate(struct file *file, int mode,
int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
+ /* Do not allow fallocate in ZONED mode */
+ if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
+ return -EOPNOTSUPP;
+
alloc_start = round_down(offset, blocksize);
alloc_end = round_up(offset + len, blocksize);
cur_offset = alloc_start;
@@ -3361,7 +3332,7 @@ static long btrfs_fallocate(struct file *file, int mode,
return ret;
}
- inode_lock(inode);
+ btrfs_inode_lock(inode, 0);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
ret = inode_newsize_ok(inode, offset + len);
@@ -3377,7 +3348,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* But that's a minor problem and won't do much harm BTW.
*/
if (alloc_start > inode->i_size) {
- ret = btrfs_cont_expand(inode, i_size_read(inode),
+ ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
alloc_start);
if (ret)
goto out;
@@ -3387,7 +3358,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* need to zero out the end of the block if i_size lands in the
* middle of a block.
*/
- ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
if (ret)
goto out;
}
@@ -3600,9 +3571,9 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek(file, offset, whence);
case SEEK_DATA:
case SEEK_HOLE:
- inode_lock_shared(inode);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
offset = find_desired_extent(inode, offset, whence);
- inode_unlock_shared(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
break;
}
@@ -3618,17 +3589,48 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}
+static int check_direct_read(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ int ret;
+ int i, seg;
+
+ ret = check_direct_IO(fs_info, iter, offset);
+ if (ret < 0)
+ return ret;
+
+ if (!iter_is_iovec(iter))
+ return 0;
+
+ for (seg = 0; seg < iter->nr_segs; seg++)
+ for (i = seg + 1; i < iter->nr_segs; i++)
+ if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
+ return -EINVAL;
+ return 0;
+}
+
+static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
+ return 0;
+
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+ ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ return ret;
+}
+
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret = 0;
if (iocb->ki_flags & IOCB_DIRECT) {
- struct inode *inode = file_inode(iocb->ki_filp);
-
- inode_lock_shared(inode);
- ret = btrfs_direct_IO(iocb, to);
- inode_unlock_shared(inode);
- if (ret < 0)
+ ret = btrfs_direct_read(iocb, to);
+ if (ret < 0 || !iov_iter_count(to) ||
+ iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
return ret;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index af0013d3df63..5400294bd271 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,7 +16,6 @@
#include "transaction.h"
#include "disk-io.h"
#include "extent_io.h"
-#include "inode-map.h"
#include "volumes.h"
#include "space-info.h"
#include "delalloc-space.h"
@@ -33,16 +32,18 @@ struct btrfs_trim_range {
struct list_head list;
};
-static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *bitmap_info);
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
-static int btrfs_wait_cache_io_root(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_io_ctl *io_ctl,
- struct btrfs_path *path);
+static int search_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info, u64 *offset,
+ u64 *bytes, bool for_alloc);
+static void free_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info);
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes);
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
@@ -141,17 +142,15 @@ static int __create_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header *header;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
- u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
+ /* We inline CRCs for the free disk space cache */
+ const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
+ BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
int ret;
ret = btrfs_insert_empty_inode(trans, root, path, ino);
if (ret)
return ret;
- /* We inline crc's for the free disk space cache */
- if (ino != BTRFS_FREE_INO_OBJECTID)
- flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
-
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
@@ -199,7 +198,7 @@ int create_free_space_inode(struct btrfs_trans_handle *trans,
int ret;
u64 ino;
- ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
+ ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
if (ret < 0)
return ret;
@@ -207,6 +206,65 @@ int create_free_space_inode(struct btrfs_trans_handle *trans,
ino, block_group->start);
}
+/*
+ * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
+ * handles lookup, otherwise it takes ownership and iputs the inode.
+ * Don't reuse an inode pointer after passing it into this function.
+ */
+int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (!inode)
+ inode = lookup_free_space_inode(block_group, path);
+ if (IS_ERR(inode)) {
+ if (PTR_ERR(inode) != -ENOENT)
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret) {
+ btrfs_add_delayed_iput(inode);
+ goto out;
+ }
+ clear_nlink(inode);
+ /* One for the block groups ref */
+ spin_lock(&block_group->lock);
+ if (block_group->iref) {
+ block_group->iref = 0;
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ /* One for the lookup ref */
+ btrfs_add_delayed_iput(inode);
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.type = 0;
+ key.offset = block_group->start;
+ ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
+ -1, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ goto out;
+ }
+ ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
@@ -267,12 +325,12 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
* We skip the throttling logic for free space cache inodes, so we don't
* need to check for -EAGAIN.
*/
- ret = btrfs_truncate_inode_items(trans, root, inode,
+ ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
0, BTRFS_EXTENT_DATA_KEY);
if (ret)
goto fail;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
fail:
if (locked)
@@ -304,16 +362,11 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
int write)
{
int num_pages;
- int check_crcs = 0;
num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
- check_crcs = 1;
-
/* Make sure we can fit our crcs and generation into the first page */
- if (write && check_crcs &&
- (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
+ if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
return -ENOSPC;
memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
@@ -324,7 +377,6 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
io_ctl->num_pages = num_pages;
io_ctl->fs_info = btrfs_sb(inode->i_sb);
- io_ctl->check_crcs = check_crcs;
io_ctl->inode = inode;
return 0;
@@ -379,11 +431,22 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
int i;
for (i = 0; i < io_ctl->num_pages; i++) {
+ int ret;
+
page = find_or_create_page(inode->i_mapping, i, mask);
if (!page) {
io_ctl_drop_pages(io_ctl);
return -ENOMEM;
}
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ io_ctl_drop_pages(io_ctl);
+ return ret;
+ }
+
io_ctl->pages[i] = page;
if (uptodate && !PageUptodate(page)) {
btrfs_readpage(NULL, page);
@@ -403,10 +466,8 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
}
}
- for (i = 0; i < io_ctl->num_pages; i++) {
+ for (i = 0; i < io_ctl->num_pages; i++)
clear_page_dirty_for_io(io_ctl->pages[i]);
- set_page_extent_mapped(io_ctl->pages[i]);
- }
return 0;
}
@@ -419,13 +480,8 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
* Skip the csum areas. If we don't check crcs then we just have a
* 64bit chunk at the front of the first page.
*/
- if (io_ctl->check_crcs) {
- io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
- io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
- } else {
- io_ctl->cur += sizeof(u64);
- io_ctl->size -= sizeof(u64) * 2;
- }
+ io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+ io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
put_unaligned_le64(generation, io_ctl->cur);
io_ctl->cur += sizeof(u64);
@@ -439,14 +495,8 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
* Skip the crc area. If we don't check crcs then we just have a 64bit
* chunk at the front of the first page.
*/
- if (io_ctl->check_crcs) {
- io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
- io_ctl->size -= sizeof(u64) +
- (sizeof(u32) * io_ctl->num_pages);
- } else {
- io_ctl->cur += sizeof(u64);
- io_ctl->size -= sizeof(u64) * 2;
- }
+ io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+ io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
cache_gen = get_unaligned_le64(io_ctl->cur);
if (cache_gen != generation) {
@@ -466,11 +516,6 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
u32 crc = ~(u32)0;
unsigned offset = 0;
- if (!io_ctl->check_crcs) {
- io_ctl_unmap_page(io_ctl);
- return;
- }
-
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
@@ -488,11 +533,6 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
u32 crc = ~(u32)0;
unsigned offset = 0;
- if (!io_ctl->check_crcs) {
- io_ctl_map_page(io_ctl, 0);
- return 0;
- }
-
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
@@ -625,42 +665,42 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
return 0;
}
-/*
- * Since we attach pinned extents after the fact we can have contiguous sections
- * of free space that are split up in entries. This poses a problem with the
- * tree logging stuff since it could have allocated across what appears to be 2
- * entries since we would have merged the entries when adding the pinned extents
- * back to the free space cache. So run through the space cache that we just
- * loaded and merge contiguous entries. This will make the log replay stuff not
- * blow up and it will make for nicer allocator behavior.
- */
-static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
- struct btrfs_free_space *e, *prev = NULL;
- struct rb_node *n;
+ struct btrfs_block_group *block_group = ctl->private;
+ u64 max_bytes;
+ u64 bitmap_bytes;
+ u64 extent_bytes;
+ u64 size = block_group->length;
+ u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+ u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
-again:
- spin_lock(&ctl->tree_lock);
- for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
- e = rb_entry(n, struct btrfs_free_space, offset_index);
- if (!prev)
- goto next;
- if (e->bitmap || prev->bitmap)
- goto next;
- if (prev->offset + prev->bytes == e->offset) {
- unlink_free_space(ctl, prev);
- unlink_free_space(ctl, e);
- prev->bytes += e->bytes;
- kmem_cache_free(btrfs_free_space_cachep, e);
- link_free_space(ctl, prev);
- prev = NULL;
- spin_unlock(&ctl->tree_lock);
- goto again;
- }
-next:
- prev = e;
- }
- spin_unlock(&ctl->tree_lock);
+ max_bitmaps = max_t(u64, max_bitmaps, 1);
+
+ ASSERT(ctl->total_bitmaps <= max_bitmaps);
+
+ /*
+ * We are trying to keep the total amount of memory used per 1GiB of
+ * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
+ * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
+ * bitmaps, we may end up using more memory than this.
+ */
+ if (size < SZ_1G)
+ max_bytes = MAX_CACHE_BYTES_PER_GIG;
+ else
+ max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
+
+ bitmap_bytes = ctl->total_bitmaps * ctl->unit;
+
+ /*
+ * we want the extent entry threshold to always be at most 1/2 the max
+ * bytes we can have, or whatever is less than that.
+ */
+ extent_bytes = max_bytes - bitmap_bytes;
+ extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
+
+ ctl->extents_thresh =
+ div_u64(extent_bytes, sizeof(struct btrfs_free_space));
}
static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
@@ -744,8 +784,10 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
while (num_entries) {
e = kmem_cache_zalloc(btrfs_free_space_cachep,
GFP_NOFS);
- if (!e)
+ if (!e) {
+ ret = -ENOMEM;
goto free_cache;
+ }
ret = io_ctl_read_entry(&io_ctl, e, &type);
if (ret) {
@@ -753,17 +795,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
goto free_cache;
}
- /*
- * Sync discard ensures that the free space cache is always
- * trimmed. So when reading this in, the state should reflect
- * that. We also do this for async as a stop gap for lack of
- * persistence.
- */
- if (btrfs_test_opt(fs_info, DISCARD_SYNC) ||
- btrfs_test_opt(fs_info, DISCARD_ASYNC))
- e->trim_state = BTRFS_TRIM_STATE_TRIMMED;
-
if (!e->bytes) {
+ ret = -1;
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
@@ -784,6 +817,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
e->bitmap = kmem_cache_zalloc(
btrfs_free_space_bitmap_cachep, GFP_NOFS);
if (!e->bitmap) {
+ ret = -ENOMEM;
kmem_cache_free(
btrfs_free_space_cachep, e);
goto free_cache;
@@ -791,7 +825,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
spin_lock(&ctl->tree_lock);
ret = link_free_space(ctl, e);
ctl->total_bitmaps++;
- ctl->op->recalc_thresholds(ctl);
+ recalculate_thresholds(ctl);
spin_unlock(&ctl->tree_lock);
if (ret) {
btrfs_err(fs_info,
@@ -816,19 +850,11 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
ret = io_ctl_read_bitmap(&io_ctl, e);
if (ret)
goto free_cache;
- e->bitmap_extents = count_bitmap_extents(ctl, e);
- if (!btrfs_free_space_trimmed(e)) {
- ctl->discardable_extents[BTRFS_STAT_CURR] +=
- e->bitmap_extents;
- ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes;
- }
}
io_ctl_drop_pages(&io_ctl);
- merge_space_tree(ctl);
ret = 1;
out:
- btrfs_discard_update_discardable(ctl->private, ctl);
io_ctl_free(&io_ctl);
return ret;
free_cache:
@@ -837,10 +863,46 @@ free_cache:
goto out;
}
+static int copy_free_space_cache(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ int ret = 0;
+
+ while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (!info->bitmap) {
+ unlink_free_space(ctl, info);
+ ret = btrfs_add_free_space(block_group, info->offset,
+ info->bytes);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ } else {
+ u64 offset = info->offset;
+ u64 bytes = ctl->unit;
+
+ while (search_bitmap(ctl, info, &offset, &bytes,
+ false) == 0) {
+ ret = btrfs_add_free_space(block_group, offset,
+ bytes);
+ if (ret)
+ break;
+ bitmap_clear_bits(ctl, info, offset, bytes);
+ offset = info->offset;
+ bytes = ctl->unit;
+ }
+ free_bitmap(ctl, info);
+ }
+ cond_resched();
+ }
+ return ret;
+}
+
int load_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space_ctl tmp_ctl = {};
struct inode *inode;
struct btrfs_path *path;
int ret = 0;
@@ -848,6 +910,13 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
u64 used = block_group->used;
/*
+ * Because we could potentially discard our loaded free space, we want
+ * to load everything into a temporary structure first, and then if it's
+ * valid copy it all into the actual free space ctl.
+ */
+ btrfs_init_free_space_ctl(block_group, &tmp_ctl);
+
+ /*
* If this block group has been marked to be cleared for one reason or
* another then we can't trust the on disk cache, so just return.
*/
@@ -898,19 +967,25 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
}
spin_unlock(&block_group->lock);
- ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
+ ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
path, block_group->start);
btrfs_free_path(path);
if (ret <= 0)
goto out;
- spin_lock(&ctl->tree_lock);
- matched = (ctl->free_space == (block_group->length - used -
- block_group->bytes_super));
- spin_unlock(&ctl->tree_lock);
+ matched = (tmp_ctl.free_space == (block_group->length - used -
+ block_group->bytes_super));
- if (!matched) {
- __btrfs_remove_free_space_cache(ctl);
+ if (matched) {
+ ret = copy_free_space_cache(block_group, &tmp_ctl);
+ /*
+ * ret == 1 means we successfully loaded the free space cache,
+ * so we need to re-set it here.
+ */
+ if (ret == 0)
+ ret = 1;
+ } else {
+ __btrfs_remove_free_space_cache(&tmp_ctl);
btrfs_warn(fs_info,
"block group %llu has wrong amount of free space",
block_group->start);
@@ -929,6 +1004,9 @@ out:
block_group->start);
}
+ spin_lock(&ctl->tree_lock);
+ btrfs_discard_update_discardable(block_group);
+ spin_unlock(&ctl->tree_lock);
iput(inode);
return ret;
}
@@ -1191,7 +1269,7 @@ out:
"failed to write free space cache for block group %llu error %d",
block_group->start, ret);
}
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
if (block_group) {
/* the dirty list is protected by the dirty_bgs_lock */
@@ -1220,14 +1298,6 @@ out:
}
-static int btrfs_wait_cache_io_root(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_io_ctl *io_ctl,
- struct btrfs_path *path)
-{
- return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
-}
-
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
@@ -1238,11 +1308,14 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
}
/**
- * __btrfs_write_out_cache - write out cached info to an inode
- * @root - the root the inode belongs to
- * @ctl - the free space cache we are going to write out
- * @block_group - the block_group for this cache if it belongs to a block_group
- * @trans - the trans handle
+ * Write out cached info to an inode
+ *
+ * @root: root the inode belongs to
+ * @inode: freespace inode we are writing out
+ * @ctl: free space cache we are going to write out
+ * @block_group: block_group for this cache if it belongs to a block_group
+ * @io_ctl: holds context for the io
+ * @trans: the trans handle
*
* This function writes out a free space cache struct to disk for quick recovery
* on mount. This will return 0 if it was successful in writing the cache out,
@@ -1332,7 +1405,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
/* Everything is written out, now we dirty the pages in the file. */
ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
io_ctl->num_pages, 0, i_size_read(inode),
- &cached_state);
+ &cached_state, false);
if (ret)
goto out_nospc;
@@ -1381,7 +1454,7 @@ out:
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
if (must_iput)
iput(inode);
return ret;
@@ -1672,44 +1745,6 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
return ret;
}
-static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
-{
- struct btrfs_block_group *block_group = ctl->private;
- u64 max_bytes;
- u64 bitmap_bytes;
- u64 extent_bytes;
- u64 size = block_group->length;
- u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
- u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
-
- max_bitmaps = max_t(u64, max_bitmaps, 1);
-
- ASSERT(ctl->total_bitmaps <= max_bitmaps);
-
- /*
- * We are trying to keep the total amount of memory used per 1GiB of
- * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
- * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
- * bitmaps, we may end up using more memory than this.
- */
- if (size < SZ_1G)
- max_bytes = MAX_CACHE_BYTES_PER_GIG;
- else
- max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
-
- bitmap_bytes = ctl->total_bitmaps * ctl->unit;
-
- /*
- * we want the extent entry threshold to always be at most 1/2 the max
- * bytes we can have, or whatever is less than that.
- */
- extent_bytes = max_bytes - bitmap_bytes;
- extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
-
- ctl->extents_thresh =
- div_u64(extent_bytes, sizeof(struct btrfs_free_space));
-}
-
static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info,
u64 offset, u64 bytes)
@@ -1912,29 +1947,6 @@ out:
return NULL;
}
-static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *bitmap_info)
-{
- struct btrfs_block_group *block_group = ctl->private;
- u64 bytes = bitmap_info->bytes;
- unsigned int rs, re;
- int count = 0;
-
- if (!block_group || !bytes)
- return count;
-
- bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0,
- BITS_PER_BITMAP) {
- bytes -= (rs - re) * ctl->unit;
- count++;
-
- if (!bytes)
- break;
- }
-
- return count;
-}
-
static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset)
{
@@ -1944,8 +1956,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
INIT_LIST_HEAD(&info->list);
link_free_space(ctl, info);
ctl->total_bitmaps++;
-
- ctl->op->recalc_thresholds(ctl);
+ recalculate_thresholds(ctl);
}
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
@@ -1967,7 +1978,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl,
kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
ctl->total_bitmaps--;
- ctl->op->recalc_thresholds(ctl);
+ recalculate_thresholds(ctl);
}
static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
@@ -2134,7 +2145,6 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
}
static const struct btrfs_free_space_op free_space_op = {
- .recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
@@ -2467,6 +2477,8 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
int ret = 0;
u64 filter_bytes = bytes;
+ ASSERT(!btrfs_is_zoned(fs_info));
+
info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
if (!info)
return -ENOMEM;
@@ -2508,7 +2520,7 @@ link:
if (ret)
kmem_cache_free(btrfs_free_space_cachep, info);
out:
- btrfs_discard_update_discardable(block_group, ctl);
+ btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
if (ret) {
@@ -2524,11 +2536,49 @@ out:
return ret;
}
+static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size, bool used)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ u64 offset = bytenr - block_group->start;
+ u64 to_free, to_unusable;
+
+ spin_lock(&ctl->tree_lock);
+ if (!used)
+ to_free = size;
+ else if (offset >= block_group->alloc_offset)
+ to_free = size;
+ else if (offset + size <= block_group->alloc_offset)
+ to_free = 0;
+ else
+ to_free = offset + size - block_group->alloc_offset;
+ to_unusable = size - to_free;
+
+ ctl->free_space += to_free;
+ block_group->zone_unusable += to_unusable;
+ spin_unlock(&ctl->tree_lock);
+ if (!used) {
+ spin_lock(&block_group->lock);
+ block_group->alloc_offset -= size;
+ spin_unlock(&block_group->lock);
+ }
+
+ /* All the region is now unusable. Mark it as unused and reclaim */
+ if (block_group->zone_unusable == block_group->length)
+ btrfs_mark_bg_unused(block_group);
+
+ return 0;
+}
+
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size)
{
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ if (btrfs_is_zoned(block_group->fs_info))
+ return __btrfs_add_free_space_zoned(block_group, bytenr, size,
+ true);
+
if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
trim_state = BTRFS_TRIM_STATE_TRIMMED;
@@ -2537,6 +2587,16 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group,
bytenr, size, trim_state);
}
+int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size)
+{
+ if (btrfs_is_zoned(block_group->fs_info))
+ return __btrfs_add_free_space_zoned(block_group, bytenr, size,
+ false);
+
+ return btrfs_add_free_space(block_group, bytenr, size);
+}
+
/*
* This is a subtle distinction because when adding free space back in general,
* we want it to be added as untrimmed for async. But in the case where we add
@@ -2547,6 +2607,10 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
{
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ if (btrfs_is_zoned(block_group->fs_info))
+ return __btrfs_add_free_space_zoned(block_group, bytenr, size,
+ true);
+
if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
trim_state = BTRFS_TRIM_STATE_TRIMMED;
@@ -2564,6 +2628,23 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group,
int ret;
bool re_search = false;
+ if (btrfs_is_zoned(block_group->fs_info)) {
+ /*
+ * This can happen with conventional zones when replaying log.
+ * Since the allocation info of tree-log nodes are not recorded
+ * to the extent-tree, calculate_alloc_pointer() failed to
+ * advance the allocation pointer after last allocated tree log
+ * node blocks.
+ *
+ * This function is called from
+ * btrfs_pin_extent_for_log_replay() when replaying the log.
+ * Advance the pointer not to overwrite the tree-log nodes.
+ */
+ if (block_group->alloc_offset < offset + bytes)
+ block_group->alloc_offset = offset + bytes;
+ return 0;
+ }
+
spin_lock(&ctl->tree_lock);
again:
@@ -2643,7 +2724,7 @@ again:
goto again;
}
out_lock:
- btrfs_discard_update_discardable(block_group, ctl);
+ btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
out:
return ret;
@@ -2658,6 +2739,16 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
struct rb_node *n;
int count = 0;
+ /*
+ * Zoned btrfs does not use free space tree and cluster. Just print
+ * out the free space after the allocation offset.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ btrfs_info(fs_info, "free space %llu",
+ block_group->length - block_group->alloc_offset);
+ return;
+ }
+
spin_lock(&ctl->tree_lock);
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
@@ -2674,10 +2765,10 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
"%d blocks of free space at or bigger than bytes is", count);
}
-void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
spin_lock_init(&ctl->tree_lock);
ctl->unit = fs_info->sectorsize;
@@ -2779,7 +2870,7 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
spin_lock(&ctl->tree_lock);
__btrfs_remove_free_space_cache_locked(ctl);
if (ctl->private)
- btrfs_discard_update_discardable(ctl->private, ctl);
+ btrfs_discard_update_discardable(ctl->private);
spin_unlock(&ctl->tree_lock);
}
@@ -2801,7 +2892,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
cond_resched_lock(&ctl->tree_lock);
}
__btrfs_remove_free_space_cache_locked(ctl);
- btrfs_discard_update_discardable(block_group, ctl);
+ btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
}
@@ -2851,6 +2942,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 align_gap_len = 0;
enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
+
spin_lock(&ctl->tree_lock);
entry = find_free_space(ctl, &offset, &bytes_search,
block_group->full_stripe_len, max_extent_size);
@@ -2885,7 +2978,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
link_free_space(ctl, entry);
}
out:
- btrfs_discard_update_discardable(block_group, ctl);
+ btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
if (align_gap_len)
@@ -2982,6 +3075,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct rb_node *node;
u64 ret = 0;
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
+
spin_lock(&cluster->lock);
if (bytes > cluster->max_size)
goto out;
@@ -3054,7 +3149,7 @@ out:
kmem_cache_free(btrfs_free_space_bitmap_cachep,
entry->bitmap);
ctl->total_bitmaps--;
- ctl->op->recalc_thresholds(ctl);
+ recalculate_thresholds(ctl);
} else if (!btrfs_free_space_trimmed(entry)) {
ctl->discardable_extents[BTRFS_STAT_CURR]--;
}
@@ -3758,6 +3853,8 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
int ret;
u64 rem = 0;
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
+
*trimmed = 0;
spin_lock(&block_group->lock);
@@ -3828,166 +3925,62 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
return ret;
}
-/*
- * Find the left-most item in the cache tree, and then return the
- * smallest inode number in the item.
- *
- * Note: the returned inode number may not be the smallest one in
- * the tree, if the left-most item is a bitmap.
- */
-u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
+bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
{
- struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
- struct btrfs_free_space *entry = NULL;
- u64 ino = 0;
-
- spin_lock(&ctl->tree_lock);
-
- if (RB_EMPTY_ROOT(&ctl->free_space_offset))
- goto out;
-
- entry = rb_entry(rb_first(&ctl->free_space_offset),
- struct btrfs_free_space, offset_index);
-
- if (!entry->bitmap) {
- ino = entry->offset;
-
- unlink_free_space(ctl, entry);
- entry->offset++;
- entry->bytes--;
- if (!entry->bytes)
- kmem_cache_free(btrfs_free_space_cachep, entry);
- else
- link_free_space(ctl, entry);
- } else {
- u64 offset = 0;
- u64 count = 1;
- int ret;
-
- ret = search_bitmap(ctl, entry, &offset, &count, true);
- /* Logic error; Should be empty if it can't find anything */
- ASSERT(!ret);
-
- ino = offset;
- bitmap_clear_bits(ctl, entry, offset, 1);
- if (entry->bytes == 0)
- free_bitmap(ctl, entry);
- }
-out:
- spin_unlock(&ctl->tree_lock);
-
- return ino;
+ return btrfs_super_cache_generation(fs_info->super_copy);
}
-struct inode *lookup_free_ino_inode(struct btrfs_root *root,
- struct btrfs_path *path)
+static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans)
{
- struct inode *inode = NULL;
-
- spin_lock(&root->ino_cache_lock);
- if (root->ino_cache_inode)
- inode = igrab(root->ino_cache_inode);
- spin_unlock(&root->ino_cache_lock);
- if (inode)
- return inode;
-
- inode = __lookup_free_space_inode(root, path, 0);
- if (IS_ERR(inode))
- return inode;
-
- spin_lock(&root->ino_cache_lock);
- if (!btrfs_fs_closing(root->fs_info))
- root->ino_cache_inode = igrab(inode);
- spin_unlock(&root->ino_cache_lock);
-
- return inode;
-}
-
-int create_free_ino_inode(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path)
-{
- return __create_free_space_inode(root, trans, path,
- BTRFS_FREE_INO_OBJECTID, 0);
-}
-
-int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
-{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct btrfs_path *path;
- struct inode *inode;
- int ret = 0;
- u64 root_gen = btrfs_root_generation(&root->root_item);
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return 0;
-
- /*
- * If we're unmounting then just return, since this does a search on the
- * normal root and not the commit root and we could deadlock.
- */
- if (btrfs_fs_closing(fs_info))
- return 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return 0;
-
- inode = lookup_free_ino_inode(root, path);
- if (IS_ERR(inode))
- goto out;
-
- if (root_gen != BTRFS_I(inode)->generation)
- goto out_put;
+ struct btrfs_block_group *block_group;
+ struct rb_node *node;
+ int ret;
- ret = __load_free_space_cache(root, inode, ctl, path, 0);
+ btrfs_info(fs_info, "cleaning free space cache v1");
- if (ret < 0)
- btrfs_err(fs_info,
- "failed to load free ino cache for root %llu",
- root->root_key.objectid);
-out_put:
- iput(inode);
+ node = rb_first(&fs_info->block_group_cache_tree);
+ while (node) {
+ block_group = rb_entry(node, struct btrfs_block_group, cache_node);
+ ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
+ if (ret)
+ goto out;
+ node = rb_next(node);
+ }
out:
- btrfs_free_path(path);
return ret;
}
-int btrfs_write_out_ino_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct inode *inode)
+int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct btrfs_trans_handle *trans;
int ret;
- struct btrfs_io_ctl io_ctl;
- bool release_metadata = true;
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return 0;
+ /*
+ * update_super_roots will appropriately set or unset
+ * super_copy->cache_generation based on SPACE_CACHE and
+ * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
+ * transaction commit whether we are enabling space cache v1 and don't
+ * have any other work to do, or are disabling it and removing free
+ * space inodes.
+ */
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- memset(&io_ctl, 0, sizeof(io_ctl));
- ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans);
- if (!ret) {
- /*
- * At this point writepages() didn't error out, so our metadata
- * reservation is released when the writeback finishes, at
- * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
- * with or without an error.
- */
- release_metadata = false;
- ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path);
+ if (!active) {
+ set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
+ ret = cleanup_free_space_cache_v1(fs_info, trans);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out;
+ }
}
- if (ret) {
- if (release_metadata)
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- inode->i_size, true);
- btrfs_debug(fs_info,
- "failed to write free ino cache for root %llu error %d",
- root->root_key.objectid, ret);
- }
+ ret = btrfs_commit_transaction(trans);
+out:
+ clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
return ret;
}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index e3d5e0ad8f8e..1f23088d43f9 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -60,7 +60,6 @@ struct btrfs_free_space_ctl {
};
struct btrfs_free_space_op {
- void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl);
bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
};
@@ -76,7 +75,6 @@ struct btrfs_io_ctl {
int num_pages;
int entries;
int bitmaps;
- unsigned check_crcs:1;
};
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
@@ -84,6 +82,9 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
int create_free_space_inode(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path);
+int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_block_group *block_group);
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
@@ -97,25 +98,17 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path);
-struct inode *lookup_free_ino_inode(struct btrfs_root *root,
- struct btrfs_path *path);
-int create_free_ino_inode(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path);
-int load_free_ino_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_root *root);
-int btrfs_write_out_ino_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct inode *inode);
-void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group);
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl);
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_free_space_ctl *ctl,
u64 bytenr, u64 size,
enum btrfs_trim_state trim_state);
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
+int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size);
int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
@@ -126,7 +119,6 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group);
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size);
-u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
void btrfs_dump_free_space(struct btrfs_block_group *block_group,
u64 bytes);
int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
@@ -148,6 +140,8 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen,
u64 maxlen, bool async);
+bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info);
+int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active);
/* Support functions for running our sanity tests */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int test_add_free_space_entry(struct btrfs_block_group *cache,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 6b9faf3b0e96..a33bca94d133 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -136,9 +136,10 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
return 0;
}
-static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
+static inline u32 free_space_bitmap_size(const struct btrfs_fs_info *fs_info,
+ u64 size)
{
- return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
+ return DIV_ROUND_UP(size >> fs_info->sectorsize_bits, BITS_PER_BYTE);
}
static unsigned long *alloc_bitmap(u32 bitmap_size)
@@ -200,8 +201,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
int done = 0, nr;
int ret;
- bitmap_size = free_space_bitmap_size(block_group->length,
- fs_info->sectorsize);
+ bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
ret = -ENOMEM;
@@ -290,8 +290,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
u32 data_size;
extent_size = min(end - i, bitmap_range);
- data_size = free_space_bitmap_size(extent_size,
- fs_info->sectorsize);
+ data_size = free_space_bitmap_size(fs_info, extent_size);
key.objectid = i;
key.type = BTRFS_FREE_SPACE_BITMAP_KEY;
@@ -339,8 +338,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
int done = 0, nr;
int ret;
- bitmap_size = free_space_bitmap_size(block_group->length,
- fs_info->sectorsize);
+ bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
ret = -ENOMEM;
@@ -383,8 +381,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
fs_info->sectorsize *
BITS_PER_BYTE);
bitmap_cursor = ((char *)bitmap) + bitmap_pos;
- data_size = free_space_bitmap_size(found_key.offset,
- fs_info->sectorsize);
+ data_size = free_space_bitmap_size(fs_info,
+ found_key.offset);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
read_extent_buffer(leaf, bitmap_cursor, ptr,
@@ -416,7 +414,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- nrbits = div_u64(block_group->length, block_group->fs_info->sectorsize);
+ nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
start_bit = find_next_bit_le(bitmap, nrbits, 0);
while (start_bit < nrbits) {
@@ -540,8 +538,8 @@ static void free_space_set_bits(struct btrfs_block_group *block_group,
end = found_end;
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- first = div_u64(*start - found_start, fs_info->sectorsize);
- last = div_u64(end - found_start, fs_info->sectorsize);
+ first = (*start - found_start) >> fs_info->sectorsize_bits;
+ last = (end - found_start) >> fs_info->sectorsize_bits;
if (bit)
extent_buffer_bitmap_set(leaf, ptr, first, last - first);
else
@@ -1152,6 +1150,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
return PTR_ERR(trans);
set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
free_space_root = btrfs_create_tree(trans,
BTRFS_FREE_SPACE_TREE_OBJECTID);
if (IS_ERR(free_space_root)) {
@@ -1173,11 +1172,18 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ ret = btrfs_commit_transaction(trans);
- return btrfs_commit_transaction(trans);
+ /*
+ * Now that we've committed the transaction any reading of our commit
+ * root will be safe, so we can cache from the free space tree now.
+ */
+ clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
+ return ret;
abort:
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -1195,8 +1201,6 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
key.objectid = 0;
key.type = 0;
key.offset = 0;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 668701832845..37f36ffdaf6b 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -119,8 +119,6 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
ret = -ENOENT;
@@ -193,8 +191,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = -ENOENT;
@@ -270,7 +266,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
@@ -327,7 +322,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
deleted file mode 100644
index 76d2e43817ea..000000000000
--- a/fs/btrfs/inode-map.c
+++ /dev/null
@@ -1,582 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Oracle. All rights reserved.
- */
-
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-
-#include "ctree.h"
-#include "disk-io.h"
-#include "free-space-cache.h"
-#include "inode-map.h"
-#include "transaction.h"
-#include "delalloc-space.h"
-
-static void fail_caching_thread(struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
-
- btrfs_warn(fs_info, "failed to start inode caching task");
- btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE,
- "disabling inode map caching");
- spin_lock(&root->ino_cache_lock);
- root->ino_cache_state = BTRFS_CACHE_ERROR;
- spin_unlock(&root->ino_cache_lock);
- wake_up(&root->ino_cache_wait);
-}
-
-static int caching_kthread(void *data)
-{
- struct btrfs_root *root = data;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct btrfs_key key;
- struct btrfs_path *path;
- struct extent_buffer *leaf;
- u64 last = (u64)-1;
- int slot;
- int ret;
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return 0;
-
- path = btrfs_alloc_path();
- if (!path) {
- fail_caching_thread(root);
- return -ENOMEM;
- }
-
- /* Since the commit root is read-only, we can safely skip locking. */
- path->skip_locking = 1;
- path->search_commit_root = 1;
- path->reada = READA_FORWARD;
-
- key.objectid = BTRFS_FIRST_FREE_OBJECTID;
- key.offset = 0;
- key.type = BTRFS_INODE_ITEM_KEY;
-again:
- /* need to make sure the commit_root doesn't disappear */
- down_read(&fs_info->commit_root_sem);
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- if (btrfs_fs_closing(fs_info))
- goto out;
-
- leaf = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
-
- if (need_resched() ||
- btrfs_transaction_in_commit(fs_info)) {
- leaf = path->nodes[0];
-
- if (WARN_ON(btrfs_header_nritems(leaf) == 0))
- break;
-
- /*
- * Save the key so we can advances forward
- * in the next search.
- */
- btrfs_item_key_to_cpu(leaf, &key, 0);
- btrfs_release_path(path);
- root->ino_cache_progress = last;
- up_read(&fs_info->commit_root_sem);
- schedule_timeout(1);
- goto again;
- } else
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
-
- if (key.type != BTRFS_INODE_ITEM_KEY)
- goto next;
-
- if (key.objectid >= root->highest_objectid)
- break;
-
- if (last != (u64)-1 && last + 1 != key.objectid) {
- __btrfs_add_free_space(fs_info, ctl, last + 1,
- key.objectid - last - 1, 0);
- wake_up(&root->ino_cache_wait);
- }
-
- last = key.objectid;
-next:
- path->slots[0]++;
- }
-
- if (last < root->highest_objectid - 1) {
- __btrfs_add_free_space(fs_info, ctl, last + 1,
- root->highest_objectid - last - 1, 0);
- }
-
- spin_lock(&root->ino_cache_lock);
- root->ino_cache_state = BTRFS_CACHE_FINISHED;
- spin_unlock(&root->ino_cache_lock);
-
- root->ino_cache_progress = (u64)-1;
- btrfs_unpin_free_ino(root);
-out:
- wake_up(&root->ino_cache_wait);
- up_read(&fs_info->commit_root_sem);
-
- btrfs_free_path(path);
-
- return ret;
-}
-
-static void start_caching(struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct task_struct *tsk;
- int ret;
- u64 objectid;
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return;
-
- spin_lock(&root->ino_cache_lock);
- if (root->ino_cache_state != BTRFS_CACHE_NO) {
- spin_unlock(&root->ino_cache_lock);
- return;
- }
-
- root->ino_cache_state = BTRFS_CACHE_STARTED;
- spin_unlock(&root->ino_cache_lock);
-
- ret = load_free_ino_cache(fs_info, root);
- if (ret == 1) {
- spin_lock(&root->ino_cache_lock);
- root->ino_cache_state = BTRFS_CACHE_FINISHED;
- spin_unlock(&root->ino_cache_lock);
- wake_up(&root->ino_cache_wait);
- return;
- }
-
- /*
- * It can be quite time-consuming to fill the cache by searching
- * through the extent tree, and this can keep ino allocation path
- * waiting. Therefore at start we quickly find out the highest
- * inode number and we know we can use inode numbers which fall in
- * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
- */
- ret = btrfs_find_free_objectid(root, &objectid);
- if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
- __btrfs_add_free_space(fs_info, ctl, objectid,
- BTRFS_LAST_FREE_OBJECTID - objectid + 1,
- 0);
- wake_up(&root->ino_cache_wait);
- }
-
- tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
- root->root_key.objectid);
- if (IS_ERR(tsk))
- fail_caching_thread(root);
-}
-
-int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
-{
- if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
- return btrfs_find_free_objectid(root, objectid);
-
-again:
- *objectid = btrfs_find_ino_for_alloc(root);
-
- if (*objectid != 0)
- return 0;
-
- start_caching(root);
-
- wait_event(root->ino_cache_wait,
- root->ino_cache_state == BTRFS_CACHE_FINISHED ||
- root->ino_cache_state == BTRFS_CACHE_ERROR ||
- root->free_ino_ctl->free_space > 0);
-
- if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
- root->free_ino_ctl->free_space == 0)
- return -ENOSPC;
- else if (root->ino_cache_state == BTRFS_CACHE_ERROR)
- return btrfs_find_free_objectid(root, objectid);
- else
- goto again;
-}
-
-void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return;
-again:
- if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
- __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
- } else {
- down_write(&fs_info->commit_root_sem);
- spin_lock(&root->ino_cache_lock);
- if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
- spin_unlock(&root->ino_cache_lock);
- up_write(&fs_info->commit_root_sem);
- goto again;
- }
- spin_unlock(&root->ino_cache_lock);
-
- start_caching(root);
-
- __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
-
- up_write(&fs_info->commit_root_sem);
- }
-}
-
-/*
- * When a transaction is committed, we'll move those inode numbers which are
- * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
- * others will just be dropped, because the commit root we were searching has
- * changed.
- *
- * Must be called with root->fs_info->commit_root_sem held
- */
-void btrfs_unpin_free_ino(struct btrfs_root *root)
-{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
- spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock;
- struct btrfs_free_space *info;
- struct rb_node *n;
- u64 count;
-
- if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
- return;
-
- while (1) {
- spin_lock(rbroot_lock);
- n = rb_first(rbroot);
- if (!n) {
- spin_unlock(rbroot_lock);
- break;
- }
-
- info = rb_entry(n, struct btrfs_free_space, offset_index);
- BUG_ON(info->bitmap); /* Logic error */
-
- if (info->offset > root->ino_cache_progress)
- count = 0;
- else
- count = min(root->ino_cache_progress - info->offset + 1,
- info->bytes);
-
- rb_erase(&info->offset_index, rbroot);
- spin_unlock(rbroot_lock);
- if (count)
- __btrfs_add_free_space(root->fs_info, ctl,
- info->offset, count, 0);
- kmem_cache_free(btrfs_free_space_cachep, info);
- }
-}
-
-#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space))
-#define INODES_PER_BITMAP (PAGE_SIZE * 8)
-
-/*
- * The goal is to keep the memory used by the free_ino tree won't
- * exceed the memory if we use bitmaps only.
- */
-static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
-{
- struct btrfs_free_space *info;
- struct rb_node *n;
- int max_ino;
- int max_bitmaps;
-
- n = rb_last(&ctl->free_space_offset);
- if (!n) {
- ctl->extents_thresh = INIT_THRESHOLD;
- return;
- }
- info = rb_entry(n, struct btrfs_free_space, offset_index);
-
- /*
- * Find the maximum inode number in the filesystem. Note we
- * ignore the fact that this can be a bitmap, because we are
- * not doing precise calculation.
- */
- max_ino = info->bytes - 1;
-
- max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
- if (max_bitmaps <= ctl->total_bitmaps) {
- ctl->extents_thresh = 0;
- return;
- }
-
- ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
- PAGE_SIZE / sizeof(*info);
-}
-
-/*
- * We don't fall back to bitmap, if we are below the extents threshold
- * or this chunk of inode numbers is a big one.
- */
-static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info)
-{
- if (ctl->free_extents < ctl->extents_thresh ||
- info->bytes > INODES_PER_BITMAP / 10)
- return false;
-
- return true;
-}
-
-static const struct btrfs_free_space_op free_ino_op = {
- .recalc_thresholds = recalculate_thresholds,
- .use_bitmap = use_bitmap,
-};
-
-static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
-{
-}
-
-static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info)
-{
- /*
- * We always use extents for two reasons:
- *
- * - The pinned tree is only used during the process of caching
- * work.
- * - Make code simpler. See btrfs_unpin_free_ino().
- */
- return false;
-}
-
-static const struct btrfs_free_space_op pinned_free_ino_op = {
- .recalc_thresholds = pinned_recalc_thresholds,
- .use_bitmap = pinned_use_bitmap,
-};
-
-void btrfs_init_free_ino_ctl(struct btrfs_root *root)
-{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
-
- spin_lock_init(&ctl->tree_lock);
- ctl->unit = 1;
- ctl->start = 0;
- ctl->private = NULL;
- ctl->op = &free_ino_op;
- INIT_LIST_HEAD(&ctl->trimming_ranges);
- mutex_init(&ctl->cache_writeout_mutex);
-
- /*
- * Initially we allow to use 16K of ram to cache chunks of
- * inode numbers before we resort to bitmaps. This is somewhat
- * arbitrary, but it will be adjusted in runtime.
- */
- ctl->extents_thresh = INIT_THRESHOLD;
-
- spin_lock_init(&pinned->tree_lock);
- pinned->unit = 1;
- pinned->start = 0;
- pinned->private = NULL;
- pinned->extents_thresh = 0;
- pinned->op = &pinned_free_ino_op;
-}
-
-int btrfs_save_ino_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct btrfs_path *path;
- struct inode *inode;
- struct btrfs_block_rsv *rsv;
- struct extent_changeset *data_reserved = NULL;
- u64 num_bytes;
- u64 alloc_hint = 0;
- int ret;
- int prealloc;
- bool retry = false;
-
- /* only fs tree and subvol/snap needs ino cache */
- if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
- (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
- root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
- return 0;
-
- /* Don't save inode cache if we are deleting this root */
- if (btrfs_root_refs(&root->root_item) == 0)
- return 0;
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- rsv = trans->block_rsv;
- trans->block_rsv = &fs_info->trans_block_rsv;
-
- num_bytes = trans->bytes_reserved;
- /*
- * 1 item for inode item insertion if need
- * 4 items for inode item update (in the worst case)
- * 1 items for slack space if we need do truncation
- * 1 item for free space object
- * 3 items for pre-allocation
- */
- trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10);
- ret = btrfs_block_rsv_add(root, trans->block_rsv,
- trans->bytes_reserved,
- BTRFS_RESERVE_NO_FLUSH);
- if (ret)
- goto out;
- trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
- trans->bytes_reserved, 1);
-again:
- inode = lookup_free_ino_inode(root, path);
- if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
- ret = PTR_ERR(inode);
- goto out_release;
- }
-
- if (IS_ERR(inode)) {
- BUG_ON(retry); /* Logic error */
- retry = true;
-
- ret = create_free_ino_inode(root, trans, path);
- if (ret)
- goto out_release;
- goto again;
- }
-
- BTRFS_I(inode)->generation = 0;
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_put;
- }
-
- if (i_size_read(inode) > 0) {
- ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
- if (ret) {
- if (ret != -ENOSPC)
- btrfs_abort_transaction(trans, ret);
- goto out_put;
- }
- }
-
- spin_lock(&root->ino_cache_lock);
- if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
- ret = -1;
- spin_unlock(&root->ino_cache_lock);
- goto out_put;
- }
- spin_unlock(&root->ino_cache_lock);
-
- spin_lock(&ctl->tree_lock);
- prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
- prealloc = ALIGN(prealloc, PAGE_SIZE);
- prealloc += ctl->total_bitmaps * PAGE_SIZE;
- spin_unlock(&ctl->tree_lock);
-
- /* Just to make sure we have enough space */
- prealloc += 8 * PAGE_SIZE;
-
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 0,
- prealloc);
- if (ret)
- goto out_put;
-
- ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
- prealloc, prealloc, &alloc_hint);
- if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
- btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true);
- goto out_put;
- }
-
- ret = btrfs_write_out_ino_cache(root, trans, path, inode);
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
-out_put:
- iput(inode);
-out_release:
- trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
- trans->bytes_reserved, 0);
- btrfs_block_rsv_release(fs_info, trans->block_rsv,
- trans->bytes_reserved, NULL);
-out:
- trans->block_rsv = rsv;
- trans->bytes_reserved = num_bytes;
-
- btrfs_free_path(path);
- extent_changeset_free(data_reserved);
- return ret;
-}
-
-int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
-{
- struct btrfs_path *path;
- int ret;
- struct extent_buffer *l;
- struct btrfs_key search_key;
- struct btrfs_key found_key;
- int slot;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
- search_key.type = -1;
- search_key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret < 0)
- goto error;
- BUG_ON(ret == 0); /* Corruption */
- if (path->slots[0] > 0) {
- slot = path->slots[0] - 1;
- l = path->nodes[0];
- btrfs_item_key_to_cpu(l, &found_key, slot);
- *objectid = max_t(u64, found_key.objectid,
- BTRFS_FIRST_FREE_OBJECTID - 1);
- } else {
- *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
- }
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
-}
-
-int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
-{
- int ret;
- mutex_lock(&root->objectid_mutex);
-
- if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
- btrfs_warn(root->fs_info,
- "the objectid of root %llu reaches its highest value",
- root->root_key.objectid);
- ret = -ENOSPC;
- goto out;
- }
-
- *objectid = ++root->highest_objectid;
- ret = 0;
-out:
- mutex_unlock(&root->objectid_mutex);
- return ret;
-}
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
deleted file mode 100644
index 7a962811dffe..000000000000
--- a/fs/btrfs/inode-map.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef BTRFS_INODE_MAP_H
-#define BTRFS_INODE_MAP_H
-
-void btrfs_init_free_ino_ctl(struct btrfs_root *root);
-void btrfs_unpin_free_ino(struct btrfs_root *root);
-void btrfs_return_ino(struct btrfs_root *root, u64 objectid);
-int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid);
-int btrfs_save_ino_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans);
-
-int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
-int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
-
-#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 936c3137c646..535abf898225 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,12 +45,12 @@
#include "compression.h"
#include "locking.h"
#include "free-space-cache.h"
-#include "inode-map.h"
#include "props.h"
#include "qgroup.h"
#include "delalloc-space.h"
#include "block-group.h"
#include "space-info.h"
+#include "zoned.h"
struct btrfs_iget_args {
u64 ino;
@@ -62,7 +62,6 @@ struct btrfs_dio_data {
loff_t length;
ssize_t submitted;
struct extent_changeset *data_reserved;
- bool sync;
};
static const struct inode_operations btrfs_dir_inode_operations;
@@ -96,6 +95,51 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
const bool uptodate);
/*
+ * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
+ *
+ * ilock_flags can have the following bit set:
+ *
+ * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
+ * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
+ * return -EAGAIN
+ */
+int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
+{
+ if (ilock_flags & BTRFS_ILOCK_SHARED) {
+ if (ilock_flags & BTRFS_ILOCK_TRY) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ else
+ return 0;
+ }
+ inode_lock_shared(inode);
+ } else {
+ if (ilock_flags & BTRFS_ILOCK_TRY) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ else
+ return 0;
+ }
+ inode_lock(inode);
+ }
+ return 0;
+}
+
+/*
+ * btrfs_inode_unlock - unock inode i_rwsem
+ *
+ * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
+ * to decide whether the lock acquired is shared or exclusive.
+ */
+void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
+{
+ if (ilock_flags & BTRFS_ILOCK_SHARED)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+}
+
+/*
* Cleanup all submitted ordered extents in specified range to handle errors
* from the btrfs_run_delalloc_range() callback.
*
@@ -158,7 +202,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
* no overlapping inline items exist in the btree
*/
static int insert_inline_extent(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, int extent_inserted,
+ struct btrfs_path *path, bool extent_inserted,
struct btrfs_root *root, struct inode *inode,
u64 start, size_t size, size_t compressed_size,
int compress_type,
@@ -179,8 +223,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
if (compressed_size && compressed_pages)
cur_size = compressed_size;
- inode_add_bytes(inode, size);
-
if (!extent_inserted) {
struct btrfs_key key;
size_t datasize;
@@ -190,7 +232,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
if (ret)
@@ -256,8 +297,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
* could end up racing with unlink.
*/
BTRFS_I(inode)->disk_i_size = inode->i_size;
- ret = btrfs_update_inode(trans, root, inode);
-
fail:
return ret;
}
@@ -273,6 +312,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
int compress_type,
struct page **compressed_pages)
{
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
@@ -283,8 +323,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
u64 data_len = inline_len;
int ret;
struct btrfs_path *path;
- int extent_inserted = 0;
- u32 extent_item_size;
if (compressed_size)
data_len = compressed_size;
@@ -310,16 +348,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
}
trans->block_rsv = &inode->block_rsv;
+ drop_args.path = path;
+ drop_args.start = start;
+ drop_args.end = aligned_end;
+ drop_args.drop_cache = true;
+ drop_args.replace_extent = true;
+
if (compressed_size && compressed_pages)
- extent_item_size = btrfs_file_extent_calc_inline_size(
+ drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
compressed_size);
else
- extent_item_size = btrfs_file_extent_calc_inline_size(
+ drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
inline_len);
- ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end,
- NULL, 1, 1, extent_item_size,
- &extent_inserted);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -327,7 +369,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
if (isize > actual_end)
inline_len = min_t(u64, isize, actual_end);
- ret = insert_inline_extent(trans, path, extent_inserted,
+ ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
root, &inode->vfs_inode, start,
inline_len, compressed_size,
compress_type, compressed_pages);
@@ -339,8 +381,17 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
goto out;
}
+ btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ } else if (ret == -ENOSPC) {
+ ret = 1;
+ goto out;
+ }
+
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
- btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
out:
/*
* Don't forget to free the reserved space, as for inlined extent
@@ -642,8 +693,7 @@ cont:
NULL,
clear_flags,
PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK |
+ PAGE_START_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
@@ -867,7 +917,6 @@ retry:
ins.objectid,
async_extent->ram_size,
ins.offset,
- BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
if (ret) {
btrfs_drop_extent_cache(inode, async_extent->start,
@@ -884,8 +933,7 @@ retry:
async_extent->start +
async_extent->ram_size - 1,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
- PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK);
+ PAGE_UNLOCK | PAGE_START_WRITEBACK);
if (btrfs_submit_compressed_write(inode, async_extent->start,
async_extent->ram_size,
ins.objectid,
@@ -921,9 +969,8 @@ out_free:
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
- PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
- PAGE_SET_ERROR);
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
kfree(async_extent);
goto again;
@@ -1021,8 +1068,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
- PAGE_END_WRITEBACK);
+ PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
*nr_written = *nr_written +
(end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
@@ -1077,7 +1123,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
free_extent_map(em);
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
- ram_size, cur_alloc_size, 0);
+ ram_size, cur_alloc_size,
+ BTRFS_ORDERED_REGULAR);
if (ret)
goto out_drop_extent_cache;
@@ -1144,8 +1191,7 @@ out_reserve:
out_unlock:
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
- page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
- PAGE_END_WRITEBACK;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
/*
* If we reserved an extent for our delalloc range (or a subrange) and
* failed to create the respective ordered extent, then it means that
@@ -1270,9 +1316,8 @@ static int cow_file_range_async(struct btrfs_inode *inode,
unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING;
- unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
- PAGE_SET_ERROR;
+ unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR;
extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits, page_ops);
@@ -1349,6 +1394,29 @@ static int cow_file_range_async(struct btrfs_inode *inode,
return 0;
}
+static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
+ struct page *locked_page, u64 start,
+ u64 end, int *page_started,
+ unsigned long *nr_written)
+{
+ int ret;
+
+ ret = cow_file_range(inode, locked_page, start, end, page_started,
+ nr_written, 0);
+ if (ret)
+ return ret;
+
+ if (*page_started)
+ return 0;
+
+ __set_page_dirty_nobuffers(locked_page);
+ account_page_redirty(locked_page);
+ extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
+ *page_started = 1;
+
+ return 0;
+}
+
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes)
{
@@ -1469,8 +1537,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK |
+ PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
return -ENOMEM;
}
@@ -1598,6 +1665,15 @@ next_slot:
goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
goto out_check;
+
+ /*
+ * The following checks can be expensive, as they need to
+ * take other locks and do btree or rbtree searches, so
+ * release the path to avoid blocking other tasks for too
+ * long.
+ */
+ btrfs_release_path(path);
+
/* If extent is RO, we must COW it */
if (btrfs_extent_readonly(fs_info, disk_bytenr))
goto out_check;
@@ -1673,12 +1749,12 @@ out_check:
cur_offset = extent_end;
if (cur_offset > end)
break;
+ if (!path->nodes[0])
+ continue;
path->slots[0]++;
goto next_slot;
}
- btrfs_release_path(path);
-
/*
* COW range from cow_start to found_key.offset - 1. As the key
* will contain the beginning of the first extent that can be
@@ -1783,8 +1859,7 @@ error:
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK |
+ PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
btrfs_free_path(path);
return ret;
@@ -1819,17 +1894,24 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
{
int ret;
int force_cow = need_force_cow(inode, start, end);
+ const bool zoned = btrfs_is_zoned(inode->root->fs_info);
if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
+ ASSERT(!zoned);
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
} else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
+ ASSERT(!zoned);
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
} else if (!inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
- ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
+ if (zoned)
+ ret = run_delalloc_zoned(inode, locked_page, start, end,
+ page_started, nr_written);
+ else
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1);
} else {
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
@@ -2098,6 +2180,8 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
spin_lock(&inode->lock);
ASSERT(inode->new_delalloc_bytes >= len);
inode->new_delalloc_bytes -= len;
+ if (*bits & EXTENT_ADD_INODE_BYTES)
+ inode_add_bytes(&inode->vfs_inode, len);
spin_unlock(&inode->lock);
}
}
@@ -2121,10 +2205,11 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+ u64 logical = bio->bi_iter.bi_sector << 9;
+ struct extent_map *em;
u64 length = 0;
u64 map_length;
- int ret;
+ int ret = 0;
struct btrfs_io_geometry geom;
if (bio_flags & EXTENT_BIO_COMPRESSED)
@@ -2132,14 +2217,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
length = bio->bi_iter.bi_size;
map_length = length;
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
- &geom);
+ em = btrfs_get_chunk_map(fs_info, logical, map_length);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical,
+ map_length, &geom);
if (ret < 0)
- return ret;
+ goto out;
if (geom.len < length + size)
- return 1;
- return 0;
+ ret = 1;
+out:
+ free_extent_map(em);
+ return ret;
}
/*
@@ -2150,14 +2240,125 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
- u64 bio_offset)
+static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+ u64 dio_file_offset)
{
- struct inode *inode = private_data;
-
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
+bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
+ unsigned int size)
+{
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_ordered_extent *ordered;
+ u64 len = bio->bi_iter.bi_size + size;
+ bool ret = true;
+
+ ASSERT(btrfs_is_zoned(fs_info));
+ ASSERT(fs_info->max_zone_append_size > 0);
+ ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND);
+
+ /* Ordered extent not yet created, so we're good */
+ ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
+ if (!ordered)
+ return ret;
+
+ if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len >
+ ordered->disk_bytenr + ordered->disk_num_bytes)
+ ret = false;
+
+ btrfs_put_ordered_extent(ordered);
+
+ return ret;
+}
+
+static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+ struct bio *bio, loff_t file_offset)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct extent_map *em = NULL, *em_new = NULL;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 len = bio->bi_iter.bi_size;
+ u64 end = start + len;
+ u64 ordered_end;
+ u64 pre, post;
+ int ret = 0;
+
+ ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+ if (WARN_ON_ONCE(!ordered))
+ return BLK_STS_IOERR;
+
+ /* No need to split */
+ if (ordered->disk_num_bytes == len)
+ goto out;
+
+ /* We cannot split once end_bio'd ordered extent */
+ if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* We cannot split a compressed ordered extent */
+ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
+ /* bio must be in one ordered extent */
+ if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Checksum list should be empty */
+ if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pre = start - ordered->disk_bytenr;
+ post = ordered_end - end;
+
+ ret = btrfs_split_ordered_extent(ordered, pre, post);
+ if (ret)
+ goto out;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
+ if (!em) {
+ read_unlock(&em_tree->lock);
+ ret = -EIO;
+ goto out;
+ }
+ read_unlock(&em_tree->lock);
+
+ ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ /*
+ * We cannot reuse em_new here but have to create a new one, as
+ * unpin_extent_cache() expects the start of the extent map to be the
+ * logical offset of the file, which does not hold true anymore after
+ * splitting.
+ */
+ em_new = create_io_em(inode, em->start + pre, len,
+ em->start + pre, em->block_start + pre, len,
+ len, len, BTRFS_COMPRESS_NONE,
+ BTRFS_ORDERED_REGULAR);
+ if (IS_ERR(em_new)) {
+ ret = PTR_ERR(em_new);
+ goto out;
+ }
+ free_extent_map(em_new);
+
+out:
+ free_extent_map(em);
+ btrfs_put_ordered_extent(ordered);
+
+ return errno_to_blk_status(ret);
+}
+
/*
* extent_io.c submission hook. This does the right thing for csum calculation
* on write, or reading the csums from the tree before a read.
@@ -2187,12 +2388,22 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int skip_sum;
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
- skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
+ !fs_info->csum_root;
if (btrfs_is_free_space_inode(BTRFS_I(inode)))
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
- if (bio_op(bio) != REQ_OP_WRITE) {
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct page *page = bio_first_bvec_all(bio)->bv_page;
+ loff_t file_offset = page_offset(page);
+
+ ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
+ if (ret)
+ goto out;
+ }
+
+ if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
if (ret)
goto out;
@@ -2202,8 +2413,13 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
mirror_num,
bio_flags);
goto out;
- } else if (!skip_sum) {
- ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL);
+ } else {
+ /*
+ * Lookup bio sums does extra checks around whether we
+ * need to csum or not, which is why we ignore skip_sum
+ * here.
+ */
+ ret = btrfs_lookup_bio_sums(inode, bio, NULL);
if (ret)
goto out;
}
@@ -2213,8 +2429,8 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
goto mapit;
/* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
- 0, inode, btrfs_submit_bio_start);
+ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
+ 0, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
@@ -2253,11 +2469,69 @@ static int add_pending_csums(struct btrfs_trans_handle *trans,
return 0;
}
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+ const u64 start,
+ const u64 len,
+ struct extent_state **cached_state)
+{
+ u64 search_start = start;
+ const u64 end = start + len - 1;
+
+ while (search_start < end) {
+ const u64 search_len = end - search_start + 1;
+ struct extent_map *em;
+ u64 em_len;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ if (em->block_start != EXTENT_MAP_HOLE)
+ goto next;
+
+ em_len = em->len;
+ if (em->start < search_start)
+ em_len -= search_start - em->start;
+ if (em_len > search_len)
+ em_len = search_len;
+
+ ret = set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
+ GFP_NOFS, NULL);
+next:
+ search_start = extent_map_end(em);
+ free_extent_map(em);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state)
{
WARN_ON(PAGE_ALIGNED(end));
+
+ if (start >= i_size_read(&inode->vfs_inode) &&
+ !(inode->flags & BTRFS_INODE_PREALLOC)) {
+ /*
+ * There can't be any extents following eof in this case so just
+ * set the delalloc new bit for the range directly.
+ */
+ extra_bits |= EXTENT_DELALLOC_NEW;
+ } else {
+ int ret;
+
+ ret = btrfs_find_new_delalloc_bytes(inode, start,
+ end + 1 - start,
+ cached_state);
+ if (ret)
+ return ret;
+ }
+
return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
cached_state);
}
@@ -2453,9 +2727,11 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 file_pos,
struct btrfs_file_extent_item *stack_fi,
+ const bool update_inode_bytes,
u64 qgroup_reserved)
{
struct btrfs_root *root = inode->root;
+ const u64 sectorsize = root->fs_info->sectorsize;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
@@ -2463,7 +2739,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
- int extent_inserted = 0;
+ struct btrfs_drop_extents_args drop_args = { 0 };
int ret;
path = btrfs_alloc_path();
@@ -2479,18 +2755,20 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
* the caller is expected to unpin it and allow it to be merged
* with the others.
*/
- ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
- file_pos + num_bytes, NULL, 0,
- 1, sizeof(*stack_fi), &extent_inserted);
+ drop_args.path = path;
+ drop_args.start = file_pos;
+ drop_args.end = file_pos + num_bytes;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(*stack_fi);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
goto out;
- if (!extent_inserted) {
+ if (!drop_args.extent_inserted) {
ins.objectid = btrfs_ino(inode);
ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
sizeof(*stack_fi));
if (ret)
@@ -2505,7 +2783,24 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- inode_add_bytes(&inode->vfs_inode, num_bytes);
+ /*
+ * If we dropped an inline extent here, we know the range where it is
+ * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
+ * number of bytes only for that range contaning the inline extent.
+ * The remaining of the range will be processed when clearning the
+ * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
+ */
+ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
+ u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
+
+ inline_size = drop_args.bytes_found - inline_size;
+ btrfs_update_inode_bytes(inode, sectorsize, inline_size);
+ drop_args.bytes_found -= inline_size;
+ num_bytes -= sectorsize;
+ }
+
+ if (update_inode_bytes)
+ btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
@@ -2543,6 +2838,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_file_extent_item stack_fi;
u64 logical_len;
+ bool update_inode_bytes;
memset(&stack_fi, 0, sizeof(stack_fi));
btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
@@ -2558,9 +2854,18 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
/* Encryption and other encoding is reserved and all 0 */
+ /*
+ * For delalloc, when completing an ordered extent we update the inode's
+ * bytes when clearing the range in the inode's io tree, so pass false
+ * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
+ * except if the ordered extent was truncated.
+ */
+ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
+ test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
+
return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
oe->file_offset, &stack_fi,
- oe->qgroup_rsv);
+ update_inode_bytes, oe->qgroup_rsv);
}
/*
@@ -2570,11 +2875,11 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
*/
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
{
- struct inode *inode = ordered_extent->inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans = NULL;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
u64 start, end;
int compress_type = 0;
@@ -2582,10 +2887,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
u64 logical_len = ordered_extent->num_bytes;
bool freespace_inode;
bool truncated = false;
- bool range_locked = false;
- bool clear_new_delalloc_bytes = false;
bool clear_reserved_extent = true;
- unsigned int clear_bits;
+ unsigned int clear_bits = EXTENT_DEFRAG;
start = ordered_extent->file_offset;
end = start + ordered_extent->num_bytes - 1;
@@ -2593,16 +2896,19 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
- clear_new_delalloc_bytes = true;
+ clear_bits |= EXTENT_DELALLOC_NEW;
- freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
+ freespace_inode = btrfs_is_free_space_inode(inode);
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO;
goto out;
}
- btrfs_free_io_failure_record(BTRFS_I(inode), start, end);
+ if (ordered_extent->disk)
+ btrfs_rewrite_logical_zoned(ordered_extent);
+
+ btrfs_free_io_failure_record(inode, start, end);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
@@ -2625,14 +2931,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans = NULL;
goto out;
}
- trans->block_rsv = &BTRFS_I(inode)->block_rsv;
+ trans->block_rsv = &inode->block_rsv;
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
}
- range_locked = true;
+ clear_bits |= EXTENT_LOCKED;
lock_extent_bits(io_tree, start, end, &cached_state);
if (freespace_inode)
@@ -2645,13 +2951,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- trans->block_rsv = &BTRFS_I(inode)->block_rsv;
+ trans->block_rsv = &inode->block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
BUG_ON(compress_type);
- ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
+ ret = btrfs_mark_extent_written(trans, inode,
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
@@ -2665,8 +2971,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
- ordered_extent->file_offset,
+ unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -2679,6 +2984,17 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
+ /*
+ * If this is a new delalloc range, clear its new delalloc flag to
+ * update the inode's number of bytes. This needs to be done first
+ * before updating the inode item.
+ */
+ if ((clear_bits & EXTENT_DELALLOC_NEW) &&
+ !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
+ clear_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
+ 0, 0, &cached_state);
+
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
@@ -2687,12 +3003,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
}
ret = 0;
out:
- clear_bits = EXTENT_DEFRAG;
- if (range_locked)
- clear_bits |= EXTENT_LOCKED;
- if (clear_new_delalloc_bytes)
- clear_bits |= EXTENT_DELALLOC_NEW;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits,
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits,
(clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
&cached_state);
@@ -2707,7 +3018,7 @@ out:
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
/* Drop the cache for the part of the extent we didn't write. */
- btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0);
+ btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
/*
* If the ordered extent had an IOERR or something else went
@@ -2742,7 +3053,7 @@ out:
* This needs to be done to make sure anybody waiting knows we are done
* updating everything for this ordered extent.
*/
- btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent);
+ btrfs_remove_ordered_extent(inode, ordered_extent);
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
@@ -2783,18 +3094,32 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
btrfs_queue_work(wq, &ordered_extent->work);
}
+/*
+ * check_data_csum - verify checksum of one sector of uncompressed data
+ * @inode: inode
+ * @io_bio: btrfs_io_bio which contains the csum
+ * @bio_offset: offset to the beginning of the bio (in bytes)
+ * @page: page where is the data to be verified
+ * @pgoff: offset inside the page
+ *
+ * The length of such check is always one sector size.
+ */
static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
- int icsum, struct page *page, int pgoff, u64 start,
- size_t len)
+ u32 bio_offset, struct page *page, u32 pgoff)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ u32 len = fs_info->sectorsize;
+ const u32 csum_size = fs_info->csum_size;
+ unsigned int offset_sectors;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
- csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size;
+ ASSERT(pgoff + len <= PAGE_SIZE);
+
+ offset_sectors = bio_offset >> fs_info->sectorsize_bits;
+ csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
kaddr = kmap_atomic(page);
shash->tfm = fs_info->csum_shash;
@@ -2807,8 +3132,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
kunmap_atomic(kaddr);
return 0;
zeroit:
- btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
- io_bio->mirror_num);
+ btrfs_print_data_csum_error(BTRFS_I(inode), page_offset(page) + pgoff,
+ csum, csum_expected, io_bio->mirror_num);
if (io_bio->device)
btrfs_dev_stat_inc_and_print(io_bio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -2819,17 +3144,23 @@ zeroit:
}
/*
- * when reads are done, we need to check csums to verify the data is correct
+ * When reads are done, we need to check csums to verify the data is correct.
* if there's a match, we allow the bio to finish. If not, the code in
* extent_io.c will try to find good copies for us.
+ *
+ * @bio_offset: offset to the beginning of the bio (in bytes)
+ * @start: file offset of the range start
+ * @end: file offset of the range end (inclusive)
+ * @mirror: mirror number
*/
-int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
struct page *page, u64 start, u64 end, int mirror)
{
- size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ const u32 sectorsize = root->fs_info->sectorsize;
+ u32 pg_off;
if (PageChecked(page)) {
ClearPageChecked(page);
@@ -2839,15 +3170,27 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
return 0;
+ if (!root->fs_info->csum_root)
+ return 0;
+
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
return 0;
}
- phy_offset >>= inode->i_sb->s_blocksize_bits;
- return check_data_csum(inode, io_bio, phy_offset, page, offset, start,
- (size_t)(end - start + 1));
+ ASSERT(page_offset(page) <= start &&
+ end <= page_offset(page) + PAGE_SIZE - 1);
+ for (pg_off = offset_in_page(start);
+ pg_off < offset_in_page(end);
+ pg_off += sectorsize, bio_offset += sectorsize) {
+ int ret;
+
+ ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off);
+ if (ret < 0)
+ return -EIO;
+ }
+ return 0;
}
/*
@@ -2914,14 +3257,16 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
}
/**
- * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
- * @fs_info - the fs_info for this fs
- * @return - EINTR if we were killed, 0 if nothing's pending
+ * Wait for flushing all delayed iputs
+ *
+ * @fs_info: the filesystem
*
* This will wait on any delayed iputs that are currently running with KILLABLE
* set. Once they are all done running we will return, unless we are killed in
* which case we return EINTR. This helps in user operations like fallocate etc
* that might get blocked on the iputs.
+ *
+ * Return EINTR if we were killed, 0 if nothing's pending
*/
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
{
@@ -3457,7 +3802,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* copy everything in the in-memory inode into the btree.
*/
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode)
+ struct btrfs_root *root,
+ struct btrfs_inode *inode)
{
struct btrfs_inode_item *inode_item;
struct btrfs_path *path;
@@ -3468,9 +3814,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
- ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
- 1);
+ ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -3481,9 +3825,9 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
- fill_inode_item(trans, leaf, inode_item, inode);
+ fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
btrfs_mark_buffer_dirty(leaf);
- btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
+ btrfs_set_inode_last_trans(trans, inode);
ret = 0;
failed:
btrfs_free_path(path);
@@ -3494,7 +3838,8 @@ failed:
* copy everything in the in-memory inode into the btree.
*/
noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode)
+ struct btrfs_root *root,
+ struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -3506,23 +3851,22 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* The data relocation inode should also be directly updated
* without delay
*/
- if (!btrfs_is_free_space_inode(BTRFS_I(inode))
+ if (!btrfs_is_free_space_inode(inode)
&& root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
ret = btrfs_delayed_update_inode(trans, root, inode);
if (!ret)
- btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
+ btrfs_set_inode_last_trans(trans, inode);
return ret;
}
return btrfs_update_inode_item(trans, root, inode);
}
-noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode)
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode)
{
int ret;
@@ -3557,7 +3901,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto out;
}
- path->leave_spinning = 1;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
if (IS_ERR_OR_NULL(di)) {
@@ -3637,7 +3980,7 @@ err:
inode_inc_iversion(&dir->vfs_inode);
inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, &dir->vfs_inode);
+ ret = btrfs_update_inode(trans, root, dir);
out:
return ret;
}
@@ -3651,7 +3994,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
if (!ret) {
drop_nlink(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, &inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, inode);
}
return ret;
}
@@ -3800,7 +4143,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_ctime = current_time(dir);
- ret = btrfs_update_inode_fallback(trans, root, dir);
+ ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -3937,7 +4280,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
struct btrfs_block_rsv block_rsv;
u64 root_flags;
int ret;
- int err;
/*
* Don't allow to delete a subvolume with send in progress. This is
@@ -3959,8 +4301,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
down_write(&fs_info->subvol_sem);
- err = may_destroy_subvol(dest);
- if (err)
+ ret = may_destroy_subvol(dest);
+ if (ret)
goto out_up_write;
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
@@ -3969,13 +4311,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
* two for dir entries,
* two for root ref/backref.
*/
- err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
- if (err)
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
+ if (ret)
goto out_up_write;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_release;
}
trans->block_rsv = &block_rsv;
@@ -3985,7 +4327,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
ret = btrfs_unlink_subvol(trans, dir, dentry);
if (ret) {
- err = ret;
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -3994,7 +4335,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
memset(&dest->root_item.drop_progress, 0,
sizeof(dest->root_item.drop_progress));
- dest->root_item.drop_level = 0;
+ btrfs_set_root_drop_level(&dest->root_item, 0);
btrfs_set_root_refs(&dest->root_item, 0);
if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
@@ -4003,7 +4344,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
}
@@ -4013,7 +4353,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
@@ -4023,7 +4362,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
}
@@ -4034,14 +4372,12 @@ out_end_trans:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
ret = btrfs_end_transaction(trans);
- if (ret && !err)
- err = ret;
inode->i_flags |= S_DEAD;
out_release:
btrfs_subvolume_release_metadata(root, &block_rsv);
out_up_write:
up_write(&fs_info->subvol_sem);
- if (err) {
+ if (ret) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
@@ -4051,15 +4387,9 @@ out_up_write:
d_invalidate(dentry);
btrfs_prune_dentries(dest);
ASSERT(dest->send_in_progress == 0);
-
- /* the last ref */
- if (dest->ino_cache_inode) {
- iput(dest->ino_cache_inode);
- dest->ino_cache_inode = NULL;
- }
}
- return err;
+ return ret;
}
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -4136,7 +4466,7 @@ out:
*/
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct inode *inode,
+ struct btrfs_inode *inode,
u64 new_size, u32 min_type)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4157,7 +4487,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int pending_del_slot = 0;
int extent_type = -1;
int ret;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ u64 ino = btrfs_ino(inode);
u64 bytes_deleted = 0;
bool be_nice = false;
bool should_throttle = false;
@@ -4171,7 +4501,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
* off from time to time. This means all inodes in subvolume roots,
* reloc roots, and data reloc roots.
*/
- if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
+ if (!btrfs_is_free_space_inode(inode) &&
test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
be_nice = true;
@@ -4181,7 +4511,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
path->reada = READA_BACK;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ lock_extent_bits(&inode->io_tree, lock_start, (u64)-1,
&cached_state);
/*
@@ -4189,7 +4519,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
* new size is not block aligned since we will be keeping the
* last block of the extent just the way it is.
*/
- btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
+ btrfs_drop_extent_cache(inode, ALIGN(new_size,
fs_info->sectorsize),
(u64)-1, 0);
}
@@ -4200,8 +4530,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
* it is used to drop the logged items. So we shouldn't kill the delayed
* items.
*/
- if (min_type == 0 && root == BTRFS_I(inode)->root)
- btrfs_kill_delayed_inode_items(BTRFS_I(inode));
+ if (min_type == 0 && root == inode->root)
+ btrfs_kill_delayed_inode_items(inode);
key.objectid = ino;
key.offset = (u64)-1;
@@ -4257,14 +4587,13 @@ search_again:
btrfs_file_extent_num_bytes(leaf, fi);
trace_btrfs_truncate_show_fi_regular(
- BTRFS_I(inode), leaf, fi,
- found_key.offset);
+ inode, leaf, fi, found_key.offset);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
item_end += btrfs_file_extent_ram_bytes(leaf,
fi);
trace_btrfs_truncate_show_fi_inline(
- BTRFS_I(inode), leaf, fi, path->slots[0],
+ inode, leaf, fi, path->slots[0],
found_key.offset);
}
item_end--;
@@ -4303,7 +4632,8 @@ search_again:
if (test_bit(BTRFS_ROOT_SHAREABLE,
&root->state) &&
extent_start != 0)
- inode_sub_bytes(inode, num_dec);
+ inode_sub_bytes(&inode->vfs_inode,
+ num_dec);
btrfs_mark_buffer_dirty(leaf);
} else {
extent_num_bytes =
@@ -4318,7 +4648,8 @@ search_again:
found_extent = 1;
if (test_bit(BTRFS_ROOT_SHAREABLE,
&root->state))
- inode_sub_bytes(inode, num_dec);
+ inode_sub_bytes(&inode->vfs_inode,
+ num_dec);
}
}
clear_len = num_dec;
@@ -4353,7 +4684,8 @@ search_again:
}
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- inode_sub_bytes(inode, item_end + 1 - new_size);
+ inode_sub_bytes(&inode->vfs_inode,
+ item_end + 1 - new_size);
}
delete:
/*
@@ -4361,8 +4693,8 @@ delete:
* multiple fsyncs, and in this case we don't want to clear the
* file extent range because it's just the log.
*/
- if (root == BTRFS_I(inode)->root) {
- ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ if (root == inode->root) {
+ ret = btrfs_inode_clear_file_extent_range(inode,
clear_start, clear_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -4471,8 +4803,8 @@ out:
if (!ret && last_size > new_size)
last_size = new_size;
btrfs_inode_safe_disk_i_size_write(inode, last_size);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
- (u64)-1, &cached_state);
+ unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1,
+ &cached_state);
}
btrfs_free_path(path);
@@ -4490,12 +4822,12 @@ out:
* This will find the block for the "from" offset and cow the block and zero the
* part we want to zero. This is used with truncate and hole punching.
*/
-int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
- int front)
+int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+ int front)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct address_space *mapping = inode->i_mapping;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
@@ -4518,33 +4850,35 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
block_start = round_down(from, blocksize);
block_end = block_start + blocksize - 1;
- ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved,
- block_start, blocksize);
+ ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
+ blocksize);
if (ret < 0) {
- if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start,
- &write_bytes) > 0) {
+ if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
/* For nocow case, no need to reserve data space */
only_release_metadata = true;
} else {
goto out;
}
}
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize);
+ ret = btrfs_delalloc_reserve_metadata(inode, blocksize);
if (ret < 0) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- data_reserved, block_start, blocksize);
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ block_start, blocksize);
goto out;
}
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- block_start, blocksize, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+ btrfs_delalloc_release_space(inode, data_reserved, block_start,
+ blocksize, true);
+ btrfs_delalloc_release_extents(inode, blocksize);
ret = -ENOMEM;
goto out;
}
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto out_unlock;
if (!PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
@@ -4562,9 +4896,8 @@ again:
wait_on_page_writeback(page);
lock_extent_bits(io_tree, block_start, block_end, &cached_state);
- set_page_extent_mapped(page);
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start);
+ ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
unlock_extent_cached(io_tree, block_start, block_end,
&cached_state);
@@ -4575,11 +4908,11 @@ again:
goto again;
}
- clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
+ clear_extent_bit(&inode->io_tree, block_start, block_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state);
- ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0,
+ ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state);
if (ret) {
unlock_extent_cached(io_tree, block_start, block_end,
@@ -4605,34 +4938,33 @@ again:
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
- set_extent_bit(&BTRFS_I(inode)->io_tree, block_start,
- block_end, EXTENT_NORESERVE, NULL, NULL,
- GFP_NOFS);
+ set_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
out_unlock:
if (ret) {
if (only_release_metadata)
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- blocksize, true);
+ btrfs_delalloc_release_metadata(inode, blocksize, true);
else
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
+ btrfs_delalloc_release_space(inode, data_reserved,
block_start, blocksize, true);
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+ btrfs_delalloc_release_extents(inode, blocksize);
unlock_page(page);
put_page(page);
out:
if (only_release_metadata)
- btrfs_check_nocow_unlock(BTRFS_I(inode));
+ btrfs_check_nocow_unlock(inode);
extent_changeset_free(data_reserved);
return ret;
}
-static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
+static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
u64 offset, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
+ struct btrfs_drop_extents_args drop_args = { 0 };
int ret;
/*
@@ -4640,9 +4972,9 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
* that any holes get logged if we fsync.
*/
if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
- BTRFS_I(inode)->last_trans = fs_info->generation;
- BTRFS_I(inode)->last_sub_trans = root->log_transid;
- BTRFS_I(inode)->last_log_commit = root->last_log_commit;
+ inode->last_trans = fs_info->generation;
+ inode->last_sub_trans = root->log_transid;
+ inode->last_log_commit = root->last_log_commit;
return 0;
}
@@ -4655,19 +4987,25 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
+ drop_args.start = offset;
+ drop_args.end = offset + len;
+ drop_args.drop_cache = true;
+
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
- ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)),
+ ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
offset, 0, 0, len, 0, len, 0, 0, 0);
- if (ret)
+ if (ret) {
btrfs_abort_transaction(trans, ret);
- else
+ } else {
+ btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
btrfs_update_inode(trans, root, inode);
+ }
btrfs_end_transaction(trans);
return ret;
}
@@ -4678,14 +5016,14 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
* these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
* the range between oldsize and size
*/
-int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
+int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
u64 block_end = ALIGN(size, fs_info->sectorsize);
u64 last_byte;
@@ -4705,11 +5043,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
if (size <= hole_start)
return 0;
- btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start,
- block_end - 1, &cached_state);
+ btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
+ &cached_state);
cur_offset = hole_start;
while (1) {
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
block_end - cur_offset);
if (IS_ERR(em)) {
err = PTR_ERR(em);
@@ -4728,17 +5066,17 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
if (err)
break;
- err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ err = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
if (err)
break;
- btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
+ btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
if (!hole_em) {
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ &inode->runtime_flags);
goto next;
}
hole_em->start = cur_offset;
@@ -4758,14 +5096,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
write_unlock(&em_tree->lock);
if (err != -EEXIST)
break;
- btrfs_drop_extent_cache(BTRFS_I(inode),
- cur_offset,
+ btrfs_drop_extent_cache(inode, cur_offset,
cur_offset +
hole_size - 1, 0);
}
free_extent_map(hole_em);
} else {
- err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ err = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
if (err)
break;
@@ -4813,7 +5150,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* this truncation.
*/
btrfs_drew_write_lock(&root->snapshot_lock);
- ret = btrfs_cont_expand(inode, oldsize, newsize);
+ ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
if (ret) {
btrfs_drew_write_unlock(&root->snapshot_lock);
return ret;
@@ -4826,12 +5163,21 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
}
i_size_write(inode, newsize);
- btrfs_inode_safe_disk_i_size_write(inode, 0);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
pagecache_isize_extended(inode, oldsize, newsize);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
+ if (btrfs_is_zoned(fs_info)) {
+ ret = btrfs_wait_ordered_range(inode,
+ ALIGN(newsize, fs_info->sectorsize),
+ (u64)-1);
+ if (ret)
+ return ret;
+ }
/*
* We're truncating a file that used to have good data down to
@@ -5099,7 +5445,8 @@ void btrfs_evict_inode(struct inode *inode)
trans->block_rsv = rsv;
- ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+ ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
+ 0, 0);
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -5126,10 +5473,6 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_end_transaction(trans);
}
- if (!(root == fs_info->tree_root ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
- btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
-
free_rsv:
btrfs_free_block_rsv(fs_info, rsv);
no_delete:
@@ -5739,7 +6082,7 @@ static int btrfs_dirty_inode(struct inode *inode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret && ret == -ENOSPC) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
@@ -5747,7 +6090,7 @@ static int btrfs_dirty_inode(struct inode *inode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
}
btrfs_end_transaction(trans);
if (BTRFS_I(inode)->delayed_node)
@@ -6010,7 +6353,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
goto fail;
}
- path->leave_spinning = 1;
ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
if (ret != 0)
goto fail_unlock;
@@ -6136,7 +6478,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
parent_inode->vfs_inode.i_mtime = now;
parent_inode->vfs_inode.i_ctime = now;
}
- ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, parent_inode);
if (ret)
btrfs_abort_transaction(trans, ret);
return ret;
@@ -6196,7 +6538,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -6227,7 +6569,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock;
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
d_instantiate_new(dentry, inode);
out_unlock:
@@ -6260,7 +6602,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -6286,7 +6628,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock;
- err = btrfs_update_inode(trans, root, inode);
+ err = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (err)
goto out_unlock;
@@ -6358,7 +6700,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
} else {
struct dentry *parent = dentry->d_parent;
- err = btrfs_update_inode(trans, root, inode);
+ err = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (err)
goto fail;
if (inode->i_nlink == 1) {
@@ -6404,7 +6746,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_fail;
@@ -6426,7 +6768,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out_fail;
btrfs_i_size_write(BTRFS_I(inode), 0);
- err = btrfs_update_inode(trans, root, inode);
+ err = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (err)
goto out_fail;
@@ -6563,12 +6905,14 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
path->reada = READA_FORWARD;
/*
- * Unless we're going to uncompress the inline extent, no sleep would
- * happen.
+ * The same explanation in load_free_space_cache applies here as well,
+ * we only read when we're loading the free space cache, and at that
+ * point the commit_root has everything we need.
*/
- path->leave_spinning = 1;
-
- path->recurse = btrfs_is_free_space_inode(inode);
+ if (btrfs_is_free_space_inode(inode)) {
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ }
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
@@ -6670,7 +7014,6 @@ next:
em->orig_start = em->start;
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
- btrfs_set_path_blocking(path);
if (!PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
@@ -6927,9 +7270,6 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
* @strict: if true, omit optimizations that might force us into unnecessary
* cow. e.g., don't trust generation number.
*
- * This function will flush ordered extents in the range to ensure proper
- * nocow checks for (nowait == false) case.
- *
* Return:
* >0 and update @len if we can do nocow write
* 0 if we can't do nocow write
@@ -7319,17 +7659,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
int ret = 0;
u64 len = length;
bool unlock_extents = false;
- bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);
-
- /*
- * We used current->journal_info here to see if we were sync, but
- * there's a lot of tests in the enospc machinery to not do flushing if
- * we have a journal_info set, so we need to clear this out and re-set
- * it in iomap_end.
- */
- ASSERT(current->journal_info == NULL ||
- current->journal_info == BTRFS_DIO_SYNC_STUB);
- current->journal_info = NULL;
if (!write)
len = min_t(u64, len, fs_info->sectorsize);
@@ -7355,7 +7684,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (!dio_data)
return -ENOMEM;
- dio_data->sync = sync;
dio_data->length = length;
if (write) {
dio_data->reserve = round_up(length, fs_info->sectorsize);
@@ -7449,6 +7777,9 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->bdev = fs_info->fs_devices->latest_bdev;
iomap->length = len;
+ if (write && btrfs_use_zone_append(BTRFS_I(inode), em))
+ iomap->flags |= IOMAP_F_ZONE_APPEND;
+
free_extent_map(em);
return 0;
@@ -7503,14 +7834,6 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
extent_changeset_free(dio_data->data_reserved);
}
out:
- /*
- * We're all done, we can re-set the current->journal_info now safely
- * for our endio.
- */
- if (dio_data->sync) {
- ASSERT(current->journal_info == NULL);
- current->journal_info = BTRFS_DIO_SYNC_STUB;
- }
kfree(dio_data);
iomap->private = NULL;
@@ -7526,7 +7849,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
if (!refcount_dec_and_test(&dip->refs))
return;
- if (bio_op(dip->dio_bio) == REQ_OP_WRITE) {
+ if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
__endio_write_update_ordered(BTRFS_I(dip->inode),
dip->logical_offset,
dip->bytes,
@@ -7574,7 +7897,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
struct bio_vec bvec;
struct bvec_iter iter;
u64 start = io_bio->logical;
- int icsum = 0;
+ u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
__bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
@@ -7585,9 +7908,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
for (i = 0; i < nr_sectors; i++) {
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
- (!csum || !check_data_csum(inode, io_bio, icsum,
- bvec.bv_page, pgoff,
- start, sectorsize))) {
+ (!csum || !check_data_csum(inode, io_bio,
+ bio_offset, bvec.bv_page, pgoff))) {
clean_io_failure(fs_info, failure_tree, io_tree,
start, bvec.bv_page,
btrfs_ino(BTRFS_I(inode)),
@@ -7595,6 +7917,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
} else {
blk_status_t status;
+ ASSERT((start - io_bio->logical) < UINT_MAX);
status = btrfs_submit_read_repair(inode,
&io_bio->bio,
start - io_bio->logical,
@@ -7607,7 +7930,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
err = status;
}
start += sectorsize;
- icsum++;
+ ASSERT(bio_offset + sectorsize > bio_offset);
+ bio_offset += sectorsize;
pgoff += sectorsize;
}
}
@@ -7640,10 +7964,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
NULL);
btrfs_queue_work(wq, &ordered->work);
}
- /*
- * If btrfs_dec_test_ordered_pending does not find any ordered
- * extent in the range, we can exit.
- */
+
+ /* No ordered extent found in the range, exit */
if (ordered_offset == last_offset)
return;
/*
@@ -7657,12 +7979,11 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
}
}
-static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
- struct bio *bio, u64 offset)
+static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
+ struct bio *bio,
+ u64 dio_file_offset)
{
- struct inode *inode = private_data;
-
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1);
}
static void btrfs_end_dio_bio(struct bio *bio)
@@ -7674,8 +7995,7 @@ static void btrfs_end_dio_bio(struct bio *bio)
btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
"direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
- bio->bi_opf,
- (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_opf, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
if (bio_op(bio) == REQ_OP_READ) {
@@ -7686,6 +8006,8 @@ static void btrfs_end_dio_bio(struct bio *bio)
if (err)
dip->dio_bio->bi_status = err;
+ btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
+
bio_put(bio);
btrfs_dio_private_put(dip);
}
@@ -7695,7 +8017,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_private *dip = bio->bi_private;
- bool write = bio_op(bio) == REQ_OP_WRITE;
+ bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
blk_status_t ret;
/* Check btrfs_submit_bio_hook() for rules about async submit. */
@@ -7712,8 +8034,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
goto map;
if (write && async_submit) {
- ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
- file_offset, inode,
+ ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
btrfs_submit_bio_start_direct_io);
goto err;
} else if (write) {
@@ -7728,8 +8049,8 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
u64 csum_offset;
csum_offset = file_offset - dip->logical_offset;
- csum_offset >>= inode->i_sb->s_blocksize_bits;
- csum_offset *= btrfs_super_csum_size(fs_info->super_copy);
+ csum_offset >>= fs_info->sectorsize_bits;
+ csum_offset *= fs_info->csum_size;
btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
}
map:
@@ -7746,7 +8067,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
struct inode *inode,
loff_t file_offset)
{
- const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
size_t dip_size;
struct btrfs_dio_private *dip;
@@ -7754,11 +8075,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
dip_size = sizeof(*dip);
if (!write && csum) {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
size_t nblocks;
- nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
- dip_size += csum_size * nblocks;
+ nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits;
+ dip_size += fs_info->csum_size * nblocks;
}
dip = kzalloc(dip_size, GFP_NOFS);
@@ -7768,7 +8088,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
dip->inode = inode;
dip->logical_offset = file_offset;
dip->bytes = dio_bio->bi_iter.bi_size;
- dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
+ dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
refcount_set(&dip->refs, 1);
return dip;
@@ -7777,8 +8097,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
struct bio *dio_bio, loff_t file_offset)
{
- const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
- const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
+ const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
BTRFS_BLOCK_GROUP_RAID56_MASK);
@@ -7789,10 +8108,12 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
u64 submit_len;
int clone_offset = 0;
int clone_len;
+ u64 logical;
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
struct btrfs_dio_data *dio_data = iomap->private;
+ struct extent_map *em = NULL;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) {
@@ -7805,13 +8126,14 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
return BLK_QC_T_NONE;
}
- if (!write && csum) {
+ if (!write) {
/*
* Load the csums up front to reduce csum tree searches and
* contention when submitting bios.
+ *
+ * If we have csums disabled this will do nothing.
*/
- status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset,
- dip->csums);
+ status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
if (status != BLK_STS_OK)
goto out_err;
}
@@ -7820,12 +8142,18 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
submit_len = dio_bio->bi_iter.bi_size;
do {
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio),
- start_sector << 9, submit_len,
- &geom);
+ logical = start_sector << 9;
+ em = btrfs_get_chunk_map(fs_info, logical, submit_len);
+ if (IS_ERR(em)) {
+ status = errno_to_blk_status(PTR_ERR(em));
+ em = NULL;
+ goto out_err_em;
+ }
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
+ logical, submit_len, &geom);
if (ret) {
status = errno_to_blk_status(ret);
- goto out_err;
+ goto out_err_em;
}
ASSERT(geom.len <= INT_MAX);
@@ -7840,6 +8168,19 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
+ WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) &&
+ fs_info->max_zone_append_size &&
+ bio_op(bio) != REQ_OP_ZONE_APPEND);
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ status = extract_ordered_extent(BTRFS_I(inode), bio,
+ file_offset);
+ if (status) {
+ bio_put(bio);
+ goto out_err;
+ }
+ }
+
ASSERT(submit_len >= clone_len);
submit_len -= clone_len;
@@ -7870,145 +8211,36 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
bio_put(bio);
if (submit_len > 0)
refcount_dec(&dip->refs);
- goto out_err;
+ goto out_err_em;
}
dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
+
+ free_extent_map(em);
} while (submit_len > 0);
return BLK_QC_T_NONE;
+out_err_em:
+ free_extent_map(em);
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
- return BLK_QC_T_NONE;
-}
-
-static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
-{
- int seg;
- int i;
- unsigned int blocksize_mask = fs_info->sectorsize - 1;
- ssize_t retval = -EINVAL;
-
- if (offset & blocksize_mask)
- goto out;
- if (iov_iter_alignment(iter) & blocksize_mask)
- goto out;
-
- /* If this is a write we don't need to check anymore */
- if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
- return 0;
- /*
- * Check to make sure we don't have duplicate iov_base's in this
- * iovec, if so return EINVAL, otherwise we'll get csum errors
- * when reading back.
- */
- for (seg = 0; seg < iter->nr_segs; seg++) {
- for (i = seg + 1; i < iter->nr_segs; i++) {
- if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
- goto out;
- }
- }
- retval = 0;
-out:
- return retval;
-}
-
-static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
- int error, unsigned flags)
-{
- /*
- * Now if we're still in the context of our submitter we know we can't
- * safely run generic_write_sync(), so clear our flag here so that the
- * caller knows to follow up with a sync.
- */
- if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
- current->journal_info = NULL;
- return error;
- }
-
- if (error)
- return error;
-
- if (size) {
- iocb->ki_flags |= IOCB_DSYNC;
- return generic_write_sync(iocb, size);
- }
-
- return 0;
+ return BLK_QC_T_NONE;
}
-static const struct iomap_ops btrfs_dio_iomap_ops = {
+const struct iomap_ops btrfs_dio_iomap_ops = {
.iomap_begin = btrfs_dio_iomap_begin,
.iomap_end = btrfs_dio_iomap_end,
};
-static const struct iomap_dio_ops btrfs_dio_ops = {
- .submit_io = btrfs_submit_direct,
-};
-
-static const struct iomap_dio_ops btrfs_sync_dops = {
+const struct iomap_dio_ops btrfs_dio_ops = {
.submit_io = btrfs_submit_direct,
- .end_io = btrfs_maybe_fsync_end_io,
};
-ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_changeset *data_reserved = NULL;
- loff_t offset = iocb->ki_pos;
- size_t count = 0;
- bool relock = false;
- ssize_t ret;
-
- if (check_direct_IO(fs_info, iter, offset))
- return 0;
-
- count = iov_iter_count(iter);
- if (iov_iter_rw(iter) == WRITE) {
- /*
- * If the write DIO is beyond the EOF, we need update
- * the isize, but it is protected by i_mutex. So we can
- * not unlock the i_mutex at this case.
- */
- if (offset + count <= inode->i_size) {
- inode_unlock(inode);
- relock = true;
- }
- down_read(&BTRFS_I(inode)->dio_sem);
- }
-
- /*
- * We have are actually a sync iocb, so we need our fancy endio to know
- * if we need to sync.
- */
- if (current->journal_info)
- ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
- &btrfs_sync_dops, is_sync_kiocb(iocb));
- else
- ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
- &btrfs_dio_ops, is_sync_kiocb(iocb));
-
- if (ret == -ENOTBLK)
- ret = 0;
-
- if (iov_iter_rw(iter) == WRITE)
- up_read(&BTRFS_I(inode)->dio_sem);
-
- if (relock)
- inode_lock(inode);
-
- extent_changeset_free(data_reserved);
- return ret;
-}
-
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -8078,7 +8310,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
int ret = try_release_extent_mapping(page, gfp_flags);
if (ret == 1)
- detach_page_private(page);
+ clear_page_extent_mapped(page);
return ret;
}
@@ -8128,6 +8360,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
u64 start;
u64 end;
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
+ bool found_ordered = false;
+ bool completed_ordered = false;
/*
* we have the page locked, so new writeback can't start,
@@ -8145,19 +8379,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
if (!inode_evicting)
lock_extent_bits(tree, page_start, page_end, &cached_state);
-again:
+
start = page_start;
+again:
ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
if (ordered) {
+ found_ordered = true;
end = min(page_end,
ordered->file_offset + ordered->num_bytes - 1);
/*
- * IO on this page will never be started, so we need
- * to account for any ordered extents now
+ * IO on this page will never be started, so we need to account
+ * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
+ * here, must leave that up for the ordered extent completion.
*/
if (!inode_evicting)
clear_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state);
/*
@@ -8179,8 +8416,10 @@ again:
if (btrfs_dec_test_ordered_pending(inode, &ordered,
start,
- end - start + 1, 1))
+ end - start + 1, 1)) {
btrfs_finish_ordered_io(ordered);
+ completed_ordered = true;
+ }
}
btrfs_put_ordered_extent(ordered);
if (!inode_evicting) {
@@ -8209,16 +8448,29 @@ again:
*/
btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
+ bool delete = true;
+
+ /*
+ * If there's an ordered extent for this range and we have not
+ * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set
+ * in the range for the ordered extent completion. We must also
+ * not delete the range, otherwise we would lose that bit (and
+ * any other bits set in the range). Make sure EXTENT_UPTODATE
+ * is cleared if we don't delete, otherwise it can lead to
+ * corruptions if the i_size is extented later.
+ */
+ if (found_ordered && !completed_ordered)
+ delete = false;
clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
- &cached_state);
+ EXTENT_DELALLOC | EXTENT_UPTODATE |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
+ delete, &cached_state);
__btrfs_releasepage(page, GFP_NOFS);
}
ClearPageChecked(page);
- detach_page_private(page);
+ clear_page_extent_mapped(page);
}
/*
@@ -8297,7 +8549,12 @@ again:
wait_on_page_writeback(page);
lock_extent_bits(io_tree, page_start, page_end, &cached_state);
- set_page_extent_mapped(page);
+ ret2 = set_page_extent_mapped(page);
+ if (ret2 < 0) {
+ ret = vmf_error(ret2);
+ unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ goto out_unlock;
+ }
/*
* we can't set the delalloc bits if there are pending ordered
@@ -8461,14 +8718,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
trans->block_rsv = rsv;
while (1) {
- ret = btrfs_truncate_inode_items(trans, root, inode,
+ ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
break;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
break;
@@ -8499,7 +8756,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
if (ret)
goto out;
trans = btrfs_start_transaction(root, 1);
@@ -8507,14 +8764,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
ret = PTR_ERR(trans);
goto out;
}
- btrfs_inode_safe_disk_i_size_write(inode, 0);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
if (trans) {
int ret2;
trans->block_rsv = &fs_info->trans_block_rsv;
- ret2 = btrfs_update_inode(trans, root, inode);
+ ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret2 && !ret)
ret = ret2;
@@ -8534,15 +8791,18 @@ out:
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- u64 new_dirid)
+ struct btrfs_root *parent_root)
{
struct inode *inode;
int err;
u64 index = 0;
+ u64 ino;
- inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
- new_dirid, new_dirid,
+ err = btrfs_get_free_objectid(new_root, &ino);
+ if (err < 0)
+ return err;
+
+ inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
S_IFDIR | (~current_umask() & S_IRWXUGO),
&index);
if (IS_ERR(inode))
@@ -8560,7 +8820,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
"error inheriting subvolume %llu properties: %d",
new_root->root_key.objectid, err);
- err = btrfs_update_inode(trans, new_root, inode);
+ err = btrfs_update_inode(trans, new_root, BTRFS_I(inode));
iput(inode);
return err;
@@ -8622,7 +8882,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
- init_rwsem(&ei->dio_sem);
return inode;
}
@@ -8762,6 +9021,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
{
u64 delalloc_bytes;
+ u64 inode_bytes;
struct inode *inode = d_inode(path->dentry);
u32 blocksize = inode->i_sb->s_blocksize;
u32 bi_flags = BTRFS_I(inode)->flags;
@@ -8788,8 +9048,9 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
+ inode_bytes = inode_get_bytes(inode);
spin_unlock(&BTRFS_I(inode)->lock);
- stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
+ stat->blocks = (ALIGN(inode_bytes, blocksize) +
ALIGN(delalloc_bytes, blocksize)) >> 9;
return 0;
}
@@ -8915,7 +9176,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
old_dentry->d_name.name,
old_dentry->d_name.len);
if (!ret)
- ret = btrfs_update_inode(trans, root, old_inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -8931,7 +9192,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
new_dentry->d_name.name,
new_dentry->d_name.len);
if (!ret)
- ret = btrfs_update_inode(trans, dest, new_inode);
+ ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9020,7 +9281,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
u64 objectid;
u64 index;
- ret = btrfs_find_free_ino(root, &objectid);
+ ret = btrfs_get_free_objectid(root, &objectid);
if (ret)
return ret;
@@ -9051,7 +9312,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out:
unlock_new_inode(inode);
if (ret)
@@ -9185,7 +9446,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dentry->d_name.name,
old_dentry->d_name.len);
if (!ret)
- ret = btrfs_update_inode(trans, root, old_inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9331,7 +9592,9 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot)
+static int start_delalloc_inodes(struct btrfs_root *root,
+ struct writeback_control *wbc, bool snapshot,
+ bool in_reclaim_context)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -9339,6 +9602,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
struct list_head works;
struct list_head splice;
int ret = 0;
+ bool full_flush = wbc->nr_to_write == LONG_MAX;
INIT_LIST_HEAD(&works);
INIT_LIST_HEAD(&splice);
@@ -9352,6 +9616,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
list_move_tail(&binode->delalloc_inodes,
&root->delalloc_inodes);
+
+ if (in_reclaim_context &&
+ test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
+ continue;
+
inode = igrab(&binode->vfs_inode);
if (!inode) {
cond_resched_lock(&root->delalloc_lock);
@@ -9362,18 +9631,24 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
if (snapshot)
set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
&binode->runtime_flags);
- work = btrfs_alloc_delalloc_work(inode);
- if (!work) {
- iput(inode);
- ret = -ENOMEM;
- goto out;
- }
- list_add_tail(&work->list, &works);
- btrfs_queue_work(root->fs_info->flush_workers,
- &work->work);
- if (*nr != U64_MAX) {
- (*nr)--;
- if (*nr == 0)
+ if (full_flush) {
+ work = btrfs_alloc_delalloc_work(inode);
+ if (!work) {
+ iput(inode);
+ ret = -ENOMEM;
+ goto out;
+ }
+ list_add_tail(&work->list, &works);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &work->work);
+ } else {
+ ret = sync_inode(inode, wbc);
+ if (!ret &&
+ test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = sync_inode(inode, wbc);
+ btrfs_add_delayed_iput(inode);
+ if (ret || wbc->nr_to_write <= 0)
goto out;
}
cond_resched();
@@ -9399,17 +9674,29 @@ out:
int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
{
+ struct writeback_control wbc = {
+ .nr_to_write = LONG_MAX,
+ .sync_mode = WB_SYNC_NONE,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 nr = U64_MAX;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- return start_delalloc_inodes(root, &nr, true);
+ return start_delalloc_inodes(root, &wbc, true, false);
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
+ bool in_reclaim_context)
{
+ struct writeback_control wbc = {
+ .nr_to_write = nr,
+ .sync_mode = WB_SYNC_NONE,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
struct btrfs_root *root;
struct list_head splice;
int ret;
@@ -9422,7 +9709,14 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
mutex_lock(&fs_info->delalloc_root_mutex);
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
- while (!list_empty(&splice) && nr) {
+ while (!list_empty(&splice)) {
+ /*
+ * Reset nr_to_write here so we know that we're doing a full
+ * flush.
+ */
+ if (nr == LONG_MAX)
+ wbc.nr_to_write = LONG_MAX;
+
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
root = btrfs_grab_root(root);
@@ -9431,9 +9725,9 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, &nr, false);
+ ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
btrfs_put_root(root);
- if (ret < 0)
+ if (ret < 0 || wbc.nr_to_write <= 0)
goto out;
spin_lock(&fs_info->delalloc_root_lock);
}
@@ -9483,7 +9777,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -9545,7 +9839,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode_nohighmem(inode);
inode_set_bytes(inode, name_len);
btrfs_i_size_write(BTRFS_I(inode), name_len);
- err = btrfs_update_inode(trans, root, inode);
+ err = btrfs_update_inode(trans, root, BTRFS_I(inode));
/*
* Last step, add directory indexes for our symlink inode. This is the
* last step to avoid extra cleanup of these indexes if an error happens
@@ -9571,7 +9865,8 @@ out_unlock:
static struct btrfs_trans_handle *insert_prealloc_file_extent(
struct btrfs_trans_handle *trans_in,
- struct inode *inode, struct btrfs_key *ins,
+ struct btrfs_inode *inode,
+ struct btrfs_key *ins,
u64 file_offset)
{
struct btrfs_file_extent_item stack_fi;
@@ -9592,13 +9887,14 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
/* Encryption and other encoding is reserved and all 0 */
- ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
+ ret = btrfs_qgroup_release_data(inode, file_offset, len);
if (ret < 0)
return ERR_PTR(ret);
if (trans) {
- ret = insert_reserved_file_extent(trans, BTRFS_I(inode),
- file_offset, &stack_fi, ret);
+ ret = insert_reserved_file_extent(trans, inode,
+ file_offset, &stack_fi,
+ true, ret);
if (ret)
return ERR_PTR(ret);
return trans;
@@ -9618,7 +9914,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
if (!path)
return ERR_PTR(-ENOMEM);
- ret = btrfs_replace_file_extents(inode, path, file_offset,
+ ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
file_offset + len - 1, &extent_info,
&trans);
btrfs_free_path(path);
@@ -9672,10 +9968,17 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
* clear_offset by our extent size.
*/
clear_offset += ins.offset;
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
- trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
+ trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
+ &ins, cur_offset);
+ /*
+ * Now that we inserted the prealloc extent we can finally
+ * decrement the number of reservations in the block group.
+ * If we did it before, we could race with relocation and have
+ * relocation miss the reserved extent, making it fail later.
+ */
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
@@ -9730,10 +10033,10 @@ next:
else
i_size = cur_offset;
i_size_write(inode, i_size);
- btrfs_inode_safe_disk_i_size_write(inode, 0);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9808,7 +10111,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_find_free_ino(root, &objectid);
+ ret = btrfs_get_free_objectid(root, &objectid);
if (ret)
goto out;
@@ -9829,7 +10132,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (ret)
goto out;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
goto out;
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
@@ -10208,6 +10511,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
}
#endif
+/*
+ * Update the number of bytes used in the VFS' inode. When we replace extents in
+ * a range (clone, dedupe, fallocate's zero range), we must update the number of
+ * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
+ * always get a correct value.
+ */
+void btrfs_update_inode_bytes(struct btrfs_inode *inode,
+ const u64 add_bytes,
+ const u64 del_bytes)
+{
+ if (add_bytes == del_bytes)
+ return;
+
+ spin_lock(&inode->lock);
+ if (del_bytes > 0)
+ inode_sub_bytes(&inode->vfs_inode, del_bytes);
+ if (add_bytes > 0)
+ inode_add_bytes(&inode->vfs_inode, add_bytes);
+ spin_unlock(&inode->lock);
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ab408a23ba32..a8c60d46d19c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -34,7 +34,6 @@
#include "print-tree.h"
#include "volumes.h"
#include "locking.h"
-#include "inode-map.h"
#include "backref.h"
#include "rcu-string.h"
#include "send.h"
@@ -193,6 +192,15 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags)
return 0;
}
+static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
+ unsigned int flags)
+{
+ if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
+ return -EPERM;
+
+ return 0;
+}
+
static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
@@ -230,6 +238,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (ret)
goto out_unlock;
+ ret = check_fsflags_compatible(fs_info, fsflags);
+ if (ret)
+ goto out_unlock;
+
binode_flags = binode->flags;
if (fsflags & FS_SYNC_FL)
binode_flags |= BTRFS_INODE_SYNC;
@@ -336,7 +348,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out_end_trans:
btrfs_end_transaction(trans);
@@ -479,7 +491,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
btrfs_end_transaction(trans);
@@ -516,6 +528,14 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
return -EPERM;
/*
+ * btrfs_trim_block_group() depends on space cache, which is not
+ * available in zoned filesystem. So, disallow fitrim on a zoned
+ * filesystem for now.
+ */
+ if (btrfs_is_zoned(fs_info))
+ return -EOPNOTSUPP;
+
+ /*
* If the fs is mounted with nologreplay, which requires it to be
* mounted in RO mode as well, we can not allow discard on free space
* inside block groups, because log trees refer to extents that are not
@@ -594,14 +614,13 @@ static noinline int create_subvol(struct inode *dir,
int err;
dev_t anon_dev = 0;
u64 objectid;
- u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
return -ENOMEM;
- ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid);
+ ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
if (ret)
goto fail_free;
@@ -681,7 +700,7 @@ static noinline int create_subvol(struct inode *dir,
free_extent_buffer(leaf);
leaf = NULL;
- btrfs_set_root_dirid(root_item, new_dirid);
+ btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
key.objectid = objectid;
key.offset = 0;
@@ -704,7 +723,7 @@ static noinline int create_subvol(struct inode *dir,
btrfs_record_root_in_trans(trans, new_root);
- ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
+ ret = btrfs_create_subvol_root(trans, new_root, root);
btrfs_put_root(new_root);
if (ret) {
/* We potentially lose an unused inode item here */
@@ -712,10 +731,6 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
}
- mutex_lock(&new_root->objectid_mutex);
- new_root->highest_objectid = new_dirid;
- mutex_unlock(&new_root->objectid_mutex);
-
/*
* insert the directory item
*/
@@ -733,7 +748,7 @@ static noinline int create_subvol(struct inode *dir,
}
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
- ret = btrfs_update_inode(trans, root, dir);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1274,6 +1289,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
u64 page_start;
u64 page_end;
u64 page_cnt;
+ u64 start = (u64)start_index << PAGE_SHIFT;
+ u64 search_start;
int ret;
int i;
int i_done;
@@ -1290,8 +1307,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- start_index << PAGE_SHIFT,
- page_cnt << PAGE_SHIFT);
+ start, page_cnt << PAGE_SHIFT);
if (ret)
return ret;
i_done = 0;
@@ -1306,6 +1322,13 @@ again:
if (!page)
break;
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ break;
+ }
+
page_start = page_offset(page);
page_end = page_start + PAGE_SIZE - 1;
while (1) {
@@ -1371,6 +1394,40 @@ again:
lock_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end - 1, &cached_state);
+
+ /*
+ * When defragmenting we skip ranges that have holes or inline extents,
+ * (check should_defrag_range()), to avoid unnecessary IO and wasting
+ * space. At btrfs_defrag_file(), we check if a range should be defragged
+ * before locking the inode and then, if it should, we trigger a sync
+ * page cache readahead - we lock the inode only after that to avoid
+ * blocking for too long other tasks that possibly want to operate on
+ * other file ranges. But before we were able to get the inode lock,
+ * some other task may have punched a hole in the range, or we may have
+ * now an inline extent, in which case we should not defrag. So check
+ * for that here, where we have the inode and the range locked, and bail
+ * out if that happened.
+ */
+ search_start = page_start;
+ while (search_start < page_end) {
+ struct extent_map *em;
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
+ page_end - search_start);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock_range;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ /* Ok, 0 means we did not defrag anything */
+ ret = 0;
+ goto out_unlock_range;
+ }
+ search_start = extent_map_end(em);
+ free_extent_map(em);
+ }
+
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 0, 0, &cached_state);
@@ -1380,8 +1437,7 @@ again:
btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start_index << PAGE_SHIFT,
- (page_cnt - i_done) << PAGE_SHIFT, true);
+ start, (page_cnt - i_done) << PAGE_SHIFT, true);
}
@@ -1394,7 +1450,6 @@ again:
for (i = 0; i < i_done; i++) {
clear_page_dirty_for_io(pages[i]);
ClearPageChecked(pages[i]);
- set_page_extent_mapped(pages[i]);
set_page_dirty(pages[i]);
unlock_page(pages[i]);
put_page(pages[i]);
@@ -1402,14 +1457,17 @@ again:
btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return i_done;
+
+out_unlock_range:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ page_start, page_end - 1, &cached_state);
out:
for (i = 0; i < i_done; i++) {
unlock_page(pages[i]);
put_page(pages[i]);
}
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start_index << PAGE_SHIFT,
- page_cnt << PAGE_SHIFT, true);
+ start, page_cnt << PAGE_SHIFT, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return ret;
@@ -1680,7 +1738,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
@@ -3323,7 +3381,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
rcu_read_lock();
dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
- NULL, true);
+ NULL);
if (!dev) {
ret = -ENODEV;
@@ -3395,7 +3453,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = -ENOMEM;
goto out_free;
}
- path->leave_spinning = 1;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
@@ -4903,7 +4960,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
+ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 66e02ebdd340..5fafc5e89bb7 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -17,404 +17,89 @@
* Extent buffer locking
* =====================
*
- * The locks use a custom scheme that allows to do more operations than are
- * available fromt current locking primitives. The building blocks are still
- * rwlock and wait queues.
- *
- * Required semantics:
+ * We use a rw_semaphore for tree locking, and the semantics are exactly the
+ * same:
*
* - reader/writer exclusion
* - writer/writer exclusion
* - reader/reader sharing
- * - spinning lock semantics
- * - blocking lock semantics
* - try-lock semantics for readers and writers
- * - one level nesting, allowing read lock to be taken by the same thread that
- * already has write lock
- *
- * The extent buffer locks (also called tree locks) manage access to eb data
- * related to the storage in the b-tree (keys, items, but not the individual
- * members of eb).
- * We want concurrency of many readers and safe updates. The underlying locking
- * is done by read-write spinlock and the blocking part is implemented using
- * counters and wait queues.
- *
- * spinning semantics - the low-level rwlock is held so all other threads that
- * want to take it are spinning on it.
- *
- * blocking semantics - the low-level rwlock is not held but the counter
- * denotes how many times the blocking lock was held;
- * sleeping is possible
- *
- * Write lock always allows only one thread to access the data.
- *
- *
- * Debugging
- * ---------
- *
- * There are additional state counters that are asserted in various contexts,
- * removed from non-debug build to reduce extent_buffer size and for
- * performance reasons.
- *
- *
- * Lock recursion
- * --------------
- *
- * A write operation on a tree might indirectly start a look up on the same
- * tree. This can happen when btrfs_cow_block locks the tree and needs to
- * lookup free extents.
- *
- * btrfs_cow_block
- * ..
- * alloc_tree_block_no_bg_flush
- * btrfs_alloc_tree_block
- * btrfs_reserve_extent
- * ..
- * load_free_space_cache
- * ..
- * btrfs_lookup_file_extent
- * btrfs_search_slot
- *
- *
- * Locking pattern - spinning
- * --------------------------
- *
- * The simple locking scenario, the +--+ denotes the spinning section.
- *
- * +- btrfs_tree_lock
- * | - extent_buffer::rwlock is held
- * | - no heavy operations should happen, eg. IO, memory allocations, large
- * | structure traversals
- * +- btrfs_tree_unock
-*
-*
- * Locking pattern - blocking
- * --------------------------
- *
- * The blocking write uses the following scheme. The +--+ denotes the spinning
- * section.
- *
- * +- btrfs_tree_lock
- * |
- * +- btrfs_set_lock_blocking_write
- *
- * - allowed: IO, memory allocations, etc.
- *
- * -- btrfs_tree_unlock - note, no explicit unblocking necessary
- *
- *
- * Blocking read is similar.
- *
- * +- btrfs_tree_read_lock
- * |
- * +- btrfs_set_lock_blocking_read
- *
- * - heavy operations allowed
- *
- * +- btrfs_tree_read_unlock_blocking
- * |
- * +- btrfs_tree_read_unlock
- *
- */
-
-#ifdef CONFIG_BTRFS_DEBUG
-static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
-{
- WARN_ON(eb->spinning_writers);
- eb->spinning_writers++;
-}
-
-static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
-{
- WARN_ON(eb->spinning_writers != 1);
- eb->spinning_writers--;
-}
-
-static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
-{
- WARN_ON(eb->spinning_writers);
-}
-
-static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
-{
- atomic_inc(&eb->spinning_readers);
-}
-
-static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb)
-{
- WARN_ON(atomic_read(&eb->spinning_readers) == 0);
- atomic_dec(&eb->spinning_readers);
-}
-
-static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb)
-{
- atomic_inc(&eb->read_locks);
-}
-
-static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb)
-{
- atomic_dec(&eb->read_locks);
-}
-
-static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
-{
- BUG_ON(!atomic_read(&eb->read_locks));
-}
-
-static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
-{
- eb->write_locks++;
-}
-
-static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
-{
- eb->write_locks--;
-}
-
-#else
-static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { }
-static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { }
-static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { }
-static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { }
-static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { }
-#endif
-
-/*
- * Mark already held read lock as blocking. Can be nested in write lock by the
- * same thread.
- *
- * Use when there are potentially long operations ahead so other thread waiting
- * on the lock will not actively spin but sleep instead.
- *
- * The rwlock is released and blocking reader counter is increased.
- */
-void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
-{
- trace_btrfs_set_lock_blocking_read(eb);
- /*
- * No lock is required. The lock owner may change if we have a read
- * lock, but it won't change to or away from us. If we have the write
- * lock, we are the owner and it'll never change.
- */
- if (eb->lock_recursed && current->pid == eb->lock_owner)
- return;
- btrfs_assert_tree_read_locked(eb);
- atomic_inc(&eb->blocking_readers);
- btrfs_assert_spinning_readers_put(eb);
- read_unlock(&eb->lock);
-}
-
-/*
- * Mark already held write lock as blocking.
- *
- * Use when there are potentially long operations ahead so other threads
- * waiting on the lock will not actively spin but sleep instead.
*
- * The rwlock is released and blocking writers is set.
+ * The rwsem implementation does opportunistic spinning which reduces number of
+ * times the locking task needs to sleep.
*/
-void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
-{
- trace_btrfs_set_lock_blocking_write(eb);
- /*
- * No lock is required. The lock owner may change if we have a read
- * lock, but it won't change to or away from us. If we have the write
- * lock, we are the owner and it'll never change.
- */
- if (eb->lock_recursed && current->pid == eb->lock_owner)
- return;
- if (eb->blocking_writers == 0) {
- btrfs_assert_spinning_writers_put(eb);
- btrfs_assert_tree_locked(eb);
- WRITE_ONCE(eb->blocking_writers, 1);
- write_unlock(&eb->lock);
- }
-}
/*
- * Lock the extent buffer for read. Wait for any writers (spinning or blocking).
- * Can be nested in write lock by the same thread.
+ * __btrfs_tree_read_lock - lock extent buffer for read
+ * @eb: the eb to be locked
+ * @nest: the nesting level to be used for lockdep
*
- * Use when the locked section does only lightweight actions and busy waiting
- * would be cheaper than making other threads do the wait/wake loop.
- *
- * The rwlock is held upon exit.
+ * This takes the read lock on the extent buffer, using the specified nesting
+ * level for lockdep purposes.
*/
-void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
- bool recurse)
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
{
u64 start_ns = 0;
if (trace_btrfs_tree_read_lock_enabled())
start_ns = ktime_get_ns();
-again:
- read_lock(&eb->lock);
- BUG_ON(eb->blocking_writers == 0 &&
- current->pid == eb->lock_owner);
- if (eb->blocking_writers) {
- if (current->pid == eb->lock_owner) {
- /*
- * This extent is already write-locked by our thread.
- * We allow an additional read lock to be added because
- * it's for the same thread. btrfs_find_all_roots()
- * depends on this as it may be called on a partly
- * (write-)locked tree.
- */
- WARN_ON(!recurse);
- BUG_ON(eb->lock_recursed);
- eb->lock_recursed = true;
- read_unlock(&eb->lock);
- trace_btrfs_tree_read_lock(eb, start_ns);
- return;
- }
- read_unlock(&eb->lock);
- wait_event(eb->write_lock_wq,
- READ_ONCE(eb->blocking_writers) == 0);
- goto again;
- }
- btrfs_assert_tree_read_locks_get(eb);
- btrfs_assert_spinning_readers_get(eb);
+
+ down_read_nested(&eb->lock, nest);
+ eb->lock_owner = current->pid;
trace_btrfs_tree_read_lock(eb, start_ns);
}
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
- __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false);
-}
-
-/*
- * Lock extent buffer for read, optimistically expecting that there are no
- * contending blocking writers. If there are, don't wait.
- *
- * Return 1 if the rwlock has been taken, 0 otherwise
- */
-int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
-{
- if (READ_ONCE(eb->blocking_writers))
- return 0;
-
- read_lock(&eb->lock);
- /* Refetch value after lock */
- if (READ_ONCE(eb->blocking_writers)) {
- read_unlock(&eb->lock);
- return 0;
- }
- btrfs_assert_tree_read_locks_get(eb);
- btrfs_assert_spinning_readers_get(eb);
- trace_btrfs_tree_read_lock_atomic(eb);
- return 1;
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL);
}
/*
- * Try-lock for read. Don't block or wait for contending writers.
+ * Try-lock for read.
*
* Retrun 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
- if (READ_ONCE(eb->blocking_writers))
- return 0;
-
- if (!read_trylock(&eb->lock))
- return 0;
-
- /* Refetch value after lock */
- if (READ_ONCE(eb->blocking_writers)) {
- read_unlock(&eb->lock);
- return 0;
+ if (down_read_trylock(&eb->lock)) {
+ eb->lock_owner = current->pid;
+ trace_btrfs_try_tree_read_lock(eb);
+ return 1;
}
- btrfs_assert_tree_read_locks_get(eb);
- btrfs_assert_spinning_readers_get(eb);
- trace_btrfs_try_tree_read_lock(eb);
- return 1;
+ return 0;
}
/*
- * Try-lock for write. May block until the lock is uncontended, but does not
- * wait until it is free.
+ * Try-lock for write.
*
* Retrun 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
- if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers))
- return 0;
-
- write_lock(&eb->lock);
- /* Refetch value after lock */
- if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) {
- write_unlock(&eb->lock);
- return 0;
+ if (down_write_trylock(&eb->lock)) {
+ eb->lock_owner = current->pid;
+ trace_btrfs_try_tree_write_lock(eb);
+ return 1;
}
- btrfs_assert_tree_write_locks_get(eb);
- btrfs_assert_spinning_writers_get(eb);
- eb->lock_owner = current->pid;
- trace_btrfs_try_tree_write_lock(eb);
- return 1;
+ return 0;
}
/*
- * Release read lock. Must be used only if the lock is in spinning mode. If
- * the read lock is nested, must pair with read lock before the write unlock.
- *
- * The rwlock is not held upon exit.
+ * Release read lock.
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_read_unlock(eb);
- /*
- * if we're nested, we have the write lock. No new locking
- * is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_recursed
- * field only matters to the lock owner.
- */
- if (eb->lock_recursed && current->pid == eb->lock_owner) {
- eb->lock_recursed = false;
- return;
- }
- btrfs_assert_tree_read_locked(eb);
- btrfs_assert_spinning_readers_put(eb);
- btrfs_assert_tree_read_locks_put(eb);
- read_unlock(&eb->lock);
-}
-
-/*
- * Release read lock, previously set to blocking by a pairing call to
- * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same
- * thread.
- *
- * State of rwlock is unchanged, last reader wakes waiting threads.
- */
-void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
-{
- trace_btrfs_tree_read_unlock_blocking(eb);
- /*
- * if we're nested, we have the write lock. No new locking
- * is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_recursed
- * field only matters to the lock owner.
- */
- if (eb->lock_recursed && current->pid == eb->lock_owner) {
- eb->lock_recursed = false;
- return;
- }
- btrfs_assert_tree_read_locked(eb);
- WARN_ON(atomic_read(&eb->blocking_readers) == 0);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_readers))
- cond_wake_up_nomb(&eb->read_lock_wq);
- btrfs_assert_tree_read_locks_put(eb);
+ eb->lock_owner = 0;
+ up_read(&eb->lock);
}
/*
- * Lock for write. Wait for all blocking and spinning readers and writers. This
- * starts context where reader lock could be nested by the same thread.
+ * __btrfs_tree_lock - lock eb for write
+ * @eb: the eb to lock
+ * @nest: the nesting to use for the lock
*
- * The rwlock is held for write upon exit.
+ * Returns with the eb->lock write locked.
*/
void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
__acquires(&eb->lock)
@@ -424,19 +109,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
if (trace_btrfs_tree_lock_enabled())
start_ns = ktime_get_ns();
- WARN_ON(eb->lock_owner == current->pid);
-again:
- wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
- wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0);
- write_lock(&eb->lock);
- /* Refetch value after lock */
- if (atomic_read(&eb->blocking_readers) ||
- READ_ONCE(eb->blocking_writers)) {
- write_unlock(&eb->lock);
- goto again;
- }
- btrfs_assert_spinning_writers_get(eb);
- btrfs_assert_tree_write_locks_get(eb);
+ down_write_nested(&eb->lock, nest);
eb->lock_owner = current->pid;
trace_btrfs_tree_lock(eb, start_ns);
}
@@ -447,68 +120,13 @@ void btrfs_tree_lock(struct extent_buffer *eb)
}
/*
- * Release the write lock, either blocking or spinning (ie. there's no need
- * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
- * This also ends the context for nesting, the read lock must have been
- * released already.
- *
- * Tasks blocked and waiting are woken, rwlock is not held upon exit.
+ * Release the write lock.
*/
void btrfs_tree_unlock(struct extent_buffer *eb)
{
- /*
- * This is read both locked and unlocked but always by the same thread
- * that already owns the lock so we don't need to use READ_ONCE
- */
- int blockers = eb->blocking_writers;
-
- BUG_ON(blockers > 1);
-
- btrfs_assert_tree_locked(eb);
trace_btrfs_tree_unlock(eb);
eb->lock_owner = 0;
- btrfs_assert_tree_write_locks_put(eb);
-
- if (blockers) {
- btrfs_assert_no_spinning_writers(eb);
- /* Unlocked write */
- WRITE_ONCE(eb->blocking_writers, 0);
- /*
- * We need to order modifying blocking_writers above with
- * actually waking up the sleepers to ensure they see the
- * updated value of blocking_writers
- */
- cond_wake_up(&eb->write_lock_wq);
- } else {
- btrfs_assert_spinning_writers_put(eb);
- write_unlock(&eb->lock);
- }
-}
-
-/*
- * Set all locked nodes in the path to blocking locks. This should be done
- * before scheduling
- */
-void btrfs_set_path_blocking(struct btrfs_path *p)
-{
- int i;
-
- for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
- if (!p->nodes[i] || !p->locks[i])
- continue;
- /*
- * If we currently have a spinning reader or writer lock this
- * will bump the count of blocking holders and drop the
- * spinlock.
- */
- if (p->locks[i] == BTRFS_READ_LOCK) {
- btrfs_set_lock_blocking_read(p->nodes[i]);
- p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
- } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
- btrfs_set_lock_blocking_write(p->nodes[i]);
- p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
- }
- }
+ up_write(&eb->lock);
}
/*
@@ -564,14 +182,13 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
*
* Return: root extent buffer with read lock held
*/
-struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
- bool recurse)
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
- __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse);
+ btrfs_tree_read_lock(eb);
if (eb == root->node)
break;
btrfs_tree_read_unlock(eb);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 3ea81ed3320b..a2e1f1f5c6e3 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -13,8 +13,6 @@
#define BTRFS_WRITE_LOCK 1
#define BTRFS_READ_LOCK 2
-#define BTRFS_WRITE_LOCK_BLOCKING 3
-#define BTRFS_READ_LOCK_BLOCKING 4
/*
* We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at
@@ -89,42 +87,28 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
-void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
- bool recurse);
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
-void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
-void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
-void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
-int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
-struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
- bool recurse);
-
-static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
-{
- return __btrfs_read_lock_root_node(root, false);
-}
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
- BUG_ON(!eb->write_locks);
+ lockdep_assert_held(&eb->lock);
}
#else
static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
#endif
-void btrfs_set_path_blocking(struct btrfs_path *p);
void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
{
- if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
+ if (rw == BTRFS_WRITE_LOCK)
btrfs_tree_unlock(eb);
- else if (rw == BTRFS_READ_LOCK_BLOCKING)
- btrfs_tree_read_unlock_blocking(eb);
else if (rw == BTRFS_READ_LOCK)
btrfs_tree_read_unlock(eb);
else
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 87bac9ecdf4c..985a21558437 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -199,14 +199,21 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = ret;
- if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
- set_bit(type, &entry->flags);
+ entry->physical = (u64)-1;
+ entry->disk = NULL;
+ entry->partno = (u8)-1;
- if (dio) {
- percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes,
- fs_info->delalloc_batch);
+ ASSERT(type == BTRFS_ORDERED_REGULAR ||
+ type == BTRFS_ORDERED_NOCOW ||
+ type == BTRFS_ORDERED_PREALLOC ||
+ type == BTRFS_ORDERED_COMPRESSED);
+ set_bit(type, &entry->flags);
+
+ percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
+ fs_info->delalloc_batch);
+
+ if (dio)
set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
- }
/* one ref for the tree */
refcount_set(&entry->refs, 1);
@@ -256,6 +263,9 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type)
{
+ ASSERT(type == BTRFS_ORDERED_REGULAR ||
+ type == BTRFS_ORDERED_NOCOW ||
+ type == BTRFS_ORDERED_PREALLOC);
return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
num_bytes, disk_num_bytes, type, 0,
BTRFS_COMPRESS_NONE);
@@ -265,6 +275,9 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type)
{
+ ASSERT(type == BTRFS_ORDERED_REGULAR ||
+ type == BTRFS_ORDERED_NOCOW ||
+ type == BTRFS_ORDERED_PREALLOC);
return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
num_bytes, disk_num_bytes, type, 1,
BTRFS_COMPRESS_NONE);
@@ -272,11 +285,12 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type,
- int compress_type)
+ u64 disk_num_bytes, int compress_type)
{
+ ASSERT(compress_type != BTRFS_COMPRESS_NONE);
return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes, type, 0,
+ num_bytes, disk_num_bytes,
+ BTRFS_ORDERED_COMPRESSED, 0,
compress_type);
}
@@ -297,26 +311,33 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
}
/*
- * this is used to account for finished IO across a given range
- * of the file. The IO may span ordered extents. If
- * a given ordered_extent is completely done, 1 is returned, otherwise
- * 0.
+ * Finish IO for one ordered extent across a given range. The range can
+ * contain several ordered extents.
*
- * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
- * to make sure this function only returns 1 once for a given ordered extent.
+ * @found_ret: Return the finished ordered extent
+ * @file_offset: File offset for the finished IO
+ * Will also be updated to one byte past the range that is
+ * recordered as finished. This allows caller to walk forward.
+ * @io_size: Length of the finish IO range
+ * @uptodate: If the IO finished without problem
*
- * file_offset is updated to one byte past the range that is recorded as
- * complete. This allows you to walk forward in the file.
+ * Return true if any ordered extent is finished in the range, and update
+ * @found_ret and @file_offset.
+ * Return false otherwise.
+ *
+ * NOTE: Although The range can cross multiple ordered extents, only one
+ * ordered extent will be updated during one call. The caller is responsible to
+ * iterate all ordered extents in the range.
*/
-int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
- struct btrfs_ordered_extent **cached,
+bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent **finished_ret,
u64 *file_offset, u64 io_size, int uptodate)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- int ret;
+ bool finished = false;
unsigned long flags;
u64 dec_end;
u64 dec_start;
@@ -324,16 +345,12 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, *file_offset);
- if (!node) {
- ret = 1;
+ if (!node)
goto out;
- }
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (!offset_in_entry(entry, *file_offset)) {
- ret = 1;
+ if (!offset_in_entry(entry, *file_offset))
goto out;
- }
dec_start = max(*file_offset, entry->file_offset);
dec_end = min(*file_offset + io_size,
@@ -354,39 +371,50 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
if (entry->bytes_left == 0) {
- ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Ensure only one caller can set the flag and finished_ret
+ * accordingly
+ */
+ finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
/* test_and_set_bit implies a barrier */
cond_wake_up_nomb(&entry->wait);
- } else {
- ret = 1;
}
out:
- if (!ret && cached && entry) {
- *cached = entry;
+ if (finished && finished_ret && entry) {
+ *finished_ret = entry;
refcount_inc(&entry->refs);
}
spin_unlock_irqrestore(&tree->lock, flags);
- return ret == 0;
+ return finished;
}
/*
- * this is used to account for finished IO across a given range
- * of the file. The IO should not span ordered extents. If
- * a given ordered_extent is completely done, 1 is returned, otherwise
- * 0.
+ * Finish IO for one ordered extent across a given range. The range can only
+ * contain one ordered extent.
*
- * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
- * to make sure this function only returns 1 once for a given ordered extent.
+ * @cached: The cached ordered extent. If not NULL, we can skip the tree
+ * search and use the ordered extent directly.
+ * Will be also used to store the finished ordered extent.
+ * @file_offset: File offset for the finished IO
+ * @io_size: Length of the finish IO range
+ * @uptodate: If the IO finishes without problem
+ *
+ * Return true if the ordered extent is finished in the range, and update
+ * @cached.
+ * Return false otherwise.
+ *
+ * NOTE: The range can NOT cross multiple ordered extents.
+ * Thus caller should ensure the range doesn't cross ordered extents.
*/
-int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
- struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size, int uptodate)
+bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 file_offset, u64 io_size, int uptodate)
{
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
- int ret;
+ bool finished = false;
spin_lock_irqsave(&tree->lock, flags);
if (cached && *cached) {
@@ -395,41 +423,39 @@ int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
}
node = tree_search(tree, file_offset);
- if (!node) {
- ret = 1;
+ if (!node)
goto out;
- }
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
have_entry:
- if (!offset_in_entry(entry, file_offset)) {
- ret = 1;
+ if (!offset_in_entry(entry, file_offset))
goto out;
- }
- if (io_size > entry->bytes_left) {
+ if (io_size > entry->bytes_left)
btrfs_crit(inode->root->fs_info,
"bad ordered accounting left %llu size %llu",
entry->bytes_left, io_size);
- }
+
entry->bytes_left -= io_size;
if (!uptodate)
set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
if (entry->bytes_left == 0) {
- ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Ensure only one caller can set the flag and finished_ret
+ * accordingly
+ */
+ finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
/* test_and_set_bit implies a barrier */
cond_wake_up_nomb(&entry->wait);
- } else {
- ret = 1;
}
out:
- if (!ret && cached && entry) {
+ if (finished && cached && entry) {
*cached = entry;
refcount_inc(&entry->refs);
}
spin_unlock_irqrestore(&tree->lock, flags);
- return ret == 0;
+ return finished;
}
/*
@@ -480,9 +506,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
false);
- if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes,
- fs_info->delalloc_batch);
+ percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
+ fs_info->delalloc_batch);
tree = &btrfs_inode->ordered_tree;
spin_lock_irq(&tree->lock);
@@ -745,9 +770,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
+ unsigned long flags;
tree = &inode->ordered_tree;
- spin_lock_irq(&tree->lock);
+ spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, file_offset);
if (!node)
goto out;
@@ -758,7 +784,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
if (entry)
refcount_inc(&entry->refs);
out:
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irqrestore(&tree->lock, flags);
return entry;
}
@@ -855,51 +881,6 @@ out:
}
/*
- * search the ordered extents for one corresponding to 'offset' and
- * try to find a checksum. This is used because we allow pages to
- * be reclaimed before their checksum is actually put into the btree
- */
-int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
- u64 disk_bytenr, u8 *sum, int len)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_sum *ordered_sum;
- struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
- unsigned long num_sectors;
- unsigned long i;
- u32 sectorsize = btrfs_inode_sectorsize(inode);
- const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits;
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- int index = 0;
-
- ordered = btrfs_lookup_ordered_extent(inode, offset);
- if (!ordered)
- return 0;
-
- spin_lock_irq(&tree->lock);
- list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
- if (disk_bytenr >= ordered_sum->bytenr &&
- disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
- i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits;
- num_sectors = ordered_sum->len >> blocksize_bits;
- num_sectors = min_t(int, len - index, num_sectors - i);
- memcpy(sum + index, ordered_sum->sums + i * csum_size,
- num_sectors * csum_size);
-
- index += (int)num_sectors * csum_size;
- if (index == len)
- goto out;
- disk_bytenr += num_sectors * sectorsize;
- }
- }
-out:
- spin_unlock_irq(&tree->lock);
- btrfs_put_ordered_extent(ordered);
- return index;
-}
-
-/*
* btrfs_flush_ordered_range - Lock the passed range and ensures all pending
* ordered extents in it are run to completion.
*
@@ -943,6 +924,84 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
}
}
+static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
+ u64 len)
+{
+ struct inode *inode = ordered->inode;
+ u64 file_offset = ordered->file_offset + pos;
+ u64 disk_bytenr = ordered->disk_bytenr + pos;
+ u64 num_bytes = len;
+ u64 disk_num_bytes = len;
+ int type;
+ unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT);
+ int compress_type = ordered->compress_type;
+ unsigned long weight;
+ int ret;
+
+ weight = hweight_long(flags_masked);
+ WARN_ON_ONCE(weight > 1);
+ if (!weight)
+ type = 0;
+ else
+ type = __ffs(flags_masked);
+
+ if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) {
+ WARN_ON_ONCE(1);
+ ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode),
+ file_offset, disk_bytenr, num_bytes,
+ disk_num_bytes, compress_type);
+ } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+ ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset,
+ disk_bytenr, num_bytes, disk_num_bytes, type);
+ } else {
+ ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset,
+ disk_bytenr, num_bytes, disk_num_bytes, type);
+ }
+
+ return ret;
+}
+
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
+ u64 post)
+{
+ struct inode *inode = ordered->inode;
+ struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct rb_node *node;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ int ret = 0;
+
+ spin_lock_irq(&tree->lock);
+ /* Remove from tree once */
+ node = &ordered->rb_node;
+ rb_erase(node, &tree->tree);
+ RB_CLEAR_NODE(node);
+ if (tree->last == node)
+ tree->last = NULL;
+
+ ordered->file_offset += pre;
+ ordered->disk_bytenr += pre;
+ ordered->num_bytes -= (pre + post);
+ ordered->disk_num_bytes -= (pre + post);
+ ordered->bytes_left -= (pre + post);
+
+ /* Re-insert the node */
+ node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
+ if (node)
+ btrfs_panic(fs_info, -EEXIST,
+ "zoned: inconsistency in ordered tree at offset %llu",
+ ordered->file_offset);
+
+ spin_unlock_irq(&tree->lock);
+
+ if (pre)
+ ret = clone_ordered_extent(ordered, 0, pre);
+ if (post)
+ ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
+ post);
+
+ return ret;
+}
+
int __init ordered_data_init(void)
{
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c3a2325e64a4..99e0853e4d3b 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -27,7 +27,7 @@ struct btrfs_ordered_sum {
};
/*
- * bits for the flags field:
+ * Bits for btrfs_ordered_extent::flags.
*
* BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
* It is used to make sure metadata is inserted into the tree only once
@@ -38,24 +38,36 @@ struct btrfs_ordered_sum {
* IO is done and any metadata is inserted into the tree.
*/
enum {
+ /*
+ * Different types for direct io, one and only one of the 4 type can
+ * be set when creating ordered extent.
+ *
+ * REGULAR: For regular non-compressed COW write
+ * NOCOW: For NOCOW write into existing non-hole extent
+ * PREALLOC: For NOCOW write into preallocated extent
+ * COMPRESSED: For compressed COW write
+ */
+ BTRFS_ORDERED_REGULAR,
+ BTRFS_ORDERED_NOCOW,
+ BTRFS_ORDERED_PREALLOC,
+ BTRFS_ORDERED_COMPRESSED,
+
+ /*
+ * Extra bit for direct io, can only be set for
+ * REGULAR/NOCOW/PREALLOC. No direct io for compressed extent.
+ */
+ BTRFS_ORDERED_DIRECT,
+
+ /* Extra status bits for ordered extents */
+
/* set when all the pages are written */
BTRFS_ORDERED_IO_DONE,
/* set when removed from the tree */
BTRFS_ORDERED_COMPLETE,
- /* set when we want to write in place */
- BTRFS_ORDERED_NOCOW,
- /* writing a zlib compressed extent */
- BTRFS_ORDERED_COMPRESSED,
- /* set when writing to preallocated extent */
- BTRFS_ORDERED_PREALLOC,
- /* set when we're doing DIO with this extent */
- BTRFS_ORDERED_DIRECT,
/* We had an io error when writing this out */
BTRFS_ORDERED_IOERR,
/* Set when we have to truncate an extent */
BTRFS_ORDERED_TRUNCATED,
- /* Regular IO for COW */
- BTRFS_ORDERED_REGULAR,
/* Used during fsync to track already logged extents */
BTRFS_ORDERED_LOGGED,
/* We have already logged all the csums of the ordered extent */
@@ -127,6 +139,14 @@ struct btrfs_ordered_extent {
struct completion completion;
struct btrfs_work flush_work;
struct list_head work_list;
+
+ /*
+ * Used to reverse-map physical address returned from ZONE_APPEND write
+ * command in a workqueue context
+ */
+ u64 physical;
+ struct gendisk *disk;
+ u8 partno;
};
/*
@@ -137,9 +157,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info,
unsigned long bytes)
{
int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
- int csum_size = btrfs_super_csum_size(fs_info->super_copy);
- return sizeof(struct btrfs_ordered_sum) + num_sectors * csum_size;
+ return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
}
static inline void
@@ -153,11 +172,11 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
-int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
- struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size, int uptodate);
-int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
- struct btrfs_ordered_extent **cached,
+bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 file_offset, u64 io_size, int uptodate);
+bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent **finished_ret,
u64 *file_offset, u64 io_size,
int uptodate);
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
@@ -168,8 +187,7 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
u64 disk_num_bytes, int type);
int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type,
- int compress_type);
+ u64 disk_num_bytes, int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
@@ -184,8 +202,6 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
u64 len);
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
struct list_head *list);
-int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
- u64 disk_bytenr, u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
@@ -193,6 +209,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
+ u64 post);
int __init ordered_data_init(void);
void __cold ordered_data_exit(void);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 7695c4783d33..aae1027bd76a 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -26,22 +26,22 @@ static const struct root_name_map root_map[] = {
{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
};
-const char *btrfs_root_name(u64 objectid, char *buf)
+const char *btrfs_root_name(const struct btrfs_key *key, char *buf)
{
int i;
- if (objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (key->objectid == BTRFS_TREE_RELOC_OBJECTID) {
snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN,
- "TREE_RELOC offset=%llu", objectid);
+ "TREE_RELOC offset=%llu", key->offset);
return buf;
}
for (i = 0; i < ARRAY_SIZE(root_map); i++) {
- if (root_map[i].id == objectid)
+ if (root_map[i].id == key->objectid)
return root_map[i].name;
}
- snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", objectid);
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", key->objectid);
return buf;
}
@@ -177,8 +177,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
__le64 subvol_id;
read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id));
- pr_info("\t\tsubvol_id %llu\n",
- (unsigned long long)le64_to_cpu(subvol_id));
+ pr_info("\t\tsubvol_id %llu\n", le64_to_cpu(subvol_id));
item_size -= sizeof(u64);
offset += sizeof(u64);
}
@@ -191,15 +190,8 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
static void print_eb_refs_lock(struct extent_buffer *eb)
{
#ifdef CONFIG_BTRFS_DEBUG
- btrfs_info(eb->fs_info,
-"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u",
- atomic_read(&eb->refs), eb->write_locks,
- atomic_read(&eb->read_locks),
- eb->blocking_writers,
- atomic_read(&eb->blocking_readers),
- eb->spinning_writers,
- atomic_read(&eb->spinning_readers),
- eb->lock_owner, current->pid);
+ btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u",
+ atomic_read(&eb->refs), eb->lock_owner, current->pid);
#endif
}
@@ -398,6 +390,7 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow)
btrfs_node_key_to_cpu(c, &first_key, i);
next = read_tree_block(fs_info, btrfs_node_blockptr(c, i),
+ btrfs_header_owner(c),
btrfs_node_ptr_generation(c, i),
level - 1, &first_key);
if (IS_ERR(next)) {
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 78b99385a503..8c3e9319ec4e 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -11,6 +11,6 @@
void btrfs_print_leaf(struct extent_buffer *l);
void btrfs_print_tree(struct extent_buffer *c, bool follow);
-const char *btrfs_root_name(u64 objectid, char *buf);
+const char *btrfs_root_name(const struct btrfs_key *key, char *buf);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 580899bdb991..808370ada888 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -11,6 +11,7 @@
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/btrfs.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "transaction.h"
@@ -497,13 +498,13 @@ next2:
break;
}
out:
+ btrfs_free_path(path);
fs_info->qgroup_flags |= flags;
if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
ret >= 0)
ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
- btrfs_free_path(path);
if (ret < 0) {
ulist_free(fs_info->qgroup_ulist);
@@ -893,8 +894,6 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
key.objectid = 0;
key.offset = 0;
key.type = 0;
@@ -936,6 +935,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
struct btrfs_key found_key;
struct btrfs_qgroup *qgroup = NULL;
struct btrfs_trans_handle *trans = NULL;
+ struct ulist *ulist = NULL;
int ret = 0;
int slot;
@@ -943,8 +943,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
if (fs_info->quota_root)
goto out;
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
- if (!fs_info->qgroup_ulist) {
+ ulist = ulist_alloc(GFP_KERNEL);
+ if (!ulist) {
ret = -ENOMEM;
goto out;
}
@@ -952,6 +952,22 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
ret = btrfs_sysfs_add_qgroups(fs_info);
if (ret < 0)
goto out;
+
+ /*
+ * Unlock qgroup_ioctl_lock before starting the transaction. This is to
+ * avoid lock acquisition inversion problems (reported by lockdep) between
+ * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
+ * start a transaction.
+ * After we started the transaction lock qgroup_ioctl_lock again and
+ * check if someone else created the quota root in the meanwhile. If so,
+ * just return success and release the transaction handle.
+ *
+ * Also we don't need to worry about someone else calling
+ * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
+ * that function returns 0 (success) when the sysfs entries already exist.
+ */
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
/*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
@@ -961,12 +977,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
* would be a lot of overkill.
*/
trans = btrfs_start_transaction(tree_root, 2);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
+ if (fs_info->quota_root)
+ goto out;
+
+ fs_info->qgroup_ulist = ulist;
+ ulist = NULL;
+
/*
* initially create the quota tree
*/
@@ -1026,6 +1050,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_ROOT_REF_KEY) {
+
+ /* Release locks on tree_root before we access quota_root */
+ btrfs_release_path(path);
+
ret = add_qgroup_item(trans, quota_root,
found_key.offset);
if (ret) {
@@ -1044,6 +1072,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ ret = btrfs_search_slot_for_read(tree_root, &found_key,
+ path, 1, 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ if (ret > 0) {
+ /*
+ * Shouldn't happen, but in case it does we
+ * don't need to do the btrfs_next_item, just
+ * continue.
+ */
+ continue;
+ }
}
ret = btrfs_next_item(tree_root, path);
if (ret < 0) {
@@ -1106,11 +1148,14 @@ out:
if (ret) {
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
- if (trans)
- btrfs_end_transaction(trans);
btrfs_sysfs_del_qgroups(fs_info);
}
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (ret && trans)
+ btrfs_end_transaction(trans);
+ else if (trans)
+ ret = btrfs_end_transaction(trans);
+ ulist_free(ulist);
return ret;
}
@@ -1123,19 +1168,29 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
* 1 For the root item
*
* We should also reserve enough items for the quota tree deletion in
* btrfs_clean_quota_tree but this is not done.
+ *
+ * Also, we must always start a transaction without holding the mutex
+ * qgroup_ioctl_lock, see btrfs_quota_enable().
*/
trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ trans = NULL;
goto out;
}
+ if (!fs_info->quota_root)
+ goto out;
+
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
spin_lock(&fs_info->qgroup_lock);
@@ -1149,13 +1204,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
ret = btrfs_clean_quota_tree(trans, quota_root);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto end_trans;
+ goto out;
}
ret = btrfs_del_root(trans, &quota_root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto end_trans;
+ goto out;
}
list_del(&quota_root->dirty_list);
@@ -1167,10 +1222,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_put_root(quota_root);
-end_trans:
- ret = btrfs_end_transaction(trans);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (ret && trans)
+ btrfs_end_transaction(trans);
+ else if (trans)
+ ret = btrfs_end_transaction(trans);
+
return ret;
}
@@ -1306,13 +1364,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
struct ulist *tmp;
+ unsigned int nofs_flag;
int ret = 0;
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
+ /* We hold a transaction handle open, must do a NOFS allocation. */
+ nofs_flag = memalloc_nofs_save();
tmp = ulist_alloc(GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!tmp)
return -ENOMEM;
@@ -1369,10 +1431,14 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
struct btrfs_qgroup_list *list;
struct ulist *tmp;
bool found = false;
+ unsigned int nofs_flag;
int ret = 0;
int ret2;
+ /* We hold a transaction handle open, must do a NOFS allocation. */
+ nofs_flag = memalloc_nofs_save();
tmp = ulist_alloc(GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!tmp)
return -ENOMEM;
@@ -1876,34 +1942,22 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
struct btrfs_key dst_key;
if (src_path->nodes[cur_level] == NULL) {
- struct btrfs_key first_key;
struct extent_buffer *eb;
int parent_slot;
- u64 child_gen;
- u64 child_bytenr;
eb = src_path->nodes[cur_level + 1];
parent_slot = src_path->slots[cur_level + 1];
- child_bytenr = btrfs_node_blockptr(eb, parent_slot);
- child_gen = btrfs_node_ptr_generation(eb, parent_slot);
- btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
- eb = read_tree_block(fs_info, child_bytenr, child_gen,
- cur_level, &first_key);
+ eb = btrfs_read_node_slot(eb, parent_slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- ret = -EIO;
- goto out;
}
src_path->nodes[cur_level] = eb;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ src_path->locks[cur_level] = BTRFS_READ_LOCK;
}
src_path->slots[cur_level] = dst_path->slots[cur_level];
@@ -1998,10 +2052,8 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
/* Read the tree block if needed */
if (dst_path->nodes[cur_level] == NULL) {
- struct btrfs_key first_key;
int parent_slot;
u64 child_gen;
- u64 child_bytenr;
/*
* dst_path->nodes[root_level] must be initialized before
@@ -2020,31 +2072,23 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
*/
eb = dst_path->nodes[cur_level + 1];
parent_slot = dst_path->slots[cur_level + 1];
- child_bytenr = btrfs_node_blockptr(eb, parent_slot);
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
- btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
/* This node is old, no need to trace */
if (child_gen < last_snapshot)
goto out;
- eb = read_tree_block(fs_info, child_bytenr, child_gen,
- cur_level, &first_key);
+ eb = btrfs_read_node_slot(eb, parent_slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- ret = -EIO;
- goto out;
}
dst_path->nodes[cur_level] = eb;
dst_path->slots[cur_level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ dst_path->locks[cur_level] = BTRFS_READ_LOCK;
need_cleanup = true;
}
@@ -2188,38 +2232,28 @@ walk_down:
level = root_level;
while (level >= 0) {
if (path->nodes[level] == NULL) {
- struct btrfs_key first_key;
int parent_slot;
- u64 child_gen;
u64 child_bytenr;
/*
- * We need to get child blockptr/gen from parent before
- * we can read it.
+ * We need to get child blockptr from parent before we
+ * can read it.
*/
eb = path->nodes[level + 1];
parent_slot = path->slots[level + 1];
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
- child_gen = btrfs_node_ptr_generation(eb, parent_slot);
- btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
- eb = read_tree_block(fs_info, child_bytenr, child_gen,
- level, &first_key);
+ eb = btrfs_read_node_slot(eb, parent_slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- ret = -EIO;
- goto out;
}
path->nodes[level] = eb;
path->slots[level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_READ_LOCK;
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
fs_info->nodesize,
@@ -3156,6 +3190,12 @@ out:
return ret;
}
+static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_fs_closing(fs_info) ||
+ test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
{
struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
@@ -3164,6 +3204,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
struct btrfs_trans_handle *trans = NULL;
int err = -ENOMEM;
int ret = 0;
+ bool stopped = false;
path = btrfs_alloc_path();
if (!path)
@@ -3176,7 +3217,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
path->skip_locking = 1;
err = 0;
- while (!err && !btrfs_fs_closing(fs_info)) {
+ while (!err && !(stopped = rescan_should_stop(fs_info))) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
@@ -3219,7 +3260,7 @@ out:
}
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (!btrfs_fs_closing(fs_info))
+ if (!stopped)
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (trans) {
ret = update_qgroup_status_item(trans);
@@ -3238,7 +3279,7 @@ out:
btrfs_end_transaction(trans);
- if (btrfs_fs_closing(fs_info)) {
+ if (stopped) {
btrfs_info(fs_info, "qgroup scan paused");
} else if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
@@ -3417,24 +3458,20 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,
{
struct rb_node *node;
struct rb_node *next;
- struct ulist_node *entry = NULL;
+ struct ulist_node *entry;
int ret = 0;
node = reserved->range_changed.root.rb_node;
+ if (!node)
+ return 0;
while (node) {
entry = rb_entry(node, struct ulist_node, rb_node);
if (entry->val < start)
node = node->rb_right;
- else if (entry)
- node = node->rb_left;
else
- break;
+ node = node->rb_left;
}
- /* Empty changeset */
- if (!entry)
- return 0;
-
if (entry->val > start && rb_prev(&entry->rb_node))
entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
rb_node);
@@ -3498,12 +3535,37 @@ static int try_flush_qgroup(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
int ret;
+ bool can_commit = true;
+
+ /*
+ * If current process holds a transaction, we shouldn't flush, as we
+ * assume all space reservation happens before a transaction handle is
+ * held.
+ *
+ * But there are cases like btrfs_delayed_item_reserve_metadata() where
+ * we try to reserve space with one transction handle already held.
+ * In that case we can't commit transaction, but at least try to end it
+ * and hope the started data writes can free some space.
+ */
+ if (current->journal_info &&
+ current->journal_info != BTRFS_SEND_TRANS_STUB)
+ can_commit = false;
/*
* We don't want to run flush again and again, so if there is a running
* one, we won't try to start a new flush, but exit directly.
*/
if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
+ /*
+ * We are already holding a transaction, thus we can block other
+ * threads from flushing. So exit right now. This increases
+ * the chance of EDQUOT for heavy load and near limit cases.
+ * But we can argue that if we're already near limit, EDQUOT is
+ * unavoidable anyway.
+ */
+ if (!can_commit)
+ return 0;
+
wait_event(root->qgroup_flush_wait,
!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
return 0;
@@ -3520,7 +3582,10 @@ static int try_flush_qgroup(struct btrfs_root *root)
goto out;
}
- ret = btrfs_commit_transaction(trans);
+ if (can_commit)
+ ret = btrfs_commit_transaction(trans);
+ else
+ ret = btrfs_end_transaction(trans);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
wake_up(&root->qgroup_flush_wait);
@@ -4160,7 +4225,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
spin_unlock(&blocks->lock);
/* Read out reloc subtree root */
- reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
block->reloc_generation, block->level,
&block->first_key);
if (IS_ERR(reloc_eb)) {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 255490f42b5d..8ec34ecb6d68 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -233,8 +233,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
}
x = cmpxchg(&info->stripe_hash_table, NULL, table);
- if (x)
- kvfree(x);
+ kvfree(x);
return 0;
}
@@ -1097,7 +1096,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
/* see if we can add this page onto our existing bio */
if (last) {
- u64 last_end = (u64)last->bi_iter.bi_sector << 9;
+ u64 last_end = last->bi_iter.bi_sector << 9;
last_end += last->bi_iter.bi_size;
/*
@@ -1105,8 +1104,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
* devices or if they are not contiguous
*/
if (last_end == disk_start && !last->bi_status &&
- last->bi_disk == stripe->dev->bdev->bd_disk &&
- last->bi_partno == stripe->dev->bdev->bd_partno) {
+ last->bi_bdev == stripe->dev->bdev) {
ret = bio_add_page(last, page, PAGE_SIZE, 0);
if (ret == PAGE_SIZE)
return 0;
@@ -1163,7 +1161,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
struct bvec_iter iter;
int i = 0;
- start = (u64)bio->bi_iter.bi_sector << 9;
+ start = bio->bi_iter.bi_sector << 9;
stripe_offset = start - rbio->bbio->raid_map[0];
page_index = stripe_offset >> PAGE_SHIFT;
@@ -1357,9 +1355,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
for (i = 0; i < rbio->bbio->num_stripes; i++) {
stripe = &rbio->bbio->stripes[i];
if (in_range(physical, stripe->physical, rbio->stripe_len) &&
- stripe->dev->bdev &&
- bio->bi_disk == stripe->dev->bdev->bd_disk &&
- bio->bi_partno == stripe->dev->bdev->bd_partno) {
+ stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
return i;
}
}
@@ -1374,7 +1370,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
- u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+ u64 logical = bio->bi_iter.bi_sector << 9;
int i;
for (i = 0; i < rbio->nr_data; i++) {
@@ -2150,7 +2146,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
if (rbio->faila == -1) {
btrfs_warn(fs_info,
"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
- __func__, (u64)bio->bi_iter.bi_sector << 9,
+ __func__, bio->bi_iter.bi_sector << 9,
(u64)bio->bi_iter.bi_size, bbio->map_type);
if (generic_io)
btrfs_put_bbio(bbio);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 9d4f5316a7e8..20fd4aa48a8c 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -52,6 +52,7 @@ struct reada_extctl {
struct reada_extent {
u64 logical;
+ u64 owner_root;
struct btrfs_key top;
struct list_head extctl;
int refcnt;
@@ -59,6 +60,7 @@ struct reada_extent {
struct reada_zone *zones[BTRFS_MAX_MIRRORS];
int nzones;
int scheduled;
+ int level;
};
struct reada_zone {
@@ -87,7 +89,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info);
static void __reada_start_machine(struct btrfs_fs_info *fs_info);
static int reada_add_block(struct reada_control *rc, u64 logical,
- struct btrfs_key *top, u64 generation);
+ struct btrfs_key *top, u64 owner_root,
+ u64 generation, int level);
/* recurses */
/* in case of err, eb might be NULL */
@@ -165,7 +168,9 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info,
if (rec->generation == generation &&
btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
- reada_add_block(rc, bytenr, &next_key, n_gen);
+ reada_add_block(rc, bytenr, &next_key,
+ btrfs_header_owner(eb), n_gen,
+ btrfs_header_level(eb) - 1);
}
}
@@ -298,7 +303,8 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
u64 logical,
- struct btrfs_key *top)
+ struct btrfs_key *top,
+ u64 owner_root, int level)
{
int ret;
struct reada_extent *re = NULL;
@@ -331,6 +337,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&re->extctl);
spin_lock_init(&re->lock);
re->refcnt = 1;
+ re->owner_root = owner_root;
+ re->level = level;
/*
* map block
@@ -421,6 +429,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
if (!dev->bdev)
continue;
+ if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state))
+ continue;
+
if (dev_replace_is_ongoing &&
dev == fs_info->dev_replace.tgtdev) {
/*
@@ -445,6 +456,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
}
have_zone = 1;
}
+ if (!have_zone)
+ radix_tree_delete(&fs_info->reada_tree, index);
spin_unlock(&fs_info->reada_lock);
up_read(&fs_info->dev_replace.rwsem);
@@ -526,6 +539,8 @@ static void reada_zone_release(struct kref *kref)
{
struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+ lockdep_assert_held(&zone->device->fs_info->reada_lock);
+
radix_tree_delete(&zone->device->reada_zones,
zone->end >> PAGE_SHIFT);
@@ -541,14 +556,15 @@ static void reada_control_release(struct kref *kref)
}
static int reada_add_block(struct reada_control *rc, u64 logical,
- struct btrfs_key *top, u64 generation)
+ struct btrfs_key *top, u64 owner_root,
+ u64 generation, int level)
{
struct btrfs_fs_info *fs_info = rc->fs_info;
struct reada_extent *re;
struct reada_extctl *rec;
/* takes one ref */
- re = reada_find_extent(fs_info, logical, top);
+ re = reada_find_extent(fs_info, logical, top, owner_root, level);
if (!re)
return -1;
@@ -640,12 +656,13 @@ static int reada_pick_zone(struct btrfs_device *dev)
}
static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
- int mirror_num, struct extent_buffer **eb)
+ u64 owner_root, int level, int mirror_num,
+ struct extent_buffer **eb)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
if (IS_ERR(buf))
return 0;
@@ -733,7 +750,8 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
logical = re->logical;
atomic_inc(&dev->reada_in_flight);
- ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb);
+ ret = reada_tree_block_flagged(fs_info, logical, re->owner_root,
+ re->level, mirror_num, &eb);
if (ret)
__readahead_hook(fs_info, re, NULL, ret);
else if (eb)
@@ -940,6 +958,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
u64 start;
u64 generation;
int ret;
+ int level;
struct extent_buffer *node;
static struct btrfs_key max_key = {
.objectid = (u64)-1,
@@ -962,9 +981,11 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
node = btrfs_root_node(root);
start = node->start;
generation = btrfs_header_generation(node);
+ level = btrfs_header_level(node);
free_extent_buffer(node);
- ret = reada_add_block(rc, start, &max_key, generation);
+ ret = reada_add_block(rc, start, &max_key, root->root_key.objectid,
+ generation, level);
if (ret) {
kfree(rc);
return ERR_PTR(ret);
@@ -1020,3 +1041,45 @@ void btrfs_reada_detach(void *handle)
kref_put(&rc->refcnt, reada_control_release);
}
+
+/*
+ * Before removing a device (device replace or device remove ioctls), call this
+ * function to wait for all existing readahead requests on the device and to
+ * make sure no one queues more readahead requests for the device.
+ *
+ * Must be called without holding neither the device list mutex nor the device
+ * replace semaphore, otherwise it will deadlock.
+ */
+void btrfs_reada_remove_dev(struct btrfs_device *dev)
+{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+
+ /* Serialize with readahead extent creation at reada_find_extent(). */
+ spin_lock(&fs_info->reada_lock);
+ set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
+ spin_unlock(&fs_info->reada_lock);
+
+ /*
+ * There might be readahead requests added to the radix trees which
+ * were not yet added to the readahead work queue. We need to start
+ * them and wait for their completion, otherwise we can end up with
+ * use-after-free problems when dropping the last reference on the
+ * readahead extents and their zones, as they need to access the
+ * device structure.
+ */
+ reada_start_machine(fs_info);
+ btrfs_flush_workqueue(fs_info->readahead_workers);
+}
+
+/*
+ * If when removing a device (device replace or device remove ioctls) an error
+ * happens after calling btrfs_reada_remove_dev(), call this to undo what that
+ * function did. This is safe to call even if btrfs_reada_remove_dev() was not
+ * called before.
+ */
+void btrfs_reada_undo_remove_dev(struct btrfs_device *dev)
+{
+ spin_lock(&dev->fs_info->reada_lock);
+ clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
+ spin_unlock(&dev->fs_info->reada_lock);
+}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 7f03dbe5b609..2b490becbe67 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -495,14 +495,15 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
}
static int process_leaf(struct btrfs_root *root,
- struct btrfs_path *path, u64 *bytenr, u64 *num_bytes)
+ struct btrfs_path *path, u64 *bytenr, u64 *num_bytes,
+ int *tree_block_level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
u32 count;
- int i = 0, tree_block_level = 0, ret = 0;
+ int i = 0, ret = 0;
struct btrfs_key key;
int nritems = btrfs_header_nritems(leaf);
@@ -515,15 +516,15 @@ static int process_leaf(struct btrfs_root *root,
case BTRFS_METADATA_ITEM_KEY:
*bytenr = key.objectid;
ret = process_extent_item(fs_info, path, &key, i,
- &tree_block_level);
+ tree_block_level);
break;
case BTRFS_TREE_BLOCK_REF_KEY:
ret = add_tree_block(fs_info, key.offset, 0,
- key.objectid, tree_block_level);
+ key.objectid, *tree_block_level);
break;
case BTRFS_SHARED_BLOCK_REF_KEY:
ret = add_tree_block(fs_info, 0, key.offset,
- key.objectid, tree_block_level);
+ key.objectid, *tree_block_level);
break;
case BTRFS_EXTENT_DATA_REF_KEY:
dref = btrfs_item_ptr(leaf, i,
@@ -549,38 +550,25 @@ static int process_leaf(struct btrfs_root *root,
/* Walk down to the leaf from the given level */
static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
- int level, u64 *bytenr, u64 *num_bytes)
+ int level, u64 *bytenr, u64 *num_bytes,
+ int *tree_block_level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *eb;
- u64 block_bytenr, gen;
int ret = 0;
while (level >= 0) {
if (level) {
- struct btrfs_key first_key;
-
- block_bytenr = btrfs_node_blockptr(path->nodes[level],
- path->slots[level]);
- gen = btrfs_node_ptr_generation(path->nodes[level],
- path->slots[level]);
- btrfs_node_key_to_cpu(path->nodes[level], &first_key,
- path->slots[level]);
- eb = read_tree_block(fs_info, block_bytenr, gen,
- level - 1, &first_key);
+ eb = btrfs_read_node_slot(path->nodes[level],
+ path->slots[level]);
if (IS_ERR(eb))
return PTR_ERR(eb);
- if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- return -EIO;
- }
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
path->nodes[level-1] = eb;
path->slots[level-1] = 0;
- path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
+ path->locks[level-1] = BTRFS_READ_LOCK;
} else {
- ret = process_leaf(root, path, bytenr, num_bytes);
+ ret = process_leaf(root, path, bytenr, num_bytes,
+ tree_block_level);
if (ret)
break;
}
@@ -681,18 +669,18 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
- u64 ref_root;
- u64 owner;
- u64 offset;
+ u64 ref_root = 0;
+ u64 owner = 0;
+ u64 offset = 0;
if (!btrfs_test_opt(fs_info, REF_VERIFY))
return 0;
if (generic_ref->type == BTRFS_REF_METADATA) {
- ref_root = generic_ref->tree_ref.root;
+ if (!parent)
+ ref_root = generic_ref->tree_ref.root;
owner = generic_ref->tree_ref.level;
- offset = 0;
- } else {
+ } else if (!parent) {
ref_root = generic_ref->data_ref.ref_root;
owner = generic_ref->data_ref.ino;
offset = generic_ref->data_ref.offset;
@@ -708,13 +696,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
goto out;
}
- if (parent) {
- ref->parent = parent;
- } else {
- ref->root_objectid = ref_root;
- ref->owner = owner;
- ref->offset = offset;
- }
+ ref->parent = parent;
+ ref->owner = owner;
+ ref->root_objectid = ref_root;
+ ref->offset = offset;
ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1;
memcpy(&ra->ref, ref, sizeof(struct ref_entry));
@@ -799,8 +784,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
if (!be) {
btrfs_err(fs_info,
"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!",
- action, (unsigned long long)bytenr,
- (unsigned long long)num_bytes);
+ action, bytenr, num_bytes);
dump_ref_action(fs_info, ra);
kfree(ref);
kfree(ra);
@@ -860,6 +844,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"dropping a ref for a root that doesn't have a ref on the block");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ kfree(ref);
kfree(ra);
goto out_unlock;
}
@@ -989,6 +974,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_path *path;
struct extent_buffer *eb;
+ int tree_block_level = 0;
u64 bytenr = 0, num_bytes = 0;
int ret, level;
@@ -1000,11 +986,10 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
return -ENOMEM;
eb = btrfs_read_lock_root_node(fs_info->extent_root);
- btrfs_set_lock_blocking_read(eb);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
path->slots[level] = 0;
- path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_READ_LOCK;
while (1) {
/*
@@ -1014,7 +999,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
* different leaf from the original extent item.
*/
ret = walk_down_tree(fs_info->extent_root, path, level,
- &bytenr, &num_bytes);
+ &bytenr, &num_bytes, &tree_block_level);
if (ret)
break;
ret = walk_up_tree(path, &level);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 99aa87c08912..b24396cf2f99 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -31,10 +31,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
endoff = destoff + olen;
if (endoff > inode->i_size) {
i_size_write(inode, endoff);
- btrfs_inode_safe_disk_i_size_write(inode, 0);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -81,7 +81,10 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
goto out_unlock;
}
- set_page_extent_mapped(page);
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto out_unlock;
+
clear_extent_bit(&inode->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, NULL);
@@ -89,6 +92,19 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (ret)
goto out_unlock;
+ /*
+ * After dirtying the page our caller will need to start a transaction,
+ * and if we are low on metadata free space, that can cause flushing of
+ * delalloc for all inodes in order to get metadata space released.
+ * However we are holding the range locked for the whole duration of
+ * the clone/dedupe operation, so we may deadlock if that happens and no
+ * other task releases enough space. So mark this inode as not being
+ * possible to flush to avoid such deadlock. We will clear that flag
+ * when we finish cloning all extents, since a transaction is started
+ * after finding each extent to clone.
+ */
+ set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
+
if (comp_type == BTRFS_COMPRESS_NONE) {
char *map;
@@ -163,6 +179,7 @@ static int clone_copy_inline_extent(struct inode *dst,
const u64 aligned_end = ALIGN(new_key->offset + datal,
fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_drop_extents_args drop_args = { 0 };
int ret;
struct btrfs_key key;
@@ -252,7 +269,11 @@ copy_inline_extent:
trans = NULL;
goto out;
}
- ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
+ drop_args.path = path;
+ drop_args.start = drop_start;
+ drop_args.end = aligned_end;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args);
if (ret)
goto out;
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
@@ -263,7 +284,7 @@ copy_inline_extent:
btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]),
size);
- inode_add_bytes(dst, datal);
+ btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
out:
@@ -347,7 +368,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 drop_start;
/* Note the key will change type as we walk through the tree */
- path->leave_spinning = 1;
ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
0, 0);
if (ret < 0)
@@ -417,7 +437,6 @@ process_slot:
size);
btrfs_release_path(path);
- path->leave_spinning = 0;
memcpy(&new_key, &key, sizeof(new_key));
new_key.objectid = btrfs_ino(BTRFS_I(inode));
@@ -533,7 +552,6 @@ process_slot:
* mixing buffered and direct IO writes against this file.
*/
btrfs_release_path(path);
- path->leave_spinning = 0;
ret = btrfs_replace_file_extents(inode, path, last_dest_end,
destoff + len - 1, NULL, &trans);
@@ -547,6 +565,8 @@ process_slot:
out:
btrfs_free_path(path);
kvfree(buf);
+ clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
+
return ret;
}
@@ -652,7 +672,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
if (destoff > inode->i_size) {
const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
- ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+ ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff);
if (ret)
return ret;
/*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 3602806d71bd..232d5da7b7be 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -18,7 +18,6 @@
#include "btrfs_inode.h"
#include "async-thread.h"
#include "free-space-cache.h"
-#include "inode-map.h"
#include "qgroup.h"
#include "print-tree.h"
#include "delalloc-space.h"
@@ -98,6 +97,7 @@ struct tree_block {
struct rb_node rb_node;
u64 bytenr;
}; /* Use rb_simple_node for search/insert */
+ u64 owner;
struct btrfs_key key;
unsigned int level:8;
unsigned int key_ready:1;
@@ -669,9 +669,7 @@ static void __del_reloc_root(struct btrfs_root *root)
RB_CLEAR_NODE(&node->rb_node);
}
spin_unlock(&rc->reloc_root_tree.lock);
- if (!node)
- return;
- BUG_ON((struct btrfs_root *)node->data != root);
+ ASSERT(!node || (struct btrfs_root *)node->data == root);
}
/*
@@ -783,7 +781,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
btrfs_set_root_refs(root_item, 0);
memset(&root_item->drop_progress, 0,
sizeof(struct btrfs_disk_key));
- root_item->drop_level = 0;
+ btrfs_set_root_drop_level(root_item, 0);
}
btrfs_tree_unlock(eb);
@@ -1196,7 +1194,6 @@ again:
btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
eb = btrfs_lock_root_node(dest);
- btrfs_set_lock_blocking_write(eb);
level = btrfs_header_level(eb);
if (level < lowest_level) {
@@ -1210,7 +1207,6 @@ again:
BTRFS_NESTING_COW);
BUG_ON(ret);
}
- btrfs_set_lock_blocking_write(eb);
if (next_key) {
next_key->objectid = (u64)-1;
@@ -1220,8 +1216,6 @@ again:
parent = eb;
while (1) {
- struct btrfs_key first_key;
-
level = btrfs_header_level(parent);
BUG_ON(level < lowest_level);
@@ -1237,7 +1231,6 @@ again:
old_bytenr = btrfs_node_blockptr(parent, slot);
blocksize = fs_info->nodesize;
old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
- btrfs_node_key_to_cpu(parent, &first_key, slot);
if (level <= max_level) {
eb = path->nodes[level];
@@ -1262,15 +1255,10 @@ again:
break;
}
- eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
- level - 1, &first_key);
+ eb = btrfs_read_node_slot(parent, slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
- } else if (!extent_buffer_uptodate(eb)) {
- ret = -EIO;
- free_extent_buffer(eb);
- break;
}
btrfs_tree_lock(eb);
if (cow) {
@@ -1279,7 +1267,6 @@ again:
BTRFS_NESTING_COW);
BUG_ON(ret);
}
- btrfs_set_lock_blocking_write(eb);
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
@@ -1418,10 +1405,8 @@ static noinline_for_stack
int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
int *level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *eb = NULL;
int i;
- u64 bytenr;
u64 ptr_gen = 0;
u64 last_snapshot;
u32 nritems;
@@ -1429,8 +1414,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
last_snapshot = btrfs_root_last_snapshot(&root->root_item);
for (i = *level; i > 0; i--) {
- struct btrfs_key first_key;
-
eb = path->nodes[i];
nritems = btrfs_header_nritems(eb);
while (path->slots[i] < nritems) {
@@ -1450,16 +1433,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
return 0;
}
- bytenr = btrfs_node_blockptr(eb, path->slots[i]);
- btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
- eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
- &first_key);
- if (IS_ERR(eb)) {
+ eb = btrfs_read_node_slot(eb, path->slots[i]);
+ if (IS_ERR(eb))
return PTR_ERR(eb);
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- return -EIO;
- }
BUG_ON(btrfs_header_level(eb) != i - 1);
path->nodes[i - 1] = eb;
path->slots[i - 1] = 0;
@@ -1575,7 +1551,7 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
reloc_root_item = &reloc_root->root_item;
memset(&reloc_root_item->drop_progress, 0,
sizeof(reloc_root_item->drop_progress));
- reloc_root_item->drop_level = 0;
+ btrfs_set_root_drop_level(reloc_root_item, 0);
btrfs_set_root_refs(reloc_root_item, 0);
btrfs_update_reloc_root(trans, root);
@@ -1648,11 +1624,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root_item *root_item;
struct btrfs_path *path;
struct extent_buffer *leaf;
+ int reserve_level;
int level;
int max_level;
int replaced = 0;
- int ret;
- int err = 0;
+ int ret = 0;
u32 min_reserved;
path = btrfs_alloc_path();
@@ -1671,7 +1647,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
} else {
btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
- level = root_item->drop_level;
+ level = btrfs_root_drop_level(root_item);
BUG_ON(level == 0);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
@@ -1696,19 +1672,18 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* Thus the needed metadata size is at most root_level * nodesize,
* and * 2 since we have two trees to COW.
*/
- min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2;
+ reserve_level = max_t(int, 1, btrfs_root_level(root_item));
+ min_reserved = fs_info->nodesize * reserve_level * 2;
memset(&next_key, 0, sizeof(next_key));
while (1) {
ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
BTRFS_RESERVE_FLUSH_LIMIT);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
@@ -1730,10 +1705,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
max_level = level;
ret = walk_down_reloc_tree(reloc_root, path, &level);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
if (ret > 0)
break;
@@ -1744,11 +1717,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
ret = replace_path(trans, rc, root, reloc_root, path,
&next_key, level, max_level);
}
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
-
if (ret > 0) {
level = ret;
btrfs_node_key_to_cpu(path->nodes[level], &key,
@@ -1767,7 +1737,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
*/
btrfs_node_key(path->nodes[level], &root_item->drop_progress,
path->slots[level]);
- root_item->drop_level = level;
+ btrfs_set_root_drop_level(root_item, level);
btrfs_end_transaction_throttle(trans);
trans = NULL;
@@ -1787,12 +1757,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
BTRFS_NESTING_COW);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
- if (ret < 0)
- err = ret;
out:
btrfs_free_path(path);
- if (err == 0)
+ if (ret == 0)
insert_dirty_subvol(trans, rc, root);
if (trans)
@@ -1803,7 +1771,7 @@ out:
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
- return err;
+ return ret;
}
static noinline_for_stack
@@ -2203,7 +2171,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
struct btrfs_key *key,
struct btrfs_path *path, int lowest)
{
- struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_backref_node *upper;
struct btrfs_backref_edge *edge;
struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
@@ -2211,17 +2178,14 @@ static int do_relocation(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
u32 blocksize;
u64 bytenr;
- u64 generation;
int slot;
- int ret;
- int err = 0;
+ int ret = 0;
BUG_ON(lowest && node->eb);
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
- struct btrfs_key first_key;
struct btrfs_ref ref = { 0 };
cond_resched();
@@ -2233,10 +2197,8 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (upper->eb && !upper->locked) {
if (!lowest) {
ret = btrfs_bin_search(upper->eb, key, &slot);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto next;
- }
BUG_ON(ret);
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (node->eb->start == bytenr)
@@ -2248,10 +2210,8 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!upper->eb) {
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
if (ret) {
- if (ret < 0)
- err = ret;
- else
- err = -ENOENT;
+ if (ret > 0)
+ ret = -ENOENT;
btrfs_release_path(path);
break;
@@ -2271,10 +2231,8 @@ static int do_relocation(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
} else {
ret = btrfs_bin_search(upper->eb, key, &slot);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto next;
- }
BUG_ON(ret);
}
@@ -2285,7 +2243,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
bytenr, node->bytenr, slot,
upper->eb->start);
- err = -EIO;
+ ret = -EIO;
goto next;
}
} else {
@@ -2294,30 +2252,20 @@ static int do_relocation(struct btrfs_trans_handle *trans,
}
blocksize = root->fs_info->nodesize;
- generation = btrfs_node_ptr_generation(upper->eb, slot);
- btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
- eb = read_tree_block(fs_info, bytenr, generation,
- upper->level - 1, &first_key);
+ eb = btrfs_read_node_slot(upper->eb, slot);
if (IS_ERR(eb)) {
- err = PTR_ERR(eb);
- goto next;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- err = -EIO;
+ ret = PTR_ERR(eb);
goto next;
}
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking_write(eb);
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
slot, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto next;
- }
BUG_ON(node->eb != eb);
} else {
btrfs_set_node_blockptr(upper->eb, slot,
@@ -2343,19 +2291,19 @@ next:
btrfs_backref_drop_node_buffer(upper);
else
btrfs_backref_unlock_node_buffer(upper);
- if (err)
+ if (ret)
break;
}
- if (!err && node->pending) {
+ if (!ret && node->pending) {
btrfs_backref_drop_node_buffer(node);
list_move_tail(&node->list, &rc->backref_cache.changed);
node->pending = 0;
}
path->lowest_level = 0;
- BUG_ON(err == -ENOSPC);
- return err;
+ BUG_ON(ret == -ENOSPC);
+ return ret;
}
static int link_to_upper(struct btrfs_trans_handle *trans,
@@ -2444,8 +2392,8 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
{
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
- block->level, NULL);
+ eb = read_tree_block(fs_info, block->bytenr, block->owner,
+ block->key.offset, block->level, NULL);
if (IS_ERR(eb)) {
return PTR_ERR(eb);
} else if (!extent_buffer_uptodate(eb)) {
@@ -2544,7 +2492,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
/* Kick in readahead for tree blocks with missing keys */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready)
- readahead_tree_block(fs_info, block->bytenr);
+ btrfs_readahead_tree_block(fs_info, block->bytenr,
+ block->owner, 0,
+ block->level);
}
/* Get first keys */
@@ -2603,6 +2553,31 @@ static noinline_for_stack int prealloc_file_extent_cluster(
if (ret)
return ret;
+ /*
+ * On a zoned filesystem, we cannot preallocate the file region.
+ * Instead, we dirty and fiemap_write the region.
+ */
+ if (btrfs_is_zoned(inode->root->fs_info)) {
+ struct btrfs_root *root = inode->root;
+ struct btrfs_trans_handle *trans;
+
+ end = cluster->end - offset + 1;
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
+ i_size_write(&inode->vfs_inode, end);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ return btrfs_end_transaction(trans);
+ }
+
inode_lock(&inode->vfs_inode);
for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
@@ -2665,7 +2640,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
/*
* Allow error injection to test balance cancellation
*/
-int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
+noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
{
return atomic_read(&fs_info->balance_cancel_req) ||
fatal_signal_pending(current);
@@ -2729,6 +2704,15 @@ static int relocate_file_extent_cluster(struct inode *inode,
goto out;
}
}
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ PAGE_SIZE, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ unlock_page(page);
+ put_page(page);
+ goto out;
+ }
if (PageReadahead(page)) {
page_cache_async_readahead(inode->i_mapping,
@@ -2756,8 +2740,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
- set_page_extent_mapped(page);
-
if (nr < cluster->nr &&
page_start + offset == cluster->boundary[nr]) {
set_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -2799,6 +2781,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
}
}
WARN_ON(nr != cluster->nr);
+ if (btrfs_is_zoned(fs_info) && !ret)
+ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
out:
kfree(ra);
return ret;
@@ -2851,21 +2835,58 @@ static int add_tree_block(struct reloc_control *rc,
u32 item_size;
int level = -1;
u64 generation;
+ u64 owner = 0;
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, path->slots[0]);
if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
item_size >= sizeof(*ei) + sizeof(*bi)) {
+ unsigned long ptr = 0, end;
+
ei = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_extent_item);
+ end = (unsigned long)ei + item_size;
if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
bi = (struct btrfs_tree_block_info *)(ei + 1);
level = btrfs_tree_block_level(eb, bi);
+ ptr = (unsigned long)(bi + 1);
} else {
level = (int)extent_key->offset;
+ ptr = (unsigned long)(ei + 1);
}
generation = btrfs_extent_generation(eb, ei);
+
+ /*
+ * We're reading random blocks without knowing their owner ahead
+ * of time. This is ok most of the time, as all reloc roots and
+ * fs roots have the same lock type. However normal trees do
+ * not, and the only way to know ahead of time is to read the
+ * inline ref offset. We know it's an fs root if
+ *
+ * 1. There's more than one ref.
+ * 2. There's a SHARED_DATA_REF_KEY set.
+ * 3. FULL_BACKREF is set on the flags.
+ *
+ * Otherwise it's safe to assume that the ref offset == the
+ * owner of this block, so we can use that when calling
+ * read_tree_block.
+ */
+ if (btrfs_extent_refs(eb, ei) == 1 &&
+ !(btrfs_extent_flags(eb, ei) &
+ BTRFS_BLOCK_FLAG_FULL_BACKREF) &&
+ ptr < end) {
+ struct btrfs_extent_inline_ref *iref;
+ int type;
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_get_extent_inline_ref_type(eb, iref,
+ BTRFS_REF_TYPE_BLOCK);
+ if (type == BTRFS_REF_TYPE_INVALID)
+ return -EINVAL;
+ if (type == BTRFS_TREE_BLOCK_REF_KEY)
+ owner = btrfs_extent_inline_ref_offset(eb, iref);
+ }
} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
btrfs_print_v0_err(eb->fs_info);
btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
@@ -2887,6 +2908,7 @@ static int add_tree_block(struct reloc_control *rc,
block->key.offset = generation;
block->level = level;
block->key_ready = 0;
+ block->owner = owner;
rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
if (rb_node)
@@ -3025,11 +3047,16 @@ static int delete_v1_space_cache(struct extent_buffer *leaf,
return 0;
for (i = 0; i < btrfs_header_nritems(leaf); i++) {
+ u8 type;
+
btrfs_item_key_to_cpu(leaf, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG &&
+ type = btrfs_file_extent_type(leaf, ei);
+
+ if ((type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) &&
btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
found = true;
space_cache_ino = key.objectid;
@@ -3069,7 +3096,7 @@ int add_data_references(struct reloc_control *rc,
while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL);
+ eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
@@ -3434,8 +3461,12 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_inode_item *item;
struct extent_buffer *leaf;
+ u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
int ret;
+ if (btrfs_is_zoned(trans->fs_info))
+ flags &= ~BTRFS_INODE_PREALLOC;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -3450,8 +3481,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
- BTRFS_INODE_PREALLOC);
+ btrfs_set_inode_flags(leaf, item, flags);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
@@ -3479,7 +3509,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
return ERR_CAST(trans);
}
- err = btrfs_find_free_objectid(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out;
@@ -3692,7 +3722,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
memset(&root->root_item.drop_progress, 0,
sizeof(root->root_item.drop_progress));
- root->root_item.drop_level = 0;
+ btrfs_set_root_drop_level(&root->root_item, 0);
btrfs_set_root_refs(&root->root_item, 0);
ret = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key, &root->root_item);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index cf63f1e27a27..582df11d298a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -20,6 +20,7 @@
#include "rcu-string.h"
#include "raid56.h"
#include "block-group.h"
+#include "zoned.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -71,11 +72,9 @@ struct scrub_page {
u64 physical;
u64 physical_for_dev_replace;
atomic_t refs;
- struct {
- unsigned int mirror_num:8;
- unsigned int have_csum:1;
- unsigned int io_error:1;
- };
+ u8 mirror_num;
+ int have_csum:1;
+ int io_error:1;
u8 csum[BTRFS_CSUM_SIZE];
struct scrub_recover *recover;
@@ -131,7 +130,7 @@ struct scrub_parity {
int nsectors;
- u64 stripe_len;
+ u32 stripe_len;
refcount_t refs;
@@ -161,13 +160,13 @@ struct scrub_ctx {
atomic_t workers_pending;
spinlock_t list_lock;
wait_queue_head_t list_wait;
- u16 csum_size;
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
int pages_per_rd_bio;
int is_dev_replace;
+ u64 write_pointer;
struct scrub_bio *wr_curr_bio;
struct mutex wr_lock;
@@ -235,15 +234,15 @@ static void scrub_parity_get(struct scrub_parity *sparity);
static void scrub_parity_put(struct scrub_parity *sparity);
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
-static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u8 *csum, int force,
+ u64 gen, int mirror_num, u8 *csum,
u64 physical_for_dev_replace);
static void scrub_bio_end_io(struct bio *bio);
static void scrub_bio_end_io_worker(struct btrfs_work *work);
static void scrub_block_complete(struct scrub_block *sblock);
static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u64 extent_len,
+ u64 extent_logical, u32 extent_len,
u64 *extent_physical,
struct btrfs_device **extent_dev,
int *extent_mirror_num);
@@ -256,10 +255,10 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_put_ctx(struct scrub_ctx *sctx);
-static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
{
- return page->recover &&
- (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ return spage->recover &&
+ (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -610,7 +609,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
atomic_set(&sctx->bios_in_flight, 0);
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
- sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
@@ -859,6 +857,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
have_csum = sblock_to_check->pagev[0]->have_csum;
dev = sblock_to_check->pagev[0]->dev;
+ if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace)
+ return btrfs_repair_one_zone(fs_info, logical);
+
/*
* We must use GFP_NOFS because the scrub task might be waiting for a
* worker task executing this function and in turn a transaction commit
@@ -1092,11 +1093,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
success = 1;
for (page_num = 0; page_num < sblock_bad->page_count;
page_num++) {
- struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+ struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
struct scrub_block *sblock_other = NULL;
/* skip no-io-error page in scrub */
- if (!page_bad->io_error && !sctx->is_dev_replace)
+ if (!spage_bad->io_error && !sctx->is_dev_replace)
continue;
if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
@@ -1108,7 +1109,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* sblock_for_recheck array to target device.
*/
sblock_other = NULL;
- } else if (page_bad->io_error) {
+ } else if (spage_bad->io_error) {
/* try to find no-io-error page in mirrors */
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
@@ -1147,7 +1148,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sblock_other,
page_num, 0);
if (0 == ret)
- page_bad->io_error = 0;
+ spage_bad->io_error = 0;
else
success = 0;
}
@@ -1325,13 +1326,13 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
- struct scrub_page *page;
+ struct scrub_page *spage;
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
- page = kzalloc(sizeof(*page), GFP_NOFS);
- if (!page) {
+ spage = kzalloc(sizeof(*spage), GFP_NOFS);
+ if (!spage) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -1339,17 +1340,17 @@ leave_nomem:
scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
- scrub_page_get(page);
- sblock->pagev[page_index] = page;
- page->sblock = sblock;
- page->flags = flags;
- page->generation = generation;
- page->logical = logical;
- page->have_csum = have_csum;
+ scrub_page_get(spage);
+ sblock->pagev[page_index] = spage;
+ spage->sblock = sblock;
+ spage->flags = flags;
+ spage->generation = generation;
+ spage->logical = logical;
+ spage->have_csum = have_csum;
if (have_csum)
- memcpy(page->csum,
+ memcpy(spage->csum,
original_sblock->pagev[0]->csum,
- sctx->csum_size);
+ sctx->fs_info->csum_size);
scrub_stripe_index_and_offset(logical,
bbio->map_type,
@@ -1360,23 +1361,23 @@ leave_nomem:
mirror_index,
&stripe_index,
&stripe_offset);
- page->physical = bbio->stripes[stripe_index].physical +
+ spage->physical = bbio->stripes[stripe_index].physical +
stripe_offset;
- page->dev = bbio->stripes[stripe_index].dev;
+ spage->dev = bbio->stripes[stripe_index].dev;
BUG_ON(page_index >= original_sblock->page_count);
- page->physical_for_dev_replace =
+ spage->physical_for_dev_replace =
original_sblock->pagev[page_index]->
physical_for_dev_replace;
/* for missing devices, dev->bdev is NULL */
- page->mirror_num = mirror_index + 1;
+ spage->mirror_num = mirror_index + 1;
sblock->page_count++;
- page->page = alloc_page(GFP_NOFS);
- if (!page->page)
+ spage->page = alloc_page(GFP_NOFS);
+ if (!spage->page)
goto leave_nomem;
scrub_get_recover(recover);
- page->recover = recover;
+ spage->recover = recover;
}
scrub_put_recover(fs_info, recover);
length -= sublen;
@@ -1394,19 +1395,19 @@ static void scrub_bio_wait_endio(struct bio *bio)
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
- struct scrub_page *page)
+ struct scrub_page *spage)
{
DECLARE_COMPLETION_ONSTACK(done);
int ret;
int mirror_num;
- bio->bi_iter.bi_sector = page->logical >> 9;
+ bio->bi_iter.bi_sector = spage->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
- mirror_num = page->sblock->pagev[0]->mirror_num;
- ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
- page->recover->map_length,
+ mirror_num = spage->sblock->pagev[0]->mirror_num;
+ ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
+ spage->recover->map_length,
mirror_num, 0);
if (ret)
return ret;
@@ -1431,10 +1432,10 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
bio_set_dev(bio, first_page->dev->bdev);
for (page_num = 0; page_num < sblock->page_count; page_num++) {
- struct scrub_page *page = sblock->pagev[page_num];
+ struct scrub_page *spage = sblock->pagev[page_num];
- WARN_ON(!page->page);
- bio_add_page(bio, page->page, PAGE_SIZE, 0);
+ WARN_ON(!spage->page);
+ bio_add_page(bio, spage->page, PAGE_SIZE, 0);
}
if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
@@ -1475,24 +1476,24 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
- struct scrub_page *page = sblock->pagev[page_num];
+ struct scrub_page *spage = sblock->pagev[page_num];
- if (page->dev->bdev == NULL) {
- page->io_error = 1;
+ if (spage->dev->bdev == NULL) {
+ spage->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
}
- WARN_ON(!page->page);
+ WARN_ON(!spage->page);
bio = btrfs_io_bio_alloc(1);
- bio_set_dev(bio, page->dev->bdev);
+ bio_set_dev(bio, spage->dev->bdev);
- bio_add_page(bio, page->page, PAGE_SIZE, 0);
- bio->bi_iter.bi_sector = page->physical >> 9;
+ bio_add_page(bio, spage->page, PAGE_SIZE, 0);
+ bio->bi_iter.bi_sector = spage->physical >> 9;
bio->bi_opf = REQ_OP_READ;
if (btrfsic_submit_bio_wait(bio)) {
- page->io_error = 1;
+ spage->io_error = 1;
sblock->no_io_error_seen = 0;
}
@@ -1548,36 +1549,36 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int page_num, int force_write)
{
- struct scrub_page *page_bad = sblock_bad->pagev[page_num];
- struct scrub_page *page_good = sblock_good->pagev[page_num];
+ struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
+ struct scrub_page *spage_good = sblock_good->pagev[page_num];
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
- BUG_ON(page_bad->page == NULL);
- BUG_ON(page_good->page == NULL);
+ BUG_ON(spage_bad->page == NULL);
+ BUG_ON(spage_good->page == NULL);
if (force_write || sblock_bad->header_error ||
- sblock_bad->checksum_error || page_bad->io_error) {
+ sblock_bad->checksum_error || spage_bad->io_error) {
struct bio *bio;
int ret;
- if (!page_bad->dev->bdev) {
+ if (!spage_bad->dev->bdev) {
btrfs_warn_rl(fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
return -EIO;
}
bio = btrfs_io_bio_alloc(1);
- bio_set_dev(bio, page_bad->dev->bdev);
- bio->bi_iter.bi_sector = page_bad->physical >> 9;
+ bio_set_dev(bio, spage_bad->dev->bdev);
+ bio->bi_iter.bi_sector = spage_bad->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
- ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
+ ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0);
if (PAGE_SIZE != ret) {
bio_put(bio);
return -EIO;
}
if (btrfsic_submit_bio_wait(bio)) {
- btrfs_dev_stat_inc_and_print(page_bad->dev,
+ btrfs_dev_stat_inc_and_print(spage_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
atomic64_inc(&fs_info->dev_replace.num_write_errors);
bio_put(bio);
@@ -1622,6 +1623,28 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
return scrub_add_page_to_wr_bio(sblock->sctx, spage);
}
+static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
+{
+ int ret = 0;
+ u64 length;
+
+ if (!btrfs_is_zoned(sctx->fs_info))
+ return 0;
+
+ if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
+ return 0;
+
+ if (sctx->write_pointer < physical) {
+ length = physical - sctx->write_pointer;
+
+ ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
+ sctx->write_pointer, length);
+ if (!ret)
+ sctx->write_pointer = physical;
+ }
+ return ret;
+}
+
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
@@ -1644,6 +1667,13 @@ again:
if (sbio->page_count == 0) {
struct bio *bio;
+ ret = fill_writer_pointer_gap(sctx,
+ spage->physical_for_dev_replace);
+ if (ret) {
+ mutex_unlock(&sctx->wr_lock);
+ return ret;
+ }
+
sbio->physical = spage->physical_for_dev_replace;
sbio->logical = spage->logical;
sbio->dev = sctx->wr_tgtdev;
@@ -1698,13 +1728,16 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
sbio = sctx->wr_curr_bio;
sctx->wr_curr_bio = NULL;
- WARN_ON(!sbio->bio->bi_disk);
+ WARN_ON(!sbio->bio->bi_bdev);
scrub_pending_bio_inc(sctx);
/* process all writes in a single worker thread. Then the block layer
* orders the requests before sending them to the driver which
* doubled the write performance on spinning disks when measured
* with Linux 3.5 */
btrfsic_submit_bio(sbio->bio);
+
+ if (btrfs_is_zoned(sctx->fs_info))
+ sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE;
}
static void scrub_wr_bio_end_io(struct bio *bio)
@@ -1798,11 +1831,15 @@ static int scrub_checksum_data(struct scrub_block *sblock)
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum);
- if (memcmp(csum, spage->csum, sctx->csum_size))
- sblock->checksum_error = 1;
+ /*
+ * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
+ * only contains one sector of data.
+ */
+ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
+ if (memcmp(csum, spage->csum, fs_info->csum_size))
+ sblock->checksum_error = 1;
return sblock->checksum_error;
}
@@ -1814,16 +1851,26 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
- const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT;
+ /*
+ * This is done in sectorsize steps even for metadata as there's a
+ * constraint for nodesize to be aligned to sectorsize. This will need
+ * to change so we don't misuse data and metadata units like that.
+ */
+ const u32 sectorsize = sctx->fs_info->sectorsize;
+ const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
int i;
struct scrub_page *spage;
char *kaddr;
BUG_ON(sblock->page_count < 1);
+
+ /* Each member in pagev is just one block, not a full page */
+ ASSERT(sblock->page_count == num_sectors);
+
spage = sblock->pagev[0];
kaddr = page_address(spage->page);
h = (struct btrfs_header *)kaddr;
- memcpy(on_disk_csum, h->csum, sctx->csum_size);
+ memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
/*
* we don't use the getter functions here, as we
@@ -1848,15 +1895,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
- PAGE_SIZE - BTRFS_CSUM_SIZE);
+ sectorsize - BTRFS_CSUM_SIZE);
- for (i = 1; i < num_pages; i++) {
+ for (i = 1; i < num_sectors; i++) {
kaddr = page_address(sblock->pagev[i]->page);
- crypto_shash_update(shash, kaddr, PAGE_SIZE);
+ crypto_shash_update(shash, kaddr, sectorsize);
}
crypto_shash_final(shash, calculated_csum);
- if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+ if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
sblock->checksum_error = 1;
return sblock->header_error || sblock->checksum_error;
@@ -1893,7 +1940,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
- if (memcmp(calculated_csum, s->csum, sctx->csum_size))
+ if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
++fail_cor;
if (fail_cor + fail_gen) {
@@ -2150,12 +2197,13 @@ bbio_out:
spin_unlock(&sctx->stat_lock);
}
-static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u8 *csum, int force,
+ u64 gen, int mirror_num, u8 *csum,
u64 physical_for_dev_replace)
{
struct scrub_block *sblock;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
int index;
sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
@@ -2174,7 +2222,12 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
for (index = 0; len > 0; index++) {
struct scrub_page *spage;
- u64 l = min_t(u64, len, PAGE_SIZE);
+ /*
+ * Here we will allocate one page for one sector to scrub.
+ * This is fine if PAGE_SIZE == sectorsize, but will cost
+ * more memory for PAGE_SIZE > sectorsize case.
+ */
+ u32 l = min(sectorsize, len);
spage = kzalloc(sizeof(*spage), GFP_KERNEL);
if (!spage) {
@@ -2198,7 +2251,7 @@ leave_nomem:
spage->mirror_num = mirror_num;
if (csum) {
spage->have_csum = 1;
- memcpy(spage->csum, csum, sctx->csum_size);
+ memcpy(spage->csum, csum, sctx->fs_info->csum_size);
} else {
spage->have_csum = 0;
}
@@ -2231,7 +2284,7 @@ leave_nomem:
}
}
- if (force)
+ if (flags & BTRFS_EXTENT_FLAG_SUPER)
scrub_submit(sctx);
}
@@ -2295,12 +2348,11 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
unsigned long *bitmap,
- u64 start, u64 len)
+ u64 start, u32 len)
{
u64 offset;
- u64 nsectors64;
u32 nsectors;
- int sectorsize = sparity->sctx->fs_info->sectorsize;
+ u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
if (len >= sparity->stripe_len) {
bitmap_set(bitmap, 0, sparity->nsectors);
@@ -2309,11 +2361,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
start -= sparity->logic_start;
start = div64_u64_rem(start, sparity->stripe_len, &offset);
- offset = div_u64(offset, sectorsize);
- nsectors64 = div_u64(len, sectorsize);
-
- ASSERT(nsectors64 < UINT_MAX);
- nsectors = (u32)nsectors64;
+ offset = offset >> sectorsize_bits;
+ nsectors = len >> sectorsize_bits;
if (offset + nsectors <= sparity->nsectors) {
bitmap_set(bitmap, offset, nsectors);
@@ -2325,13 +2374,13 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
}
static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
- u64 start, u64 len)
+ u64 start, u32 len)
{
__scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
}
static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
- u64 start, u64 len)
+ u64 start, u32 len)
{
__scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
}
@@ -2359,48 +2408,77 @@ static void scrub_block_complete(struct scrub_block *sblock)
u64 end = sblock->pagev[sblock->page_count - 1]->logical +
PAGE_SIZE;
+ ASSERT(end - start <= U32_MAX);
scrub_parity_mark_sectors_error(sblock->sparity,
start, end - start);
}
}
+static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
+{
+ sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
+ list_del(&sum->list);
+ kfree(sum);
+}
+
+/*
+ * Find the desired csum for range [logical, logical + sectorsize), and store
+ * the csum into @csum.
+ *
+ * The search source is sctx->csum_list, which is a pre-populated list
+ * storing bytenr ordered csum ranges. We're reponsible to cleanup any range
+ * that is before @logical.
+ *
+ * Return 0 if there is no csum for the range.
+ * Return 1 if there is csum for the range and copied to @csum.
+ */
static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
{
- struct btrfs_ordered_sum *sum = NULL;
- unsigned long index;
- unsigned long num_sectors;
+ bool found = false;
while (!list_empty(&sctx->csum_list)) {
+ struct btrfs_ordered_sum *sum = NULL;
+ unsigned long index;
+ unsigned long num_sectors;
+
sum = list_first_entry(&sctx->csum_list,
struct btrfs_ordered_sum, list);
+ /* The current csum range is beyond our range, no csum found */
if (sum->bytenr > logical)
- return 0;
- if (sum->bytenr + sum->len > logical)
break;
- ++sctx->stat.csum_discards;
- list_del(&sum->list);
- kfree(sum);
- sum = NULL;
- }
- if (!sum)
- return 0;
+ /*
+ * The current sum is before our bytenr, since scrub is always
+ * done in bytenr order, the csum will never be used anymore,
+ * clean it up so that later calls won't bother with the range,
+ * and continue search the next range.
+ */
+ if (sum->bytenr + sum->len <= logical) {
+ drop_csum_range(sctx, sum);
+ continue;
+ }
- index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
- ASSERT(index < UINT_MAX);
+ /* Now the csum range covers our bytenr, copy the csum */
+ found = true;
+ index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
+ num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
- num_sectors = sum->len / sctx->fs_info->sectorsize;
- memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
- if (index == num_sectors - 1) {
- list_del(&sum->list);
- kfree(sum);
+ memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
+ sctx->fs_info->csum_size);
+
+ /* Cleanup the range if we're at the end of the csum range */
+ if (index == num_sectors - 1)
+ drop_csum_range(sctx, sum);
+ break;
}
+ if (!found)
+ return 0;
return 1;
}
/* scrub extent tries to collect up to 64 kB for each bio */
static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
- u64 logical, u64 len,
+ u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u64 physical_for_dev_replace)
{
@@ -2432,7 +2510,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
}
while (len) {
- u64 l = min_t(u64, len, blocksize);
+ u32 l = min(len, blocksize);
int have_csum = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
@@ -2442,7 +2520,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
++sctx->stat.no_csum;
}
ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
- mirror_num, have_csum ? csum : NULL, 0,
+ mirror_num, have_csum ? csum : NULL,
physical_for_dev_replace);
if (ret)
return ret;
@@ -2455,14 +2533,17 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
}
static int scrub_pages_for_parity(struct scrub_parity *sparity,
- u64 logical, u64 len,
+ u64 logical, u32 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num, u8 *csum)
{
struct scrub_ctx *sctx = sparity->sctx;
struct scrub_block *sblock;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
int index;
+ ASSERT(IS_ALIGNED(len, sectorsize));
+
sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
if (!sblock) {
spin_lock(&sctx->stat_lock);
@@ -2481,7 +2562,6 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
for (index = 0; len > 0; index++) {
struct scrub_page *spage;
- u64 l = min_t(u64, len, PAGE_SIZE);
spage = kzalloc(sizeof(*spage), GFP_KERNEL);
if (!spage) {
@@ -2508,7 +2588,7 @@ leave_nomem:
spage->mirror_num = mirror_num;
if (csum) {
spage->have_csum = 1;
- memcpy(spage->csum, csum, sctx->csum_size);
+ memcpy(spage->csum, csum, sctx->fs_info->csum_size);
} else {
spage->have_csum = 0;
}
@@ -2516,9 +2596,12 @@ leave_nomem:
spage->page = alloc_page(GFP_KERNEL);
if (!spage->page)
goto leave_nomem;
- len -= l;
- logical += l;
- physical += l;
+
+
+ /* Iterate over the stripe range in sectorsize steps */
+ len -= sectorsize;
+ logical += sectorsize;
+ physical += sectorsize;
}
WARN_ON(sblock->page_count == 0);
@@ -2539,7 +2622,7 @@ leave_nomem:
}
static int scrub_extent_for_parity(struct scrub_parity *sparity,
- u64 logical, u64 len,
+ u64 logical, u32 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num)
{
@@ -2563,7 +2646,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
}
while (len) {
- u64 l = min_t(u64, len, blocksize);
+ u32 l = min(len, blocksize);
int have_csum = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
@@ -2767,7 +2850,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
u64 generation;
u64 extent_logical;
u64 extent_physical;
- u64 extent_len;
+ /* Check the comment in scrub_stripe() for why u32 is enough here */
+ u32 extent_len;
u64 mapped_length;
struct btrfs_device *extent_dev;
struct scrub_parity *sparity;
@@ -2776,7 +2860,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
int extent_mirror_num;
int stop_loop = 0;
- nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
+ ASSERT(map->stripe_len <= U32_MAX);
+ nsectors = map->stripe_len >> fs_info->sectorsize_bits;
bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
GFP_NOFS);
@@ -2787,6 +2872,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
return -ENOMEM;
}
+ ASSERT(map->stripe_len <= U32_MAX);
sparity->stripe_len = map->stripe_len;
sparity->nsectors = nsectors;
sparity->sctx = sctx;
@@ -2881,6 +2967,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
}
again:
extent_logical = key.objectid;
+ ASSERT(bytes <= U32_MAX);
extent_len = bytes;
if (extent_logical < logic_start) {
@@ -2959,9 +3046,11 @@ next:
logic_start += map->stripe_len;
}
out:
- if (ret < 0)
+ if (ret < 0) {
+ ASSERT(logic_end - logic_start <= U32_MAX);
scrub_parity_mark_sectors_error(sparity, logic_start,
logic_end - logic_start);
+ }
scrub_parity_put(sparity);
scrub_submit(sctx);
mutex_lock(&sctx->wr_lock);
@@ -2972,6 +3061,46 @@ out:
return ret < 0 ? ret : 0;
}
+static void sync_replace_for_zoned(struct scrub_ctx *sctx)
+{
+ if (!btrfs_is_zoned(sctx->fs_info))
+ return;
+
+ sctx->flush_all_writes = true;
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+}
+
+static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
+ u64 physical, u64 physical_end)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ int ret = 0;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+
+ mutex_lock(&sctx->wr_lock);
+ if (sctx->write_pointer < physical_end) {
+ ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
+ physical,
+ sctx->write_pointer);
+ if (ret)
+ btrfs_err(fs_info,
+ "zoned: failed to recover write pointer");
+ }
+ mutex_unlock(&sctx->wr_lock);
+ btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
+
+ return ret;
+}
+
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
@@ -3003,7 +3132,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 offset;
u64 extent_logical;
u64 extent_physical;
- u64 extent_len;
+ /*
+ * Unlike chunk length, extent length should never go beyond
+ * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
+ */
+ u32 extent_len;
u64 stripe_logical;
u64 stripe_end;
struct btrfs_device *extent_dev;
@@ -3084,17 +3217,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
key_end.offset = (u64)-1;
reada1 = btrfs_reada_add(root, &key, &key_end);
- key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.type = BTRFS_EXTENT_CSUM_KEY;
- key.offset = logical;
- key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key_end.type = BTRFS_EXTENT_CSUM_KEY;
- key_end.offset = logic_end;
- reada2 = btrfs_reada_add(csum_root, &key, &key_end);
+ if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = logical;
+ key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key_end.type = BTRFS_EXTENT_CSUM_KEY;
+ key_end.offset = logic_end;
+ reada2 = btrfs_reada_add(csum_root, &key, &key_end);
+ } else {
+ reada2 = NULL;
+ }
if (!IS_ERR(reada1))
btrfs_reada_wait(reada1);
- if (!IS_ERR(reada2))
+ if (!IS_ERR_OR_NULL(reada2))
btrfs_reada_wait(reada2);
@@ -3104,6 +3241,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
blk_start_plug(&plug);
+ if (sctx->is_dev_replace &&
+ btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
+ mutex_lock(&sctx->wr_lock);
+ sctx->write_pointer = physical;
+ mutex_unlock(&sctx->wr_lock);
+ sctx->flush_all_writes = true;
+ }
+
/*
* now find all extents for each stripe and scrub them
*/
@@ -3248,6 +3393,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
again:
extent_logical = key.objectid;
+ ASSERT(bytes <= U32_MAX);
extent_len = bytes;
/*
@@ -3291,6 +3437,9 @@ again:
if (ret)
goto out;
+ if (sctx->is_dev_replace)
+ sync_replace_for_zoned(sctx);
+
if (extent_logical + extent_len <
key.objectid + bytes) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -3358,6 +3507,17 @@ out:
blk_finish_plug(&plug);
btrfs_free_path(path);
btrfs_free_path(ppath);
+
+ if (sctx->is_dev_replace && ret >= 0) {
+ int ret2;
+
+ ret2 = sync_write_pointer_for_zoned(sctx, base + offset,
+ map->stripes[num].physical,
+ physical_end);
+ if (ret2)
+ ret = ret2;
+ }
+
return ret < 0 ? ret : 0;
}
@@ -3413,6 +3573,25 @@ out:
return ret;
}
+static int finish_extent_writes_for_zoned(struct btrfs_root *root,
+ struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_trans_handle *trans;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ btrfs_wait_block_group_reservations(cache);
+ btrfs_wait_nocow_writers(cache);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ return btrfs_commit_transaction(trans);
+}
+
static noinline_for_stack
int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev, u64 start, u64 end)
@@ -3499,6 +3678,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;
+ if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
+ spin_lock(&cache->lock);
+ if (!cache->to_copy) {
+ spin_unlock(&cache->lock);
+ ro_set = 0;
+ goto done;
+ }
+ spin_unlock(&cache->lock);
+ }
+
/*
* Make sure that while we are scrubbing the corresponding block
* group doesn't get its logical address and its device extents
@@ -3557,6 +3746,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* group is not RO.
*/
ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
+ if (!ret && sctx->is_dev_replace) {
+ ret = finish_extent_writes_for_zoned(root, cache);
+ if (ret) {
+ btrfs_dec_block_group_ro(cache);
+ scrub_pause_off(fs_info);
+ btrfs_put_block_group(cache);
+ break;
+ }
+ }
+
if (ret == 0) {
ro_set = 1;
} else if (ret == -ENOSPC && !sctx->is_dev_replace) {
@@ -3630,6 +3829,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
+ if (sctx->is_dev_replace &&
+ !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
+ cache, found_key.offset))
+ ro_set = 0;
+
+done:
down_write(&dev_replace->rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
@@ -3704,10 +3909,12 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (bytenr + BTRFS_SUPER_INFO_SIZE >
scrub_dev->commit_total_bytes)
break;
+ if (!btrfs_check_super_location(scrub_dev, bytenr))
+ continue;
ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
- NULL, 1, bytenr);
+ NULL, bytenr);
if (ret)
return ret;
}
@@ -3821,14 +4028,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return -EINVAL;
}
- if (fs_info->sectorsize != PAGE_SIZE) {
- /* not supported for data w/o checksums */
- btrfs_err_rl(fs_info,
- "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
- fs_info->sectorsize, PAGE_SIZE);
- return -EINVAL;
- }
-
if (fs_info->nodesize >
PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
@@ -3855,7 +4054,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
goto out_free_ctx;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3866,8 +4065,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!is_dev_replace && !readonly &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
- rcu_str_deref(dev->name));
+ btrfs_err_in_rcu(fs_info,
+ "scrub on devid %llu: filesystem on %s is not writable",
+ devid, rcu_str_deref(dev->name));
ret = -EROFS;
goto out;
}
@@ -4031,7 +4231,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)
@@ -4042,7 +4242,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
}
static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u64 extent_len,
+ u64 extent_logical, u32 extent_len,
u64 *extent_physical,
struct btrfs_device **extent_dev,
int *extent_mirror_num)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 340c76a12ce1..f87878274e9f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -236,6 +236,7 @@ struct waiting_dir_move {
* after this directory is moved, we can try to rmdir the ino rmdir_ino.
*/
u64 rmdir_ino;
+ u64 rmdir_gen;
bool orphanized;
};
@@ -316,7 +317,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
static struct waiting_dir_move *
get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
-static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen);
static int need_send_hole(struct send_ctx *sctx)
{
@@ -1190,9 +1191,6 @@ struct backref_ctx {
/* may be truncated in case it's the last extent in a file */
u64 extent_len;
- /* data offset in the file extent item */
- u64 data_offset;
-
/* Just to check for bugs in backref resolving */
int found_itself;
};
@@ -1400,19 +1398,6 @@ static int find_extent_clone(struct send_ctx *sctx,
backref_ctx->cur_offset = data_offset;
backref_ctx->found_itself = 0;
backref_ctx->extent_len = num_bytes;
- /*
- * For non-compressed extents iterate_extent_inodes() gives us extent
- * offsets that already take into account the data offset, but not for
- * compressed extents, since the offset is logical and not relative to
- * the physical extent locations. We must take this into account to
- * avoid sending clone offsets that go beyond the source file's size,
- * which would result in the clone ioctl failing with -EINVAL on the
- * receiving end.
- */
- if (compressed == BTRFS_COMPRESS_NONE)
- backref_ctx->data_offset = 0;
- else
- backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi);
/*
* The last extent of a file may be too large due to page alignment.
@@ -2299,7 +2284,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
fs_path_reset(name);
- if (is_waiting_for_rm(sctx, ino)) {
+ if (is_waiting_for_rm(sctx, ino, gen)) {
ret = gen_unique_name(sctx, ino, gen, name);
if (ret < 0)
goto out;
@@ -2410,7 +2395,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
sctx->send_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
- le64_to_cpu(sctx->send_root->root_item.ctransid));
+ btrfs_root_ctransid(&sctx->send_root->root_item));
if (parent_root) {
if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
@@ -2419,7 +2404,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
parent_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
- le64_to_cpu(sctx->parent_root->root_item.ctransid));
+ btrfs_root_ctransid(&sctx->parent_root->root_item));
}
ret = send_cmd(sctx);
@@ -2858,8 +2843,8 @@ out:
return ret;
}
-static struct orphan_dir_info *
-add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
+ u64 dir_ino, u64 dir_gen)
{
struct rb_node **p = &sctx->orphan_dirs.rb_node;
struct rb_node *parent = NULL;
@@ -2868,20 +2853,23 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
while (*p) {
parent = *p;
entry = rb_entry(parent, struct orphan_dir_info, node);
- if (dir_ino < entry->ino) {
+ if (dir_ino < entry->ino)
p = &(*p)->rb_left;
- } else if (dir_ino > entry->ino) {
+ else if (dir_ino > entry->ino)
p = &(*p)->rb_right;
- } else {
+ else if (dir_gen < entry->gen)
+ p = &(*p)->rb_left;
+ else if (dir_gen > entry->gen)
+ p = &(*p)->rb_right;
+ else
return entry;
- }
}
odi = kmalloc(sizeof(*odi), GFP_KERNEL);
if (!odi)
return ERR_PTR(-ENOMEM);
odi->ino = dir_ino;
- odi->gen = 0;
+ odi->gen = dir_gen;
odi->last_dir_index_offset = 0;
rb_link_node(&odi->node, parent, p);
@@ -2889,8 +2877,8 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
return odi;
}
-static struct orphan_dir_info *
-get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx,
+ u64 dir_ino, u64 gen)
{
struct rb_node *n = sctx->orphan_dirs.rb_node;
struct orphan_dir_info *entry;
@@ -2901,15 +2889,19 @@ get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
n = n->rb_left;
else if (dir_ino > entry->ino)
n = n->rb_right;
+ else if (gen < entry->gen)
+ n = n->rb_left;
+ else if (gen > entry->gen)
+ n = n->rb_right;
else
return entry;
}
return NULL;
}
-static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen)
{
- struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+ struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen);
return odi != NULL;
}
@@ -2954,7 +2946,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
- odi = get_orphan_dir_info(sctx, dir);
+ odi = get_orphan_dir_info(sctx, dir, dir_gen);
if (odi)
key.offset = odi->last_dir_index_offset;
@@ -2985,7 +2977,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
dm = get_waiting_dir_move(sctx, loc.objectid);
if (dm) {
- odi = add_orphan_dir_info(sctx, dir);
+ odi = add_orphan_dir_info(sctx, dir, dir_gen);
if (IS_ERR(odi)) {
ret = PTR_ERR(odi);
goto out;
@@ -2993,12 +2985,13 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
odi->gen = dir_gen;
odi->last_dir_index_offset = found_key.offset;
dm->rmdir_ino = dir;
+ dm->rmdir_gen = dir_gen;
ret = 0;
goto out;
}
if (loc.objectid > send_progress) {
- odi = add_orphan_dir_info(sctx, dir);
+ odi = add_orphan_dir_info(sctx, dir, dir_gen);
if (IS_ERR(odi)) {
ret = PTR_ERR(odi);
goto out;
@@ -3038,6 +3031,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
return -ENOMEM;
dm->ino = ino;
dm->rmdir_ino = 0;
+ dm->rmdir_gen = 0;
dm->orphanized = orphanized;
while (*p) {
@@ -3183,7 +3177,7 @@ static int path_loop(struct send_ctx *sctx, struct fs_path *name,
while (ino != BTRFS_FIRST_FREE_OBJECTID) {
fs_path_reset(name);
- if (is_waiting_for_rm(sctx, ino))
+ if (is_waiting_for_rm(sctx, ino, gen))
break;
if (is_waiting_for_move(sctx, ino)) {
if (*ancestor_ino == 0)
@@ -3223,6 +3217,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
u64 parent_ino, parent_gen;
struct waiting_dir_move *dm = NULL;
u64 rmdir_ino = 0;
+ u64 rmdir_gen;
u64 ancestor;
bool is_orphan;
int ret;
@@ -3237,6 +3232,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
dm = get_waiting_dir_move(sctx, pm->ino);
ASSERT(dm);
rmdir_ino = dm->rmdir_ino;
+ rmdir_gen = dm->rmdir_gen;
is_orphan = dm->orphanized;
free_waiting_dir_move(sctx, dm);
@@ -3273,6 +3269,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
dm = get_waiting_dir_move(sctx, pm->ino);
ASSERT(dm);
dm->rmdir_ino = rmdir_ino;
+ dm->rmdir_gen = rmdir_gen;
}
goto out;
}
@@ -3291,7 +3288,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
struct orphan_dir_info *odi;
u64 gen;
- odi = get_orphan_dir_info(sctx, rmdir_ino);
+ odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen);
if (!odi) {
/* already deleted */
goto finish;
@@ -5101,7 +5098,7 @@ static int send_clone(struct send_ctx *sctx,
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
clone_root->root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
- le64_to_cpu(clone_root->root->root_item.ctransid));
+ btrfs_root_ctransid(&clone_root->root->root_item));
TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
clone_root->offset);
@@ -5499,6 +5496,21 @@ static int clone_range(struct send_ctx *sctx,
break;
offset += clone_len;
clone_root->offset += clone_len;
+
+ /*
+ * If we are cloning from the file we are currently processing,
+ * and using the send root as the clone root, we must stop once
+ * the current clone offset reaches the current eof of the file
+ * at the receiver, otherwise we would issue an invalid clone
+ * operation (source range going beyond eof) and cause the
+ * receiver to fail. So if we reach the current eof, bail out
+ * and fallback to a regular write.
+ */
+ if (clone_root->root == sctx->send_root &&
+ clone_root->ino == sctx->cur_ino &&
+ clone_root->offset >= sctx->cur_inode_next_write_offset)
+ break;
+
data_offset += clone_len;
next:
path->slots[0]++;
@@ -6579,10 +6591,9 @@ static int changed_cb(struct btrfs_path *left_path,
struct btrfs_path *right_path,
struct btrfs_key *key,
enum btrfs_compare_tree_result result,
- void *ctx)
+ struct send_ctx *sctx)
{
int ret = 0;
- struct send_ctx *sctx = ctx;
if (result == BTRFS_COMPARE_TREE_SAME) {
if (key->type == BTRFS_INODE_REF_KEY ||
@@ -6787,7 +6798,7 @@ static int tree_compare_item(struct btrfs_path *left_path,
* If it detects a change, it aborts immediately.
*/
static int btrfs_compare_trees(struct btrfs_root *left_root,
- struct btrfs_root *right_root, void *ctx)
+ struct btrfs_root *right_root, struct send_ctx *sctx)
{
struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
@@ -6939,7 +6950,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = changed_cb(left_path, right_path,
&right_key,
BTRFS_COMPARE_TREE_DELETED,
- ctx);
+ sctx);
if (ret < 0)
goto out;
}
@@ -6950,7 +6961,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = changed_cb(left_path, right_path,
&left_key,
BTRFS_COMPARE_TREE_NEW,
- ctx);
+ sctx);
if (ret < 0)
goto out;
}
@@ -6964,7 +6975,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = changed_cb(left_path, right_path,
&left_key,
BTRFS_COMPARE_TREE_NEW,
- ctx);
+ sctx);
if (ret < 0)
goto out;
advance_left = ADVANCE;
@@ -6972,7 +6983,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = changed_cb(left_path, right_path,
&right_key,
BTRFS_COMPARE_TREE_DELETED,
- ctx);
+ sctx);
if (ret < 0)
goto out;
advance_right = ADVANCE;
@@ -6987,7 +6998,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
else
result = BTRFS_COMPARE_TREE_SAME;
ret = changed_cb(left_path, right_path,
- &left_key, result, ctx);
+ &left_key, result, sctx);
if (ret < 0)
goto out;
advance_left = ADVANCE;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 64099565ab8f..2da6177f4b0b 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -140,6 +140,12 @@
* be freed, plus any delayed work we may not have gotten rid of in the case
* of metadata.
*
+ * FORCE_COMMIT_TRANS
+ * For use by the preemptive flusher. We use this to bypass the ticketing
+ * checks in may_commit_transaction, as we have more information about the
+ * overall state of the system and may want to commit the transaction ahead
+ * of actual ENOSPC conditions.
+ *
* OVERCOMMIT
*
* Because we hold so many reservations for metadata we will allow you to
@@ -163,6 +169,7 @@ u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
ASSERT(s_info);
return s_info->bytes_used + s_info->bytes_reserved +
s_info->bytes_pinned + s_info->bytes_readonly +
+ s_info->bytes_zone_unusable +
(may_use_included ? s_info->bytes_may_use : 0);
}
@@ -206,6 +213,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->ro_bgs);
INIT_LIST_HEAD(&space_info->tickets);
INIT_LIST_HEAD(&space_info->priority_tickets);
+ space_info->clamp = 1;
ret = btrfs_sysfs_add_space_info_type(info, space_info);
if (ret)
@@ -257,7 +265,7 @@ out:
void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly,
+ u64 bytes_readonly, u64 bytes_zone_unusable,
struct btrfs_space_info **space_info)
{
struct btrfs_space_info *found;
@@ -273,6 +281,7 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
found->bytes_used += bytes_used;
found->disk_used += bytes_used * factor;
found->bytes_readonly += bytes_readonly;
+ found->bytes_zone_unusable += bytes_zone_unusable;
if (total_bytes > 0)
found->full = 0;
btrfs_try_granting_tickets(info, found);
@@ -422,10 +431,10 @@ static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
info->total_bytes - btrfs_space_info_used(info, true),
info->full ? "" : "not ");
btrfs_info(fs_info,
- "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
+ "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
info->total_bytes, info->bytes_used, info->bytes_pinned,
info->bytes_reserved, info->bytes_may_use,
- info->bytes_readonly);
+ info->bytes_readonly, info->bytes_zone_unusable);
DUMP_BLOCK_RSV(fs_info, global_block_rsv);
DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
@@ -454,9 +463,10 @@ again:
list_for_each_entry(cache, &info->block_groups[index], list) {
spin_lock(&cache->lock);
btrfs_info(fs_info,
- "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
+ "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s",
cache->start, cache->length, cache->used, cache->pinned,
- cache->reserved, cache->ro ? "[readonly]" : "");
+ cache->reserved, cache->zone_unusable,
+ cache->ro ? "[readonly]" : "");
spin_unlock(&cache->lock);
btrfs_dump_free_space(cache, bytes);
}
@@ -489,7 +499,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
{
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
- u64 dio_bytes;
+ u64 ordered_bytes;
u64 items;
long time_left;
int loops;
@@ -513,26 +523,24 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
- dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
- if (delalloc_bytes == 0 && dio_bytes == 0) {
- if (trans)
- return;
- if (wait_ordered)
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ if (delalloc_bytes == 0 && ordered_bytes == 0)
return;
- }
/*
* If we are doing more ordered than delalloc we need to just wait on
* ordered extents, otherwise we'll waste time trying to flush delalloc
* that likely won't give us the space back we need.
*/
- if (dio_bytes > delalloc_bytes)
+ if (ordered_bytes > delalloc_bytes)
wait_ordered = true;
loops = 0;
- while ((delalloc_bytes || dio_bytes) && loops < 3) {
- btrfs_start_delalloc_roots(fs_info, items);
+ while ((delalloc_bytes || ordered_bytes) && loops < 3) {
+ u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+ long nr_pages = min_t(u64, temp, LONG_MAX);
+
+ btrfs_start_delalloc_roots(fs_info, nr_pages, true);
loops++;
if (wait_ordered && !trans) {
@@ -553,15 +561,16 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
- dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
+ ordered_bytes = percpu_counter_sum_positive(
+ &fs_info->ordered_bytes);
}
}
/**
- * maybe_commit_transaction - possibly commit the transaction if its ok to
- * @root - the root we're allocating for
- * @bytes - the number of bytes we want to reserve
- * @force - force the commit
+ * Possibly commit the transaction if its ok to
+ *
+ * @fs_info: the filesystem
+ * @space_info: space_info we are checking for commit, either data or metadata
*
* This will check to make sure that committing the transaction will actually
* get us somewhere and then commit the transaction if it does. Otherwise it
@@ -667,7 +676,7 @@ enospc:
*/
static void flush_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 num_bytes,
- int state)
+ enum btrfs_flush_state state, bool for_preempt)
{
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_trans_handle *trans;
@@ -736,13 +745,21 @@ static void flush_space(struct btrfs_fs_info *fs_info,
case COMMIT_TRANS:
ret = may_commit_transaction(fs_info, space_info);
break;
+ case FORCE_COMMIT_TRANS:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = btrfs_commit_transaction(trans);
+ break;
default:
ret = -ENOSPC;
break;
}
trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
- ret);
+ ret, for_preempt);
return;
}
@@ -752,7 +769,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
{
u64 used;
u64 avail;
- u64 expected;
u64 to_reclaim = space_info->reclaim_size;
lockdep_assert_held(&space_info->lock);
@@ -770,43 +786,88 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
if (space_info->total_bytes + avail < used)
to_reclaim += used - (space_info->total_bytes + avail);
- if (to_reclaim)
- return to_reclaim;
-
- to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
- if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL))
- return 0;
-
- used = btrfs_space_info_used(space_info, true);
-
- if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
- BTRFS_RESERVE_FLUSH_ALL))
- expected = div_factor_fine(space_info->total_bytes, 95);
- else
- expected = div_factor_fine(space_info->total_bytes, 90);
-
- if (used > expected)
- to_reclaim = used - expected;
- else
- to_reclaim = 0;
- to_reclaim = min(to_reclaim, space_info->bytes_may_use +
- space_info->bytes_reserved);
return to_reclaim;
}
-static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 used)
+static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
{
+ u64 ordered, delalloc;
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+ u64 used;
/* If we're just plain full then async reclaim just slows us down. */
if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
- return 0;
+ return false;
- if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
- return 0;
+ /*
+ * We have tickets queued, bail so we don't compete with the async
+ * flushers.
+ */
+ if (space_info->reclaim_size)
+ return false;
+
+ /*
+ * If we have over half of the free space occupied by reservations or
+ * pinned then we want to start flushing.
+ *
+ * We do not do the traditional thing here, which is to say
+ *
+ * if (used >= ((total_bytes + avail) / 2))
+ * return 1;
+ *
+ * because this doesn't quite work how we want. If we had more than 50%
+ * of the space_info used by bytes_used and we had 0 available we'd just
+ * constantly run the background flusher. Instead we want it to kick in
+ * if our reclaimable space exceeds our clamped free space.
+ *
+ * Our clamping range is 2^1 -> 2^8. Practically speaking that means
+ * the following:
+ *
+ * Amount of RAM Minimum threshold Maximum threshold
+ *
+ * 256GiB 1GiB 128GiB
+ * 128GiB 512MiB 64GiB
+ * 64GiB 256MiB 32GiB
+ * 32GiB 128MiB 16GiB
+ * 16GiB 64MiB 8GiB
+ *
+ * These are the range our thresholds will fall in, corresponding to how
+ * much delalloc we need for the background flusher to kick in.
+ */
+
+ thresh = calc_available_free_space(fs_info, space_info,
+ BTRFS_RESERVE_FLUSH_ALL);
+ thresh += (space_info->total_bytes - space_info->bytes_used -
+ space_info->bytes_reserved - space_info->bytes_readonly);
+ thresh >>= space_info->clamp;
+
+ used = space_info->bytes_pinned;
+
+ /*
+ * If we have more ordered bytes than delalloc bytes then we're either
+ * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
+ * around. Preemptive flushing is only useful in that it can free up
+ * space before tickets need to wait for things to finish. In the case
+ * of ordered extents, preemptively waiting on ordered extents gets us
+ * nothing, if our reservations are tied up in ordered extents we'll
+ * simply have to slow down writers by forcing them to wait on ordered
+ * extents.
+ *
+ * In the case that ordered is larger than delalloc, only include the
+ * block reserves that we would actually be able to directly reclaim
+ * from. In this case if we're heavy on metadata operations this will
+ * clearly be heavy enough to warrant preemptive flushing. In the case
+ * of heavy DIO or ordered reservations, preemptive flushing will just
+ * waste time and cause us to slow down.
+ */
+ ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+ if (ordered >= delalloc)
+ used += fs_info->delayed_refs_rsv.reserved +
+ fs_info->delayed_block_rsv.reserved;
+ else
+ used += space_info->bytes_may_use;
return (used >= thresh && !btrfs_fs_closing(fs_info) &&
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
@@ -920,7 +981,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
u64 to_reclaim;
- int flush_state;
+ enum btrfs_flush_state flush_state;
int commit_cycles = 0;
u64 last_tickets_id;
@@ -939,7 +1000,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
flush_state = FLUSH_DELAYED_ITEMS_NR;
do {
- flush_space(fs_info, space_info, to_reclaim, flush_state);
+ flush_space(fs_info, space_info, to_reclaim, flush_state, false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
@@ -988,6 +1049,105 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
}
/*
+ * This handles pre-flushing of metadata space before we get to the point that
+ * we need to start blocking threads on tickets. The logic here is different
+ * from the other flush paths because it doesn't rely on tickets to tell us how
+ * much we need to flush, instead it attempts to keep us below the 80% full
+ * watermark of space by flushing whichever reservation pool is currently the
+ * largest.
+ */
+static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_rsv *delayed_block_rsv;
+ struct btrfs_block_rsv *delayed_refs_rsv;
+ struct btrfs_block_rsv *global_rsv;
+ struct btrfs_block_rsv *trans_rsv;
+ int loops = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info,
+ preempt_reclaim_work);
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ delayed_block_rsv = &fs_info->delayed_block_rsv;
+ delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ global_rsv = &fs_info->global_block_rsv;
+ trans_rsv = &fs_info->trans_block_rsv;
+
+ spin_lock(&space_info->lock);
+ while (need_preemptive_reclaim(fs_info, space_info)) {
+ enum btrfs_flush_state flush;
+ u64 delalloc_size = 0;
+ u64 to_reclaim, block_rsv_size;
+ u64 global_rsv_size = global_rsv->reserved;
+
+ loops++;
+
+ /*
+ * We don't have a precise counter for the metadata being
+ * reserved for delalloc, so we'll approximate it by subtracting
+ * out the block rsv's space from the bytes_may_use. If that
+ * amount is higher than the individual reserves, then we can
+ * assume it's tied up in delalloc reservations.
+ */
+ block_rsv_size = global_rsv_size +
+ delayed_block_rsv->reserved +
+ delayed_refs_rsv->reserved +
+ trans_rsv->reserved;
+ if (block_rsv_size < space_info->bytes_may_use)
+ delalloc_size = space_info->bytes_may_use - block_rsv_size;
+ spin_unlock(&space_info->lock);
+
+ /*
+ * We don't want to include the global_rsv in our calculation,
+ * because that's space we can't touch. Subtract it from the
+ * block_rsv_size for the next checks.
+ */
+ block_rsv_size -= global_rsv_size;
+
+ /*
+ * We really want to avoid flushing delalloc too much, as it
+ * could result in poor allocation patterns, so only flush it if
+ * it's larger than the rest of the pools combined.
+ */
+ if (delalloc_size > block_rsv_size) {
+ to_reclaim = delalloc_size;
+ flush = FLUSH_DELALLOC;
+ } else if (space_info->bytes_pinned >
+ (delayed_block_rsv->reserved +
+ delayed_refs_rsv->reserved)) {
+ to_reclaim = space_info->bytes_pinned;
+ flush = FORCE_COMMIT_TRANS;
+ } else if (delayed_block_rsv->reserved >
+ delayed_refs_rsv->reserved) {
+ to_reclaim = delayed_block_rsv->reserved;
+ flush = FLUSH_DELAYED_ITEMS_NR;
+ } else {
+ to_reclaim = delayed_refs_rsv->reserved;
+ flush = FLUSH_DELAYED_REFS_NR;
+ }
+
+ /*
+ * We don't want to reclaim everything, just a portion, so scale
+ * down the to_reclaim by 1/4. If it takes us down to 0,
+ * reclaim 1 items worth.
+ */
+ to_reclaim >>= 2;
+ if (!to_reclaim)
+ to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
+ flush_space(fs_info, space_info, to_reclaim, flush, true);
+ cond_resched();
+ spin_lock(&space_info->lock);
+ }
+
+ /* We only went through once, back off our clamping. */
+ if (loops == 1 && !space_info->reclaim_size)
+ space_info->clamp = max(1, space_info->clamp - 1);
+ trace_btrfs_done_preemptive_reclaim(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+}
+
+/*
* FLUSH_DELALLOC_WAIT:
* Space is freed from flushing delalloc in one of two ways.
*
@@ -1052,7 +1212,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
u64 last_tickets_id;
- int flush_state = 0;
+ enum btrfs_flush_state flush_state = 0;
fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
space_info = fs_info->data_sinfo;
@@ -1067,7 +1227,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
spin_unlock(&space_info->lock);
while (!space_info->full) {
- flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
@@ -1080,7 +1240,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
while (flush_state < ARRAY_SIZE(data_flush_states)) {
flush_space(fs_info, space_info, U64_MAX,
- data_flush_states[flush_state]);
+ data_flush_states[flush_state], false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
@@ -1113,6 +1273,8 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
+ INIT_WORK(&fs_info->preempt_reclaim_work,
+ btrfs_preempt_reclaim_metadata_space);
}
static const enum btrfs_flush_state priority_flush_states[] = {
@@ -1151,7 +1313,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
flush_state = 0;
do {
- flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
+ flush_space(fs_info, space_info, to_reclaim, states[flush_state],
+ false);
flush_state++;
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
@@ -1167,7 +1330,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
struct reserve_ticket *ticket)
{
while (!space_info->full) {
- flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
@@ -1212,11 +1375,14 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
}
/**
- * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
- * @fs_info - the fs
- * @space_info - the space_info for the reservation
- * @ticket - the ticket for the reservation
- * @flush - how much we can flush
+ * Do the appropriate flushing and waiting for a ticket
+ *
+ * @fs_info: the filesystem
+ * @space_info: space info for the reservation
+ * @ticket: ticket for the reservation
+ * @start_ns: timestamp when the reservation started
+ * @orig_bytes: amount of bytes originally reserved
+ * @flush: how much we can flush
*
* This does the work of figuring out how to flush for the ticket, waiting for
* the reservation, and returning the appropriate error if there is one.
@@ -1224,6 +1390,7 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket,
+ u64 start_ns, u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
int ret;
@@ -1279,6 +1446,8 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
* space wasn't reserved at all).
*/
ASSERT(!(ticket->bytes == 0 && ticket->error));
+ trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
+ start_ns, flush, ticket->error);
return ret;
}
@@ -1292,12 +1461,31 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
(flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
}
+static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+
+ /*
+ * If we're heavy on ordered operations then clamping won't help us. We
+ * need to clamp specifically to keep up with dirty'ing buffered
+ * writers, because there's not a 1:1 correlation of writing delalloc
+ * and freeing space, like there is with flushing delayed refs or
+ * delayed nodes. If we're already more ordered than delalloc then
+ * we're keeping up, otherwise we aren't and should probably clamp.
+ */
+ if (ordered < delalloc)
+ space_info->clamp = min(space_info->clamp + 1, 8);
+}
+
/**
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @space_info - the space info we want to allocate from
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
+ * Try to reserve bytes from the block_rsv's space
+ *
+ * @fs_info: the filesystem
+ * @space_info: space info we want to allocate from
+ * @orig_bytes: number of bytes we want
+ * @flush: whether or not we can flush to make our reservation
*
* This will reserve orig_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
@@ -1312,6 +1500,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
{
struct work_struct *async_work;
struct reserve_ticket ticket;
+ u64 start_ns = 0;
u64 used;
int ret = 0;
bool pending_tickets;
@@ -1364,6 +1553,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
space_info->reclaim_size += ticket.bytes;
init_waitqueue_head(&ticket.wait);
ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
+ if (trace_btrfs_reserve_ticket_enabled())
+ start_ns = ktime_get_ns();
+
if (flush == BTRFS_RESERVE_FLUSH_ALL ||
flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
flush == BTRFS_RESERVE_FLUSH_DATA) {
@@ -1380,6 +1572,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
list_add_tail(&ticket.list,
&space_info->priority_tickets);
}
+
+ /*
+ * We were forced to add a reserve ticket, so our preemptive
+ * flushing is unable to keep up. Clamp down on the threshold
+ * for the preemptive flushing in order to keep up with the
+ * workload.
+ */
+ maybe_clamp_preempt(fs_info, space_info);
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
/*
@@ -1388,27 +1588,29 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* the async reclaim as we will panic.
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
- need_do_async_reclaim(fs_info, space_info, used) &&
- !work_busy(&fs_info->async_reclaim_work)) {
+ need_preemptive_reclaim(fs_info, space_info) &&
+ !work_busy(&fs_info->preempt_reclaim_work)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
queue_work(system_unbound_wq,
- &fs_info->async_reclaim_work);
+ &fs_info->preempt_reclaim_work);
}
}
spin_unlock(&space_info->lock);
if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
return ret;
- return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
+ return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
+ orig_bytes, flush);
}
/**
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @block_rsv - the block_rsv we're allocating for
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
+ * Trye to reserve metadata bytes from the block_rsv's space
+ *
+ * @root: the root we're allocating for
+ * @block_rsv: block_rsv we're allocating for
+ * @orig_bytes: number of bytes we want
+ * @flush: whether or not we can flush to make our reservation
*
* This will reserve orig_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
@@ -1446,10 +1648,11 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
}
/**
- * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation
- * @fs_info - the filesystem
- * @bytes - the number of bytes we need
- * @flush - how we are allowed to flush
+ * Try to reserve data bytes for an allocation
+ *
+ * @fs_info: the filesystem
+ * @bytes: number of bytes we need
+ * @flush: how we are allowed to flush
*
* This will reserve bytes from the data space info. If there is not enough
* space then we will attempt to flush space as specified by flush.
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 5646393b928c..b1a8ffb03b3e 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -17,11 +17,17 @@ struct btrfs_space_info {
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
+ u64 bytes_zone_unusable; /* total bytes that are unusable until
+ resetting the device zone */
u64 max_extent_size; /* This will hold the maximum extent size of
the space info if we had an ENOSPC in the
allocator. */
+ int clamp; /* Used to scale our threshold for preemptive
+ flushing. The value is >> clamp, so turns
+ out to be a 2^clamp divisor. */
+
unsigned int full:1; /* indicates that we cannot allocate any more
chunks for this space */
unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
@@ -119,7 +125,7 @@ DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly,
+ u64 bytes_readonly, u64 bytes_zone_unusable,
struct btrfs_space_info **space_info);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
@@ -152,4 +158,21 @@ static inline void btrfs_space_info_free_bytes_may_use(
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
+static inline void __btrfs_mod_total_bytes_pinned(
+ struct btrfs_space_info *space_info,
+ s64 mod)
+{
+ percpu_counter_add_batch(&space_info->total_bytes_pinned, mod,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH);
+}
+
+static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info,
+ u64 flags, s64 mod)
+{
+ struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags);
+
+ ASSERT(space_info);
+ __btrfs_mod_total_bytes_pinned(space_info, mod);
+}
+
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index c46be27be700..8260f8bb3ff0 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -57,8 +57,9 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = member_offset >> PAGE_SHIFT; \
- const unsigned long oip = offset_in_page(member_offset); \
+ const unsigned long idx = get_eb_page_index(member_offset); \
+ const unsigned long oip = get_eb_offset_in_page(token->eb, \
+ member_offset); \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
const int part = PAGE_SIZE - oip; \
@@ -85,8 +86,8 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = offset_in_page(member_offset); \
- const unsigned long idx = member_offset >> PAGE_SHIFT; \
+ const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
+ const unsigned long idx = get_eb_page_index(member_offset); \
char *kaddr = page_address(eb->pages[idx]); \
const int size = sizeof(u##bits); \
const int part = PAGE_SIZE - oip; \
@@ -106,8 +107,9 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \
u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = member_offset >> PAGE_SHIFT; \
- const unsigned long oip = offset_in_page(member_offset); \
+ const unsigned long idx = get_eb_page_index(member_offset); \
+ const unsigned long oip = get_eb_offset_in_page(token->eb, \
+ member_offset); \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
const int part = PAGE_SIZE - oip; \
@@ -136,8 +138,8 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = offset_in_page(member_offset); \
- const unsigned long idx = member_offset >> PAGE_SHIFT; \
+ const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
+ const unsigned long idx = get_eb_page_index(member_offset); \
char *kaddr = page_address(eb->pages[idx]); \
const int size = sizeof(u##bits); \
const int part = PAGE_SIZE - oip; \
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
new file mode 100644
index 000000000000..c69049e7daa9
--- /dev/null
+++ b/fs/btrfs/subpage.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include "ctree.h"
+#include "subpage.h"
+
+int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page, enum btrfs_subpage_type type)
+{
+ struct btrfs_subpage *subpage = NULL;
+ int ret;
+
+ /*
+ * We have cases like a dummy extent buffer page, which is not mappped
+ * and doesn't need to be locked.
+ */
+ if (page->mapping)
+ ASSERT(PageLocked(page));
+ /* Either not subpage, or the page already has private attached */
+ if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
+ return 0;
+
+ ret = btrfs_alloc_subpage(fs_info, &subpage, type);
+ if (ret < 0)
+ return ret;
+ attach_page_private(page, subpage);
+ return 0;
+}
+
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ /* Either not subpage, or already detached */
+ if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
+ return;
+
+ subpage = (struct btrfs_subpage *)detach_page_private(page);
+ ASSERT(subpage);
+ btrfs_free_subpage(subpage);
+}
+
+int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ struct btrfs_subpage **ret,
+ enum btrfs_subpage_type type)
+{
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return 0;
+
+ *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
+ if (!*ret)
+ return -ENOMEM;
+ spin_lock_init(&(*ret)->lock);
+ if (type == BTRFS_SUBPAGE_METADATA)
+ atomic_set(&(*ret)->eb_refs, 0);
+ else
+ atomic_set(&(*ret)->readers, 0);
+ return 0;
+}
+
+void btrfs_free_subpage(struct btrfs_subpage *subpage)
+{
+ kfree(subpage);
+}
+
+/*
+ * Increase the eb_refs of current subpage.
+ *
+ * This is important for eb allocation, to prevent race with last eb freeing
+ * of the same page.
+ * With the eb_refs increased before the eb inserted into radix tree,
+ * detach_extent_buffer_page() won't detach the page private while we're still
+ * allocating the extent buffer.
+ */
+void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page) && page->mapping);
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ subpage = (struct btrfs_subpage *)page->private;
+ atomic_inc(&subpage->eb_refs);
+}
+
+void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page) && page->mapping);
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(atomic_read(&subpage->eb_refs));
+ atomic_dec(&subpage->eb_refs);
+}
+
+static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ /* Basic checks */
+ ASSERT(PagePrivate(page) && page->private);
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(len, fs_info->sectorsize));
+ /*
+ * The range check only works for mapped page, we can still have
+ * unmapped page like dummy extent buffer pages.
+ */
+ if (page->mapping)
+ ASSERT(page_offset(page) <= start &&
+ start + len <= page_offset(page) + PAGE_SIZE);
+}
+
+void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = len >> fs_info->sectorsize_bits;
+ int ret;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ ret = atomic_add_return(nbits, &subpage->readers);
+ ASSERT(ret == nbits);
+}
+
+void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = len >> fs_info->sectorsize_bits;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+ ASSERT(atomic_read(&subpage->readers) >= nbits);
+ if (atomic_sub_and_test(nbits, &subpage->readers))
+ unlock_page(page);
+}
+
+/*
+ * Convert the [start, start + len) range into a u16 bitmap
+ *
+ * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
+ */
+static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
+ const int nbits = len >> fs_info->sectorsize_bits;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ /*
+ * Here nbits can be 16, thus can go beyond u16 range. We make the
+ * first left shift to be calculate in unsigned long (at least u32),
+ * then truncate the result to u16.
+ */
+ return (u16)(((1UL << nbits) - 1) << bit_start);
+}
+
+void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->uptodate_bitmap |= tmp;
+ if (subpage->uptodate_bitmap == U16_MAX)
+ SetPageUptodate(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->uptodate_bitmap &= ~tmp;
+ ClearPageUptodate(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->error_bitmap |= tmp;
+ SetPageError(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->error_bitmap &= ~tmp;
+ if (subpage->error_bitmap == 0)
+ ClearPageError(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+/*
+ * Unlike set/clear which is dependent on each page status, for test all bits
+ * are tested in the same way.
+ */
+#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \
+bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
+ unsigned long flags; \
+ bool ret; \
+ \
+ spin_lock_irqsave(&subpage->lock, flags); \
+ ret = ((subpage->name##_bitmap & tmp) == tmp); \
+ spin_unlock_irqrestore(&subpage->lock, flags); \
+ return ret; \
+}
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
+
+/*
+ * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
+ * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
+ * back to regular sectorsize branch.
+ */
+#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \
+ test_page_func) \
+void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ set_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_set_##name(fs_info, page, start, len); \
+} \
+void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ clear_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_clear_##name(fs_info, page, start, len); \
+} \
+bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
+ return test_page_func(page); \
+ return btrfs_subpage_test_##name(fs_info, page, start, len); \
+}
+IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
+ PageUptodate);
+IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
new file mode 100644
index 000000000000..b86a4881475d
--- /dev/null
+++ b/fs/btrfs/subpage.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SUBPAGE_H
+#define BTRFS_SUBPAGE_H
+
+#include <linux/spinlock.h>
+
+/*
+ * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap
+ * is sufficient. Regular bitmap_* is not used due to size reasons.
+ */
+#define BTRFS_SUBPAGE_BITMAP_SIZE 16
+
+/*
+ * Structure to trace status of each sector inside a page, attached to
+ * page::private for both data and metadata inodes.
+ */
+struct btrfs_subpage {
+ /* Common members for both data and metadata pages */
+ spinlock_t lock;
+ u16 uptodate_bitmap;
+ u16 error_bitmap;
+ union {
+ /*
+ * Structures only used by metadata
+ *
+ * @eb_refs should only be operated under private_lock, as it
+ * manages whether the subpage can be detached.
+ */
+ atomic_t eb_refs;
+ /* Structures only used by data */
+ struct {
+ atomic_t readers;
+ };
+ };
+};
+
+enum btrfs_subpage_type {
+ BTRFS_SUBPAGE_METADATA,
+ BTRFS_SUBPAGE_DATA,
+};
+
+int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page, enum btrfs_subpage_type type);
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+
+/* Allocate additional data where page represents more than one sector */
+int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ struct btrfs_subpage **ret,
+ enum btrfs_subpage_type type);
+void btrfs_free_subpage(struct btrfs_subpage *subpage);
+
+void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+
+void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+
+/*
+ * Template for subpage related operations.
+ *
+ * btrfs_subpage_*() are for call sites where the page has subpage attached and
+ * the range is ensured to be inside the page.
+ *
+ * btrfs_page_*() are for call sites where the page can either be subpage
+ * specific or regular page. The function will handle both cases.
+ * But the range still needs to be inside the page.
+ */
+#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
+void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len);
+
+DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
+DECLARE_BTRFS_SUBPAGE_OPS(error);
+
+#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8840a4fa81eb..f8435641b912 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -44,10 +44,10 @@
#include "backref.h"
#include "space-info.h"
#include "sysfs.h"
+#include "zoned.h"
#include "tests/btrfs-tests.h"
#include "block-group.h"
#include "discard.h"
-
#include "qgroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -174,7 +174,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
btrfs_discard_stop(fs_info);
/* btrfs handle error by forcing the filesystem readonly */
- sb->s_flags |= SB_RDONLY;
+ btrfs_set_sb_rdonly(sb);
btrfs_info(fs_info, "forced readonly");
/*
* Note that a running device replace operation is not canceled here
@@ -240,9 +240,13 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
vaf.fmt = fmt;
vaf.va = &args;
- if (__ratelimit(ratelimit))
- printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
- fs_info ? fs_info->sb->s_id : "<unknown>", &vaf);
+ if (__ratelimit(ratelimit)) {
+ if (fs_info)
+ printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
+ fs_info->sb->s_id, &vaf);
+ else
+ printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ }
va_end(args);
}
@@ -333,7 +337,6 @@ enum {
Opt_device,
Opt_fatal_errors,
Opt_flushoncommit, Opt_noflushoncommit,
- Opt_inode_cache, Opt_noinode_cache,
Opt_max_inline,
Opt_barrier, Opt_nobarrier,
Opt_datacow, Opt_nodatacow,
@@ -360,9 +363,13 @@ enum {
Opt_rescue,
Opt_usebackuproot,
Opt_nologreplay,
+ Opt_ignorebadroots,
+ Opt_ignoredatacsums,
+ Opt_rescue_all,
/* Deprecated options */
Opt_recovery,
+ Opt_inode_cache, Opt_noinode_cache,
/* Debugging options */
Opt_check_integrity,
@@ -455,9 +462,25 @@ static const match_table_t tokens = {
static const match_table_t rescue_tokens = {
{Opt_usebackuproot, "usebackuproot"},
{Opt_nologreplay, "nologreplay"},
+ {Opt_ignorebadroots, "ignorebadroots"},
+ {Opt_ignorebadroots, "ibadroots"},
+ {Opt_ignoredatacsums, "ignoredatacsums"},
+ {Opt_ignoredatacsums, "idatacsums"},
+ {Opt_rescue_all, "all"},
{Opt_err, NULL},
};
+static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
+ const char *opt_name)
+{
+ if (fs_info->mount_opt & opt) {
+ btrfs_err(fs_info, "%s must be used with ro mount option",
+ opt_name);
+ return true;
+ }
+ return false;
+}
+
static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
{
char *opts;
@@ -487,6 +510,23 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
btrfs_set_and_info(info, NOLOGREPLAY,
"disabling log replay at mount time");
break;
+ case Opt_ignorebadroots:
+ btrfs_set_and_info(info, IGNOREBADROOTS,
+ "ignoring bad roots");
+ break;
+ case Opt_ignoredatacsums:
+ btrfs_set_and_info(info, IGNOREDATACSUMS,
+ "ignoring data csums");
+ break;
+ case Opt_rescue_all:
+ btrfs_info(info, "enabling all of the rescue options");
+ btrfs_set_and_info(info, IGNOREDATACSUMS,
+ "ignoring data csums");
+ btrfs_set_and_info(info, IGNOREBADROOTS,
+ "ignoring bad roots");
+ btrfs_set_and_info(info, NOLOGREPLAY,
+ "disabling log replay at mount time");
+ break;
case Opt_err:
btrfs_info(info, "unrecognized rescue option '%s'", p);
ret = -EINVAL;
@@ -511,7 +551,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
{
substring_t args[MAX_OPT_ARGS];
char *p, *num;
- u64 cache_gen;
int intarg;
int ret = 0;
char *compress_type;
@@ -521,11 +560,17 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
bool saved_compress_force;
int no_compress = 0;
- cache_gen = btrfs_super_cache_generation(info->super_copy);
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
- else if (cache_gen)
- btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ else if (btrfs_free_space_cache_v1_active(info)) {
+ if (btrfs_is_zoned(info)) {
+ btrfs_info(info,
+ "zoned: clearing existing space cache");
+ btrfs_set_super_cache_generation(info->super_copy, 0);
+ } else {
+ btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ }
+ }
/*
* Even the options are empty, we still need to do extra check
@@ -832,14 +877,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
break;
case Opt_inode_cache:
- btrfs_warn(info,
- "the 'inode_cache' option is deprecated and will have no effect from 5.11");
- btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
- "enabling inode map caching");
- break;
case Opt_noinode_cache:
- btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
- "disabling inode map caching");
+ btrfs_warn(info,
+ "the 'inode_cache' option is deprecated and has no effect since 5.11");
break;
case Opt_clear_cache:
btrfs_set_and_info(info, CLEAR_CACHE,
@@ -968,14 +1008,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
}
check:
- /*
- * Extra check for current option against current flag
- */
- if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) {
- btrfs_err(info,
- "nologreplay must be used with ro mount option");
+ /* We're read-only, don't have to check. */
+ if (new_flags & SB_RDONLY)
+ goto out;
+
+ if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
+ check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
+ check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
ret = -EINVAL;
- }
out:
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
!btrfs_test_opt(info, FREE_SPACE_TREE) &&
@@ -984,6 +1024,8 @@ out:
ret = -EINVAL;
}
+ if (!ret)
+ ret = btrfs_check_mountopts_zoned(info);
if (!ret && btrfs_test_opt(info, SPACE_CACHE))
btrfs_info(info, "disk space caching is enabled");
if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
@@ -1127,7 +1169,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
ret = -ENOMEM;
goto err;
}
- path->leave_spinning = 1;
name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!name) {
@@ -1256,7 +1297,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
/*
* Find the "default" dir item which points to the root item that we
@@ -1383,11 +1423,18 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return btrfs_commit_transaction(trans);
}
+static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed)
+{
+ seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s);
+ *printed = true;
+}
+
static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
const char *subvol_name;
+ bool printed = false;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1420,7 +1467,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (btrfs_test_opt(info, NOTREELOG))
seq_puts(seq, ",notreelog");
if (btrfs_test_opt(info, NOLOGREPLAY))
- seq_puts(seq, ",rescue=nologreplay");
+ print_rescue_option(seq, "nologreplay", &printed);
+ if (btrfs_test_opt(info, USEBACKUPROOT))
+ print_rescue_option(seq, "usebackuproot", &printed);
+ if (btrfs_test_opt(info, IGNOREBADROOTS))
+ print_rescue_option(seq, "ignorebadroots", &printed);
+ if (btrfs_test_opt(info, IGNOREDATACSUMS))
+ print_rescue_option(seq, "ignoredatacsums", &printed);
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD_SYNC))
@@ -1429,9 +1482,9 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",discard=async");
if (!(info->sb->s_flags & SB_POSIXACL))
seq_puts(seq, ",noacl");
- if (btrfs_test_opt(info, SPACE_CACHE))
+ if (btrfs_free_space_cache_v1_active(info))
seq_puts(seq, ",space_cache");
- else if (btrfs_test_opt(info, FREE_SPACE_TREE))
+ else if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
seq_puts(seq, ",space_cache=v2");
else
seq_puts(seq, ",nospace_cache");
@@ -1445,8 +1498,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",enospc_debug");
if (btrfs_test_opt(info, AUTO_DEFRAG))
seq_puts(seq, ",autodefrag");
- if (btrfs_test_opt(info, INODE_MAP_CACHE))
- seq_puts(seq, ",inode_cache");
if (btrfs_test_opt(info, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -1810,6 +1861,8 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
unsigned long old_opts)
{
+ const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
+
/*
* We need to cleanup all defragable inodes if the autodefragment is
* close or the filesystem is read only.
@@ -1826,12 +1879,15 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
!btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_cleanup(fs_info);
+
+ /* If we toggled space cache */
+ if (cache_opt != btrfs_free_space_cache_v1_active(fs_info))
+ btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
}
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- struct btrfs_root *root = fs_info->tree_root;
unsigned old_flags = sb->s_flags;
unsigned long old_opts = fs_info->mount_opt;
unsigned long old_compress_type = fs_info->compress_type;
@@ -1862,6 +1918,22 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
+ btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ (!sb_rdonly(sb) || (*flags & SB_RDONLY))) {
+ btrfs_warn(fs_info,
+ "remount supports changing free space tree only from ro to rw");
+ /* Make sure free space cache options match the state on disk */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ if (btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ }
+
if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
@@ -1880,7 +1952,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
/* avoid complains from lockdep et al. */
up(&fs_info->uuid_tree_rescan_sem);
- sb->s_flags |= SB_RDONLY;
+ btrfs_set_sb_rdonly(sb);
/*
* Setting SB_RDONLY will put the cleaner thread to
@@ -1891,10 +1963,42 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
*/
btrfs_delete_unused_bgs(fs_info);
+ /*
+ * The cleaner task could be already running before we set the
+ * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).
+ * We must make sure that after we finish the remount, i.e. after
+ * we call btrfs_commit_super(), the cleaner can no longer start
+ * a transaction - either because it was dropping a dead root,
+ * running delayed iputs or deleting an unused block group (the
+ * cleaner picked a block group from the list of unused block
+ * groups before we were able to in the previous call to
+ * btrfs_delete_unused_bgs()).
+ */
+ wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING,
+ TASK_UNINTERRUPTIBLE);
+
+ /*
+ * We've set the superblock to RO mode, so we might have made
+ * the cleaner task sleep without running all pending delayed
+ * iputs. Go through all the delayed iputs here, so that if an
+ * unmount happens without remounting RW we don't end up at
+ * finishing close_ctree() with a non-empty list of delayed
+ * iputs.
+ */
+ btrfs_run_delayed_iputs(fs_info);
+
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
btrfs_pause_balance(fs_info);
+ /*
+ * Pause the qgroup rescan worker if it is running. We don't want
+ * it to be still running after we are in RO mode, as after that,
+ * by the time we unmount, it might have left a transaction open,
+ * so we would leak the transaction and/or crash.
+ */
+ btrfs_qgroup_wait_for_completion(fs_info, false);
+
ret = btrfs_commit_super(fs_info);
if (ret)
goto restore;
@@ -1923,41 +2027,24 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
ret = -EINVAL;
goto restore;
}
-
- ret = btrfs_cleanup_fs_roots(fs_info);
- if (ret)
- goto restore;
-
- /* recover relocation */
- mutex_lock(&fs_info->cleaner_mutex);
- ret = btrfs_recover_relocation(root);
- mutex_unlock(&fs_info->cleaner_mutex);
- if (ret)
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ btrfs_warn(fs_info,
+ "read-write mount is not yet allowed for sectorsize %u page size %lu",
+ fs_info->sectorsize, PAGE_SIZE);
+ ret = -EINVAL;
goto restore;
+ }
- ret = btrfs_resume_balance_async(fs_info);
+ /*
+ * NOTE: when remounting with a change that does writes, don't
+ * put it anywhere above this point, as we are not sure to be
+ * safe to write until we pass the above checks.
+ */
+ ret = btrfs_start_pre_rw_mount(fs_info);
if (ret)
goto restore;
- ret = btrfs_resume_dev_replace_async(fs_info);
- if (ret) {
- btrfs_warn(fs_info, "failed to resume dev_replace");
- goto restore;
- }
-
- btrfs_qgroup_rescan_resume(fs_info);
-
- if (!fs_info->uuid_root) {
- btrfs_info(fs_info, "creating UUID tree");
- ret = btrfs_create_uuid_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to create the UUID tree %d",
- ret);
- goto restore;
- }
- }
- sb->s_flags &= ~SB_RDONLY;
+ btrfs_clear_sb_rdonly(sb);
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
}
@@ -1970,6 +2057,7 @@ out:
wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_clear_oneshot_options(fs_info);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
return 0;
@@ -1978,6 +2066,8 @@ restore:
/* We've hit an error - don't reset SB_RDONLY */
if (sb_rdonly(sb))
old_flags |= SB_RDONLY;
+ if (!(old_flags & SB_RDONLY))
+ clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
sb->s_flags = old_flags;
fs_info->mount_opt = old_opts;
fs_info->compress_type = old_compress_type;
@@ -2156,7 +2246,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 total_used = 0;
u64 total_free_data = 0;
u64 total_free_meta = 0;
- int bits = dentry->d_sb->s_blocksize_bits;
+ u32 bits = fs_info->sectorsize_bits;
__be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
@@ -2463,6 +2553,11 @@ static void __init btrfs_print_mod_info(void)
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
", ref-verify=on"
#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+ ", zoned=yes"
+#else
+ ", zoned=no"
+#endif
;
pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
}
@@ -2523,8 +2618,6 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_end_io_wq;
- btrfs_init_lockdep();
-
btrfs_print_mod_info();
err = btrfs_run_sanity_tests();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 279d9262b676..6eb1c50fa98c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -263,6 +263,10 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
+/* Remove once support for zoned allocation is feature complete */
+#ifdef CONFIG_BTRFS_DEBUG
+BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+#endif
static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -278,6 +282,9 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
BTRFS_FEAT_ATTR_PTR(raid1c34),
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_FEAT_ATTR_PTR(zoned),
+#endif
NULL
};
@@ -329,10 +336,35 @@ static ssize_t send_stream_version_show(struct kobject *kobj,
}
BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
+static const char *rescue_opts[] = {
+ "usebackuproot",
+ "nologreplay",
+ "ignorebadroots",
+ "ignoredatacsums",
+ "all",
+};
+
+static ssize_t supported_rescue_options_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ (i ? " " : ""), rescue_opts[i]);
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_rescue_options,
+ supported_rescue_options_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
BTRFS_ATTR_PTR(static_feature, send_stream_version),
+ BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
NULL
};
@@ -433,7 +465,8 @@ static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
return -EINVAL;
WRITE_ONCE(discard_ctl->iops_limit, iops_limit);
-
+ btrfs_discard_calc_delay(discard_ctl);
+ btrfs_discard_schedule_work(discard_ctl, true);
return len;
}
BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show,
@@ -463,7 +496,7 @@ static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
return -EINVAL;
WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit);
-
+ btrfs_discard_schedule_work(discard_ctl, true);
return len;
}
BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show,
@@ -633,6 +666,7 @@ SPACE_INFO_ATTR(bytes_pinned);
SPACE_INFO_ATTR(bytes_reserved);
SPACE_INFO_ATTR(bytes_may_use);
SPACE_INFO_ATTR(bytes_readonly);
+SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
BTRFS_ATTR(space_info, total_bytes_pinned,
@@ -646,6 +680,7 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, bytes_reserved),
BTRFS_ATTR_PTR(space_info, bytes_may_use),
BTRFS_ATTR_PTR(space_info, bytes_readonly),
+ BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
@@ -854,6 +889,82 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
}
BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
+static ssize_t btrfs_generation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation);
+}
+BTRFS_ATTR(, generation, btrfs_generation_show);
+
+/*
+ * Look for an exact string @string in @buffer with possible leading or
+ * trailing whitespace
+ */
+static bool strmatch(const char *buffer, const char *string)
+{
+ const size_t len = strlen(string);
+
+ /* Skip leading whitespace */
+ buffer = skip_spaces(buffer);
+
+ /* Match entire string, check if the rest is whitespace or empty */
+ if (strncmp(string, buffer, len) == 0 &&
+ strlen(skip_spaces(buffer + len)) == 0)
+ return true;
+
+ return false;
+}
+
+static const char * const btrfs_read_policy_name[] = { "pid" };
+
+static ssize_t btrfs_read_policy_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
+ if (fs_devices->read_policy == i)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]",
+ (ret == 0 ? "" : " "),
+ btrfs_read_policy_name[i]);
+ else
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ (ret == 0 ? "" : " "),
+ btrfs_read_policy_name[i]);
+ }
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+
+ return ret;
+}
+
+static ssize_t btrfs_read_policy_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ int i;
+
+ for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
+ if (strmatch(buf, btrfs_read_policy_name[i])) {
+ if (i != fs_devices->read_policy) {
+ fs_devices->read_policy = i;
+ btrfs_info(fs_devices->fs_info,
+ "read policy set to '%s'",
+ btrfs_read_policy_name[i]);
+ }
+ return len;
+ }
+ }
+
+ return -EINVAL;
+}
+BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -863,6 +974,8 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, metadata_uuid),
BTRFS_ATTR_PTR(, checksum),
BTRFS_ATTR_PTR(, exclusive_operation),
+ BTRFS_ATTR_PTR(, generation),
+ BTRFS_ATTR_PTR(, read_policy),
NULL,
};
@@ -1207,7 +1320,7 @@ static const char *alloc_name(u64 flags)
default:
WARN_ON(1);
return "invalid-combination";
- };
+ }
}
/*
@@ -1232,8 +1345,6 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
void btrfs_sysfs_remove_device(struct btrfs_device *device)
{
- struct hd_struct *disk;
- struct kobject *disk_kobj;
struct kobject *devices_kobj;
/*
@@ -1243,11 +1354,8 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device)
devices_kobj = device->fs_info->fs_devices->devices_kobj;
ASSERT(devices_kobj);
- if (device->bdev) {
- disk = device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(devices_kobj, disk_kobj->name);
- }
+ if (device->bdev)
+ sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name);
if (device->devid_kobj.state_initialized) {
kobject_del(&device->devid_kobj);
@@ -1353,11 +1461,7 @@ int btrfs_sysfs_add_device(struct btrfs_device *device)
nofs_flag = memalloc_nofs_save();
if (device->bdev) {
- struct hd_struct *disk;
- struct kobject *disk_kobj;
-
- disk = device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
+ struct kobject *disk_kobj = bdev_kobj(device->bdev);
ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
if (ret) {
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 999c14e5d0bd..6bd97bd4cb37 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -55,8 +55,14 @@ struct inode *btrfs_new_test_inode(void)
struct inode *inode;
inode = new_inode(test_mnt->mnt_sb);
- if (inode)
- inode_init_owner(inode, NULL, S_IFREG);
+ if (!inode)
+ return NULL;
+
+ inode->i_mode = S_IFREG;
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ BTRFS_I(inode)->location.offset = 0;
+ inode_init_owner(inode, NULL, S_IFREG);
return inode;
}
@@ -134,6 +140,7 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
+ fs_info->sectorsize_bits = ilog2(sectorsize);
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
test_mnt->mnt_sb->s_fs_info = fs_info;
@@ -224,7 +231,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
- btrfs_init_free_space_ctl(cache);
+ btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
mutex_init(&cache->free_space_lock);
return cache;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index df7ce874a74b..73e96d505f4f 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -379,54 +379,50 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
{
struct btrfs_fs_info *fs_info;
- unsigned long len;
unsigned long *bitmap = NULL;
struct extent_buffer *eb = NULL;
int ret;
test_msg("running extent buffer bitmap tests");
- /*
- * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than
- * BTRFS_MAX_METADATA_BLOCKSIZE.
- */
- len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE)
- ? sectorsize * 4 : sectorsize;
-
- fs_info = btrfs_alloc_dummy_fs_info(len, len);
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_std_err(TEST_ALLOC_FS_INFO);
return -ENOMEM;
}
- bitmap = kmalloc(len, GFP_KERNEL);
+ bitmap = kmalloc(nodesize, GFP_KERNEL);
if (!bitmap) {
test_err("couldn't allocate test bitmap");
ret = -ENOMEM;
goto out;
}
- eb = __alloc_dummy_extent_buffer(fs_info, 0, len);
+ eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
goto out;
}
- ret = __test_eb_bitmaps(bitmap, eb, len);
+ ret = __test_eb_bitmaps(bitmap, eb, nodesize);
if (ret)
goto out;
- /* Do it over again with an extent buffer which isn't page-aligned. */
free_extent_buffer(eb);
- eb = __alloc_dummy_extent_buffer(fs_info, nodesize / 2, len);
+
+ /*
+ * Test again for case where the tree block is sectorsize aligned but
+ * not nodesize aligned.
+ */
+ eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
goto out;
}
- ret = __test_eb_bitmaps(bitmap, eb, len);
+ ret = __test_eb_bitmaps(bitmap, eb, nodesize);
out:
free_extent_buffer(eb);
kfree(bitmap);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 57379e96ccc9..c0aefe6dee0b 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -507,7 +507,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
goto out_free;
}
- ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
+ ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1),
&logical, &out_ndaddrs, &out_stripe_len);
if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
test_err("didn't rmap anything but expected %d",
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index aebdf23f0cdd..8f05c1eb833f 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -399,7 +399,6 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
u64 offset;
u64 max_extent_size;
const struct btrfs_free_space_op test_free_space_ops = {
- .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds,
.use_bitmap = test_use_bitmap,
};
const struct btrfs_free_space_op *orig_free_space_ops;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index e6719f7db386..c9874b12d337 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -232,11 +232,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
return ret;
}
- inode->i_mode = S_IFREG;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
- BTRFS_I(inode)->location.offset = 0;
-
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_std_err(TEST_ALLOC_FS_INFO);
@@ -835,10 +830,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
return ret;
}
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
- BTRFS_I(inode)->location.offset = 0;
-
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_std_err(TEST_ALLOC_FS_INFO);
@@ -983,7 +974,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1050,7 +1042,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1082,7 +1075,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* Empty */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1097,7 +1091,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
out:
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, 0, 0, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ce1ca8e73c2d..f3137285a9e2 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -36,7 +36,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
return -ENOMEM;
}
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
if (ret) {
test_err("couldn't insert ref %d", ret);
@@ -86,7 +85,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
return -ENOMEM;
}
- path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
test_err("couldn't find extent ref");
@@ -135,7 +133,6 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
test_std_err(TEST_ALLOC_ROOT);
return -ENOMEM;
}
- path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
if (ret) {
@@ -170,7 +167,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
return -ENOMEM;
}
- path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
test_err("couldn't find extent ref");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 52ada47aff50..acff6bb49a97 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -16,12 +16,12 @@
#include "transaction.h"
#include "locking.h"
#include "tree-log.h"
-#include "inode-map.h"
#include "volumes.h"
#include "dev-replace.h"
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
+#include "zoned.h"
#define BTRFS_ROOT_TRANS_TAG 0
@@ -108,6 +108,11 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
__TRANS_JOIN |
__TRANS_JOIN_NOLOCK |
__TRANS_JOIN_NOSTART),
+ [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK |
+ __TRANS_JOIN_NOSTART),
[TRANS_STATE_COMPLETED] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
@@ -155,6 +160,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root, *tmp;
+ struct btrfs_caching_control *caching_ctl, *next;
down_write(&fs_info->commit_root_sem);
list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
@@ -162,8 +168,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
root->commit_root = btrfs_root_node(root);
- if (is_fstree(root->root_key.objectid))
- btrfs_unpin_free_ino(root);
extent_io_tree_release(&root->dirty_log_pages);
btrfs_qgroup_clean_swapped_blocks(root);
}
@@ -180,6 +184,47 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
spin_lock(&cur_trans->dropped_roots_lock);
}
spin_unlock(&cur_trans->dropped_roots_lock);
+
+ /*
+ * We have to update the last_byte_to_unpin under the commit_root_sem,
+ * at the same time we swap out the commit roots.
+ *
+ * This is because we must have a real view of the last spot the caching
+ * kthreads were while caching. Consider the following views of the
+ * extent tree for a block group
+ *
+ * commit root
+ * +----+----+----+----+----+----+----+
+ * |\\\\| |\\\\|\\\\| |\\\\|\\\\|
+ * +----+----+----+----+----+----+----+
+ * 0 1 2 3 4 5 6 7
+ *
+ * new commit root
+ * +----+----+----+----+----+----+----+
+ * | | | |\\\\| | |\\\\|
+ * +----+----+----+----+----+----+----+
+ * 0 1 2 3 4 5 6 7
+ *
+ * If the cache_ctl->progress was at 3, then we are only allowed to
+ * unpin [0,1) and [2,3], because the caching thread has already
+ * processed those extents. We are not allowed to unpin [5,6), because
+ * the caching thread will re-start it's search from 3, and thus find
+ * the hole from [4,6) to add to the free space cache.
+ */
+ spin_lock(&fs_info->block_group_cache_lock);
+ list_for_each_entry_safe(caching_ctl, next,
+ &fs_info->caching_block_groups, list) {
+ struct btrfs_block_group *cache = caching_ctl->block_group;
+
+ if (btrfs_block_group_done(cache)) {
+ cache->last_byte_to_unpin = (u64)-1;
+ list_del_init(&caching_ctl->list);
+ btrfs_put_caching_control(caching_ctl);
+ } else {
+ cache->last_byte_to_unpin = caching_ctl->progress;
+ }
+ }
+ spin_unlock(&fs_info->block_group_cache_lock);
up_write(&fs_info->commit_root_sem);
}
@@ -336,6 +381,8 @@ loop:
spin_lock_init(&cur_trans->dirty_bgs_lock);
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
spin_lock_init(&cur_trans->dropped_roots_lock);
+ INIT_LIST_HEAD(&cur_trans->releasing_ebs);
+ spin_lock_init(&cur_trans->releasing_ebs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
@@ -787,10 +834,11 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
return trans;
}
-/* wait for a transaction commit to be fully complete */
-static noinline void wait_for_commit(struct btrfs_transaction *commit)
+/* Wait for a transaction commit to reach at least the given state. */
+static noinline void wait_for_commit(struct btrfs_transaction *commit,
+ const enum btrfs_trans_state min_state)
{
- wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
+ wait_event(commit->commit_wait, commit->state >= min_state);
}
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
@@ -845,7 +893,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
goto out; /* nothing committing|committed */
}
- wait_for_commit(cur_trans);
+ wait_for_commit(cur_trans, TRANS_STATE_COMPLETED);
btrfs_put_transaction(cur_trans);
out:
return ret;
@@ -856,24 +904,23 @@ void btrfs_throttle(struct btrfs_fs_info *fs_info)
wait_current_trans(fs_info);
}
-static int should_end_transaction(struct btrfs_trans_handle *trans)
+static bool should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
if (btrfs_check_space_for_delayed_refs(fs_info))
- return 1;
+ return true;
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
}
-int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
+bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
- smp_mb();
if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
- cur_trans->delayed_refs.flushing)
- return 1;
+ test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
+ return true;
return should_end_transaction(trans);
}
@@ -1191,10 +1238,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret)
- return ret;
-
ret = btrfs_run_dev_stats(trans);
if (ret)
return ret;
@@ -1209,10 +1252,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
if (ret)
return ret;
- /* run_qgroups might have added some more refs */
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret)
- return ret;
again:
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
struct btrfs_root *root;
@@ -1227,15 +1266,24 @@ again:
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret)
- return ret;
}
+ /* Now flush any delayed refs generated by updating all of the roots */
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ if (ret)
+ return ret;
+
while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
ret = btrfs_write_dirty_block_groups(trans);
if (ret)
return ret;
+
+ /*
+ * We're writing the dirty block groups, which could generate
+ * delayed refs, which could generate more dirty block groups,
+ * so we want to keep this flushing in this loop to make sure
+ * everything gets run.
+ */
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
@@ -1280,7 +1328,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
struct btrfs_root *gang[8];
int i;
int ret;
- int err = 0;
spin_lock(&fs_info->fs_roots_radix_lock);
while (1) {
@@ -1292,6 +1339,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
break;
for (i = 0; i < ret; i++) {
struct btrfs_root *root = gang[i];
+ int ret2;
+
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
@@ -1300,8 +1349,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
btrfs_free_log(trans, root);
btrfs_update_reloc_root(trans, root);
- btrfs_save_ino_cache(root, trans);
-
/* see comments in should_cow_block() */
clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_mb__after_atomic();
@@ -1313,17 +1360,17 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
root->node);
}
- err = btrfs_update_root(trans, fs_info->tree_root,
+ ret2 = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key,
&root->root_item);
+ if (ret2)
+ return ret2;
spin_lock(&fs_info->fs_roots_radix_lock);
- if (err)
- break;
btrfs_qgroup_free_meta_all_pertrans(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
- return err;
+ return 0;
}
/*
@@ -1396,6 +1443,23 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
record_root_in_trans(trans, src, 1);
/*
+ * btrfs_qgroup_inherit relies on a consistent view of the usage for the
+ * src root, so we must run the delayed refs here.
+ *
+ * However this isn't particularly fool proof, because there's no
+ * synchronization keeping us from changing the tree after this point
+ * before we do the qgroup_inherit, or even from making changes while
+ * we're doing the qgroup_inherit. But that's a problem for the future,
+ * for now flush the delayed refs to narrow the race window where the
+ * qgroup counters could end up wrong.
+ */
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ /*
* We are going to commit transaction, see btrfs_commit_transaction()
* comment for reason locking tree_log_mutex
*/
@@ -1488,7 +1552,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ASSERT(pending->root_item);
new_root_item = pending->root_item;
- pending->error = btrfs_find_free_objectid(tree_root, &objectid);
+ pending->error = btrfs_get_free_objectid(tree_root, &objectid);
if (pending->error)
goto no_free_objectid;
@@ -1598,8 +1662,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- btrfs_set_lock_blocking_write(old);
-
ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
/* clean up in any case */
btrfs_tree_unlock(old);
@@ -1650,12 +1712,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
/*
* Do special qgroup accounting for snapshot, as we do some qgroup
* snapshot hack to do fast snapshot.
@@ -1681,7 +1737,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
dentry->d_name.len * 2);
parent_inode->i_mtime = parent_inode->i_ctime =
current_time(parent_inode);
- ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
+ ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1703,12 +1759,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
}
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
fail:
pending->error = ret;
dir_item_existed:
@@ -1761,6 +1811,8 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->root_level = root_item->level;
if (btrfs_test_opt(fs_info, SPACE_CACHE))
super->cache_generation = root_item->generation;
+ else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags))
+ super->cache_generation = 0;
if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
}
@@ -1956,10 +2008,8 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
}
}
-static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans)
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
-
/*
* We use writeback_inodes_sb here because if we used
* btrfs_start_delalloc_roots we would deadlock with fs freeze.
@@ -1969,50 +2019,15 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans)
* from already being in a transaction and our join_transaction doesn't
* have to re-take the fs freeze lock.
*/
- if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
- } else {
- struct btrfs_pending_snapshot *pending;
- struct list_head *head = &trans->transaction->pending_snapshots;
-
- /*
- * Flush dellaloc for any root that is going to be snapshotted.
- * This is done to avoid a corrupted version of files, in the
- * snapshots, that had both buffered and direct IO writes (even
- * if they were done sequentially) due to an unordered update of
- * the inode's size on disk.
- */
- list_for_each_entry(pending, head, list) {
- int ret;
-
- ret = btrfs_start_delalloc_snapshot(pending->root);
- if (ret)
- return ret;
- }
- }
return 0;
}
-static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans)
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
-
- if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- } else {
- struct btrfs_pending_snapshot *pending;
- struct list_head *head = &trans->transaction->pending_snapshots;
-
- /*
- * Wait for any dellaloc that we started previously for the roots
- * that are going to be snapshotted. This is to avoid a corrupted
- * version of files in the snapshots that had both buffered and
- * direct IO writes (even if they were done sequentially).
- */
- list_for_each_entry(pending, head, list)
- btrfs_wait_ordered_extents(pending->root,
- U64_MAX, 0, U64_MAX);
- }
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
@@ -2042,32 +2057,25 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
- /* make a pass through all the delayed refs we have so far
- * any runnings procs may add more while we are here
- */
- ret = btrfs_run_delayed_refs(trans, 0);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
-
- cur_trans = trans->transaction;
-
/*
- * set the flushing flag so procs in this transaction have to
- * start sending their work down.
+ * We only want one transaction commit doing the flushing so we do not
+ * waste a bunch of time on lock contention on the extent root node.
*/
- cur_trans->delayed_refs.flushing = 1;
- smp_wmb();
+ if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING,
+ &cur_trans->delayed_refs.flags)) {
+ /*
+ * Make a pass through all the delayed refs we have so far.
+ * Any running threads may add more while we are here.
+ */
+ ret = btrfs_run_delayed_refs(trans, 0);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+ }
btrfs_create_pending_block_groups(trans);
- ret = btrfs_run_delayed_refs(trans, 0);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
-
if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
int run_it = 0;
@@ -2101,11 +2109,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
spin_lock(&fs_info->trans_lock);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+ enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
+
spin_unlock(&fs_info->trans_lock);
refcount_inc(&cur_trans->use_count);
- ret = btrfs_end_transaction(trans);
- wait_for_commit(cur_trans);
+ if (trans->in_fsync)
+ want_state = TRANS_STATE_SUPER_COMMITTED;
+ ret = btrfs_end_transaction(trans);
+ wait_for_commit(cur_trans, want_state);
if (TRANS_ABORTED(cur_trans))
ret = cur_trans->aborted;
@@ -2119,13 +2131,19 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_blocked_wait);
if (cur_trans->list.prev != &fs_info->trans_list) {
+ enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
+
+ if (trans->in_fsync)
+ want_state = TRANS_STATE_SUPER_COMMITTED;
+
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
- if (prev_trans->state != TRANS_STATE_COMPLETED) {
+ if (prev_trans->state < want_state) {
refcount_inc(&prev_trans->use_count);
spin_unlock(&fs_info->trans_lock);
- wait_for_commit(prev_trans);
+ wait_for_commit(prev_trans, want_state);
+
ret = READ_ONCE(prev_trans->aborted);
btrfs_put_transaction(prev_trans);
@@ -2150,7 +2168,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
extwriter_counter_dec(cur_trans, trans->type);
- ret = btrfs_start_delalloc_flush(trans);
+ ret = btrfs_start_delalloc_flush(fs_info);
if (ret)
goto cleanup_transaction;
@@ -2166,7 +2184,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto cleanup_transaction;
- btrfs_wait_delalloc_flush(trans);
+ btrfs_wait_delalloc_flush(fs_info);
/*
* Wait for all ordered extents started by a fast fsync that joined this
@@ -2265,14 +2283,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_free_log_root_tree(trans, fs_info);
/*
- * commit_fs_roots() can call btrfs_save_ino_cache(), which generates
- * new delayed refs. Must handle them or qgroup can be wrong.
- */
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret)
- goto unlock_tree_log;
-
- /*
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
@@ -2293,8 +2303,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
goto unlock_tree_log;
}
- btrfs_prepare_extent_commit(fs_info);
-
cur_trans = fs_info->running_transaction;
btrfs_set_root_node(&fs_info->tree_root->root_item,
@@ -2345,6 +2353,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
goto scrub_continue;
}
+ /*
+ * At this point, we should have written all the tree blocks allocated
+ * in this transaction. So it's now safe to free the redirtyied extent
+ * buffers.
+ */
+ btrfs_free_redirty_list(cur_trans);
+
ret = write_all_supers(fs_info, 0);
/*
* the super is written, we can safely allow the tree-loggers
@@ -2354,6 +2369,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto scrub_continue;
+ /*
+ * We needn't acquire the lock here because there is no other task
+ * which can change it.
+ */
+ cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
+ wake_up(&cur_trans->commit_wait);
+
btrfs_finish_extent_commit(trans);
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
@@ -2435,10 +2457,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
btrfs_kill_all_delayed_nodes(root);
- if (root->ino_cache_inode) {
- iput(root->ino_cache_inode);
- root->ino_cache_inode = NULL;
- }
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
@@ -2459,16 +2477,6 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
if (!prev)
return;
- bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
- if (prev & bit)
- btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
- prev &= ~bit;
-
- bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
- if (prev & bit)
- btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
- prev &= ~bit;
-
bit = 1 << BTRFS_PENDING_COMMIT;
if (prev & bit)
btrfs_debug(fs_info, "pending commit done");
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 858d9153a1cd..6335716e513f 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -16,6 +16,7 @@ enum btrfs_trans_state {
TRANS_STATE_COMMIT_START,
TRANS_STATE_COMMIT_DOING,
TRANS_STATE_UNBLOCKED,
+ TRANS_STATE_SUPER_COMMITTED,
TRANS_STATE_COMPLETED,
TRANS_STATE_MAX,
};
@@ -92,6 +93,9 @@ struct btrfs_transaction {
*/
atomic_t pending_ordered;
wait_queue_head_t pending_wait;
+
+ spinlock_t releasing_ebs_lock;
+ struct list_head releasing_ebs;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -112,7 +116,6 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
#define BTRFS_SEND_TRANS_STUB ((void *)1)
-#define BTRFS_DIO_SYNC_STUB ((void *)2)
struct btrfs_trans_handle {
u64 transid;
@@ -134,6 +137,7 @@ struct btrfs_trans_handle {
bool can_flush_pending_bgs;
bool reloc_reserved;
bool dirty;
+ bool in_fsync;
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
struct list_head new_bgs;
@@ -219,7 +223,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
int wait_for_unblock);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
-int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
+bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index f0ffd5ee77bd..582061c7b547 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -100,7 +100,8 @@ static void file_extent_err(const struct extent_buffer *eb, int slot,
*/
#define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment) \
({ \
- if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
+ if (unlikely(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), \
+ (alignment)))) \
file_extent_err((leaf), (slot), \
"invalid %s for file extent, have %llu, should be aligned to %u", \
(#name), btrfs_file_extent_##name((leaf), (fi)), \
@@ -203,7 +204,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
u32 item_size = btrfs_item_size_nr(leaf, slot);
u64 extent_end;
- if (!IS_ALIGNED(key->offset, sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
file_extent_err(leaf, slot,
"unaligned file_offset for file extent, have %llu should be aligned to %u",
key->offset, sectorsize);
@@ -216,7 +217,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
* But if objectids mismatch, it means we have a missing
* INODE_ITEM.
*/
- if (!check_prev_ino(leaf, key, slot, prev_key))
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
return -EUCLEAN;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
@@ -225,14 +226,15 @@ static int check_extent_data_item(struct extent_buffer *leaf,
* Make sure the item contains at least inline header, so the file
* extent type is not some garbage.
*/
- if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) {
+ if (unlikely(item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START)) {
file_extent_err(leaf, slot,
"invalid item size, have %u expect [%zu, %u)",
item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START,
SZ_4K);
return -EUCLEAN;
}
- if (btrfs_file_extent_type(leaf, fi) >= BTRFS_NR_FILE_EXTENT_TYPES) {
+ if (unlikely(btrfs_file_extent_type(leaf, fi) >=
+ BTRFS_NR_FILE_EXTENT_TYPES)) {
file_extent_err(leaf, slot,
"invalid type for file extent, have %u expect range [0, %u]",
btrfs_file_extent_type(leaf, fi),
@@ -244,14 +246,15 @@ static int check_extent_data_item(struct extent_buffer *leaf,
* Support for new compression/encryption must introduce incompat flag,
* and must be caught in open_ctree().
*/
- if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES) {
+ if (unlikely(btrfs_file_extent_compression(leaf, fi) >=
+ BTRFS_NR_COMPRESS_TYPES)) {
file_extent_err(leaf, slot,
"invalid compression for file extent, have %u expect range [0, %u]",
btrfs_file_extent_compression(leaf, fi),
BTRFS_NR_COMPRESS_TYPES - 1);
return -EUCLEAN;
}
- if (btrfs_file_extent_encryption(leaf, fi)) {
+ if (unlikely(btrfs_file_extent_encryption(leaf, fi))) {
file_extent_err(leaf, slot,
"invalid encryption for file extent, have %u expect 0",
btrfs_file_extent_encryption(leaf, fi));
@@ -259,7 +262,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
}
if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
/* Inline extent must have 0 as key offset */
- if (key->offset) {
+ if (unlikely(key->offset)) {
file_extent_err(leaf, slot,
"invalid file_offset for inline file extent, have %llu expect 0",
key->offset);
@@ -272,8 +275,8 @@ static int check_extent_data_item(struct extent_buffer *leaf,
return 0;
/* Uncompressed inline extent size must match item size */
- if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
- btrfs_file_extent_ram_bytes(leaf, fi)) {
+ if (unlikely(item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+ btrfs_file_extent_ram_bytes(leaf, fi))) {
file_extent_err(leaf, slot,
"invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
@@ -284,22 +287,22 @@ static int check_extent_data_item(struct extent_buffer *leaf,
}
/* Regular or preallocated extent has fixed item size */
- if (item_size != sizeof(*fi)) {
+ if (unlikely(item_size != sizeof(*fi))) {
file_extent_err(leaf, slot,
"invalid item size for reg/prealloc file extent, have %u expect %zu",
item_size, sizeof(*fi));
return -EUCLEAN;
}
- if (CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) ||
- CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) ||
- CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) ||
- CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) ||
- CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize))
+ if (unlikely(CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)))
return -EUCLEAN;
/* Catch extent end overflow */
- if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi),
- key->offset, &extent_end)) {
+ if (unlikely(check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi),
+ key->offset, &extent_end))) {
file_extent_err(leaf, slot,
"extent end overflow, have file offset %llu extent num bytes %llu",
key->offset,
@@ -320,7 +323,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
prev_fi = btrfs_item_ptr(leaf, slot - 1,
struct btrfs_file_extent_item);
prev_end = file_extent_end(leaf, prev_key, prev_fi);
- if (prev_end > key->offset) {
+ if (unlikely(prev_end > key->offset)) {
file_extent_err(leaf, slot - 1,
"file extent end range (%llu) goes beyond start offset (%llu) of the next file extent",
prev_end, key->offset);
@@ -336,21 +339,21 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
u32 sectorsize = fs_info->sectorsize;
- u32 csumsize = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csumsize = fs_info->csum_size;
- if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
+ if (unlikely(key->objectid != BTRFS_EXTENT_CSUM_OBJECTID)) {
generic_err(leaf, slot,
"invalid key objectid for csum item, have %llu expect %llu",
key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
return -EUCLEAN;
}
- if (!IS_ALIGNED(key->offset, sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
generic_err(leaf, slot,
"unaligned key offset for csum item, have %llu should be aligned to %u",
key->offset, sectorsize);
return -EUCLEAN;
}
- if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
+ if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) {
generic_err(leaf, slot,
"unaligned item size for csum item, have %u should be aligned to %u",
btrfs_item_size_nr(leaf, slot), csumsize);
@@ -363,7 +366,7 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
prev_item_size = btrfs_item_size_nr(leaf, slot - 1);
prev_csum_end = (prev_item_size / csumsize) * sectorsize;
prev_csum_end += prev_key->offset;
- if (prev_csum_end > key->offset) {
+ if (unlikely(prev_csum_end > key->offset)) {
generic_err(leaf, slot - 1,
"csum end range (%llu) goes beyond the start range (%llu) of the next csum item",
prev_csum_end, key->offset);
@@ -388,15 +391,16 @@ static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
/* For XATTR_ITEM, location key should be all 0 */
if (item_key.type == BTRFS_XATTR_ITEM_KEY) {
- if (key->type != 0 || key->objectid != 0 || key->offset != 0)
+ if (unlikely(key->objectid != 0 || key->type != 0 ||
+ key->offset != 0))
return -EUCLEAN;
return 0;
}
- if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
- key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
- key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
- key->objectid != BTRFS_FREE_INO_OBJECTID) {
+ if (unlikely((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
+ key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
+ key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
+ key->objectid != BTRFS_FREE_INO_OBJECTID)) {
if (is_inode_item) {
generic_err(leaf, slot,
"invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
@@ -414,7 +418,7 @@ static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
}
return -EUCLEAN;
}
- if (key->offset != 0) {
+ if (unlikely(key->offset != 0)) {
if (is_inode_item)
inode_item_err(leaf, slot,
"invalid key offset: has %llu expect 0",
@@ -438,7 +442,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY);
/* No such tree id */
- if (key->objectid == 0) {
+ if (unlikely(key->objectid == 0)) {
if (is_root_item)
generic_err(leaf, slot, "invalid root id 0");
else
@@ -448,7 +452,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
}
/* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */
- if (!is_fstree(key->objectid) && !is_root_item) {
+ if (unlikely(!is_fstree(key->objectid) && !is_root_item)) {
dir_item_err(leaf, slot,
"invalid location key objectid, have %llu expect [%llu, %llu]",
key->objectid, BTRFS_FIRST_FREE_OBJECTID,
@@ -464,7 +468,8 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
* So here we only check offset for reloc tree whose key->offset must
* be a valid tree.
*/
- if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) {
+ if (unlikely(key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
+ key->offset == 0)) {
generic_err(leaf, slot, "invalid root id 0 for reloc tree");
return -EUCLEAN;
}
@@ -480,8 +485,9 @@ static int check_dir_item(struct extent_buffer *leaf,
u32 item_size = btrfs_item_size_nr(leaf, slot);
u32 cur = 0;
- if (!check_prev_ino(leaf, key, slot, prev_key))
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
return -EUCLEAN;
+
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
while (cur < item_size) {
struct btrfs_key location_key;
@@ -494,7 +500,7 @@ static int check_dir_item(struct extent_buffer *leaf,
int ret;
/* header itself should not cross item boundary */
- if (cur + sizeof(*di) > item_size) {
+ if (unlikely(cur + sizeof(*di) > item_size)) {
dir_item_err(leaf, slot,
"dir item header crosses item boundary, have %zu boundary %u",
cur + sizeof(*di), item_size);
@@ -505,12 +511,12 @@ static int check_dir_item(struct extent_buffer *leaf,
btrfs_dir_item_key_to_cpu(leaf, di, &location_key);
if (location_key.type == BTRFS_ROOT_ITEM_KEY) {
ret = check_root_key(leaf, &location_key, slot);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
} else if (location_key.type == BTRFS_INODE_ITEM_KEY ||
location_key.type == 0) {
ret = check_inode_key(leaf, &location_key, slot);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
} else {
dir_item_err(leaf, slot,
@@ -522,22 +528,22 @@ static int check_dir_item(struct extent_buffer *leaf,
/* dir type check */
dir_type = btrfs_dir_type(leaf, di);
- if (dir_type >= BTRFS_FT_MAX) {
+ if (unlikely(dir_type >= BTRFS_FT_MAX)) {
dir_item_err(leaf, slot,
"invalid dir item type, have %u expect [0, %u)",
dir_type, BTRFS_FT_MAX);
return -EUCLEAN;
}
- if (key->type == BTRFS_XATTR_ITEM_KEY &&
- dir_type != BTRFS_FT_XATTR) {
+ if (unlikely(key->type == BTRFS_XATTR_ITEM_KEY &&
+ dir_type != BTRFS_FT_XATTR)) {
dir_item_err(leaf, slot,
"invalid dir item type for XATTR key, have %u expect %u",
dir_type, BTRFS_FT_XATTR);
return -EUCLEAN;
}
- if (dir_type == BTRFS_FT_XATTR &&
- key->type != BTRFS_XATTR_ITEM_KEY) {
+ if (unlikely(dir_type == BTRFS_FT_XATTR &&
+ key->type != BTRFS_XATTR_ITEM_KEY)) {
dir_item_err(leaf, slot,
"xattr dir type found for non-XATTR key");
return -EUCLEAN;
@@ -550,13 +556,13 @@ static int check_dir_item(struct extent_buffer *leaf,
/* Name/data length check */
name_len = btrfs_dir_name_len(leaf, di);
data_len = btrfs_dir_data_len(leaf, di);
- if (name_len > max_name_len) {
+ if (unlikely(name_len > max_name_len)) {
dir_item_err(leaf, slot,
"dir item name len too long, have %u max %u",
name_len, max_name_len);
return -EUCLEAN;
}
- if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) {
+ if (unlikely(name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info))) {
dir_item_err(leaf, slot,
"dir item name and data len too long, have %u max %u",
name_len + data_len,
@@ -564,7 +570,7 @@ static int check_dir_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
- if (data_len && dir_type != BTRFS_FT_XATTR) {
+ if (unlikely(data_len && dir_type != BTRFS_FT_XATTR)) {
dir_item_err(leaf, slot,
"dir item with invalid data len, have %u expect 0",
data_len);
@@ -574,7 +580,7 @@ static int check_dir_item(struct extent_buffer *leaf,
total_size = sizeof(*di) + name_len + data_len;
/* header and name/data should not cross item boundary */
- if (cur + total_size > item_size) {
+ if (unlikely(cur + total_size > item_size)) {
dir_item_err(leaf, slot,
"dir item data crosses item boundary, have %u boundary %u",
cur + total_size, item_size);
@@ -592,7 +598,7 @@ static int check_dir_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, namebuf,
(unsigned long)(di + 1), name_len);
name_hash = btrfs_name_hash(namebuf, name_len);
- if (key->offset != name_hash) {
+ if (unlikely(key->offset != name_hash)) {
dir_item_err(leaf, slot,
"name hash mismatch with key, have 0x%016x expect 0x%016llx",
name_hash, key->offset);
@@ -641,13 +647,13 @@ static int check_block_group_item(struct extent_buffer *leaf,
* Here we don't really care about alignment since extent allocator can
* handle it. We care more about the size.
*/
- if (key->offset == 0) {
+ if (unlikely(key->offset == 0)) {
block_group_err(leaf, slot,
"invalid block group size 0");
return -EUCLEAN;
}
- if (item_size != sizeof(bgi)) {
+ if (unlikely(item_size != sizeof(bgi))) {
block_group_err(leaf, slot,
"invalid item size, have %u expect %zu",
item_size, sizeof(bgi));
@@ -656,8 +662,8 @@ static int check_block_group_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
sizeof(bgi));
- if (btrfs_stack_block_group_chunk_objectid(&bgi) !=
- BTRFS_FIRST_CHUNK_TREE_OBJECTID) {
+ if (unlikely(btrfs_stack_block_group_chunk_objectid(&bgi) !=
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
block_group_err(leaf, slot,
"invalid block group chunk objectid, have %llu expect %llu",
btrfs_stack_block_group_chunk_objectid(&bgi),
@@ -665,7 +671,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
- if (btrfs_stack_block_group_used(&bgi) > key->offset) {
+ if (unlikely(btrfs_stack_block_group_used(&bgi) > key->offset)) {
block_group_err(leaf, slot,
"invalid block group used, have %llu expect [0, %llu)",
btrfs_stack_block_group_used(&bgi), key->offset);
@@ -673,7 +679,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
}
flags = btrfs_stack_block_group_flags(&bgi);
- if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
+ if (unlikely(hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1)) {
block_group_err(leaf, slot,
"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
@@ -682,11 +688,11 @@ static int check_block_group_item(struct extent_buffer *leaf,
}
type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
- if (type != BTRFS_BLOCK_GROUP_DATA &&
- type != BTRFS_BLOCK_GROUP_METADATA &&
- type != BTRFS_BLOCK_GROUP_SYSTEM &&
- type != (BTRFS_BLOCK_GROUP_METADATA |
- BTRFS_BLOCK_GROUP_DATA)) {
+ if (unlikely(type != BTRFS_BLOCK_GROUP_DATA &&
+ type != BTRFS_BLOCK_GROUP_METADATA &&
+ type != BTRFS_BLOCK_GROUP_SYSTEM &&
+ type != (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_DATA))) {
block_group_err(leaf, slot,
"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
type, hweight64(type),
@@ -754,50 +760,75 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
u64 length;
+ u64 chunk_end;
u64 stripe_len;
u16 num_stripes;
u16 sub_stripes;
u64 type;
u64 features;
bool mixed = false;
+ int raid_index;
+ int nparity;
+ int ncopies;
length = btrfs_chunk_length(leaf, chunk);
stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
type = btrfs_chunk_type(leaf, chunk);
+ raid_index = btrfs_bg_flags_to_raid_index(type);
+ ncopies = btrfs_raid_array[raid_index].ncopies;
+ nparity = btrfs_raid_array[raid_index].nparity;
- if (!num_stripes) {
+ if (unlikely(!num_stripes)) {
chunk_err(leaf, chunk, logical,
"invalid chunk num_stripes, have %u", num_stripes);
return -EUCLEAN;
}
- if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
+ if (unlikely(num_stripes < ncopies)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk num_stripes < ncopies, have %u < %d",
+ num_stripes, ncopies);
+ return -EUCLEAN;
+ }
+ if (unlikely(nparity && num_stripes == nparity)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk num_stripes == nparity, have %u == %d",
+ num_stripes, nparity);
+ return -EUCLEAN;
+ }
+ if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) {
chunk_err(leaf, chunk, logical,
"invalid chunk logical, have %llu should aligned to %u",
logical, fs_info->sectorsize);
return -EUCLEAN;
}
- if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
+ if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) {
chunk_err(leaf, chunk, logical,
"invalid chunk sectorsize, have %u expect %u",
btrfs_chunk_sector_size(leaf, chunk),
fs_info->sectorsize);
return -EUCLEAN;
}
- if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
+ if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) {
chunk_err(leaf, chunk, logical,
"invalid chunk length, have %llu", length);
return -EUCLEAN;
}
- if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
+ if (unlikely(check_add_overflow(logical, length, &chunk_end))) {
+ chunk_err(leaf, chunk, logical,
+"invalid chunk logical start and length, have logical start %llu length %llu",
+ logical, length);
+ return -EUCLEAN;
+ }
+ if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) {
chunk_err(leaf, chunk, logical,
"invalid chunk stripe length: %llu",
stripe_len);
return -EUCLEAN;
}
- if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
- type) {
+ if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+ BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
chunk_err(leaf, chunk, logical,
"unrecognized chunk type: 0x%llx",
~(BTRFS_BLOCK_GROUP_TYPE_MASK |
@@ -806,22 +837,23 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
return -EUCLEAN;
}
- if (!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
- (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) {
+ if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) {
chunk_err(leaf, chunk, logical,
"invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
return -EUCLEAN;
}
- if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) {
chunk_err(leaf, chunk, logical,
"missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
type, BTRFS_BLOCK_GROUP_TYPE_MASK);
return -EUCLEAN;
}
- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
- (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ (type & (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_DATA)))) {
chunk_err(leaf, chunk, logical,
"system chunk with data or metadata type: 0x%llx",
type);
@@ -833,20 +865,21 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
mixed = true;
if (!mixed) {
- if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
- (type & BTRFS_BLOCK_GROUP_DATA)) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) &&
+ (type & BTRFS_BLOCK_GROUP_DATA))) {
chunk_err(leaf, chunk, logical,
"mixed chunk type in non-mixed mode: 0x%llx", type);
return -EUCLEAN;
}
}
- if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
- (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
- ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
+ (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
+ ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
+ num_stripes != 1))) {
chunk_err(leaf, chunk, logical,
"invalid num_stripes:sub_stripes %u:%u for profile %llu",
num_stripes, sub_stripes,
@@ -869,7 +902,7 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
{
int num_stripes;
- if (btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk)) {
+ if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) {
chunk_err(leaf, chunk, key->offset,
"invalid chunk item size: have %u expect [%zu, %u)",
btrfs_item_size_nr(leaf, slot),
@@ -883,8 +916,8 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
if (num_stripes == 0)
goto out;
- if (btrfs_chunk_item_size(num_stripes) !=
- btrfs_item_size_nr(leaf, slot)) {
+ if (unlikely(btrfs_chunk_item_size(num_stripes) !=
+ btrfs_item_size_nr(leaf, slot))) {
chunk_err(leaf, chunk, key->offset,
"invalid chunk item size: have %u expect %lu",
btrfs_item_size_nr(leaf, slot),
@@ -923,14 +956,14 @@ static int check_dev_item(struct extent_buffer *leaf,
{
struct btrfs_dev_item *ditem;
- if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) {
+ if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) {
dev_item_err(leaf, slot,
"invalid objectid: has=%llu expect=%llu",
key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
return -EUCLEAN;
}
ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
- if (btrfs_device_id(leaf, ditem) != key->offset) {
+ if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) {
dev_item_err(leaf, slot,
"devid mismatch: key has=%llu item has=%llu",
key->offset, btrfs_device_id(leaf, ditem));
@@ -942,8 +975,8 @@ static int check_dev_item(struct extent_buffer *leaf,
* it can be 0 for device removal. Device size check can only be done
* by dev extents check.
*/
- if (btrfs_device_bytes_used(leaf, ditem) >
- btrfs_device_total_bytes(leaf, ditem)) {
+ if (unlikely(btrfs_device_bytes_used(leaf, ditem) >
+ btrfs_device_total_bytes(leaf, ditem))) {
dev_item_err(leaf, slot,
"invalid bytes used: have %llu expect [0, %llu]",
btrfs_device_bytes_used(leaf, ditem),
@@ -968,13 +1001,13 @@ static int check_inode_item(struct extent_buffer *leaf,
int ret;
ret = check_inode_key(leaf, key, slot);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
/* Here we use super block generation + 1 to handle log tree */
- if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) {
+ if (unlikely(btrfs_inode_generation(leaf, iitem) > super_gen + 1)) {
inode_item_err(leaf, slot,
"invalid inode generation: has %llu expect (0, %llu]",
btrfs_inode_generation(leaf, iitem),
@@ -982,7 +1015,7 @@ static int check_inode_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
/* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
- if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
+ if (unlikely(btrfs_inode_transid(leaf, iitem) > super_gen + 1)) {
inode_item_err(leaf, slot,
"invalid inode transid: has %llu expect [0, %llu]",
btrfs_inode_transid(leaf, iitem), super_gen + 1);
@@ -995,7 +1028,7 @@ static int check_inode_item(struct extent_buffer *leaf,
* anything in the fs. So here we skip the check.
*/
mode = btrfs_inode_mode(leaf, iitem);
- if (mode & ~valid_mask) {
+ if (unlikely(mode & ~valid_mask)) {
inode_item_err(leaf, slot,
"unknown mode bit detected: 0x%x",
mode & ~valid_mask);
@@ -1008,20 +1041,20 @@ static int check_inode_item(struct extent_buffer *leaf,
* FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS
*/
if (!has_single_bit_set(mode & S_IFMT)) {
- if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) {
+ if (unlikely(!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode))) {
inode_item_err(leaf, slot,
"invalid mode: has 0%o expect valid S_IF* bit(s)",
mode & S_IFMT);
return -EUCLEAN;
}
}
- if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) {
+ if (unlikely(S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1)) {
inode_item_err(leaf, slot,
"invalid nlink: has %u expect no more than 1 for dir",
btrfs_inode_nlink(leaf, iitem));
return -EUCLEAN;
}
- if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) {
+ if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) {
inode_item_err(leaf, slot,
"unknown flags detected: 0x%llx",
btrfs_inode_flags(leaf, iitem) &
@@ -1041,15 +1074,17 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
int ret;
ret = check_root_key(leaf, key, slot);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
- if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
- btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) {
+ if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
+ btrfs_item_size_nr(leaf, slot) !=
+ btrfs_legacy_root_item_size())) {
generic_err(leaf, slot,
"invalid root item size, have %u expect %zu or %u",
btrfs_item_size_nr(leaf, slot), sizeof(ri),
btrfs_legacy_root_item_size());
+ return -EUCLEAN;
}
/*
@@ -1061,24 +1096,24 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
btrfs_item_size_nr(leaf, slot));
/* Generation related */
- if (btrfs_root_generation(&ri) >
- btrfs_super_generation(fs_info->super_copy) + 1) {
+ if (unlikely(btrfs_root_generation(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
generic_err(leaf, slot,
"invalid root generation, have %llu expect (0, %llu]",
btrfs_root_generation(&ri),
btrfs_super_generation(fs_info->super_copy) + 1);
return -EUCLEAN;
}
- if (btrfs_root_generation_v2(&ri) >
- btrfs_super_generation(fs_info->super_copy) + 1) {
+ if (unlikely(btrfs_root_generation_v2(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
generic_err(leaf, slot,
"invalid root v2 generation, have %llu expect (0, %llu]",
btrfs_root_generation_v2(&ri),
btrfs_super_generation(fs_info->super_copy) + 1);
return -EUCLEAN;
}
- if (btrfs_root_last_snapshot(&ri) >
- btrfs_super_generation(fs_info->super_copy) + 1) {
+ if (unlikely(btrfs_root_last_snapshot(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
generic_err(leaf, slot,
"invalid root last_snapshot, have %llu expect (0, %llu]",
btrfs_root_last_snapshot(&ri),
@@ -1087,27 +1122,27 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
}
/* Alignment and level check */
- if (!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid root bytenr, have %llu expect to be aligned to %u",
btrfs_root_bytenr(&ri), fs_info->sectorsize);
return -EUCLEAN;
}
- if (btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL) {
+ if (unlikely(btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL)) {
generic_err(leaf, slot,
"invalid root level, have %u expect [0, %u]",
btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1);
return -EUCLEAN;
}
- if (ri.drop_level >= BTRFS_MAX_LEVEL) {
+ if (unlikely(btrfs_root_drop_level(&ri) >= BTRFS_MAX_LEVEL)) {
generic_err(leaf, slot,
"invalid root level, have %u expect [0, %u]",
- ri.drop_level, BTRFS_MAX_LEVEL - 1);
+ btrfs_root_drop_level(&ri), BTRFS_MAX_LEVEL - 1);
return -EUCLEAN;
}
/* Flags check */
- if (btrfs_root_flags(&ri) & ~valid_root_flags) {
+ if (unlikely(btrfs_root_flags(&ri) & ~valid_root_flags)) {
generic_err(leaf, slot,
"invalid root flags, have 0x%llx expect mask 0x%llx",
btrfs_root_flags(&ri), valid_root_flags);
@@ -1161,14 +1196,14 @@ static int check_extent_item(struct extent_buffer *leaf,
u64 total_refs; /* Total refs in btrfs_extent_item */
u64 inline_refs = 0; /* found total inline refs */
- if (key->type == BTRFS_METADATA_ITEM_KEY &&
- !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
+ if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY &&
+ !btrfs_fs_incompat(fs_info, SKINNY_METADATA))) {
generic_err(leaf, slot,
"invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled");
return -EUCLEAN;
}
/* key->objectid is the bytenr for both key types */
- if (!IS_ALIGNED(key->objectid, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->objectid, fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid key objectid, have %llu expect to be aligned to %u",
key->objectid, fs_info->sectorsize);
@@ -1176,8 +1211,8 @@ static int check_extent_item(struct extent_buffer *leaf,
}
/* key->offset is tree level for METADATA_ITEM_KEY */
- if (key->type == BTRFS_METADATA_ITEM_KEY &&
- key->offset >= BTRFS_MAX_LEVEL) {
+ if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY &&
+ key->offset >= BTRFS_MAX_LEVEL)) {
extent_err(leaf, slot,
"invalid tree level, have %llu expect [0, %u]",
key->offset, BTRFS_MAX_LEVEL - 1);
@@ -1203,7 +1238,7 @@ static int check_extent_item(struct extent_buffer *leaf,
* Either using btrfs_extent_inline_ref::offset, or specific
* data structure.
*/
- if (item_size < sizeof(*ei)) {
+ if (unlikely(item_size < sizeof(*ei))) {
extent_err(leaf, slot,
"invalid item size, have %u expect [%zu, %u)",
item_size, sizeof(*ei),
@@ -1217,15 +1252,16 @@ static int check_extent_item(struct extent_buffer *leaf,
flags = btrfs_extent_flags(leaf, ei);
total_refs = btrfs_extent_refs(leaf, ei);
generation = btrfs_extent_generation(leaf, ei);
- if (generation > btrfs_super_generation(fs_info->super_copy) + 1) {
+ if (unlikely(generation >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
extent_err(leaf, slot,
"invalid generation, have %llu expect (0, %llu]",
generation,
btrfs_super_generation(fs_info->super_copy) + 1);
return -EUCLEAN;
}
- if (!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA |
- BTRFS_EXTENT_FLAG_TREE_BLOCK))) {
+ if (unlikely(!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA |
+ BTRFS_EXTENT_FLAG_TREE_BLOCK)))) {
extent_err(leaf, slot,
"invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx",
flags, BTRFS_EXTENT_FLAG_DATA |
@@ -1234,21 +1270,21 @@ static int check_extent_item(struct extent_buffer *leaf,
}
is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK);
if (is_tree_block) {
- if (key->type == BTRFS_EXTENT_ITEM_KEY &&
- key->offset != fs_info->nodesize) {
+ if (unlikely(key->type == BTRFS_EXTENT_ITEM_KEY &&
+ key->offset != fs_info->nodesize)) {
extent_err(leaf, slot,
"invalid extent length, have %llu expect %u",
key->offset, fs_info->nodesize);
return -EUCLEAN;
}
} else {
- if (key->type != BTRFS_EXTENT_ITEM_KEY) {
+ if (unlikely(key->type != BTRFS_EXTENT_ITEM_KEY)) {
extent_err(leaf, slot,
"invalid key type, have %u expect %u for data backref",
key->type, BTRFS_EXTENT_ITEM_KEY);
return -EUCLEAN;
}
- if (!IS_ALIGNED(key->offset, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->offset, fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid extent length, have %llu expect aligned to %u",
key->offset, fs_info->sectorsize);
@@ -1262,7 +1298,7 @@ static int check_extent_item(struct extent_buffer *leaf,
struct btrfs_tree_block_info *info;
info = (struct btrfs_tree_block_info *)ptr;
- if (btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL) {
+ if (unlikely(btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL)) {
extent_err(leaf, slot,
"invalid tree block info level, have %u expect [0, %u]",
btrfs_tree_block_level(leaf, info),
@@ -1281,7 +1317,7 @@ static int check_extent_item(struct extent_buffer *leaf,
u64 inline_offset;
u8 inline_type;
- if (ptr + sizeof(*iref) > end) {
+ if (unlikely(ptr + sizeof(*iref) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %zu end %lu",
ptr, sizeof(*iref), end);
@@ -1290,7 +1326,7 @@ static int check_extent_item(struct extent_buffer *leaf,
iref = (struct btrfs_extent_inline_ref *)ptr;
inline_type = btrfs_extent_inline_ref_type(leaf, iref);
inline_offset = btrfs_extent_inline_ref_offset(leaf, iref);
- if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) {
+ if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
ptr, inline_type, end);
@@ -1304,7 +1340,8 @@ static int check_extent_item(struct extent_buffer *leaf,
break;
/* Contains parent bytenr */
case BTRFS_SHARED_BLOCK_REF_KEY:
- if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(inline_offset,
+ fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid tree parent bytenr, have %llu expect aligned to %u",
inline_offset, fs_info->sectorsize);
@@ -1319,7 +1356,8 @@ static int check_extent_item(struct extent_buffer *leaf,
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
dref_offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (!IS_ALIGNED(dref_offset, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(dref_offset,
+ fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid data ref offset, have %llu expect aligned to %u",
dref_offset, fs_info->sectorsize);
@@ -1330,7 +1368,8 @@ static int check_extent_item(struct extent_buffer *leaf,
/* Contains parent bytenr and ref count */
case BTRFS_SHARED_DATA_REF_KEY:
sref = (struct btrfs_shared_data_ref *)(iref + 1);
- if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(inline_offset,
+ fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid data parent bytenr, have %llu expect aligned to %u",
inline_offset, fs_info->sectorsize);
@@ -1346,14 +1385,14 @@ static int check_extent_item(struct extent_buffer *leaf,
ptr += btrfs_extent_inline_ref_size(inline_type);
}
/* No padding is allowed */
- if (ptr != end) {
+ if (unlikely(ptr != end)) {
extent_err(leaf, slot,
"invalid extent item size, padding bytes found");
return -EUCLEAN;
}
/* Finally, check the inline refs against total refs */
- if (inline_refs > total_refs) {
+ if (unlikely(inline_refs > total_refs)) {
extent_err(leaf, slot,
"invalid extent refs, have %llu expect >= inline %llu",
total_refs, inline_refs);
@@ -1370,21 +1409,21 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf,
if (key->type == BTRFS_SHARED_DATA_REF_KEY)
expect_item_size = sizeof(struct btrfs_shared_data_ref);
- if (btrfs_item_size_nr(leaf, slot) != expect_item_size) {
+ if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) {
generic_err(leaf, slot,
"invalid item size, have %u expect %u for key type %u",
btrfs_item_size_nr(leaf, slot),
expect_item_size, key->type);
return -EUCLEAN;
}
- if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid key objectid for shared block ref, have %llu expect aligned to %u",
key->objectid, leaf->fs_info->sectorsize);
return -EUCLEAN;
}
- if (key->type != BTRFS_TREE_BLOCK_REF_KEY &&
- !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize)) {
+ if (unlikely(key->type != BTRFS_TREE_BLOCK_REF_KEY &&
+ !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid tree parent bytenr, have %llu expect aligned to %u",
key->offset, leaf->fs_info->sectorsize);
@@ -1400,13 +1439,14 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot);
- if (btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0) {
+ if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) {
generic_err(leaf, slot,
"invalid item size, have %u expect aligned to %zu for key type %u",
btrfs_item_size_nr(leaf, slot),
sizeof(*dref), key->type);
+ return -EUCLEAN;
}
- if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid key objectid for shared block ref, have %llu expect aligned to %u",
key->objectid, leaf->fs_info->sectorsize);
@@ -1423,16 +1463,17 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
owner = btrfs_extent_data_ref_objectid(leaf, dref);
offset = btrfs_extent_data_ref_offset(leaf, dref);
hash = hash_extent_data_ref(root_objectid, owner, offset);
- if (hash != key->offset) {
+ if (unlikely(hash != key->offset)) {
extent_err(leaf, slot,
"invalid extent data ref hash, item has 0x%016llx key has 0x%016llx",
hash, key->offset);
return -EUCLEAN;
}
- if (!IS_ALIGNED(offset, leaf->fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid extent data backref offset, have %llu expect aligned to %u",
offset, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
}
}
return 0;
@@ -1448,10 +1489,10 @@ static int check_inode_ref(struct extent_buffer *leaf,
unsigned long ptr;
unsigned long end;
- if (!check_prev_ino(leaf, key, slot, prev_key))
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
return -EUCLEAN;
/* namelen can't be 0, so item_size == sizeof() is also invalid */
- if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) {
+ if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) {
inode_ref_err(leaf, slot,
"invalid item size, have %u expect (%zu, %u)",
btrfs_item_size_nr(leaf, slot),
@@ -1464,7 +1505,7 @@ static int check_inode_ref(struct extent_buffer *leaf,
while (ptr < end) {
u16 namelen;
- if (ptr + sizeof(iref) > end) {
+ if (unlikely(ptr + sizeof(iref) > end)) {
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
ptr, end, sizeof(iref));
@@ -1473,7 +1514,7 @@ static int check_inode_ref(struct extent_buffer *leaf,
iref = (struct btrfs_inode_ref *)ptr;
namelen = btrfs_inode_ref_name_len(leaf, iref);
- if (ptr + sizeof(*iref) + namelen > end) {
+ if (unlikely(ptr + sizeof(*iref) + namelen > end)) {
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu namelen %u",
ptr, end, namelen);
@@ -1556,7 +1597,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
u32 nritems = btrfs_header_nritems(leaf);
int slot;
- if (btrfs_header_level(leaf) != 0) {
+ if (unlikely(btrfs_header_level(leaf) != 0)) {
generic_err(leaf, 0,
"invalid level for leaf, have %d expect 0",
btrfs_header_level(leaf));
@@ -1575,19 +1616,19 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
u64 owner = btrfs_header_owner(leaf);
/* These trees must never be empty */
- if (owner == BTRFS_ROOT_TREE_OBJECTID ||
- owner == BTRFS_CHUNK_TREE_OBJECTID ||
- owner == BTRFS_EXTENT_TREE_OBJECTID ||
- owner == BTRFS_DEV_TREE_OBJECTID ||
- owner == BTRFS_FS_TREE_OBJECTID ||
- owner == BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID ||
+ owner == BTRFS_CHUNK_TREE_OBJECTID ||
+ owner == BTRFS_EXTENT_TREE_OBJECTID ||
+ owner == BTRFS_DEV_TREE_OBJECTID ||
+ owner == BTRFS_FS_TREE_OBJECTID ||
+ owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
generic_err(leaf, 0,
"invalid root, root %llu must never be empty",
owner);
return -EUCLEAN;
}
/* Unknown tree */
- if (owner == 0) {
+ if (unlikely(owner == 0)) {
generic_err(leaf, 0,
"invalid owner, root 0 is not defined");
return -EUCLEAN;
@@ -1595,7 +1636,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
return 0;
}
- if (nritems == 0)
+ if (unlikely(nritems == 0))
return 0;
/*
@@ -1616,7 +1657,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
btrfs_item_key_to_cpu(leaf, &key, slot);
/* Make sure the keys are in the right order */
- if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
+ if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) {
generic_err(leaf, slot,
"bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
prev_key.objectid, prev_key.type,
@@ -1635,7 +1676,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
else
item_end_expected = btrfs_item_offset_nr(leaf,
slot - 1);
- if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
+ if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) {
generic_err(leaf, slot,
"unexpected item end, have %u expect %u",
btrfs_item_end_nr(leaf, slot),
@@ -1648,8 +1689,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
* just in case all the items are consistent to each other, but
* all point outside of the leaf.
*/
- if (btrfs_item_end_nr(leaf, slot) >
- BTRFS_LEAF_DATA_SIZE(fs_info)) {
+ if (unlikely(btrfs_item_end_nr(leaf, slot) >
+ BTRFS_LEAF_DATA_SIZE(fs_info))) {
generic_err(leaf, slot,
"slot end outside of leaf, have %u expect range [0, %u]",
btrfs_item_end_nr(leaf, slot),
@@ -1658,8 +1699,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
}
/* Also check if the item pointer overlaps with btrfs item. */
- if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
- btrfs_item_ptr_offset(leaf, slot)) {
+ if (unlikely(btrfs_item_ptr_offset(leaf, slot) <
+ btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) {
generic_err(leaf, slot,
"slot overlaps with its data, item end %lu data start %lu",
btrfs_item_nr_offset(slot) +
@@ -1674,7 +1715,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
* criteria
*/
ret = check_leaf_item(leaf, &key, slot, &prev_key);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
}
@@ -1707,13 +1748,13 @@ int btrfs_check_node(struct extent_buffer *node)
u64 bytenr;
int ret = 0;
- if (level <= 0 || level >= BTRFS_MAX_LEVEL) {
+ if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) {
generic_err(node, 0,
"invalid level for node, have %d expect [1, %d]",
level, BTRFS_MAX_LEVEL - 1);
return -EUCLEAN;
}
- if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) {
+ if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) {
btrfs_crit(fs_info,
"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
btrfs_header_owner(node), node->start,
@@ -1727,13 +1768,13 @@ int btrfs_check_node(struct extent_buffer *node)
btrfs_node_key_to_cpu(node, &key, slot);
btrfs_node_key_to_cpu(node, &next_key, slot + 1);
- if (!bytenr) {
+ if (unlikely(!bytenr)) {
generic_err(node, slot,
"invalid NULL node pointer");
ret = -EUCLEAN;
goto out;
}
- if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) {
generic_err(node, slot,
"unaligned pointer, have %llu should be aligned to %u",
bytenr, fs_info->sectorsize);
@@ -1741,7 +1782,7 @@ int btrfs_check_node(struct extent_buffer *node)
goto out;
}
- if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
+ if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) {
generic_err(node, slot,
"bad key order, current (%llu %u %llu) next (%llu %u %llu)",
key.objectid, key.type, key.offset,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index d3f28b8f4ff9..7c45d960b53c 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,7 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
u32 nritems;
root_node = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking_write(root_node);
nritems = btrfs_header_nritems(root_node);
root->defrag_max.objectid = 0;
/* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 56cbc1706b6f..d90695c1ab6c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,9 +17,9 @@
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
-#include "inode-map.h"
#include "block-group.h"
#include "space-info.h"
+#include "zoned.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -105,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
u64 dirid, int del_all);
+static void wait_log_commit(struct btrfs_root *root, int transid);
/*
* tree logging is a special write ahead log used to make sure that
@@ -139,16 +140,45 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ const bool zoned = btrfs_is_zoned(fs_info);
int ret = 0;
+ bool created = false;
+
+ /*
+ * First check if the log root tree was already created. If not, create
+ * it before locking the root's log_mutex, just to keep lockdep happy.
+ */
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
+ mutex_lock(&tree_root->log_mutex);
+ if (!fs_info->log_root_tree) {
+ ret = btrfs_init_log_root_tree(trans, fs_info);
+ if (!ret) {
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
+ created = true;
+ }
+ }
+ mutex_unlock(&tree_root->log_mutex);
+ if (ret)
+ return ret;
+ }
mutex_lock(&root->log_mutex);
+again:
if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
if (btrfs_need_log_full_commit(trans)) {
ret = -EAGAIN;
goto out;
}
+ if (zoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
+
if (!root->log_start_pid) {
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
@@ -156,12 +186,16 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
} else {
- mutex_lock(&fs_info->tree_log_mutex);
- if (!fs_info->log_root_tree)
- ret = btrfs_init_log_root_tree(trans, fs_info);
- mutex_unlock(&fs_info->tree_log_mutex);
- if (ret)
+ /*
+ * This means fs_info->log_root_tree was already created
+ * for some other FS trees. Do the full commit not to mix
+ * nodes from multiple log transactions to do sequential
+ * writing.
+ */
+ if (zoned && !created) {
+ ret = -EAGAIN;
goto out;
+ }
ret = btrfs_add_log_tree(trans, root);
if (ret)
@@ -172,7 +206,6 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
root->log_start_pid = current->pid;
}
- atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
if (ctx && !ctx->logging_new_name) {
int index = root->log_transid % 2;
@@ -192,14 +225,22 @@ out:
*/
static int join_running_log_trans(struct btrfs_root *root)
{
+ const bool zoned = btrfs_is_zoned(root->fs_info);
int ret = -ENOENT;
if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
return ret;
mutex_lock(&root->log_mutex);
+again:
if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
ret = 0;
+ if (zoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
atomic_inc(&root->log_writers);
}
mutex_unlock(&root->log_mutex);
@@ -576,6 +617,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = root->fs_info;
int found_type;
u64 extent_end;
@@ -653,7 +695,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
/* drop any overlapping extents */
- ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
+ drop_args.start = start;
+ drop_args.end = extent_end;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
if (ret)
goto out;
@@ -828,9 +873,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- inode_add_bytes(inode, nbytes);
update_inode:
- ret = btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out:
if (inode)
iput(inode);
@@ -1529,7 +1574,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
}
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
@@ -1564,18 +1609,6 @@ out:
return ret;
}
-static int insert_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 ino)
-{
- int ret;
-
- ret = btrfs_insert_orphan_item(trans, root, ino);
- if (ret == -EEXIST)
- ret = 0;
-
- return ret;
-}
-
static int count_inode_extrefs(struct btrfs_root *root,
struct btrfs_inode *inode, struct btrfs_path *path)
{
@@ -1716,7 +1749,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
}
BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1727,7 +1760,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
- ret = insert_orphan_item(trans, root, ino);
+ ret = btrfs_insert_orphan_item(trans, root, ino);
+ if (ret == -EEXIST)
+ ret = 0;
}
out:
@@ -1820,7 +1855,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
set_nlink(inode, 1);
else
inc_nlink(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
} else if (ret == -EEXIST) {
ret = 0;
} else {
@@ -1973,7 +2008,7 @@ out:
btrfs_release_path(path);
if (!ret && update_size) {
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
- ret = btrfs_update_inode(trans, root, dir);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
}
kfree(name);
iput(dir);
@@ -2586,6 +2621,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
* those prealloc extents just after replaying them.
*/
if (S_ISREG(mode)) {
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct inode *inode;
u64 from;
@@ -2596,12 +2632,18 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
}
from = ALIGN(i_size_read(inode),
root->fs_info->sectorsize);
- ret = btrfs_drop_extents(wc->trans, root, inode,
- from, (u64)-1, 1);
+ drop_args.start = from;
+ drop_args.end = (u64)-1;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(wc->trans, root,
+ BTRFS_I(inode),
+ &drop_args);
if (!ret) {
+ inode_sub_bytes(inode,
+ drop_args.bytes_found);
/* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
- root, inode);
+ root, BTRFS_I(inode));
}
iput(inode);
if (ret)
@@ -2709,7 +2751,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
- next = btrfs_find_create_tree_block(fs_info, bytenr);
+ next = btrfs_find_create_tree_block(fs_info, bytenr,
+ btrfs_header_owner(cur),
+ *level - 1);
if (IS_ERR(next))
return PTR_ERR(next);
@@ -2732,7 +2776,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -2742,6 +2785,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
return ret;
}
+ btrfs_redirty_list_add(
+ trans->transaction, next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
@@ -2801,7 +2846,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -2883,7 +2927,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -3023,6 +3066,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
+ u64 log_root_start;
+ u64 log_root_level;
mutex_lock(&root->log_mutex);
log_transid = ctx->log_transid;
@@ -3075,6 +3120,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
blk_start_plug(&plug);
ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
+ /*
+ * -EAGAIN happens when someone, e.g., a concurrent transaction
+ * commit, writes a dirty extent in this tree-log commit. This
+ * concurrent write will create a hole writing out the extents,
+ * and we cannot proceed on a zoned filesystem, requiring
+ * sequential writing. While we can bail out to a full commit
+ * here, but we can continue hoping the concurrent writing fills
+ * the hole.
+ */
+ if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
+ ret = 0;
if (ret) {
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, ret);
@@ -3117,6 +3173,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
+ if (btrfs_is_zoned(fs_info)) {
+ mutex_lock(&fs_info->tree_root->log_mutex);
+ if (!log_root_tree->node) {
+ ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
+ if (ret) {
+ mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out;
+ }
+ }
+ mutex_unlock(&fs_info->tree_root->log_mutex);
+ }
+
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit
@@ -3184,7 +3253,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
&log_root_tree->dirty_log_pages,
EXTENT_DIRTY | EXTENT_NEW);
blk_finish_plug(&plug);
- if (ret) {
+ /*
+ * As described above, -EAGAIN indicates a hole in the extents. We
+ * cannot wait for these write outs since the waiting cause a
+ * deadlock. Bail out to the full commit instead.
+ */
+ if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
+ btrfs_set_log_full_commit(trans);
+ btrfs_wait_tree_log_extents(log, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ } else if (ret) {
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
@@ -3200,22 +3279,31 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out_wake_log_root;
}
- btrfs_set_super_log_root(fs_info->super_for_commit,
- log_root_tree->node->start);
- btrfs_set_super_log_root_level(fs_info->super_for_commit,
- btrfs_header_level(log_root_tree->node));
-
+ log_root_start = log_root_tree->node->start;
+ log_root_level = btrfs_header_level(log_root_tree->node);
log_root_tree->log_transid++;
mutex_unlock(&log_root_tree->log_mutex);
/*
- * Nobody else is going to jump in and write the ctree
- * super here because the log_commit atomic below is protecting
- * us. We must be called with a transaction handle pinning
- * the running transaction open, so a full commit can't hop
- * in and cause problems either.
+ * Here we are guaranteed that nobody is going to write the superblock
+ * for the current transaction before us and that neither we do write
+ * our superblock before the previous transaction finishes its commit
+ * and writes its superblock, because:
+ *
+ * 1) We are holding a handle on the current transaction, so no body
+ * can commit it until we release the handle;
+ *
+ * 2) Before writing our superblock we acquire the tree_log_mutex, so
+ * if the previous transaction is still committing, and hasn't yet
+ * written its superblock, we wait for it to do it, because a
+ * transaction commit acquires the tree_log_mutex when the commit
+ * begins and releases it only after writing its superblock.
*/
+ mutex_lock(&fs_info->tree_log_mutex);
+ btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
+ btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
ret = write_all_supers(fs_info, 1);
+ mutex_unlock(&fs_info->tree_log_mutex);
if (ret) {
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
@@ -3266,17 +3354,22 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
.process_func = process_one_buffer
};
- ret = walk_log_tree(trans, log, &wc);
- if (ret) {
- if (trans)
- btrfs_abort_transaction(trans, ret);
- else
- btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ if (log->node) {
+ ret = walk_log_tree(trans, log, &wc);
+ if (ret) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ }
}
clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
extent_io_tree_release(&log->log_csum_range);
+
+ if (trans && log->node)
+ btrfs_redirty_list_add(trans->transaction, log->node);
btrfs_put_root(log);
}
@@ -3300,6 +3393,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
if (fs_info->log_root_tree) {
free_log_tree(trans, fs_info->log_root_tree);
fs_info->log_root_tree = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
}
return 0;
}
@@ -3359,7 +3453,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
int ret;
int err = 0;
- int bytes_del = 0;
u64 dir_ino = btrfs_ino(dir);
if (!inode_logged(trans, dir))
@@ -3386,7 +3479,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
}
if (di) {
ret = btrfs_delete_one_dir_name(trans, log, path, di);
- bytes_del += name_len;
if (ret) {
err = ret;
goto fail;
@@ -3401,46 +3493,17 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
}
if (di) {
ret = btrfs_delete_one_dir_name(trans, log, path, di);
- bytes_del += name_len;
if (ret) {
err = ret;
goto fail;
}
}
- /* update the directory size in the log to reflect the names
- * we have removed
+ /*
+ * We do not need to update the size field of the directory's inode item
+ * because on log replay we update the field to reflect all existing
+ * entries in the directory (see overwrite_item()).
*/
- if (bytes_del) {
- struct btrfs_key key;
-
- key.objectid = dir_ino;
- key.offset = 0;
- key.type = BTRFS_INODE_ITEM_KEY;
- btrfs_release_path(path);
-
- ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
- if (ret < 0) {
- err = ret;
- goto fail;
- }
- if (ret == 0) {
- struct btrfs_inode_item *item;
- u64 i_size;
-
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_inode_item);
- i_size = btrfs_inode_size(path->nodes[0], item);
- if (i_size > bytes_del)
- i_size -= bytes_del;
- else
- i_size = 0;
- btrfs_set_inode_size(path->nodes[0], item, i_size);
- btrfs_mark_buffer_dirty(path->nodes[0]);
- } else
- ret = 0;
- btrfs_release_path(path);
- }
fail:
btrfs_free_path(path);
out_unlock:
@@ -3869,7 +3932,14 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_timespec_nsec(&token, &item->ctime,
inode->i_ctime.tv_nsec);
- btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
+ /*
+ * We do not need to set the nbytes field, in fact during a fast fsync
+ * its value may not even be correct, since a fast fsync does not wait
+ * for ordered extent completion, which is where we update nbytes, it
+ * only waits for writeback to complete. During log replay as we find
+ * file extent items and replay them, we adjust the nbytes field of the
+ * inode item in subvolume tree as needed (see overwrite_item()).
+ */
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
btrfs_set_token_inode_transid(&token, item, trans->transid);
@@ -4196,6 +4266,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
{
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *log = root->log_root;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
@@ -4204,19 +4275,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
- int extent_inserted = 0;
ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
- ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
- em->start + em->len, NULL, 0, 1,
- sizeof(*fi), &extent_inserted);
+ drop_args.path = path;
+ drop_args.start = em->start;
+ drop_args.end = em->start + em->len;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(*fi);
+ ret = btrfs_drop_extents(trans, log, inode, &drop_args);
if (ret)
return ret;
- if (!extent_inserted) {
+ if (!drop_args.extent_inserted) {
key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = em->start;
@@ -4375,8 +4448,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
do {
ret = btrfs_truncate_inode_items(trans,
root->log_root,
- &inode->vfs_inode,
- truncate_offset,
+ inode, truncate_offset,
BTRFS_EXTENT_DATA_KEY);
} while (ret == -EAGAIN);
if (ret)
@@ -4415,14 +4487,12 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
- u64 test_gen;
int ret = 0;
int num = 0;
INIT_LIST_HEAD(&extents);
write_lock(&tree->lock);
- test_gen = root->fs_info->last_trans_committed;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
list_del_init(&em->list);
@@ -4438,7 +4508,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
goto process;
}
- if (em->generation <= test_gen)
+ if (em->generation < trans->transid)
continue;
/* We log prealloc extents beyond eof later. */
@@ -4571,6 +4641,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
const u64 ino = btrfs_ino(inode);
int ins_nr = 0;
int start_slot = 0;
+ bool found_xattrs = false;
+
+ if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
+ return 0;
key.objectid = ino;
key.type = BTRFS_XATTR_ITEM_KEY;
@@ -4609,6 +4683,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
start_slot = slot;
ins_nr++;
path->slots[0]++;
+ found_xattrs = true;
cond_resched();
}
if (ins_nr > 0) {
@@ -4618,6 +4693,9 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
return ret;
}
+ if (!found_xattrs)
+ set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
+
return 0;
}
@@ -5262,12 +5340,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
/*
+ * This is for cases where logging a directory could result in losing a
+ * a file after replaying the log. For example, if we move a file from a
+ * directory A to a directory B, then fsync directory A, we have no way
+ * to known the file was moved from A to B, so logging just A would
+ * result in losing the file after a log replay.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode) &&
+ inode_only == LOG_INODE_ALL &&
+ inode->last_unlink_trans >= trans->transid) {
+ btrfs_set_log_full_commit(trans);
+ err = 1;
+ goto out_unlock;
+ }
+
+ /*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
*/
if (S_ISDIR(inode->vfs_inode.i_mode)) {
int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
if (inode_only == LOG_INODE_EXISTS)
max_key_type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino, max_key_type);
@@ -5303,7 +5397,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
&inode->runtime_flags);
while(1) {
ret = btrfs_truncate_inode_items(trans,
- log, &inode->vfs_inode, 0, 0);
+ log, inode, 0, 0);
if (ret != -EAGAIN)
break;
}
@@ -5424,98 +5518,31 @@ out_unlock:
}
/*
- * Check if we must fallback to a transaction commit when logging an inode.
- * This must be called after logging the inode and is used only in the context
- * when fsyncing an inode requires the need to log some other inode - in which
- * case we can't lock the i_mutex of each other inode we need to log as that
- * can lead to deadlocks with concurrent fsync against other inodes (as we can
- * log inodes up or down in the hierarchy) or rename operations for example. So
- * we take the log_mutex of the inode after we have logged it and then check for
- * its last_unlink_trans value - this is safe because any task setting
- * last_unlink_trans must take the log_mutex and it must do this before it does
- * the actual unlink operation, so if we do this check before a concurrent task
- * sets last_unlink_trans it means we've logged a consistent version/state of
- * all the inode items, otherwise we are not sure and must do a transaction
- * commit (the concurrent task might have only updated last_unlink_trans before
- * we logged the inode or it might have also done the unlink).
- */
-static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- bool ret = false;
-
- mutex_lock(&inode->log_mutex);
- if (inode->last_unlink_trans > fs_info->last_trans_committed) {
- /*
- * Make sure any commits to the log are forced to be full
- * commits.
- */
- btrfs_set_log_full_commit(trans);
- ret = true;
- }
- mutex_unlock(&inode->log_mutex);
-
- return ret;
-}
-
-/*
- * follow the dentry parent pointers up the chain and see if any
- * of the directories in it require a full commit before they can
- * be logged. Returns zero if nothing special needs to be done or 1 if
- * a full commit is required.
+ * Check if we need to log an inode. This is used in contexts where while
+ * logging an inode we need to log another inode (either that it exists or in
+ * full mode). This is used instead of btrfs_inode_in_log() because the later
+ * requires the inode to be in the log and have the log transaction committed,
+ * while here we do not care if the log transaction was already committed - our
+ * caller will commit the log later - and we want to avoid logging an inode
+ * multiple times when multiple tasks have joined the same log transaction.
*/
-static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode,
- struct dentry *parent,
- struct super_block *sb,
- u64 last_committed)
+static bool need_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
{
- int ret = 0;
- struct dentry *old_parent = NULL;
-
/*
- * for regular files, if its inode is already on disk, we don't
- * have to worry about the parents at all. This is because
- * we can use the last_unlink_trans field to record renames
- * and other fun in this file.
+ * If this inode does not have new/updated/deleted xattrs since the last
+ * time it was logged and is flagged as logged in the current transaction,
+ * we can skip logging it. As for new/deleted names, those are updated in
+ * the log by link/unlink/rename operations.
+ * In case the inode was logged and then evicted and reloaded, its
+ * logged_trans will be 0, in which case we have to fully log it since
+ * logged_trans is a transient field, not persisted.
*/
- if (S_ISREG(inode->vfs_inode.i_mode) &&
- inode->generation <= last_committed &&
- inode->last_unlink_trans <= last_committed)
- goto out;
-
- if (!S_ISDIR(inode->vfs_inode.i_mode)) {
- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
- goto out;
- inode = BTRFS_I(d_inode(parent));
- }
-
- while (1) {
- if (btrfs_must_commit_transaction(trans, inode)) {
- ret = 1;
- break;
- }
-
- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
- break;
-
- if (IS_ROOT(parent)) {
- inode = BTRFS_I(d_inode(parent));
- if (btrfs_must_commit_transaction(trans, inode))
- ret = 1;
- break;
- }
-
- parent = dget_parent(parent);
- dput(old_parent);
- old_parent = parent;
- inode = BTRFS_I(d_inode(parent));
+ if (inode->logged_trans == trans->transid &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return false;
- }
- dput(old_parent);
-out:
- return ret;
+ return true;
}
struct btrfs_dir_list {
@@ -5645,7 +5672,7 @@ process_leaf:
goto next_dir_inode;
}
- if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
+ if (!need_log_inode(trans, BTRFS_I(di_inode))) {
btrfs_add_delayed_iput(di_inode);
break;
}
@@ -5655,9 +5682,6 @@ process_leaf:
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
log_mode, ctx);
- if (!ret &&
- btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
- ret = 1;
btrfs_add_delayed_iput(di_inode);
if (ret)
goto next_dir_inode;
@@ -5795,13 +5819,15 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
goto out;
}
+ if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
+ btrfs_add_delayed_iput(dir_inode);
+ continue;
+ }
+
if (ctx)
ctx->log_new_dentries = false;
ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
LOG_INODE_ALL, ctx);
- if (!ret &&
- btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
- ret = 1;
if (!ret && ctx && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, root,
BTRFS_I(dir_inode), ctx);
@@ -5828,7 +5854,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
while (true) {
struct btrfs_fs_info *fs_info = root->fs_info;
- const u64 last_committed = fs_info->last_trans_committed;
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
struct btrfs_key search_key;
@@ -5847,7 +5872,8 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (BTRFS_I(inode)->generation > last_committed)
+ if (BTRFS_I(inode)->generation >= trans->transid &&
+ need_log_inode(trans, BTRFS_I(inode)))
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
@@ -5888,7 +5914,6 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct dentry *old_parent = NULL;
struct super_block *sb = inode->vfs_inode.i_sb;
int ret = 0;
@@ -5902,7 +5927,8 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (root != inode->root)
break;
- if (inode->generation > fs_info->last_trans_committed) {
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode)) {
ret = btrfs_log_inode(trans, root, inode,
LOG_INODE_EXISTS, ctx);
if (ret)
@@ -6017,38 +6043,19 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct super_block *sb;
int ret = 0;
- u64 last_committed = fs_info->last_trans_committed;
bool log_dentries = false;
- sb = inode->vfs_inode.i_sb;
-
if (btrfs_test_opt(fs_info, NOTREELOG)) {
ret = 1;
goto end_no_trans;
}
- /*
- * The prev transaction commit doesn't complete, we need do
- * full commit by ourselves.
- */
- if (fs_info->last_trans_log_full_commit >
- fs_info->last_trans_committed) {
- ret = 1;
- goto end_no_trans;
- }
-
if (btrfs_root_refs(&root->root_item) == 0) {
ret = 1;
goto end_no_trans;
}
- ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
- last_committed);
- if (ret)
- goto end_no_trans;
-
/*
* Skip already logged inodes or inodes corresponding to tmpfiles
* (since logging them is pointless, a link count of 0 means they
@@ -6075,8 +6082,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* and other fun in this file.
*/
if (S_ISREG(inode->vfs_inode.i_mode) &&
- inode->generation <= last_committed &&
- inode->last_unlink_trans <= last_committed) {
+ inode->generation < trans->transid &&
+ inode->last_unlink_trans < trans->transid) {
ret = 0;
goto end_trans;
}
@@ -6125,7 +6132,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* but the file inode does not have a matching BTRFS_INODE_REF_KEY item
* and has a link count of 2.
*/
- if (inode->last_unlink_trans > last_committed) {
+ if (inode->last_unlink_trans >= trans->transid) {
ret = btrfs_log_all_parents(trans, inode, ctx);
if (ret)
goto end_trans;
@@ -6295,8 +6302,7 @@ again:
* root->objectid_mutex is not acquired as log replay
* could only happen during mount.
*/
- ret = btrfs_find_highest_objectid(root,
- &root->highest_objectid);
+ ret = btrfs_init_root_free_objectid(root);
}
wc.replay_dest->log_root = NULL;
@@ -6434,7 +6440,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
struct dentry *parent)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_log_ctx ctx;
/*
@@ -6448,8 +6453,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (inode->logged_trans <= fs_info->last_trans_committed &&
- (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
+ if (inode->logged_trans < trans->transid &&
+ (!old_dir || old_dir->logged_trans < trans->transid))
return;
btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 28525ad7ff8c..74023c8a783f 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -129,8 +129,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
} else {
btrfs_warn(fs_info,
"insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
- ret, (unsigned long long)key.objectid,
- (unsigned long long)key.offset, type);
+ ret, key.objectid, key.offset, type);
goto out;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 58b9c419a2b6..bc3b33efddc5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -31,6 +31,7 @@
#include "space-info.h"
#include "block-group.h"
#include "discard.h"
+#include "zoned.h"
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
@@ -374,6 +375,7 @@ void btrfs_free_device(struct btrfs_device *device)
rcu_string_free(device->name);
extent_io_tree_release(&device->alloc_state);
bio_put(device->flush_bio);
+ btrfs_destroy_dev_zone_info(device);
kfree(device);
}
@@ -419,7 +421,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
* Preallocate a bio that's always going to be used for flushing device
* barriers and matches the device lifespan
*/
- dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
+ dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
if (!dev->flush_bio) {
kfree(dev);
return ERR_PTR(-ENOMEM);
@@ -822,7 +824,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
} else {
mutex_lock(&fs_devices->device_list_mutex);
device = btrfs_find_device(fs_devices, devid,
- disk_super->dev_item.uuid, NULL, false);
+ disk_super->dev_item.uuid, NULL);
/*
* If this disk has been pulled into an fs devices created by
@@ -929,25 +931,30 @@ static noinline struct btrfs_device *device_list_add(const char *path,
* make sure it's the same device if the device is mounted
*/
if (device->bdev) {
- struct block_device *path_bdev;
+ int error;
+ dev_t path_dev;
- path_bdev = lookup_bdev(path);
- if (IS_ERR(path_bdev)) {
+ error = lookup_bdev(path, &path_dev);
+ if (error) {
mutex_unlock(&fs_devices->device_list_mutex);
- return ERR_CAST(path_bdev);
+ return ERR_PTR(error);
}
- if (device->bdev != path_bdev) {
- bdput(path_bdev);
+ if (device->bdev->bd_dev != path_dev) {
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_warn_in_rcu(device->fs_info,
+ /*
+ * device->fs_info may not be reliable here, so
+ * pass in a NULL instead. This avoids a
+ * possible use-after-free when the fs_info and
+ * fs_info->sb are already torn down.
+ */
+ btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
current->comm,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- bdput(path_bdev);
btrfs_info_in_rcu(device->fs_info,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
@@ -1038,7 +1045,7 @@ error:
}
static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
- int step, struct btrfs_device **latest_dev)
+ struct btrfs_device **latest_dev)
{
struct btrfs_device *device, *next;
@@ -1056,22 +1063,13 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
continue;
}
- if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
- /*
- * In the first step, keep the device which has
- * the correct fsid and the devid that is used
- * for the dev_replace procedure.
- * In the second step, the dev_replace state is
- * read from the device tree and it is known
- * whether the procedure is really active or
- * not, which means whether this device is
- * used or whether it should be removed.
- */
- if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state)) {
- continue;
- }
- }
+ /*
+ * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
+ * in btrfs_init_dev_replace() so just continue.
+ */
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+ continue;
+
if (device->bdev) {
blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
@@ -1080,9 +1078,6 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state))
- fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
@@ -1095,16 +1090,16 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
* After we have read the system tree and know devids belonging to this
* filesystem, remove the device which does not belong there.
*/
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *latest_dev = NULL;
struct btrfs_fs_devices *seed_dev;
mutex_lock(&uuid_mutex);
- __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
+ __btrfs_free_extra_devids(fs_devices, &latest_dev);
list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
- __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
+ __btrfs_free_extra_devids(seed_dev, &latest_dev);
fs_devices->latest_bdev = latest_dev->bdev;
@@ -1143,6 +1138,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
device->bdev = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ btrfs_destroy_dev_zone_info(device);
device->fs_info = NULL;
atomic_set(&device->dev_stats_ccnt, 0);
@@ -1223,6 +1219,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
+ fs_devices->read_policy = BTRFS_READ_POLICY_PID;
return 0;
}
@@ -1274,7 +1271,7 @@ void btrfs_release_disk_super(struct btrfs_super_block *super)
}
static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
- u64 bytenr)
+ u64 bytenr, u64 bytenr_orig)
{
struct btrfs_super_block *disk_super;
struct page *page;
@@ -1305,7 +1302,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
/* align our pointer to the offset of the super block */
disk_super = p + offset_in_page(bytenr);
- if (btrfs_super_bytenr(disk_super) != bytenr ||
+ if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
btrfs_release_disk_super(p);
return ERR_PTR(-EINVAL);
@@ -1340,7 +1337,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct block_device *bdev;
- u64 bytenr;
+ u64 bytenr, bytenr_orig;
+ int ret;
lockdep_assert_held(&uuid_mutex);
@@ -1350,14 +1348,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
* So, we need to add a special mount option to scan for
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
- bytenr = btrfs_sb_offset(0);
flags |= FMODE_EXCL;
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
- disk_super = btrfs_read_disk_super(bdev, bytenr);
+ bytenr_orig = btrfs_sb_offset(0);
+ ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
@@ -1412,11 +1414,62 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
* make sure to start at an offset of at least 1MB.
*/
return max_t(u64, start, SZ_1M);
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ /*
+ * We don't care about the starting region like regular
+ * allocator, because we anyway use/reserve the first two zones
+ * for superblock logging.
+ */
+ return ALIGN(start, device->zone_info->zone_size);
default:
BUG();
}
}
+static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
+ u64 *hole_start, u64 *hole_size,
+ u64 num_bytes)
+{
+ u64 zone_size = device->zone_info->zone_size;
+ u64 pos;
+ int ret;
+ bool changed = false;
+
+ ASSERT(IS_ALIGNED(*hole_start, zone_size));
+
+ while (*hole_size > 0) {
+ pos = btrfs_find_allocatable_zones(device, *hole_start,
+ *hole_start + *hole_size,
+ num_bytes);
+ if (pos != *hole_start) {
+ *hole_size = *hole_start + *hole_size - pos;
+ *hole_start = pos;
+ changed = true;
+ if (*hole_size < num_bytes)
+ break;
+ }
+
+ ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
+
+ /* Range is ensured to be empty */
+ if (!ret)
+ return changed;
+
+ /* Given hole range was invalid (outside of device) */
+ if (ret == -ERANGE) {
+ *hole_start += *hole_size;
+ *hole_size = 0;
+ return 1;
+ }
+
+ *hole_start += zone_size;
+ *hole_size -= zone_size;
+ changed = true;
+ }
+
+ return changed;
+}
+
/**
* dev_extent_hole_check - check if specified hole is suitable for allocation
* @device: the device which we have the hole
@@ -1424,7 +1477,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
* @hole_size: the size of the hole
* @num_bytes: the size of the free space that we need
*
- * This function may modify @hole_start and @hole_end to reflect the suitable
+ * This function may modify @hole_start and @hole_size to reflect the suitable
* position for allocation. Returns 1 if hole position is updated, 0 otherwise.
*/
static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
@@ -1433,24 +1486,39 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
bool changed = false;
u64 hole_end = *hole_start + *hole_size;
- /*
- * Check before we set max_hole_start, otherwise we could end up
- * sending back this offset anyway.
- */
- if (contains_pending_extent(device, hole_start, *hole_size)) {
- if (hole_end >= *hole_start)
- *hole_size = hole_end - *hole_start;
- else
- *hole_size = 0;
- changed = true;
- }
+ for (;;) {
+ /*
+ * Check before we set max_hole_start, otherwise we could end up
+ * sending back this offset anyway.
+ */
+ if (contains_pending_extent(device, hole_start, *hole_size)) {
+ if (hole_end >= *hole_start)
+ *hole_size = hole_end - *hole_start;
+ else
+ *hole_size = 0;
+ changed = true;
+ }
+
+ switch (device->fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ /* No extra check */
+ break;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ if (dev_extent_hole_check_zoned(device, hole_start,
+ hole_size, num_bytes)) {
+ changed = true;
+ /*
+ * The changed hole can contain pending extent.
+ * Loop again to check that.
+ */
+ continue;
+ }
+ break;
+ default:
+ BUG();
+ }
- switch (device->fs_devices->chunk_alloc_policy) {
- case BTRFS_CHUNK_ALLOC_REGULAR:
- /* No extra check */
break;
- default:
- BUG();
}
return changed;
@@ -1503,6 +1571,9 @@ static int find_free_dev_extent_start(struct btrfs_device *device,
search_start = dev_extent_search_start(device, search_start);
+ WARN_ON(device->zone_info &&
+ !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2021,6 +2092,11 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
if (IS_ERR(disk_super))
continue;
+ if (bdev_is_zoned(bdev)) {
+ btrfs_reset_sb_log_zones(bdev, copy_num);
+ continue;
+ }
+
memset(&disk_super->magic, 0, sizeof(disk_super->magic));
page = virt_to_page(disk_super);
@@ -2099,6 +2175,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
+ if (!ret)
+ btrfs_reada_remove_dev(device);
mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
@@ -2179,6 +2257,7 @@ out:
return ret;
error_undo:
+ btrfs_reada_undo_remove_dev(device);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
@@ -2296,10 +2375,10 @@ static struct btrfs_device *btrfs_find_device_by_path(
dev_uuid = disk_super->dev_item.uuid;
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->metadata_uuid, true);
+ disk_super->metadata_uuid);
else
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->fsid, true);
+ disk_super->fsid);
btrfs_release_disk_super(disk_super);
if (!device)
@@ -2319,7 +2398,7 @@ struct btrfs_device *btrfs_find_device_by_devspec(
if (devid) {
device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
- NULL, true);
+ NULL);
if (!device)
return ERR_PTR(-ENOENT);
return device;
@@ -2468,7 +2547,7 @@ next_slot:
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid, true);
+ fs_uuid);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -2510,6 +2589,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (IS_ERR(bdev))
return PTR_ERR(bdev);
+ if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
if (fs_devices->seeding) {
seeding_dev = 1;
down_write(&sb->s_umount);
@@ -2543,10 +2627,17 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
rcu_assign_pointer(device->name, name);
+ device->fs_info = fs_info;
+ device->bdev = bdev;
+
+ ret = btrfs_get_dev_zone_info(device);
+ if (ret)
+ goto error_free_device;
+
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto error_free_device;
+ goto error_free_zone;
}
q = bdev_get_queue(bdev);
@@ -2559,8 +2650,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
- device->fs_info = fs_info;
- device->bdev = bdev;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
@@ -2568,7 +2657,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
- sb->s_flags &= ~SB_RDONLY;
+ btrfs_clear_sb_rdonly(sb);
ret = btrfs_prepare_sprout(fs_info);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -2704,9 +2793,11 @@ error_sysfs:
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
error_trans:
if (seeding_dev)
- sb->s_flags |= SB_RDONLY;
+ btrfs_set_sb_rdonly(sb);
if (trans)
btrfs_end_transaction(trans);
+error_free_zone:
+ btrfs_destroy_dev_zone_info(device);
error_free_device:
btrfs_free_device(device);
error:
@@ -4291,6 +4382,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
+ btrfs_release_path(path);
+
mutex_lock(&fs_info->balance_mutex);
BUG_ON(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
@@ -4640,11 +4733,10 @@ again:
}
ret = btrfs_previous_item(root, path, 0, key.type);
- if (ret)
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- if (ret < 0)
- goto done;
if (ret) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ if (ret < 0)
+ goto done;
ret = 0;
btrfs_release_path(path);
break;
@@ -4876,6 +4968,37 @@ static void init_alloc_chunk_ctl_policy_regular(
ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
}
+static void init_alloc_chunk_ctl_policy_zoned(
+ struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ u64 zone_size = fs_devices->fs_info->zone_size;
+ u64 limit;
+ int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
+ int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
+ u64 min_chunk_size = min_data_stripes * zone_size;
+ u64 type = ctl->type;
+
+ ctl->max_stripe_size = zone_size;
+ if (type & BTRFS_BLOCK_GROUP_DATA) {
+ ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
+ zone_size);
+ } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+ ctl->max_chunk_size = ctl->max_stripe_size;
+ } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ctl->max_chunk_size = 2 * ctl->max_stripe_size;
+ ctl->devs_max = min_t(int, ctl->devs_max,
+ BTRFS_MAX_DEVS_SYS_CHUNK);
+ }
+
+ /* We don't want a chunk larger than 10% of writable space */
+ limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
+ zone_size),
+ min_chunk_size);
+ ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
+ ctl->dev_extent_min = zone_size * ctl->dev_stripes;
+}
+
static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl)
{
@@ -4896,6 +5019,9 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
case BTRFS_CHUNK_ALLOC_REGULAR:
init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
break;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
+ break;
default:
BUG();
}
@@ -5022,6 +5148,38 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
return 0;
}
+static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ u64 zone_size = devices_info[0].dev->zone_info->zone_size;
+ /* Number of stripes that count for block group size */
+ int data_stripes;
+
+ /*
+ * It should hold because:
+ * dev_extent_min == dev_extent_want == zone_size * dev_stripes
+ */
+ ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
+
+ ctl->stripe_size = zone_size;
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+
+ /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
+ ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
+ ctl->stripe_size) + ctl->nparity,
+ ctl->dev_stripes);
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+ ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
+ }
+
+ ctl->chunk_size = ctl->stripe_size * data_stripes;
+
+ return 0;
+}
+
static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
@@ -5049,6 +5207,8 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
switch (fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
return decide_stripe_size_regular(ctl, devices_info);
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ return decide_stripe_size_zoned(ctl, devices_info);
default:
BUG();
}
@@ -5482,7 +5642,18 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
else
num_stripes = map->num_stripes;
- preferred_mirror = first + current->pid % num_stripes;
+ switch (fs_info->fs_devices->read_policy) {
+ default:
+ /* Shouldn't happen, just warn and use pid instead of failing */
+ btrfs_warn_rl(fs_info,
+ "unknown read_policy type %u, reset to pid",
+ fs_info->fs_devices->read_policy);
+ fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+ fallthrough;
+ case BTRFS_READ_POLICY_PID:
+ preferred_mirror = first + (current->pid % num_stripes);
+ break;
+ }
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
@@ -5802,9 +5973,29 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
return ret;
}
+static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
+{
+ struct btrfs_block_group *cache;
+ bool ret;
+
+ /* Non zoned filesystem does not use "to_copy" flag */
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
+ cache = btrfs_lookup_block_group(fs_info, logical);
+
+ spin_lock(&cache->lock);
+ ret = cache->to_copy;
+ spin_unlock(&cache->lock);
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
struct btrfs_bio **bbio_ret,
struct btrfs_dev_replace *dev_replace,
+ u64 logical,
int *num_stripes_ret, int *max_errors_ret)
{
struct btrfs_bio *bbio = *bbio_ret;
@@ -5818,6 +6009,13 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
int index_where_to_add;
/*
+ * A block group which have "to_copy" set will eventually
+ * copied by dev-replace process. We can avoid cloning IO here.
+ */
+ if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
+ return;
+
+ /*
* duplicate the write operations while the dev replace
* procedure is running. Since the copying of the old disk to
* the new disk takes place at run time while the filesystem is
@@ -5902,23 +6100,24 @@ static bool need_full_stripe(enum btrfs_map_op op)
}
/*
- * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
- * tuple. This information is used to calculate how big a
- * particular bio can get before it straddles a stripe.
+ * Calculate the geometry of a particular (address, len) tuple. This
+ * information is used to calculate how big a particular bio can get before it
+ * straddles a stripe.
*
- * @fs_info - the filesystem
- * @logical - address that we want to figure out the geometry of
- * @len - the length of IO we are going to perform, starting at @logical
- * @op - type of operation - write or read
- * @io_geom - pointer used to return values
+ * @fs_info: the filesystem
+ * @em: mapping containing the logical extent
+ * @op: type of operation - write or read
+ * @logical: address that we want to figure out the geometry of
+ * @len: the length of IO we are going to perform, starting at @logical
+ * @io_geom: pointer used to return values
*
* Returns < 0 in case a chunk for the given logical address cannot be found,
* usually shouldn't happen unless @logical is corrupted, 0 otherwise.
*/
-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
+ enum btrfs_map_op op, u64 logical, u64 len,
+ struct btrfs_io_geometry *io_geom)
{
- struct extent_map *em;
struct map_lookup *map;
u64 offset;
u64 stripe_offset;
@@ -5926,14 +6125,9 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 stripe_len;
u64 raid56_full_stripe_start = (u64)-1;
int data_stripes;
- int ret = 0;
ASSERT(op != BTRFS_MAP_DISCARD);
- em = btrfs_get_chunk_map(fs_info, logical, len);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
map = em->map_lookup;
/* Offset of this logical address in the chunk */
offset = logical - em->start;
@@ -5947,8 +6141,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
btrfs_crit(fs_info,
"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
stripe_offset, offset, em->start, logical, stripe_len);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
/* stripe_offset is the offset of this block in its stripe */
@@ -5995,10 +6188,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
io_geom->stripe_offset = stripe_offset;
io_geom->raid56_stripe_offset = raid56_full_stripe_start;
-out:
- /* once for us */
- free_extent_map(em);
- return ret;
+ return 0;
}
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
@@ -6031,12 +6221,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
ASSERT(bbio_ret);
ASSERT(op != BTRFS_MAP_DISCARD);
- ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
+ em = btrfs_get_chunk_map(fs_info, logical, *length);
+ ASSERT(!IS_ERR(em));
+
+ ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom);
if (ret < 0)
return ret;
- em = btrfs_get_chunk_map(fs_info, logical, *length);
- ASSERT(!IS_ERR(em));
map = em->map_lookup;
*length = geom.len;
@@ -6212,8 +6403,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
- handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
- &max_errors);
+ handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
+ &num_stripes, &max_errors);
}
*bbio_ret = bbio;
@@ -6284,7 +6475,7 @@ static void btrfs_end_bio(struct bio *bio)
struct btrfs_device *dev = btrfs_io_bio(bio)->device;
ASSERT(dev->bdev);
- if (bio_op(bio) == REQ_OP_WRITE)
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else if (!(bio->bi_opf & REQ_RAHEAD))
@@ -6336,9 +6527,23 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
btrfs_io_bio(bio)->device = dev;
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
+ /*
+ * For zone append writing, bi_sector must point the beginning of the
+ * zone
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ if (btrfs_dev_is_sequential(dev, physical)) {
+ u64 zone_start = round_down(physical, fs_info->zone_size);
+
+ bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
+ } else {
+ bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
+ bio->bi_opf |= REQ_OP_WRITE;
+ }
+ }
btrfs_debug_in_rcu(fs_info,
"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
- bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
+ bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
dev->devid, bio->bi_iter.bi_size);
bio_set_dev(bio, dev->bdev);
@@ -6370,7 +6575,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
{
struct btrfs_device *dev;
struct bio *first_bio = bio;
- u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+ u64 logical = bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
int ret;
@@ -6397,10 +6602,10 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
- ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
+ ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
- if (bio_op(bio) == REQ_OP_WRITE) {
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
ret = raid56_parity_write(fs_info, bio, bbio,
map_length);
} else {
@@ -6423,7 +6628,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
&dev->dev_state) ||
- (bio_op(first_bio) == REQ_OP_WRITE &&
+ (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
bbio_error(bbio, first_bio, logical);
continue;
@@ -6450,8 +6655,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
* If @seed is true, traverse through the seed devices.
*/
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid,
- bool seed)
+ u64 devid, u8 *uuid, u8 *fsid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *seed_devs;
@@ -6658,7 +6862,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
- devid, uuid, NULL, true);
+ devid, uuid, NULL);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
@@ -6797,7 +7001,7 @@ static int read_one_dev(struct extent_buffer *leaf,
}
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid, true);
+ fs_uuid);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@@ -6860,6 +7064,16 @@ static int read_one_dev(struct extent_buffer *leaf,
}
fill_device_from_item(leaf, dev_item, device);
+ if (device->bdev) {
+ u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+
+ if (device->total_bytes > max_total_bytes) {
+ btrfs_err(fs_info,
+ "device total_bytes should be at most %llu but found %llu",
+ max_total_bytes, device->total_bytes);
+ return -EINVAL;
+ }
+ }
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
@@ -6894,11 +7108,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
* fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
* overallocate but we can keep it as-is, only the first page is used.
*/
- sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
+ sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
+ root->root_key.objectid, 0);
if (IS_ERR(sb))
return PTR_ERR(sb);
set_extent_buffer_uptodate(sb);
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
/*
* The sb extent buffer is artificial and just used to read the system array.
* set_extent_buffer_uptodate() call does not properly mark all it's
@@ -7062,12 +7276,8 @@ static void readahead_tree_node_children(struct extent_buffer *node)
int i;
const int nr_items = btrfs_header_nritems(node);
- for (i = 0; i < nr_items; i++) {
- u64 start;
-
- start = btrfs_node_blockptr(node, i);
- readahead_tree_block(node->fs_info, start);
- }
+ for (i = 0; i < nr_items; i++)
+ btrfs_readahead_node_child(node, i);
}
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
@@ -7454,8 +7664,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
- true);
+ dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -7586,28 +7795,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/* Make sure no dev extent is beyond device bondary */
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
goto out;
}
- /* It's possible this device is a dummy for seed device */
- if (dev->disk_total_bytes == 0) {
- struct btrfs_fs_devices *devs;
-
- devs = list_first_entry(&fs_info->fs_devices->seed_list,
- struct btrfs_fs_devices, seed_list);
- dev = btrfs_find_device(devs, devid, NULL, NULL, false);
- if (!dev) {
- btrfs_err(fs_info, "failed to find seed devid %llu",
- devid);
- ret = -EUCLEAN;
- goto out;
- }
- }
-
if (physical_offset + physical_len > dev->disk_total_bytes) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
@@ -7616,6 +7810,20 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
ret = -EUCLEAN;
goto out;
}
+
+ if (dev->zone_info) {
+ u64 zone_size = dev->zone_info->zone_size;
+
+ if (!IS_ALIGNED(physical_offset, zone_size) ||
+ !IS_ALIGNED(physical_len, zone_size)) {
+ btrfs_err(fs_info,
+"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
+ devid, physical_offset, physical_len);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
out:
free_extent_map(em);
return ret;
@@ -7662,6 +7870,19 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
u64 prev_dev_ext_end = 0;
int ret = 0;
+ /*
+ * We don't have a dev_root because we mounted with ignorebadroots and
+ * failed to load the root, so we want to skip the verification in this
+ * case for sure.
+ *
+ * However if the dev root is fine, but the tree itself is corrupted
+ * we'd still fail to mount. This verification is only to make sure
+ * writes can happen safely, so instead just bypass this check
+ * completely in the case of IGNOREBADROOTS.
+ */
+ if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ return 0;
+
key.objectid = 1;
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
@@ -7759,3 +7980,75 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
spin_unlock(&fs_info->swapfile_pins_lock);
return node != NULL;
}
+
+static int relocating_repair_kthread(void *data)
+{
+ struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ u64 target;
+ int ret = 0;
+
+ target = cache->start;
+ btrfs_put_block_group(cache);
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ btrfs_info(fs_info,
+ "zoned: skip relocating block group %llu to repair: EBUSY",
+ target);
+ return -EBUSY;
+ }
+
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
+
+ /* Ensure block group still exists */
+ cache = btrfs_lookup_block_group(fs_info, target);
+ if (!cache)
+ goto out;
+
+ if (!cache->relocating_repair)
+ goto out;
+
+ ret = btrfs_may_alloc_data_chunk(fs_info, target);
+ if (ret < 0)
+ goto out;
+
+ btrfs_info(fs_info,
+ "zoned: relocating block group %llu to repair IO failure",
+ target);
+ ret = btrfs_relocate_chunk(fs_info, target);
+
+out:
+ if (cache)
+ btrfs_put_block_group(cache);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ btrfs_exclop_finish(fs_info);
+
+ return ret;
+}
+
+int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
+{
+ struct btrfs_block_group *cache;
+
+ /* Do not attempt to repair in degraded state */
+ if (btrfs_test_opt(fs_info, DEGRADED))
+ return 0;
+
+ cache = btrfs_lookup_block_group(fs_info, logical);
+ if (!cache)
+ return 0;
+
+ spin_lock(&cache->lock);
+ if (cache->relocating_repair) {
+ spin_unlock(&cache->lock);
+ btrfs_put_block_group(cache);
+ return 0;
+ }
+ cache->relocating_repair = 1;
+ spin_unlock(&cache->lock);
+
+ kthread_run(relocating_repair_kthread, cache,
+ "btrfs-relocating-repair");
+
+ return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bf27ac07d315..d4c3e0dd32b8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,9 @@ struct btrfs_io_geometry {
#define BTRFS_DEV_STATE_MISSING (2)
#define BTRFS_DEV_STATE_REPLACE_TGT (3)
#define BTRFS_DEV_STATE_FLUSH_SENT (4)
+#define BTRFS_DEV_STATE_NO_READA (5)
+
+struct btrfs_zoned_device_info;
struct btrfs_device {
struct list_head dev_list; /* device_list_mutex */
@@ -64,6 +67,8 @@ struct btrfs_device {
struct block_device *bdev;
+ struct btrfs_zoned_device_info *zone_info;
+
/* the mode sent to blkdev_get */
fmode_t mode;
@@ -209,6 +214,17 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
enum btrfs_chunk_allocation_policy {
BTRFS_CHUNK_ALLOC_REGULAR,
+ BTRFS_CHUNK_ALLOC_ZONED,
+};
+
+/*
+ * Read policies for mirrored block group profiles, read picks the stripe based
+ * on these policies.
+ */
+enum btrfs_read_policy {
+ /* Use process PID to choose the stripe */
+ BTRFS_READ_POLICY_PID,
+ BTRFS_NR_READ_POLICY,
};
struct btrfs_fs_devices {
@@ -264,6 +280,9 @@ struct btrfs_fs_devices {
struct completion kobj_unregister;
enum btrfs_chunk_allocation_policy chunk_alloc_policy;
+
+ /* Policy used to read the mirrored stripes */
+ enum btrfs_read_policy read_policy;
};
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
@@ -405,6 +424,7 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
case REQ_OP_DISCARD:
return BTRFS_MAP_DISCARD;
case REQ_OP_WRITE:
+ case REQ_OP_ZONE_APPEND:
return BTRFS_MAP_WRITE;
default:
WARN_ON_ONCE(1);
@@ -422,8 +442,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret);
-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 len, struct btrfs_io_geometry *io_geom);
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
+ enum btrfs_map_op op, u64 logical, u64 len,
+ struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
@@ -436,7 +457,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
int btrfs_forget_devices(const char *path);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
@@ -453,7 +474,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid, bool seed);
+ u64 devid, u8 *uuid, u8 *fsid);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
@@ -578,5 +599,6 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
+int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 95d9aebff2c4..af6246f36a9e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -213,9 +213,11 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
}
out:
btrfs_free_path(path);
- if (!ret)
+ if (!ret) {
set_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags);
+ }
return ret;
}
@@ -239,7 +241,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
BUG_ON(ret);
out:
btrfs_end_transaction(trans);
@@ -390,7 +392,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
if (!ret) {
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
BUG_ON(ret);
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
new file mode 100644
index 000000000000..d0eb0c8d6269
--- /dev/null
+++ b/fs/btrfs/zoned.c
@@ -0,0 +1,1461 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/sched/mm.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "zoned.h"
+#include "rcu-string.h"
+#include "disk-io.h"
+#include "block-group.h"
+#include "transaction.h"
+#include "dev-replace.h"
+#include "space-info.h"
+
+/* Maximum number of zones to report per blkdev_report_zones() call */
+#define BTRFS_REPORT_NR_ZONES 4096
+/* Invalid allocation pointer value for missing devices */
+#define WP_MISSING_DEV ((u64)-1)
+/* Pseudo write pointer value for conventional zone */
+#define WP_CONVENTIONAL ((u64)-2)
+
+/* Number of superblock log zones */
+#define BTRFS_NR_SB_LOG_ZONES 2
+
+static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
+{
+ struct blk_zone *zones = data;
+
+ memcpy(&zones[idx], zone, sizeof(*zone));
+
+ return 0;
+}
+
+static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
+ u64 *wp_ret)
+{
+ bool empty[BTRFS_NR_SB_LOG_ZONES];
+ bool full[BTRFS_NR_SB_LOG_ZONES];
+ sector_t sector;
+
+ ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
+ zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
+
+ empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
+ empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
+ full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
+ full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
+
+ /*
+ * Possible states of log buffer zones
+ *
+ * Empty[0] In use[0] Full[0]
+ * Empty[1] * x 0
+ * In use[1] 0 x 0
+ * Full[1] 1 1 C
+ *
+ * Log position:
+ * *: Special case, no superblock is written
+ * 0: Use write pointer of zones[0]
+ * 1: Use write pointer of zones[1]
+ * C: Compare super blcoks from zones[0] and zones[1], use the latest
+ * one determined by generation
+ * x: Invalid state
+ */
+
+ if (empty[0] && empty[1]) {
+ /* Special case to distinguish no superblock to read */
+ *wp_ret = zones[0].start << SECTOR_SHIFT;
+ return -ENOENT;
+ } else if (full[0] && full[1]) {
+ /* Compare two super blocks */
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct page *page[BTRFS_NR_SB_LOG_ZONES];
+ struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
+ int i;
+
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ u64 bytenr;
+
+ bytenr = ((zones[i].start + zones[i].len)
+ << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
+
+ page[i] = read_cache_page_gfp(mapping,
+ bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page[i])) {
+ if (i == 1)
+ btrfs_release_disk_super(super[0]);
+ return PTR_ERR(page[i]);
+ }
+ super[i] = page_address(page[i]);
+ }
+
+ if (super[0]->generation > super[1]->generation)
+ sector = zones[1].start;
+ else
+ sector = zones[0].start;
+
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
+ btrfs_release_disk_super(super[i]);
+ } else if (!full[0] && (empty[1] || full[1])) {
+ sector = zones[0].wp;
+ } else if (full[0]) {
+ sector = zones[1].wp;
+ } else {
+ return -EUCLEAN;
+ }
+ *wp_ret = sector << SECTOR_SHIFT;
+ return 0;
+}
+
+/*
+ * The following zones are reserved as the circular buffer on ZONED btrfs.
+ * - The primary superblock: zones 0 and 1
+ * - The first copy: zones 16 and 17
+ * - The second copy: zones 1024 or zone at 256GB which is minimum, and
+ * the following one
+ */
+static inline u32 sb_zone_number(int shift, int mirror)
+{
+ ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+
+ switch (mirror) {
+ case 0: return 0;
+ case 1: return 16;
+ case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024);
+ }
+
+ return 0;
+}
+
+/*
+ * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
+ * device into static sized chunks and fake a conventional zone on each of
+ * them.
+ */
+static int emulate_report_zones(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zones, unsigned int nr_zones)
+{
+ const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
+ sector_t bdev_size = bdev_nr_sectors(device->bdev);
+ unsigned int i;
+
+ pos >>= SECTOR_SHIFT;
+ for (i = 0; i < nr_zones; i++) {
+ zones[i].start = i * zone_sectors + pos;
+ zones[i].len = zone_sectors;
+ zones[i].capacity = zone_sectors;
+ zones[i].wp = zones[i].start + zone_sectors;
+ zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
+ zones[i].cond = BLK_ZONE_COND_NOT_WP;
+
+ if (zones[i].wp >= bdev_size) {
+ i++;
+ break;
+ }
+ }
+
+ return i;
+}
+
+static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zones, unsigned int *nr_zones)
+{
+ int ret;
+
+ if (!*nr_zones)
+ return 0;
+
+ if (!bdev_is_zoned(device->bdev)) {
+ ret = emulate_report_zones(device, pos, zones, *nr_zones);
+ *nr_zones = ret;
+ return 0;
+ }
+
+ ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
+ copy_zone_info_cb, zones);
+ if (ret < 0) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to read zone %llu on %s (devid %llu)",
+ pos, rcu_str_deref(device->name),
+ device->devid);
+ return ret;
+ }
+ *nr_zones = ret;
+ if (!ret)
+ return -EIO;
+
+ return 0;
+}
+
+/* The emulated zone size is determined from the size of device extent */
+static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_dev_extent *dext;
+ int ret = 0;
+
+ key.objectid = 1;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ /* No dev extents at all? Not good */
+ if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+ leaf = path->nodes[0];
+ dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+ fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
+ ret = 0;
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ int ret = 0;
+
+ /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
+ return 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ /* We can skip reading of zone info for missing devices */
+ if (!device->bdev)
+ continue;
+
+ ret = btrfs_get_dev_zone_info(device);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
+}
+
+int btrfs_get_dev_zone_info(struct btrfs_device *device)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_zoned_device_info *zone_info = NULL;
+ struct block_device *bdev = device->bdev;
+ struct request_queue *queue = bdev_get_queue(bdev);
+ sector_t nr_sectors;
+ sector_t sector = 0;
+ struct blk_zone *zones = NULL;
+ unsigned int i, nreported = 0, nr_zones;
+ unsigned int zone_sectors;
+ char *model, *emulated;
+ int ret;
+
+ /*
+ * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
+ * yet be set.
+ */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
+ return 0;
+
+ if (device->zone_info)
+ return 0;
+
+ zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
+ if (!zone_info)
+ return -ENOMEM;
+
+ if (!bdev_is_zoned(bdev)) {
+ if (!fs_info->zone_size) {
+ ret = calculate_emulated_zone_size(fs_info);
+ if (ret)
+ goto out;
+ }
+
+ ASSERT(fs_info->zone_size);
+ zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
+ } else {
+ zone_sectors = bdev_zone_sectors(bdev);
+ }
+
+ nr_sectors = bdev_nr_sectors(bdev);
+ /* Check if it's power of 2 (see is_power_of_2) */
+ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
+ zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
+ zone_info->zone_size_shift = ilog2(zone_info->zone_size);
+ zone_info->max_zone_append_size =
+ (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
+ zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
+ if (!IS_ALIGNED(nr_sectors, zone_sectors))
+ zone_info->nr_zones++;
+
+ zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->seq_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->empty_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
+ if (!zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Get zones type */
+ while (sector < nr_sectors) {
+ nr_zones = BTRFS_REPORT_NR_ZONES;
+ ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
+ &nr_zones);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < nr_zones; i++) {
+ if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
+ __set_bit(nreported, zone_info->seq_zones);
+ if (zones[i].cond == BLK_ZONE_COND_EMPTY)
+ __set_bit(nreported, zone_info->empty_zones);
+ nreported++;
+ }
+ sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
+ }
+
+ if (nreported != zone_info->nr_zones) {
+ btrfs_err_in_rcu(device->fs_info,
+ "inconsistent number of zones on %s (%u/%u)",
+ rcu_str_deref(device->name), nreported,
+ zone_info->nr_zones);
+ ret = -EIO;
+ goto out;
+ }
+
+ /* Validate superblock log */
+ nr_zones = BTRFS_NR_SB_LOG_ZONES;
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ u32 sb_zone;
+ u64 sb_wp;
+ int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
+
+ sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
+ if (sb_zone + 1 >= zone_info->nr_zones)
+ continue;
+
+ sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT);
+ ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT,
+ &zone_info->sb_zones[sb_pos],
+ &nr_zones);
+ if (ret)
+ goto out;
+
+ if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to read super block log zone info at devid %llu zone %u",
+ device->devid, sb_zone);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ /*
+ * If zones[0] is conventional, always use the beggining of the
+ * zone to record superblock. No need to validate in that case.
+ */
+ if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
+ BLK_ZONE_TYPE_CONVENTIONAL)
+ continue;
+
+ ret = sb_write_pointer(device->bdev,
+ &zone_info->sb_zones[sb_pos], &sb_wp);
+ if (ret != -ENOENT && ret) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: super block log zone corrupted devid %llu zone %u",
+ device->devid, sb_zone);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+
+ kfree(zones);
+
+ device->zone_info = zone_info;
+
+ switch (bdev_zoned_model(bdev)) {
+ case BLK_ZONED_HM:
+ model = "host-managed zoned";
+ emulated = "";
+ break;
+ case BLK_ZONED_HA:
+ model = "host-aware zoned";
+ emulated = "";
+ break;
+ case BLK_ZONED_NONE:
+ model = "regular";
+ emulated = "emulated ";
+ break;
+ default:
+ /* Just in case */
+ btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
+ bdev_zoned_model(bdev),
+ rcu_str_deref(device->name));
+ ret = -EOPNOTSUPP;
+ goto out_free_zone_info;
+ }
+
+ btrfs_info_in_rcu(fs_info,
+ "%s block device %s, %u %szones of %llu bytes",
+ model, rcu_str_deref(device->name), zone_info->nr_zones,
+ emulated, zone_info->zone_size);
+
+ return 0;
+
+out:
+ kfree(zones);
+out_free_zone_info:
+ bitmap_free(zone_info->empty_zones);
+ bitmap_free(zone_info->seq_zones);
+ kfree(zone_info);
+ device->zone_info = NULL;
+
+ return ret;
+}
+
+void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return;
+
+ bitmap_free(zone_info->seq_zones);
+ bitmap_free(zone_info->empty_zones);
+ kfree(zone_info);
+ device->zone_info = NULL;
+}
+
+int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone)
+{
+ unsigned int nr_zones = 1;
+ int ret;
+
+ ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
+ if (ret != 0 || !nr_zones)
+ return ret ? ret : -EIO;
+
+ return 0;
+}
+
+int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 zoned_devices = 0;
+ u64 nr_devices = 0;
+ u64 zone_size = 0;
+ u64 max_zone_append_size = 0;
+ const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
+ int ret = 0;
+
+ /* Count zoned devices */
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ enum blk_zoned_model model;
+
+ if (!device->bdev)
+ continue;
+
+ model = bdev_zoned_model(device->bdev);
+ /*
+ * A Host-Managed zoned device must be used as a zoned device.
+ * A Host-Aware zoned device and a non-zoned devices can be
+ * treated as a zoned device, if ZONED flag is enabled in the
+ * superblock.
+ */
+ if (model == BLK_ZONED_HM ||
+ (model == BLK_ZONED_HA && incompat_zoned) ||
+ (model == BLK_ZONED_NONE && incompat_zoned)) {
+ struct btrfs_zoned_device_info *zone_info =
+ device->zone_info;
+
+ zone_info = device->zone_info;
+ zoned_devices++;
+ if (!zone_size) {
+ zone_size = zone_info->zone_size;
+ } else if (zone_info->zone_size != zone_size) {
+ btrfs_err(fs_info,
+ "zoned: unequal block device zone sizes: have %llu found %llu",
+ device->zone_info->zone_size,
+ zone_size);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!max_zone_append_size ||
+ (zone_info->max_zone_append_size &&
+ zone_info->max_zone_append_size < max_zone_append_size))
+ max_zone_append_size =
+ zone_info->max_zone_append_size;
+ }
+ nr_devices++;
+ }
+
+ if (!zoned_devices && !incompat_zoned)
+ goto out;
+
+ if (!zoned_devices && incompat_zoned) {
+ /* No zoned block device found on ZONED filesystem */
+ btrfs_err(fs_info,
+ "zoned: no zoned devices found on a zoned filesystem");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (zoned_devices && !incompat_zoned) {
+ btrfs_err(fs_info,
+ "zoned: mode not enabled but zoned device found");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (zoned_devices != nr_devices) {
+ btrfs_err(fs_info,
+ "zoned: cannot mix zoned and regular devices");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * stripe_size is always aligned to BTRFS_STRIPE_LEN in
+ * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
+ * check the alignment here.
+ */
+ if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
+ btrfs_err(fs_info,
+ "zoned: zone size %llu not aligned to stripe %u",
+ zone_size, BTRFS_STRIPE_LEN);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+ btrfs_err(fs_info, "zoned: mixed block groups not supported");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ fs_info->zone_size = zone_size;
+ fs_info->max_zone_append_size = max_zone_append_size;
+ fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
+
+ /*
+ * Check mount options here, because we might change fs_info->zoned
+ * from fs_info->zone_size.
+ */
+ ret = btrfs_check_mountopts_zoned(fs_info);
+ if (ret)
+ goto out;
+
+ btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
+out:
+ return ret;
+}
+
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+{
+ if (!btrfs_is_zoned(info))
+ return 0;
+
+ /*
+ * Space cache writing is not COWed. Disable that to avoid write errors
+ * in sequential zones.
+ */
+ if (btrfs_test_opt(info, SPACE_CACHE)) {
+ btrfs_err(info, "zoned: space cache v1 is not supported");
+ return -EINVAL;
+ }
+
+ if (btrfs_test_opt(info, NODATACOW)) {
+ btrfs_err(info, "zoned: NODATACOW not supported");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
+ int rw, u64 *bytenr_ret)
+{
+ u64 wp;
+ int ret;
+
+ if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ *bytenr_ret = zones[0].start << SECTOR_SHIFT;
+ return 0;
+ }
+
+ ret = sb_write_pointer(bdev, zones, &wp);
+ if (ret != -ENOENT && ret < 0)
+ return ret;
+
+ if (rw == WRITE) {
+ struct blk_zone *reset = NULL;
+
+ if (wp == zones[0].start << SECTOR_SHIFT)
+ reset = &zones[0];
+ else if (wp == zones[1].start << SECTOR_SHIFT)
+ reset = &zones[1];
+
+ if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ ASSERT(reset->cond == BLK_ZONE_COND_FULL);
+
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ reset->start, reset->len,
+ GFP_NOFS);
+ if (ret)
+ return ret;
+
+ reset->cond = BLK_ZONE_COND_EMPTY;
+ reset->wp = reset->start;
+ }
+ } else if (ret != -ENOENT) {
+ /* For READ, we want the precious one */
+ if (wp == zones[0].start << SECTOR_SHIFT)
+ wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
+ wp -= BTRFS_SUPER_INFO_SIZE;
+ }
+
+ *bytenr_ret = wp;
+ return 0;
+
+}
+
+int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
+ u64 *bytenr_ret)
+{
+ struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
+ unsigned int zone_sectors;
+ u32 sb_zone;
+ int ret;
+ u8 zone_sectors_shift;
+ sector_t nr_sectors;
+ u32 nr_zones;
+
+ if (!bdev_is_zoned(bdev)) {
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+ }
+
+ ASSERT(rw == READ || rw == WRITE);
+
+ zone_sectors = bdev_zone_sectors(bdev);
+ if (!is_power_of_2(zone_sectors))
+ return -EINVAL;
+ zone_sectors_shift = ilog2(zone_sectors);
+ nr_sectors = bdev_nr_sectors(bdev);
+ nr_zones = nr_sectors >> zone_sectors_shift;
+
+ sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+ if (sb_zone + 1 >= nr_zones)
+ return -ENOENT;
+
+ ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift,
+ BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
+ zones);
+ if (ret < 0)
+ return ret;
+ if (ret != BTRFS_NR_SB_LOG_ZONES)
+ return -EIO;
+
+ return sb_log_location(bdev, zones, rw, bytenr_ret);
+}
+
+int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
+ u64 *bytenr_ret)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ u32 zone_num;
+
+ /*
+ * For a zoned filesystem on a non-zoned block device, use the same
+ * super block locations as regular filesystem. Doing so, the super
+ * block can always be retrieved and the zoned flag of the volume
+ * detected from the super block information.
+ */
+ if (!bdev_is_zoned(device->bdev)) {
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+ }
+
+ zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+ if (zone_num + 1 >= zinfo->nr_zones)
+ return -ENOENT;
+
+ return sb_log_location(device->bdev,
+ &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
+ rw, bytenr_ret);
+}
+
+static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
+ int mirror)
+{
+ u32 zone_num;
+
+ if (!zinfo)
+ return false;
+
+ zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+ if (zone_num + 1 >= zinfo->nr_zones)
+ return false;
+
+ if (!test_bit(zone_num, zinfo->seq_zones))
+ return false;
+
+ return true;
+}
+
+void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ struct blk_zone *zone;
+
+ if (!is_sb_log_zone(zinfo, mirror))
+ return;
+
+ zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
+ if (zone->cond != BLK_ZONE_COND_FULL) {
+ if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+
+ if (zone->wp == zone->start + zone->len)
+ zone->cond = BLK_ZONE_COND_FULL;
+
+ return;
+ }
+
+ zone++;
+ ASSERT(zone->cond != BLK_ZONE_COND_FULL);
+ if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+
+ if (zone->wp == zone->start + zone->len)
+ zone->cond = BLK_ZONE_COND_FULL;
+}
+
+int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
+{
+ sector_t zone_sectors;
+ sector_t nr_sectors;
+ u8 zone_sectors_shift;
+ u32 sb_zone;
+ u32 nr_zones;
+
+ zone_sectors = bdev_zone_sectors(bdev);
+ zone_sectors_shift = ilog2(zone_sectors);
+ nr_sectors = bdev_nr_sectors(bdev);
+ nr_zones = nr_sectors >> zone_sectors_shift;
+
+ sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+ if (sb_zone + 1 >= nr_zones)
+ return -ENOENT;
+
+ return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ sb_zone << zone_sectors_shift,
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+}
+
+/**
+ * btrfs_find_allocatable_zones - find allocatable zones within a given region
+ *
+ * @device: the device to allocate a region on
+ * @hole_start: the position of the hole to allocate the region
+ * @num_bytes: size of wanted region
+ * @hole_end: the end of the hole
+ * @return: position of allocatable zones
+ *
+ * Allocatable region should not contain any superblock locations.
+ */
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+ u64 hole_end, u64 num_bytes)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ const u8 shift = zinfo->zone_size_shift;
+ u64 nzones = num_bytes >> shift;
+ u64 pos = hole_start;
+ u64 begin, end;
+ bool have_sb;
+ int i;
+
+ ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+
+ while (pos < hole_end) {
+ begin = pos >> shift;
+ end = begin + nzones;
+
+ if (end > zinfo->nr_zones)
+ return hole_end;
+
+ /* Check if zones in the region are all empty */
+ if (btrfs_dev_is_sequential(device, pos) &&
+ find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
+ pos += zinfo->zone_size;
+ continue;
+ }
+
+ have_sb = false;
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ u32 sb_zone;
+ u64 sb_pos;
+
+ sb_zone = sb_zone_number(shift, i);
+ if (!(end <= sb_zone ||
+ sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
+ have_sb = true;
+ pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift;
+ break;
+ }
+
+ /* We also need to exclude regular superblock positions */
+ sb_pos = btrfs_sb_offset(i);
+ if (!(pos + num_bytes <= sb_pos ||
+ sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
+ have_sb = true;
+ pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
+ zinfo->zone_size);
+ break;
+ }
+ }
+ if (!have_sb)
+ break;
+ }
+
+ return pos;
+}
+
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+ u64 length, u64 *bytes)
+{
+ int ret;
+
+ *bytes = 0;
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
+ physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
+ GFP_NOFS);
+ if (ret)
+ return ret;
+
+ *bytes = length;
+ while (length) {
+ btrfs_dev_set_zone_empty(device, physical);
+ physical += device->zone_info->zone_size;
+ length -= device->zone_info->zone_size;
+ }
+
+ return 0;
+}
+
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ const u8 shift = zinfo->zone_size_shift;
+ unsigned long begin = start >> shift;
+ unsigned long end = (start + size) >> shift;
+ u64 pos;
+ int ret;
+
+ ASSERT(IS_ALIGNED(start, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(size, zinfo->zone_size));
+
+ if (end > zinfo->nr_zones)
+ return -ERANGE;
+
+ /* All the zones are conventional */
+ if (find_next_bit(zinfo->seq_zones, begin, end) == end)
+ return 0;
+
+ /* All the zones are sequential and empty */
+ if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
+ find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
+ return 0;
+
+ for (pos = start; pos < start + size; pos += zinfo->zone_size) {
+ u64 reset_bytes;
+
+ if (!btrfs_dev_is_sequential(device, pos) ||
+ btrfs_dev_is_empty_zone(device, pos))
+ continue;
+
+ /* Free regions should be empty */
+ btrfs_warn_in_rcu(
+ device->fs_info,
+ "zoned: resetting device %s (devid %llu) zone %llu for allocation",
+ rcu_str_deref(device->name), device->devid, pos >> shift);
+ WARN_ON_ONCE(1);
+
+ ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
+ &reset_bytes);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Calculate an allocation pointer from the extent allocation information
+ * for a block group consist of conventional zones. It is pointed to the
+ * end of the highest addressed extent in the block group as an allocation
+ * offset.
+ */
+static int calculate_alloc_pointer(struct btrfs_block_group *cache,
+ u64 *offset_ret)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int ret;
+ u64 length;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = cache->start + cache->length;
+ key.type = 0;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ /* We should not find the exact match */
+ if (!ret)
+ ret = -EUCLEAN;
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_previous_extent_item(root, path, cache->start);
+ if (ret) {
+ if (ret == 1) {
+ ret = 0;
+ *offset_ret = 0;
+ }
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
+ length = found_key.offset;
+ else
+ length = fs_info->nodesize;
+
+ if (!(found_key.objectid >= cache->start &&
+ found_key.objectid + length <= cache->start + cache->length)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ *offset_ret = found_key.objectid + length - cache->start;
+ ret = 0;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 logical = cache->start;
+ u64 length = cache->length;
+ u64 physical = 0;
+ int ret;
+ int i;
+ unsigned int nofs_flag;
+ u64 *alloc_offsets = NULL;
+ u64 last_alloc = 0;
+ u32 num_sequential = 0, num_conventional = 0;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ /* Sanity check */
+ if (!IS_ALIGNED(length, fs_info->zone_size)) {
+ btrfs_err(fs_info,
+ "zoned: block group %llu len %llu unaligned to zone size %llu",
+ logical, length, fs_info->zone_size);
+ return -EIO;
+ }
+
+ /* Get the chunk mapping */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, length);
+ read_unlock(&em_tree->lock);
+
+ if (!em)
+ return -EINVAL;
+
+ map = em->map_lookup;
+
+ alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
+ if (!alloc_offsets) {
+ free_extent_map(em);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ bool is_sequential;
+ struct blk_zone zone;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int dev_replace_is_ongoing = 0;
+
+ device = map->stripes[i].dev;
+ physical = map->stripes[i].physical;
+
+ if (device->bdev == NULL) {
+ alloc_offsets[i] = WP_MISSING_DEV;
+ continue;
+ }
+
+ is_sequential = btrfs_dev_is_sequential(device, physical);
+ if (is_sequential)
+ num_sequential++;
+ else
+ num_conventional++;
+
+ if (!is_sequential) {
+ alloc_offsets[i] = WP_CONVENTIONAL;
+ continue;
+ }
+
+ /*
+ * This zone will be used for allocation, so mark this zone
+ * non-empty.
+ */
+ btrfs_dev_clear_zone_empty(device, physical);
+
+ down_read(&dev_replace->rwsem);
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
+ btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical);
+ up_read(&dev_replace->rwsem);
+
+ /*
+ * The group is mapped to a sequential zone. Get the zone write
+ * pointer to determine the allocation offset within the zone.
+ */
+ WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
+ nofs_flag = memalloc_nofs_save();
+ ret = btrfs_get_dev_zone(device, physical, &zone);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret == -EIO || ret == -EOPNOTSUPP) {
+ ret = 0;
+ alloc_offsets[i] = WP_MISSING_DEV;
+ continue;
+ } else if (ret) {
+ goto out;
+ }
+
+ switch (zone.cond) {
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ btrfs_err(fs_info,
+ "zoned: offline/readonly zone %llu on device %s (devid %llu)",
+ physical >> device->zone_info->zone_size_shift,
+ rcu_str_deref(device->name), device->devid);
+ alloc_offsets[i] = WP_MISSING_DEV;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ alloc_offsets[i] = 0;
+ break;
+ case BLK_ZONE_COND_FULL:
+ alloc_offsets[i] = fs_info->zone_size;
+ break;
+ default:
+ /* Partially used zone */
+ alloc_offsets[i] =
+ ((zone.wp - zone.start) << SECTOR_SHIFT);
+ break;
+ }
+ }
+
+ if (num_sequential > 0)
+ cache->seq_zone = true;
+
+ if (num_conventional > 0) {
+ /*
+ * Avoid calling calculate_alloc_pointer() for new BG. It
+ * is no use for new BG. It must be always 0.
+ *
+ * Also, we have a lock chain of extent buffer lock ->
+ * chunk mutex. For new BG, this function is called from
+ * btrfs_make_block_group() which is already taking the
+ * chunk mutex. Thus, we cannot call
+ * calculate_alloc_pointer() which takes extent buffer
+ * locks to avoid deadlock.
+ */
+ if (new) {
+ cache->alloc_offset = 0;
+ goto out;
+ }
+ ret = calculate_alloc_pointer(cache, &last_alloc);
+ if (ret || map->num_stripes == num_conventional) {
+ if (!ret)
+ cache->alloc_offset = last_alloc;
+ else
+ btrfs_err(fs_info,
+ "zoned: failed to determine allocation offset of bg %llu",
+ cache->start);
+ goto out;
+ }
+ }
+
+ switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case 0: /* single */
+ cache->alloc_offset = alloc_offsets[0];
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID0:
+ case BTRFS_BLOCK_GROUP_RAID10:
+ case BTRFS_BLOCK_GROUP_RAID5:
+ case BTRFS_BLOCK_GROUP_RAID6:
+ /* non-single profiles are not supported yet */
+ default:
+ btrfs_err(fs_info, "zoned: profile %s not yet supported",
+ btrfs_bg_type_to_raid_name(map->type));
+ ret = -EINVAL;
+ goto out;
+ }
+
+out:
+ /* An extent is allocated after the write pointer */
+ if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
+ btrfs_err(fs_info,
+ "zoned: got wrong write pointer in BG %llu: %llu > %llu",
+ logical, last_alloc, cache->alloc_offset);
+ ret = -EIO;
+ }
+
+ if (!ret)
+ cache->meta_write_pointer = cache->alloc_offset + cache->start;
+
+ kfree(alloc_offsets);
+ free_extent_map(em);
+
+ return ret;
+}
+
+void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
+{
+ u64 unusable, free;
+
+ if (!btrfs_is_zoned(cache->fs_info))
+ return;
+
+ WARN_ON(cache->bytes_super != 0);
+ unusable = cache->alloc_offset - cache->used;
+ free = cache->length - cache->alloc_offset;
+
+ /* We only need ->free_space in ALLOC_SEQ block groups */
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ cache->free_space_ctl->free_space = free;
+ cache->zone_unusable = unusable;
+
+ /* Should not have any excluded extents. Just in case, though */
+ btrfs_free_excluded_extents(cache);
+}
+
+void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+
+ if (!btrfs_is_zoned(fs_info) ||
+ btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
+ !list_empty(&eb->release_list))
+ return;
+
+ set_extent_buffer_dirty(eb);
+ set_extent_bits_nowait(&trans->dirty_pages, eb->start,
+ eb->start + eb->len - 1, EXTENT_DIRTY);
+ memzero_extent_buffer(eb, 0, eb->len);
+ set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
+
+ spin_lock(&trans->releasing_ebs_lock);
+ list_add_tail(&eb->release_list, &trans->releasing_ebs);
+ spin_unlock(&trans->releasing_ebs_lock);
+ atomic_inc(&eb->refs);
+}
+
+void btrfs_free_redirty_list(struct btrfs_transaction *trans)
+{
+ spin_lock(&trans->releasing_ebs_lock);
+ while (!list_empty(&trans->releasing_ebs)) {
+ struct extent_buffer *eb;
+
+ eb = list_first_entry(&trans->releasing_ebs,
+ struct extent_buffer, release_list);
+ list_del_init(&eb->release_list);
+ free_extent_buffer(eb);
+ }
+ spin_unlock(&trans->releasing_ebs_lock);
+}
+
+bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_block_group *cache;
+ bool ret = false;
+
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
+ if (!fs_info->max_zone_append_size)
+ return false;
+
+ if (!is_data_inode(&inode->vfs_inode))
+ return false;
+
+ cache = btrfs_lookup_block_group(fs_info, em->block_start);
+ ASSERT(cache);
+ if (!cache)
+ return false;
+
+ ret = cache->seq_zone;
+ btrfs_put_block_group(cache);
+
+ return ret;
+}
+
+void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
+ struct bio *bio)
+{
+ struct btrfs_ordered_extent *ordered;
+ const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+ if (bio_op(bio) != REQ_OP_ZONE_APPEND)
+ return;
+
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
+ if (WARN_ON(!ordered))
+ return;
+
+ ordered->physical = physical;
+ ordered->disk = bio->bi_bdev->bd_disk;
+ ordered->partno = bio->bi_bdev->bd_partno;
+
+ btrfs_put_ordered_extent(ordered);
+}
+
+void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
+{
+ struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_ordered_sum *sum;
+ struct block_device *bdev;
+ u64 orig_logical = ordered->disk_bytenr;
+ u64 *logical = NULL;
+ int nr, stripe_len;
+
+ /* Zoned devices should not have partitions. So, we can assume it is 0 */
+ ASSERT(ordered->partno == 0);
+ bdev = bdgrab(ordered->disk->part0);
+ if (WARN_ON(!bdev))
+ return;
+
+ if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev,
+ ordered->physical, &logical, &nr,
+ &stripe_len)))
+ goto out;
+
+ WARN_ON(nr != 1);
+
+ if (orig_logical == *logical)
+ goto out;
+
+ ordered->disk_bytenr = *logical;
+
+ em_tree = &inode->extent_tree;
+ write_lock(&em_tree->lock);
+ em = search_extent_mapping(em_tree, ordered->file_offset,
+ ordered->num_bytes);
+ em->block_start = *logical;
+ free_extent_map(em);
+ write_unlock(&em_tree->lock);
+
+ list_for_each_entry(sum, &ordered->list, list) {
+ if (*logical < orig_logical)
+ sum->bytenr -= orig_logical - *logical;
+ else
+ sum->bytenr += *logical - orig_logical;
+ }
+
+out:
+ kfree(logical);
+ bdput(bdev);
+}
+
+bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_block_group **cache_ret)
+{
+ struct btrfs_block_group *cache;
+ bool ret = true;
+
+ if (!btrfs_is_zoned(fs_info))
+ return true;
+
+ cache = *cache_ret;
+
+ if (cache && (eb->start < cache->start ||
+ cache->start + cache->length <= eb->start)) {
+ btrfs_put_block_group(cache);
+ cache = NULL;
+ *cache_ret = NULL;
+ }
+
+ if (!cache)
+ cache = btrfs_lookup_block_group(fs_info, eb->start);
+
+ if (cache) {
+ if (cache->meta_write_pointer != eb->start) {
+ btrfs_put_block_group(cache);
+ cache = NULL;
+ ret = false;
+ } else {
+ cache->meta_write_pointer = eb->start + eb->len;
+ }
+
+ *cache_ret = cache;
+ }
+
+ return ret;
+}
+
+void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
+ struct extent_buffer *eb)
+{
+ if (!btrfs_is_zoned(eb->fs_info) || !cache)
+ return;
+
+ ASSERT(cache->meta_write_pointer == eb->start + eb->len);
+ cache->meta_write_pointer = eb->start;
+}
+
+int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
+{
+ if (!btrfs_dev_is_sequential(device, physical))
+ return -EOPNOTSUPP;
+
+ return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
+ length >> SECTOR_SHIFT, GFP_NOFS, 0);
+}
+
+static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
+ struct blk_zone *zone)
+{
+ struct btrfs_bio *bbio = NULL;
+ u64 mapped_length = PAGE_SIZE;
+ unsigned int nofs_flag;
+ int nmirrors;
+ int i, ret;
+
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
+ &mapped_length, &bbio);
+ if (ret || !bbio || mapped_length < PAGE_SIZE) {
+ btrfs_put_bbio(bbio);
+ return -EIO;
+ }
+
+ if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ return -EINVAL;
+
+ nofs_flag = memalloc_nofs_save();
+ nmirrors = (int)bbio->num_stripes;
+ for (i = 0; i < nmirrors; i++) {
+ u64 physical = bbio->stripes[i].physical;
+ struct btrfs_device *dev = bbio->stripes[i].dev;
+
+ /* Missing device */
+ if (!dev->bdev)
+ continue;
+
+ ret = btrfs_get_dev_zone(dev, physical, zone);
+ /* Failing device */
+ if (ret == -EIO || ret == -EOPNOTSUPP)
+ continue;
+ break;
+ }
+ memalloc_nofs_restore(nofs_flag);
+
+ return ret;
+}
+
+/*
+ * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
+ * filling zeros between @physical_pos to a write pointer of dev-replace
+ * source device.
+ */
+int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ u64 physical_start, u64 physical_pos)
+{
+ struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
+ struct blk_zone zone;
+ u64 length;
+ u64 wp;
+ int ret;
+
+ if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
+ return 0;
+
+ ret = read_zone_info(fs_info, logical, &zone);
+ if (ret)
+ return ret;
+
+ wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
+
+ if (physical_pos == wp)
+ return 0;
+
+ if (physical_pos > wp)
+ return -EUCLEAN;
+
+ length = wp - physical_pos;
+ return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
new file mode 100644
index 000000000000..61e969652fe1
--- /dev/null
+++ b/fs/btrfs/zoned.h
@@ -0,0 +1,307 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ZONED_H
+#define BTRFS_ZONED_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include "volumes.h"
+#include "disk-io.h"
+#include "block-group.h"
+
+struct btrfs_zoned_device_info {
+ /*
+ * Number of zones, zone size and types of zones if bdev is a
+ * zoned block device.
+ */
+ u64 zone_size;
+ u8 zone_size_shift;
+ u64 max_zone_append_size;
+ u32 nr_zones;
+ unsigned long *seq_zones;
+ unsigned long *empty_zones;
+ struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
+};
+
+#ifdef CONFIG_BLK_DEV_ZONED
+int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone);
+int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
+int btrfs_get_dev_zone_info(struct btrfs_device *device);
+void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
+int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
+ u64 *bytenr_ret);
+int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
+ u64 *bytenr_ret);
+void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
+int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+ u64 hole_end, u64 num_bytes);
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+ u64 length, u64 *bytes);
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
+int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
+void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
+void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ struct extent_buffer *eb);
+void btrfs_free_redirty_list(struct btrfs_transaction *trans);
+bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em);
+void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
+ struct bio *bio);
+void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
+bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_block_group **cache_ret);
+void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
+ struct extent_buffer *eb);
+int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length);
+int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ u64 physical_start, u64 physical_pos);
+#else /* CONFIG_BLK_DEV_ZONED */
+static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone)
+{
+ return 0;
+}
+
+static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
+{
+ return 0;
+}
+
+static inline int btrfs_get_dev_zone_info(struct btrfs_device *device)
+{
+ return 0;
+}
+
+static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { }
+
+static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ btrfs_err(fs_info, "zoned block devices support is not enabled");
+ return -EOPNOTSUPP;
+}
+
+static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+{
+ return 0;
+}
+
+static inline int btrfs_sb_log_location_bdev(struct block_device *bdev,
+ int mirror, int rw, u64 *bytenr_ret)
+{
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+}
+
+static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
+ int rw, u64 *bytenr_ret)
+{
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+}
+
+static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{ }
+
+static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
+{
+ return 0;
+}
+
+static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device,
+ u64 hole_start, u64 hole_end,
+ u64 num_bytes)
+{
+ return hole_start;
+}
+
+static inline int btrfs_reset_device_zone(struct btrfs_device *device,
+ u64 physical, u64 length, u64 *bytes)
+{
+ *bytes = 0;
+ return 0;
+}
+
+static inline int btrfs_ensure_empty_zones(struct btrfs_device *device,
+ u64 start, u64 size)
+{
+ return 0;
+}
+
+static inline int btrfs_load_block_group_zone_info(
+ struct btrfs_block_group *cache, bool new)
+{
+ return 0;
+}
+
+static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
+
+static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ struct extent_buffer *eb) { }
+static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
+
+static inline bool btrfs_use_zone_append(struct btrfs_inode *inode,
+ struct extent_map *em)
+{
+ return false;
+}
+
+static inline void btrfs_record_physical_zoned(struct inode *inode,
+ u64 file_offset, struct bio *bio)
+{
+}
+
+static inline void btrfs_rewrite_logical_zoned(
+ struct btrfs_ordered_extent *ordered) { }
+
+static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_block_group **cache_ret)
+{
+ return true;
+}
+
+static inline void btrfs_revert_meta_write_pointer(
+ struct btrfs_block_group *cache,
+ struct extent_buffer *eb)
+{
+}
+
+static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device,
+ u64 physical, u64 length)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev,
+ u64 logical, u64 physical_start,
+ u64 physical_pos)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif
+
+static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return false;
+
+ return test_bit(pos >> zone_info->zone_size_shift, zone_info->seq_zones);
+}
+
+static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return true;
+
+ return test_bit(pos >> zone_info->zone_size_shift, zone_info->empty_zones);
+}
+
+static inline void btrfs_dev_set_empty_zone_bit(struct btrfs_device *device,
+ u64 pos, bool set)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno;
+
+ if (!zone_info)
+ return;
+
+ zno = pos >> zone_info->zone_size_shift;
+ if (set)
+ set_bit(zno, zone_info->empty_zones);
+ else
+ clear_bit(zno, zone_info->empty_zones);
+}
+
+static inline void btrfs_dev_set_zone_empty(struct btrfs_device *device, u64 pos)
+{
+ btrfs_dev_set_empty_zone_bit(device, pos, true);
+}
+
+static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 pos)
+{
+ btrfs_dev_set_empty_zone_bit(device, pos, false);
+}
+
+static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info,
+ struct block_device *bdev)
+{
+ if (btrfs_is_zoned(fs_info)) {
+ /*
+ * We can allow a regular device on a zoned filesystem, because
+ * we will emulate the zoned capabilities.
+ */
+ if (!bdev_is_zoned(bdev))
+ return true;
+
+ return fs_info->zone_size ==
+ (bdev_zone_sectors(bdev) << SECTOR_SHIFT);
+ }
+
+ /* Do not allow Host Manged zoned device */
+ return bdev_zoned_model(bdev) != BLK_ZONED_HM;
+}
+
+static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
+{
+ /*
+ * On a non-zoned device, any address is OK. On a zoned device,
+ * non-SEQUENTIAL WRITE REQUIRED zones are capable.
+ */
+ return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos);
+}
+
+static inline bool btrfs_can_zone_reset(struct btrfs_device *device,
+ u64 physical, u64 length)
+{
+ u64 zone_size;
+
+ if (!btrfs_dev_is_sequential(device, physical))
+ return false;
+
+ zone_size = device->zone_info->zone_size;
+ if (!IS_ALIGNED(physical, zone_size) || !IS_ALIGNED(length, zone_size))
+ return false;
+
+ return true;
+}
+
+static inline void btrfs_zoned_meta_io_lock(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_is_zoned(fs_info))
+ return;
+ mutex_lock(&fs_info->zoned_meta_io_lock);
+}
+
+static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_is_zoned(fs_info))
+ return;
+ mutex_unlock(&fs_info->zoned_meta_io_lock);
+}
+
+static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg == bg->start)
+ fs_info->treelog_bg = 0;
+ spin_unlock(&fs_info->treelog_bg_lock);
+}
+
+#endif