aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c537
1 files changed, 299 insertions, 238 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3b21fee13e77..78ad31a59e59 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,6 +34,8 @@
#include "block-group.h"
#include "discard.h"
#include "rcu-string.h"
+#include "zoned.h"
+#include "dev-replace.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -82,41 +84,6 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
EXTENT_UPTODATE);
}
-static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
-{
- if (ref->type == BTRFS_REF_METADATA) {
- if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
- return BTRFS_BLOCK_GROUP_SYSTEM;
- else
- return BTRFS_BLOCK_GROUP_METADATA;
- }
- return BTRFS_BLOCK_GROUP_DATA;
-}
-
-static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *ref)
-{
- struct btrfs_space_info *space_info;
- u64 flags = generic_ref_to_space_flags(ref);
-
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-}
-
-static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *ref)
-{
- struct btrfs_space_info *space_info;
- u64 flags = generic_ref_to_space_flags(ref);
-
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-}
-
/* simple helper to search for an existing data extent at a given offset */
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
@@ -844,6 +811,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
want = extent_ref_type(parent, owner);
if (insert) {
extra_size = btrfs_extent_inline_ref_size(want);
+ path->search_for_extension = 1;
path->keep_locks = 1;
} else
extra_size = -1;
@@ -996,6 +964,7 @@ again:
out:
if (insert) {
path->keep_locks = 0;
+ path->search_for_extension = 0;
btrfs_unlock_up_safe(path, 1);
}
return err;
@@ -1297,6 +1266,46 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
return ret;
}
+static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes)
+{
+ struct btrfs_device *dev = stripe->dev;
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ u64 phys = stripe->physical;
+ u64 len = stripe->length;
+ u64 discarded = 0;
+ int ret = 0;
+
+ /* Zone reset on a zoned filesystem */
+ if (btrfs_can_zone_reset(dev, phys, len)) {
+ u64 src_disc;
+
+ ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
+ if (ret)
+ goto out;
+
+ if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
+ dev != dev_replace->srcdev)
+ goto out;
+
+ src_disc = discarded;
+
+ /* Send to replace target as well */
+ ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
+ &discarded);
+ discarded += src_disc;
+ } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) {
+ ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
+ } else {
+ ret = 0;
+ *bytes = 0;
+ }
+
+out:
+ *bytes = discarded;
+ return ret;
+}
+
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes)
{
@@ -1331,20 +1340,13 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
stripe = bbio->stripes;
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
- struct request_queue *req_q;
if (!stripe->dev->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue;
}
- req_q = bdev_get_queue(stripe->dev->bdev);
- if (!blk_queue_discard(req_q))
- continue;
- ret = btrfs_issue_discard(stripe->dev->bdev,
- stripe->physical,
- stripe->length,
- &bytes);
+ ret = do_discard_extent(stripe, &bytes);
if (!ret) {
discarded_bytes += bytes;
} else if (ret != -EOPNOTSUPP) {
@@ -1386,7 +1388,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int old_ref_mod, new_ref_mod;
int ret;
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
@@ -1395,17 +1396,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
- ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
- NULL, &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
else
- ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
btrfs_ref_tree_mod(fs_info, generic_ref);
- if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
- sub_pinned_bytes(fs_info, generic_ref);
-
return ret;
}
@@ -1465,7 +1461,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
parent, root_objectid, owner,
@@ -1489,7 +1484,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- path->leave_spinning = 1;
/* now insert the actual backref */
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
BUG_ON(refs_to_add != 1);
@@ -1605,7 +1599,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
}
again:
- path->leave_spinning = 1;
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
if (ret < 0) {
err = ret;
@@ -1796,34 +1789,28 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
{
int nr_items = 1; /* Dropping this ref head update. */
- if (head->total_ref_mod < 0) {
- struct btrfs_space_info *space_info;
- u64 flags;
+ /*
+ * We had csum deletions accounted for in our delayed refs rsv, we need
+ * to drop the csum leaves for this update from our delayed_refs_rsv.
+ */
+ if (head->total_ref_mod < 0 && head->is_data) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= head->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+ }
- if (head->is_data)
- flags = BTRFS_BLOCK_GROUP_DATA;
- else if (head->is_system)
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
- else
- flags = BTRFS_BLOCK_GROUP_METADATA;
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -head->num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ /*
+ * We were dropping refs, or had a new ref and dropped it, and thus must
+ * adjust down our total_bytes_pinned, the space may or may not have
+ * been pinned and so is accounted for properly in the pinned space by
+ * now.
+ */
+ if (head->total_ref_mod < 0 ||
+ (head->total_ref_mod == 0 && head->must_insert_reserved)) {
+ u64 flags = btrfs_ref_head_to_space_flags(head);
- /*
- * We had csum deletions accounted for in our delayed refs rsv,
- * we need to drop the csum leaves for this update from our
- * delayed_refs_rsv.
- */
- if (head->is_data) {
- spin_lock(&delayed_refs->lock);
- delayed_refs->pending_csums -= head->num_bytes;
- spin_unlock(&delayed_refs->lock);
- nr_items += btrfs_csum_bytes_to_leaves(fs_info,
- head->num_bytes);
- }
+ btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
}
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
@@ -2133,25 +2120,6 @@ static u64 find_middle(struct rb_root *root)
#endif
/*
- * Takes the number of bytes to be csumm'ed and figures out how many leaves it
- * would require to store the csums for that many bytes.
- */
-u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
-{
- u64 csum_size;
- u64 num_csums_per_leaf;
- u64 num_csums;
-
- csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
- num_csums_per_leaf = div64_u64(csum_size,
- (u64)btrfs_super_csum_size(fs_info->super_copy));
- num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
- num_csums += num_csums_per_leaf - 1;
- num_csums = div64_u64(num_csums, num_csums_per_leaf);
- return num_csums;
-}
-
-/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
* 0, which means to process everything in the tree at the start
@@ -2180,7 +2148,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
if (count == 0)
- count = atomic_read(&delayed_refs->num_entries) * 2;
+ count = delayed_refs->num_heads_ready;
again:
#ifdef SCRAMBLE_DELAYED_REFS
@@ -2592,8 +2560,7 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
- num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
@@ -2622,8 +2589,6 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
struct btrfs_block_group *cache;
int ret;
- btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes);
-
cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
if (!cache)
return -EINVAL;
@@ -2635,11 +2600,19 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
* the pinned extents.
*/
btrfs_cache_block_group(cache, 1);
+ /*
+ * Make sure we wait until the cache is completely built in case it is
+ * missing or is invalid and therefore needs to be rebuilt.
+ */
+ ret = btrfs_wait_block_group_cache_done(cache);
+ if (ret)
+ goto out;
pin_down_extent(trans, cache, bytenr, num_bytes, 0);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+out:
btrfs_put_block_group(cache);
return ret;
}
@@ -2649,45 +2622,22 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
{
int ret;
struct btrfs_block_group *block_group;
- struct btrfs_caching_control *caching_ctl;
block_group = btrfs_lookup_block_group(fs_info, start);
if (!block_group)
return -EINVAL;
- btrfs_cache_block_group(block_group, 0);
- caching_ctl = btrfs_get_caching_control(block_group);
-
- if (!caching_ctl) {
- /* Logic error */
- BUG_ON(!btrfs_block_group_done(block_group));
- ret = btrfs_remove_free_space(block_group, start, num_bytes);
- } else {
- mutex_lock(&caching_ctl->mutex);
-
- if (start >= caching_ctl->progress) {
- ret = btrfs_add_excluded_extent(fs_info, start,
- num_bytes);
- } else if (start + num_bytes <= caching_ctl->progress) {
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- } else {
- num_bytes = caching_ctl->progress - start;
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- if (ret)
- goto out_lock;
+ btrfs_cache_block_group(block_group, 1);
+ /*
+ * Make sure we wait until the cache is completely built in case it is
+ * missing or is invalid and therefore needs to be rebuilt.
+ */
+ ret = btrfs_wait_block_group_cache_done(block_group);
+ if (ret)
+ goto out;
- num_bytes = (start + num_bytes) -
- caching_ctl->progress;
- start = caching_ctl->progress;
- ret = btrfs_add_excluded_extent(fs_info, start,
- num_bytes);
- }
-out_lock:
- mutex_unlock(&caching_ctl->mutex);
- btrfs_put_caching_control(caching_ctl);
- }
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+out:
btrfs_put_block_group(block_group);
return ret;
}
@@ -2730,31 +2680,6 @@ btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
atomic_inc(&bg->reservations);
}
-void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_caching_control *next;
- struct btrfs_caching_control *caching_ctl;
- struct btrfs_block_group *cache;
-
- down_write(&fs_info->commit_root_sem);
-
- list_for_each_entry_safe(caching_ctl, next,
- &fs_info->caching_block_groups, list) {
- cache = caching_ctl->block_group;
- if (btrfs_block_group_done(cache)) {
- cache->last_byte_to_unpin = (u64)-1;
- list_del_init(&caching_ctl->list);
- btrfs_put_caching_control(caching_ctl);
- } else {
- cache->last_byte_to_unpin = caching_ctl->progress;
- }
- }
-
- up_write(&fs_info->commit_root_sem);
-
- btrfs_update_global_block_rsv(fs_info);
-}
-
/*
* Returns the free cluster for the given space info and sets empty_cluster to
* what it should be based on the mount options.
@@ -2816,11 +2741,13 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
- if (start < cache->last_byte_to_unpin) {
- len = min(len, cache->last_byte_to_unpin - start);
- if (return_free_space)
- btrfs_add_free_space(cache, start, len);
+ down_read(&fs_info->commit_root_sem);
+ if (start < cache->last_byte_to_unpin && return_free_space) {
+ u64 add_len = min(len, cache->last_byte_to_unpin - start);
+
+ btrfs_add_free_space(cache, start, add_len);
}
+ up_read(&fs_info->commit_root_sem);
start += len;
total_unpinned += len;
@@ -2844,11 +2771,14 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
cache->pinned -= len;
btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
space_info->max_extent_size = 0;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(space_info, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
+ } else if (btrfs_is_zoned(fs_info)) {
+ /* Need reset before reusing in a zoned block group */
+ space_info->bytes_zone_unusable += len;
+ readonly = true;
}
spin_unlock(&cache->lock);
if (!readonly && return_free_space &&
@@ -2901,9 +2831,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
- if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
- clear_extent_bits(&fs_info->excluded_extents, start,
- end, EXTENT_UPTODATE);
if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
@@ -3040,8 +2967,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
if (!is_data && refs_to_drop != 1) {
@@ -3106,7 +3031,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_release_path(path);
- path->leave_spinning = 1;
/* Slow path to locate EXTENT/METADATA_ITEM */
key.objectid = bytenr;
@@ -3185,7 +3109,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_tree_block_info *bi;
if (item_size < sizeof(*ei) + sizeof(*bi)) {
btrfs_crit(info,
-"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu",
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
key.objectid, key.type, key.offset,
owner_objectid, item_size,
sizeof(*ei) + sizeof(*bi));
@@ -3384,7 +3308,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ref generic_ref = { 0 };
- int pin = 1;
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
@@ -3393,13 +3316,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
root->root_key.objectid);
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- int old_ref_mod, new_ref_mod;
-
btrfs_ref_tree_mod(fs_info, &generic_ref);
- ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
BUG_ON(ret); /* -ENOMEM */
- pin = old_ref_mod >= 0 && new_ref_mod < 0;
}
if (last_ref && btrfs_header_generation(buf) == trans->transid) {
@@ -3407,11 +3326,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start);
- if (!ret)
+ if (!ret) {
+ btrfs_redirty_list_add(trans->transaction, buf);
goto out;
+ }
}
- pin = 0;
cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
@@ -3420,6 +3340,13 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
goto out;
}
+ if (btrfs_is_zoned(fs_info)) {
+ btrfs_redirty_list_add(trans->transaction, buf);
+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
+ btrfs_put_block_group(cache);
+ goto out;
+ }
+
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
@@ -3428,9 +3355,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
}
out:
- if (pin)
- add_pinned_bytes(fs_info, &generic_ref);
-
if (last_ref) {
/*
* Deleting the buffer, clear the corrupt flag since it doesn't
@@ -3444,7 +3368,6 @@ out:
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int old_ref_mod, new_ref_mod;
int ret;
if (btrfs_is_testing(fs_info))
@@ -3460,14 +3383,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
- old_ref_mod = new_ref_mod = 0;
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
- ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
} else {
- ret = btrfs_add_delayed_data_ref(trans, ref, 0,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_data_ref(trans, ref, 0);
}
if (!((ref->type == BTRFS_REF_METADATA &&
@@ -3476,9 +3396,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
btrfs_ref_tree_mod(fs_info, ref);
- if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
- add_pinned_bytes(fs_info, ref);
-
return ret;
}
@@ -3555,6 +3472,7 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
enum btrfs_extent_allocation_policy {
BTRFS_EXTENT_ALLOC_CLUSTERED,
+ BTRFS_EXTENT_ALLOC_ZONED,
};
/*
@@ -3579,6 +3497,9 @@ struct find_free_extent_ctl {
bool have_caching_bg;
bool orig_have_caching_bg;
+ /* Allocation is called for tree-log */
+ bool for_treelog;
+
/* RAID index, converted from flags */
int index;
@@ -3807,6 +3728,118 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group,
return find_free_extent_unclustered(block_group, ffe_ctl);
}
+/*
+ * Tree-log block group locking
+ * ============================
+ *
+ * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
+ * indicates the starting address of a block group, which is reserved only
+ * for tree-log metadata.
+ *
+ * Lock nesting
+ * ============
+ *
+ * space_info::lock
+ * block_group::lock
+ * fs_info::treelog_bg_lock
+ */
+
+/*
+ * Simple allocator for sequential-only block group. It only allows sequential
+ * allocation. No need to play with trees. This function also reserves the
+ * bytes as in btrfs_add_reserved_bytes.
+ */
+static int do_allocation_zoned(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_space_info *space_info = block_group->space_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ u64 start = block_group->start;
+ u64 num_bytes = ffe_ctl->num_bytes;
+ u64 avail;
+ u64 bytenr = block_group->start;
+ u64 log_bytenr;
+ int ret = 0;
+ bool skip;
+
+ ASSERT(btrfs_is_zoned(block_group->fs_info));
+
+ /*
+ * Do not allow non-tree-log blocks in the dedicated tree-log block
+ * group, and vice versa.
+ */
+ spin_lock(&fs_info->treelog_bg_lock);
+ log_bytenr = fs_info->treelog_bg;
+ skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
+ (!ffe_ctl->for_treelog && bytenr == log_bytenr));
+ spin_unlock(&fs_info->treelog_bg_lock);
+ if (skip)
+ return 1;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ spin_lock(&fs_info->treelog_bg_lock);
+
+ ASSERT(!ffe_ctl->for_treelog ||
+ block_group->start == fs_info->treelog_bg ||
+ fs_info->treelog_bg == 0);
+
+ if (block_group->ro) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Do not allow currently using block group to be tree-log dedicated
+ * block group.
+ */
+ if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
+ (block_group->used || block_group->reserved)) {
+ ret = 1;
+ goto out;
+ }
+
+ avail = block_group->length - block_group->alloc_offset;
+ if (avail < num_bytes) {
+ if (ffe_ctl->max_extent_size < avail) {
+ /*
+ * With sequential allocator, free space is always
+ * contiguous
+ */
+ ffe_ctl->max_extent_size = avail;
+ ffe_ctl->total_free_space = avail;
+ }
+ ret = 1;
+ goto out;
+ }
+
+ if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
+ fs_info->treelog_bg = block_group->start;
+
+ ffe_ctl->found_offset = start + block_group->alloc_offset;
+ block_group->alloc_offset += num_bytes;
+ spin_lock(&ctl->tree_lock);
+ ctl->free_space -= num_bytes;
+ spin_unlock(&ctl->tree_lock);
+
+ /*
+ * We do not check if found_offset is aligned to stripesize. The
+ * address is anyway rewritten when using zone append writing.
+ */
+
+ ffe_ctl->search_start = ffe_ctl->found_offset;
+
+out:
+ if (ret && ffe_ctl->for_treelog)
+ fs_info->treelog_bg = 0;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ return ret;
+}
+
static int do_allocation(struct btrfs_block_group *block_group,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_block_group **bg_ret)
@@ -3814,6 +3847,8 @@ static int do_allocation(struct btrfs_block_group *block_group,
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
default:
BUG();
}
@@ -3828,6 +3863,9 @@ static void release_block_group(struct btrfs_block_group *block_group,
ffe_ctl->retry_clustered = false;
ffe_ctl->retry_unclustered = false;
break;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Nothing to do */
+ break;
default:
BUG();
}
@@ -3856,6 +3894,9 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
case BTRFS_EXTENT_ALLOC_CLUSTERED:
found_extent_clustered(ffe_ctl, ins);
break;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Nothing to do */
+ break;
default:
BUG();
}
@@ -3871,6 +3912,9 @@ static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
*/
ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
return 0;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Give up here */
+ return -ENOSPC;
default:
BUG();
}
@@ -4039,6 +4083,14 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
case BTRFS_EXTENT_ALLOC_CLUSTERED:
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ if (ffe_ctl->for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg)
+ ffe_ctl->hint_byte = fs_info->treelog_bg;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ }
+ return 0;
default:
BUG();
}
@@ -4081,6 +4133,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
bool full_search = false;
+ bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
WARN_ON(num_bytes < fs_info->sectorsize);
@@ -4094,6 +4147,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
ffe_ctl.orig_have_caching_bg = false;
ffe_ctl.found_offset = 0;
ffe_ctl.hint_byte = hint_byte_orig;
+ ffe_ctl.for_treelog = for_treelog;
ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
/* For clustered allocation */
@@ -4102,6 +4156,9 @@ static noinline int find_free_extent(struct btrfs_root *root,
ffe_ctl.last_ptr = NULL;
ffe_ctl.use_cluster = true;
+ if (btrfs_is_zoned(fs_info))
+ ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED;
+
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
@@ -4165,8 +4222,11 @@ search:
struct btrfs_block_group *bg_ret;
/* If the block group is read-only, we can skip it entirely. */
- if (unlikely(block_group->ro))
+ if (unlikely(block_group->ro)) {
+ if (for_treelog)
+ btrfs_clear_treelog_bg(block_group);
continue;
+ }
btrfs_grab_block_group(block_group, delalloc);
ffe_ctl.search_start = block_group->start;
@@ -4244,20 +4304,21 @@ have_block_group:
/* move on to the next group */
if (ffe_ctl.search_start + num_bytes >
block_group->start + block_group->length) {
- btrfs_add_free_space(block_group, ffe_ctl.found_offset,
- num_bytes);
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl.found_offset, num_bytes);
goto loop;
}
if (ffe_ctl.found_offset < ffe_ctl.search_start)
- btrfs_add_free_space(block_group, ffe_ctl.found_offset,
- ffe_ctl.search_start - ffe_ctl.found_offset);
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl.found_offset,
+ ffe_ctl.search_start - ffe_ctl.found_offset);
ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
num_bytes, delalloc);
if (ret == -EAGAIN) {
- btrfs_add_free_space(block_group, ffe_ctl.found_offset,
- num_bytes);
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl.found_offset, num_bytes);
goto loop;
}
btrfs_inc_block_group_reservations(block_group);
@@ -4351,6 +4412,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
+ bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
flags = get_alloc_profile_by_root(root, is_data);
again:
@@ -4374,8 +4436,8 @@ again:
sinfo = btrfs_find_space_info(fs_info, flags);
btrfs_err(fs_info,
- "allocation failed flags %llu, wanted %llu",
- flags, num_bytes);
+ "allocation failed flags %llu, wanted %llu tree-log %d",
+ flags, num_bytes, for_treelog);
if (sinfo)
btrfs_dump_space_info(fs_info, sinfo,
num_bytes, 1);
@@ -4448,7 +4510,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
if (ret) {
@@ -4533,7 +4594,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
&extent_key, size);
if (ret) {
@@ -4559,7 +4619,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
}
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
- BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_SHARED_BLOCK_REF_KEY);
btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
@@ -4596,7 +4655,6 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins)
{
struct btrfs_ref generic_ref = { 0 };
- int ret;
BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
@@ -4604,9 +4662,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ins->objectid, ins->offset, 0);
btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
- ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
- ram_bytes, NULL, NULL);
- return ret;
+
+ return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
}
/*
@@ -4662,7 +4719,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
if (IS_ERR(buf))
return buf;
@@ -4679,12 +4736,17 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ERR_PTR(-EUCLEAN);
}
+ /*
+ * This needs to stay, because we could allocate a freed block from an
+ * old tree into a new tree, so we need to make sure this new block is
+ * set to the appropriate level and owner.
+ */
btrfs_set_buffer_lockdep_class(owner, buf, level);
__btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
+ clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
- btrfs_set_lock_blocking_write(buf);
set_extent_buffer_uptodate(buf);
memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -4794,8 +4856,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
generic_ref.real_root = root->root_key.objectid;
btrfs_init_tree_ref(&generic_ref, level, root_objectid);
btrfs_ref_tree_mod(fs_info, &generic_ref);
- ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
- extent_op, NULL, NULL);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
if (ret)
goto out_free_delayed;
}
@@ -4905,7 +4966,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- readahead_tree_block(fs_info, bytenr);
+ btrfs_readahead_node_child(eb, slot);
nread++;
}
wc->reada_slot = slot;
@@ -5064,16 +5125,13 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
next = find_extent_buffer(fs_info, bytenr);
if (!next) {
- next = btrfs_find_create_tree_block(fs_info, bytenr);
+ next = btrfs_find_create_tree_block(fs_info, bytenr,
+ root->root_key.objectid, level - 1);
if (IS_ERR(next))
return PTR_ERR(next);
-
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
- level - 1);
reada = 1;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
@@ -5124,8 +5182,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, generation, level - 1,
- &first_key);
+ next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
+ generation, level - 1, &first_key);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
@@ -5133,7 +5191,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
return -EIO;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
}
level--;
@@ -5145,7 +5202,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
path->nodes[level] = next;
path->slots[level] = 0;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
wc->level = level;
if (wc->level == 1)
wc->reada_slot = 0;
@@ -5273,8 +5330,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level]) {
BUG_ON(level == 0);
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking_write(eb);
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
@@ -5317,8 +5373,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level] &&
btrfs_header_generation(eb) == trans->transid) {
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking_write(eb);
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
}
btrfs_clean_tree_block(eb);
}
@@ -5486,9 +5541,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking_write(path->nodes[level]);
path->slots[level] = 0;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
memset(&wc->update_progress, 0,
sizeof(wc->update_progress));
} else {
@@ -5496,7 +5550,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
memcpy(&wc->update_progress, &key,
sizeof(wc->update_progress));
- level = root_item->drop_level;
+ level = btrfs_root_drop_level(root_item);
BUG_ON(level == 0);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5516,8 +5570,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
level = btrfs_header_level(root->node);
while (1) {
btrfs_tree_lock(path->nodes[level]);
- btrfs_set_lock_blocking_write(path->nodes[level]);
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
@@ -5529,7 +5582,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
}
BUG_ON(wc->refs[level] == 0);
- if (level == root_item->drop_level)
+ if (level == btrfs_root_drop_level(root_item))
break;
btrfs_tree_unlock(path->nodes[level]);
@@ -5574,7 +5627,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
}
btrfs_cpu_key_to_disk(&root_item->drop_progress,
&wc->drop_progress);
- root_item->drop_level = wc->drop_level;
+ btrfs_set_root_drop_level(root_item, wc->drop_level);
BUG_ON(wc->level == 0);
if (btrfs_should_end_transaction(trans) ||
@@ -5596,7 +5649,15 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_free;
}
- trans = btrfs_start_transaction(tree_root, 0);
+ /*
+ * Use join to avoid potential EINTR from transaction
+ * start. See wait_reserve_ticket and the whole
+ * reservation callchain.
+ */
+ if (for_reloc)
+ trans = btrfs_join_transaction(tree_root);
+ else
+ trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
@@ -5704,7 +5765,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
level = btrfs_header_level(node);
path->nodes[level] = node;
path->slots[level] = 0;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
wc->refs[parent_level] = 1;
wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;