aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/transaction.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-10-30 10:42:06 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-10-30 10:42:06 -1000
commitd5acbc60fafbe0fc94c552ce916dd592cd4c6371 (patch)
treec2d70058845399ebcf894e551e6b0c053dd3e836 /fs/btrfs/transaction.c
parent8829687a4ac1d484639425a691da46f6e361aec1 (diff)
parentc6e8f898f56fae2cb5bc4396bec480f23cd8b066 (diff)
Merge tag 'for-6.7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "New features: - raid-stripe-tree New tree for logical file extent mapping where the physical mapping may not match on multiple devices. This is now used in zoned mode to implement RAID0/RAID1* profiles, but can be used in non-zoned mode as well. The support for RAID56 is in development and will eventually fix the problems with the current implementation. This is a backward incompatible feature and has to be enabled at mkfs time. - simple quota accounting (squota) A simplified mode of qgroup that accounts all space on the initial extent owners (a subvolume), the snapshots are then cheap to create and delete. The deletion of snapshots in fully accounting qgroups is a known CPU/IO performance bottleneck. The squota is not suitable for the general use case but works well for containers where the original subvolume exists for the whole time. This is a backward incompatible feature as it needs extending some structures, but can be enabled on an existing filesystem. - temporary filesystem fsid (temp_fsid) The fsid identifies a filesystem and is hard coded in the structures, which disallows mounting the same fsid found on different devices. For a single device filesystem this is not strictly necessary, a new temporary fsid can be generated on mount e.g. after a device is cloned. This will be used by Steam Deck for root partition A/B testing, or can be used for VM root images. Other user visible changes: - filesystems with partially finished metadata_uuid conversion cannot be mounted anymore and the uuid fixup has to be done by btrfs-progs (btrfstune). Performance improvements: - reduce reservations for checksum deletions (with enabled free space tree by factor of 4), on a sample workload on file with many extents the deletion time decreased by 12% - make extent state merges more efficient during insertions, reduce rb-tree iterations (run time of critical functions reduced by 5%) Core changes: - the integrity check functionality has been removed, this was a debugging feature and removal does not affect other integrity checks like checksums or tree-checker - space reservation changes: - more efficient delayed ref reservations, this avoids building up too much work or overusing or exhausting the global block reserve in some situations - move delayed refs reservation to the transaction start time, this prevents some ENOSPC corner cases related to exhaustion of global reserve - improvements in reducing excessive reservations for block group items - adjust overcommit logic in near full situations, account for one more chunk to eventually allocate metadata chunk, this is mostly relevant for small filesystems (<10GiB) - single device filesystems are scanned but not registered (except seed devices), this allows temp_fsid to work - qgroup iterations do not need GFP_ATOMIC allocations anymore - cleanups, refactoring, reduced data structure size, function parameter simplifications, error handling fixes" * tag 'for-6.7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (156 commits) btrfs: open code timespec64 in struct btrfs_inode btrfs: remove redundant log root tree index assignment during log sync btrfs: remove redundant initialization of variable dirty in btrfs_update_time() btrfs: sysfs: show temp_fsid feature btrfs: disable the device add feature for temp-fsid btrfs: disable the seed feature for temp-fsid btrfs: update comment for temp-fsid, fsid, and metadata_uuid btrfs: remove pointless empty log context list check when syncing log btrfs: update comment for struct btrfs_inode::lock btrfs: remove pointless barrier from btrfs_sync_file() btrfs: add and use helpers for reading and writing last_trans_committed btrfs: add and use helpers for reading and writing fs_info->generation btrfs: add and use helpers for reading and writing log_transid btrfs: add and use helpers for reading and writing last_log_commit btrfs: support cloned-device mount capability btrfs: add helper function find_fsid_by_disk btrfs: stop reserving excessive space for block group item insertions btrfs: stop reserving excessive space for block group item updates btrfs: reorder btrfs_inode to fill gaps btrfs: open code btrfs_ordered_inode_tree in btrfs_inode ...
Diffstat (limited to 'fs/btrfs/transaction.c')
-rw-r--r--fs/btrfs/transaction.c229
1 files changed, 140 insertions, 89 deletions
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 38a2775c5c7b..6e63816dddcb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -386,7 +386,7 @@ loop:
IO_TREE_TRANS_DIRTY_PAGES);
extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
IO_TREE_FS_PINNED_EXTENTS);
- fs_info->generation++;
+ btrfs_set_fs_generation(fs_info, fs_info->generation + 1);
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
cur_trans->aborted = 0;
@@ -561,6 +561,69 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
return true;
}
+static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
+ enum btrfs_reserve_flush_enum flush,
+ u64 num_bytes,
+ u64 *delayed_refs_bytes)
+{
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
+ u64 extra_delayed_refs_bytes = 0;
+ u64 bytes;
+ int ret;
+
+ /*
+ * If there's a gap between the size of the delayed refs reserve and
+ * its reserved space, than some tasks have added delayed refs or bumped
+ * its size otherwise (due to block group creation or removal, or block
+ * group item update). Also try to allocate that gap in order to prevent
+ * using (and possibly abusing) the global reserve when committing the
+ * transaction.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+ !btrfs_block_rsv_full(delayed_refs_rsv)) {
+ spin_lock(&delayed_refs_rsv->lock);
+ if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
+ extra_delayed_refs_bytes = delayed_refs_rsv->size -
+ delayed_refs_rsv->reserved;
+ spin_unlock(&delayed_refs_rsv->lock);
+ }
+
+ bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
+
+ /*
+ * We want to reserve all the bytes we may need all at once, so we only
+ * do 1 enospc flushing cycle per transaction start.
+ */
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ if (ret == 0) {
+ if (extra_delayed_refs_bytes > 0)
+ btrfs_migrate_to_delayed_refs_rsv(fs_info,
+ extra_delayed_refs_bytes);
+ return 0;
+ }
+
+ if (extra_delayed_refs_bytes > 0) {
+ bytes -= extra_delayed_refs_bytes;
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ if (ret == 0)
+ return 0;
+ }
+
+ /*
+ * If we are an emergency flush, which can steal from the global block
+ * reserve, then attempt to not reserve space for the delayed refs, as
+ * we will consume space for them from the global block reserve.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ bytes -= *delayed_refs_bytes;
+ *delayed_refs_bytes = 0;
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ }
+
+ return ret;
+}
+
static struct btrfs_trans_handle *
start_transaction(struct btrfs_root *root, unsigned int num_items,
unsigned int type, enum btrfs_reserve_flush_enum flush,
@@ -568,10 +631,12 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
u64 num_bytes = 0;
u64 qgroup_reserved = 0;
+ u64 delayed_refs_bytes = 0;
bool reloc_reserved = false;
bool do_chunk_alloc = false;
int ret;
@@ -594,9 +659,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
* the appropriate flushing if need be.
*/
if (num_items && root != fs_info->chunk_root) {
- struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
- u64 delayed_refs_bytes = 0;
-
qgroup_reserved = num_items * fs_info->nodesize;
/*
* Use prealloc for now, as there might be a currently running
@@ -608,20 +670,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
if (ret)
return ERR_PTR(ret);
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
/*
- * We want to reserve all the bytes we may need all at once, so
- * we only do 1 enospc flushing cycle per transaction start. We
- * accomplish this by simply assuming we'll do num_items worth
- * of delayed refs updates in this trans handle, and refill that
- * amount for whatever is missing in the reserve.
+ * If we plan to insert/update/delete "num_items" from a btree,
+ * we will also generate delayed refs for extent buffers in the
+ * respective btree paths, so reserve space for the delayed refs
+ * that will be generated by the caller as it modifies btrees.
+ * Try to reserve them to avoid excessive use of the global
+ * block reserve.
*/
- num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
- if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- !btrfs_block_rsv_full(delayed_refs_rsv)) {
- delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
- num_items);
- num_bytes += delayed_refs_bytes;
- }
+ delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items);
/*
* Do the reservation for the relocation root creation
@@ -631,16 +689,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
reloc_reserved = true;
}
- ret = btrfs_reserve_metadata_bytes(fs_info, rsv, num_bytes, flush);
+ ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes,
+ &delayed_refs_bytes);
if (ret)
goto reserve_fail;
- if (delayed_refs_bytes) {
- btrfs_migrate_to_delayed_refs_rsv(fs_info, delayed_refs_bytes);
- num_bytes -= delayed_refs_bytes;
- }
- btrfs_block_rsv_add_bytes(rsv, num_bytes, true);
- if (rsv->space_info->force_alloc)
+ btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true);
+
+ if (trans_rsv->space_info->force_alloc)
do_chunk_alloc = true;
} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
!btrfs_block_rsv_full(delayed_refs_rsv)) {
@@ -700,6 +756,7 @@ again:
h->type = type;
INIT_LIST_HEAD(&h->new_bgs);
+ btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS);
smp_mb();
if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
@@ -712,8 +769,17 @@ again:
if (num_bytes) {
trace_btrfs_space_reservation(fs_info, "transaction",
h->transid, num_bytes, 1);
- h->block_rsv = &fs_info->trans_block_rsv;
+ h->block_rsv = trans_rsv;
h->bytes_reserved = num_bytes;
+ if (delayed_refs_bytes > 0) {
+ trace_btrfs_space_reservation(fs_info,
+ "local_delayed_refs_rsv",
+ h->transid,
+ delayed_refs_bytes, 1);
+ h->delayed_refs_bytes_reserved = delayed_refs_bytes;
+ btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true);
+ delayed_refs_bytes = 0;
+ }
h->reloc_reserved = reloc_reserved;
}
@@ -769,8 +835,10 @@ join_fail:
kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
if (num_bytes)
- btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
- num_bytes, NULL);
+ btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
+ if (delayed_refs_bytes)
+ btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
+ delayed_refs_bytes);
reserve_fail:
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
return ERR_PTR(ret);
@@ -817,7 +885,7 @@ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *roo
}
/*
- * btrfs_attach_transaction() - catch the running transaction
+ * Catch the running transaction.
*
* It is used when we want to commit the current the transaction, but
* don't want to start a new one.
@@ -836,7 +904,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
}
/*
- * btrfs_attach_transaction_barrier() - catch the running transaction
+ * Catch the running transaction.
*
* It is similar to the above function, the difference is this one
* will wait for all the inactive transactions until they fully
@@ -912,7 +980,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
int ret = 0;
if (transid) {
- if (transid <= fs_info->last_trans_committed)
+ if (transid <= btrfs_get_last_trans_committed(fs_info))
goto out;
/* find specified transaction */
@@ -936,7 +1004,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
* raced with btrfs_commit_transaction
*/
if (!cur_trans) {
- if (transid > fs_info->last_trans_committed)
+ if (transid > btrfs_get_last_trans_committed(fs_info))
ret = -EINVAL;
goto out;
}
@@ -991,11 +1059,14 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
if (!trans->block_rsv) {
ASSERT(!trans->bytes_reserved);
+ ASSERT(!trans->delayed_refs_bytes_reserved);
return;
}
- if (!trans->bytes_reserved)
+ if (!trans->bytes_reserved) {
+ ASSERT(!trans->delayed_refs_bytes_reserved);
return;
+ }
ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
trace_btrfs_space_reservation(fs_info, "transaction",
@@ -1003,6 +1074,16 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
btrfs_block_rsv_release(fs_info, trans->block_rsv,
trans->bytes_reserved, NULL);
trans->bytes_reserved = 0;
+
+ if (!trans->delayed_refs_bytes_reserved)
+ return;
+
+ trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv",
+ trans->transid,
+ trans->delayed_refs_bytes_reserved, 0);
+ btrfs_block_rsv_release(fs_info, &trans->delayed_rsv,
+ trans->delayed_refs_bytes_reserved, NULL);
+ trans->delayed_refs_bytes_reserved = 0;
}
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -1334,7 +1415,7 @@ again:
}
/* Now flush any delayed refs generated by updating all of the roots */
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret)
return ret;
@@ -1349,7 +1430,7 @@ again:
* so we want to keep this flushing in this loop to make sure
* everything gets run.
*/
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret)
return ret;
}
@@ -1484,45 +1565,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
}
/*
- * defrag a given btree.
- * Every leaf in the btree is read and defragged.
- */
-int btrfs_defrag_root(struct btrfs_root *root)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_trans_handle *trans;
- int ret;
-
- if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
- return 0;
-
- while (1) {
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
-
- ret = btrfs_defrag_leaves(trans, root);
-
- btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(info);
- cond_resched();
-
- if (btrfs_fs_closing(info) || ret != -EAGAIN)
- break;
-
- if (btrfs_defrag_cancelled(info)) {
- btrfs_debug(info, "defrag_root cancelled");
- ret = -EAGAIN;
- break;
- }
- }
- clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
- return ret;
-}
-
-/*
* Do all special snapshot related qgroup dirty hack.
*
* Will do all needed qgroup inherit and dirty hack like switch commit
@@ -1539,11 +1581,10 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
int ret;
/*
- * Save some performance in the case that qgroups are not
- * enabled. If this check races with the ioctl, rescan will
- * kick in anyway.
+ * Save some performance in the case that qgroups are not enabled. If
+ * this check races with the ioctl, rescan will kick in anyway.
*/
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
/*
@@ -1567,7 +1608,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* for now flush the delayed refs to narrow the race window where the
* qgroup counters could end up wrong.
*/
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -1582,7 +1623,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
/* Now qgroup are all updated, we can inherit it to new qgroups */
ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
- inherit);
+ parent->root_key.objectid, inherit);
if (ret < 0)
goto out;
@@ -1732,6 +1773,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
+ ret = btrfs_create_qgroup(trans, objectid);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+
/*
* pull in the delayed directory update
* and the delayed inode item
@@ -1843,8 +1890,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* To co-operate with that hack, we do hack again.
* Or snapshot will be greatly slowed down by a subtree qgroup rescan
*/
- ret = qgroup_account_snapshot(trans, root, parent_root,
- pending->inherit, objectid);
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL)
+ ret = qgroup_account_snapshot(trans, root, parent_root,
+ pending->inherit, objectid);
+ else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid,
+ parent_root->root_key.objectid, pending->inherit);
if (ret < 0)
goto fail;
@@ -1862,7 +1913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
fname.disk_name.len * 2);
inode_set_mtime_to_ts(parent_inode,
inode_set_ctime_current(parent_inode));
- ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
+ ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -2085,7 +2136,7 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_block_group *block_group, *tmp;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
list_del_init(&block_group->bg_list);
}
}
@@ -2404,7 +2455,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto unlock_reloc;
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret)
goto unlock_reloc;
@@ -2537,7 +2588,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(fs_info);
- fs_info->last_trans_committed = cur_trans->transid;
+ btrfs_set_last_trans_committed(fs_info, cur_trans->transid);
/*
* We needn't acquire the lock here because there is no other task
* which can change it.
@@ -2655,18 +2706,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
*/
void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno, bool first_hit)
+ unsigned int line, int error, bool first_hit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- WRITE_ONCE(trans->aborted, errno);
- WRITE_ONCE(trans->transaction->aborted, errno);
- if (first_hit && errno == -ENOSPC)
+ WRITE_ONCE(trans->aborted, error);
+ WRITE_ONCE(trans->transaction->aborted, error);
+ if (first_hit && error == -ENOSPC)
btrfs_dump_space_info_for_trans_abort(fs_info);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
wake_up(&fs_info->transaction_blocked_wait);
- __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+ __btrfs_handle_fs_error(fs_info, function, line, error, NULL);
}
int __init btrfs_transaction_init(void)