diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-07 11:34:19 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-07 11:34:19 -0700 |
commit | 9f2e3a53f7ec9ef55e9d01bc29a6285d291c151e (patch) | |
tree | c25b0eb20dac1a39a6b55c521b2658dcceb7d532 /fs/btrfs/tree-log.c | |
parent | 78438ce18f26dbcaa8993bb45d20ffb0cec3bc3e (diff) | |
parent | b1c16ac978fd40ae636e629bb69a652df7eebdc2 (diff) |
Merge tag 'for-5.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"This time the majority of changes are cleanups, though there's still a
number of changes of user interest.
User visible changes:
- better read time and write checks to catch errors early and before
writing data to disk (to catch potential memory corruption on data
that get checksummed)
- qgroups + metadata relocation: last speed up patch int the series
to address the slowness, there should be no overhead comparing
balance with and without qgroups
- FIEMAP ioctl does not start a transaction unnecessarily, this can
result in a speed up and less blocking due to IO
- LOGICAL_INO (v1, v2) does not start transaction unnecessarily, this
can speed up the mentioned ioctl and scrub as well
- fsync on files with many (but not too many) hardlinks is faster,
finer decision if the links should be fsynced individually or
completely
- send tries harder to find ranges to clone
- trim/discard will skip unallocated chunks that haven't been touched
since the last mount
Fixes:
- send flushes delayed allocation before start, otherwise it could
miss some changes in case of a very recent rw->ro switch of a
subvolume
- fix fallocate with qgroups that could lead to space accounting
underflow, reported as a warning
- trim/discard ioctl honours the requested range
- starting send and dedupe on a subvolume at the same time will let
only one of them succeed, this is to prevent changes that send
could miss due to dedupe; both operations are restartable
Core changes:
- more tree-checker validations, errors reported by fuzzing tools:
- device item
- inode item
- block group profiles
- tracepoints for extent buffer locking
- async cow preallocates memory to avoid errors happening too deep in
the call chain
- metadata reservations for delalloc reworked to better adapt in
many-writers/low-space scenarios
- improved space flushing logic for intense DIO vs buffered workloads
- lots of cleanups
- removed unused struct members
- redundant argument removal
- properties and xattrs
- extent buffer locking
- selftests
- use common file type conversions
- many-argument functions reduction"
* tag 'for-5.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (227 commits)
btrfs: Use kvmalloc for allocating compressed path context
btrfs: Factor out common extent locking code in submit_compressed_extents
btrfs: Set io_tree only once in submit_compressed_extents
btrfs: Replace clear_extent_bit with unlock_extent
btrfs: Make compress_file_range take only struct async_chunk
btrfs: Remove fs_info from struct async_chunk
btrfs: Rename async_cow to async_chunk
btrfs: Preallocate chunks in cow_file_range_async
btrfs: reserve delalloc metadata differently
btrfs: track DIO bytes in flight
btrfs: merge calls of btrfs_setxattr and btrfs_setxattr_trans in btrfs_set_prop
btrfs: delete unused function btrfs_set_prop_trans
btrfs: start transaction in xattr_handler_set_prop
btrfs: drop local copy of inode i_mode
btrfs: drop old_fsflags in btrfs_ioctl_setflags
btrfs: modify local copy of btrfs_inode flags
btrfs: drop useless inode i_flags copy and restore
btrfs: start transaction in btrfs_ioctl_setflags()
btrfs: export btrfs_set_prop
btrfs: refactor btrfs_set_props to validate externally
...
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r-- | fs/btrfs/tree-log.c | 289 |
1 files changed, 225 insertions, 64 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 561884f60d35..6adcd8a2c5c7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -139,7 +139,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); if (root->log_root) { - if (btrfs_need_log_full_commit(fs_info, trans)) { + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; goto out; } @@ -225,6 +225,17 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } +static int btrfs_write_tree_block(struct extent_buffer *buf) +{ + return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, + buf->start + buf->len - 1); +} + +static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +{ + filemap_fdatawait_range(buf->pages[0]->mapping, + buf->start, buf->start + buf->len - 1); +} /* * the walk control struct is used to pass state down the chain when @@ -304,7 +315,7 @@ static int process_one_buffer(struct btrfs_root *log, if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { if (wc->pin && btrfs_header_level(eb) == 0) - ret = btrfs_exclude_logged_extents(fs_info, eb); + ret = btrfs_exclude_logged_extents(eb); if (wc->write) btrfs_write_tree_block(eb); if (wc->wait) @@ -333,7 +344,6 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; u32 item_size; u64 saved_i_size = 0; @@ -454,10 +464,9 @@ insert: found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (found_size > item_size) - btrfs_truncate_item(fs_info, path, item_size, 1); + btrfs_truncate_item(path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(fs_info, path, - item_size - found_size); + btrfs_extend_item(path, item_size - found_size); } else if (ret) { return ret; } @@ -694,9 +703,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; if (ins.objectid > 0) { + struct btrfs_ref ref = { 0 }; u64 csum_start; u64 csum_end; LIST_HEAD(ordered_sums); + /* * is this extent already allocated in the extent * allocation tree? If so, just add a reference @@ -704,10 +715,13 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); if (ret == 0) { - ret = btrfs_inc_extent_ref(trans, root, - ins.objectid, ins.offset, - 0, root->root_key.objectid, + btrfs_init_generic_ref(&ref, + BTRFS_ADD_DELAYED_REF, + ins.objectid, ins.offset, 0); + btrfs_init_data_ref(&ref, + root->root_key.objectid, key->objectid, offset); + ret = btrfs_inc_extent_ref(trans, &ref); if (ret) goto out; } else { @@ -2725,7 +2739,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking_write(next); - clean_tree_block(fs_info, next); + btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } else { @@ -2809,7 +2823,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking_write(next); - clean_tree_block(fs_info, next); + btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } else { @@ -2891,7 +2905,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking_write(next); - clean_tree_block(fs_info, next); + btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } else { @@ -3066,7 +3080,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (btrfs_need_log_full_commit(fs_info, trans)) { + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; mutex_unlock(&root->log_mutex); goto out; @@ -3085,7 +3099,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); mutex_unlock(&root->log_mutex); goto out; } @@ -3127,7 +3141,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); if (ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); @@ -3173,7 +3187,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (btrfs_need_log_full_commit(fs_info, trans)) { + if (btrfs_need_log_full_commit(trans)) { blk_finish_plug(&plug); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); @@ -3186,7 +3200,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; @@ -3196,7 +3210,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_wait_tree_log_extents(log_root_tree, EXTENT_NEW | EXTENT_DIRTY); if (ret) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } @@ -3218,7 +3232,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = write_all_supers(fs_info, 1); if (ret) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); goto out_wake_log_root; } @@ -3422,7 +3436,7 @@ fail: out_unlock: mutex_unlock(&dir->log_mutex); if (ret == -ENOSPC) { - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(trans); ret = 0; } else if (ret < 0) btrfs_abort_transaction(trans, ret); @@ -3438,7 +3452,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *inode, u64 dirid) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log; u64 index; int ret; @@ -3456,7 +3469,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, dirid, &index); mutex_unlock(&inode->log_mutex); if (ret == -ENOSPC) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) btrfs_abort_transaction(trans, ret); @@ -5442,7 +5455,7 @@ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, * Make sure any commits to the log are forced to be full * commits. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); ret = true; } mutex_unlock(&inode->log_mutex); @@ -5819,6 +5832,190 @@ out: return ret; } +static int log_new_ancestors(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_key found_key; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + + while (true) { + struct btrfs_fs_info *fs_info = root->fs_info; + const u64 last_committed = fs_info->last_trans_committed; + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_key search_key; + struct inode *inode; + int ret = 0; + + btrfs_release_path(path); + + search_key.objectid = found_key.offset; + search_key.type = BTRFS_INODE_ITEM_KEY; + search_key.offset = 0; + inode = btrfs_iget(fs_info->sb, &search_key, root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (BTRFS_I(inode)->generation > last_committed) + ret = btrfs_log_inode(trans, root, BTRFS_I(inode), + LOG_INODE_EXISTS, + 0, LLONG_MAX, ctx); + iput(inode); + if (ret) + return ret; + + if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) + break; + + search_key.type = BTRFS_INODE_REF_KEY; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + return ret; + + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + return -ENOENT; + leaf = path->nodes[0]; + slot = path->slots[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != search_key.objectid || + found_key.type != BTRFS_INODE_REF_KEY) + return -ENOENT; + } + return 0; +} + +static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct dentry *parent, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct dentry *old_parent = NULL; + struct super_block *sb = inode->vfs_inode.i_sb; + int ret = 0; + + while (true) { + if (!parent || d_really_is_negative(parent) || + sb != parent->d_sb) + break; + + inode = BTRFS_I(d_inode(parent)); + if (root != inode->root) + break; + + if (inode->generation > fs_info->last_trans_committed) { + ret = btrfs_log_inode(trans, root, inode, + LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); + if (ret) + break; + } + if (IS_ROOT(parent)) + break; + + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; + } + dput(old_parent); + + return ret; +} + +static int log_all_new_ancestors(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct dentry *parent, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = inode->root; + const u64 ino = btrfs_ino(inode); + struct btrfs_path *path; + struct btrfs_key search_key; + int ret; + + /* + * For a single hard link case, go through a fast path that does not + * need to iterate the fs/subvolume tree. + */ + if (inode->vfs_inode.i_nlink < 2) + return log_new_ancestors_fast(trans, inode, parent, ctx); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + search_key.objectid = ino; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = 0; +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + if (ret == 0) + path->slots[0]++; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_key found_key; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != ino || + found_key.type > BTRFS_INODE_EXTREF_KEY) + break; + + /* + * Don't deal with extended references because they are rare + * cases and too complex to deal with (we would need to keep + * track of which subitem we are processing for each item in + * this loop, etc). So just return some error to fallback to + * a transaction commit. + */ + if (found_key.type == BTRFS_INODE_EXTREF_KEY) { + ret = -EMLINK; + goto out; + } + + /* + * Logging ancestors needs to do more searches on the fs/subvol + * tree, so it releases the path as needed to avoid deadlocks. + * Keep track of the last inode ref key and resume from that key + * after logging all new ancestors for the current hard link. + */ + memcpy(&search_key, &found_key, sizeof(search_key)); + + ret = log_new_ancestors(trans, root, path, ctx); + if (ret) + goto out; + btrfs_release_path(path); + goto again; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -5836,11 +6033,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct super_block *sb; - struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = fs_info->last_trans_committed; bool log_dentries = false; - struct btrfs_inode *orig_inode = inode; sb = inode->vfs_inode.i_sb; @@ -5946,56 +6141,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * and has a link count of 2. */ if (inode->last_unlink_trans > last_committed) { - ret = btrfs_log_all_parents(trans, orig_inode, ctx); + ret = btrfs_log_all_parents(trans, inode, ctx); if (ret) goto end_trans; } - /* - * If a new hard link was added to the inode in the current transaction - * and its link count is now greater than 1, we need to fallback to a - * transaction commit, otherwise we can end up not logging all its new - * parents for all the hard links. Here just from the dentry used to - * fsync, we can not visit the ancestor inodes for all the other hard - * links to figure out if any is new, so we fallback to a transaction - * commit (instead of adding a lot of complexity of scanning a btree, - * since this scenario is not a common use case). - */ - if (inode->vfs_inode.i_nlink > 1 && - inode->last_link_trans > last_committed) { - ret = -EMLINK; + ret = log_all_new_ancestors(trans, inode, parent, ctx); + if (ret) goto end_trans; - } - while (1) { - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - break; - - inode = BTRFS_I(d_inode(parent)); - if (root != inode->root) - break; - - if (inode->generation > last_committed) { - ret = btrfs_log_inode(trans, root, inode, - LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); - if (ret) - goto end_trans; - } - if (IS_ROOT(parent)) - break; - - parent = dget_parent(parent); - dput(old_parent); - old_parent = parent; - } if (log_dentries) - ret = log_new_dir_dentries(trans, root, orig_inode, ctx); + ret = log_new_dir_dentries(trans, root, inode, ctx); else ret = 0; end_trans: - dput(old_parent); if (ret < 0) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); ret = 1; } |