diff options
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r-- | fs/btrfs/tree-log.c | 613 |
1 files changed, 364 insertions, 249 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 434457794c27..3c2ae0e4f25a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> @@ -21,12 +8,12 @@ #include <linux/blkdev.h> #include <linux/list_sort.h> #include <linux/iversion.h> +#include "ctree.h" #include "tree-log.h" #include "disk-io.h" #include "locking.h" #include "print-tree.h" #include "backref.h" -#include "hash.h" #include "compression.h" #include "qgroup.h" #include "inode-map.h" @@ -235,11 +222,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root) void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { - /* - * Implicit memory barrier after atomic_dec_and_test - */ - if (waitqueue_active(&root->log_writer_wait)) - wake_up(&root->log_writer_wait); + /* atomic_dec_and_test implies a barrier */ + cond_wake_up_nomb(&root->log_writer_wait); } } @@ -286,7 +270,7 @@ struct walk_control { * inside it */ int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, - struct walk_control *wc, u64 gen); + struct walk_control *wc, u64 gen, int level); }; /* @@ -294,7 +278,7 @@ struct walk_control { */ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, - struct walk_control *wc, u64 gen) + struct walk_control *wc, u64 gen, int level) { struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; @@ -304,7 +288,7 @@ static int process_one_buffer(struct btrfs_root *log, * pin down any logged extents, so we have to read the block. */ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { - ret = btrfs_read_buffer(eb, gen); + ret = btrfs_read_buffer(eb, gen, level, NULL); if (ret) return ret; } @@ -561,12 +545,8 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); - if (IS_ERR(inode)) { - inode = NULL; - } else if (is_bad_inode(inode)) { - iput(inode); + if (IS_ERR(inode)) inode = NULL; - } return inode; } @@ -613,7 +593,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (btrfs_file_extent_disk_bytenr(eb, item) == 0) nbytes = 0; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_inline_len(eb, slot, item); + size = btrfs_file_extent_ram_bytes(eb, item); nbytes = btrfs_file_extent_ram_bytes(eb, item); extent_end = ALIGN(start + size, fs_info->sectorsize); @@ -701,7 +681,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * as the owner of the file extent changed from log tree * (doesn't affect qgroup) to fs/file tree(affects qgroup) */ - ret = btrfs_qgroup_trace_extent(trans, fs_info, + ret = btrfs_qgroup_trace_extent(trans, btrfs_file_extent_disk_bytenr(eb, item), btrfs_file_extent_disk_num_bytes(eb, item), GFP_NOFS); @@ -731,7 +711,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * allocation tree */ ret = btrfs_alloc_logged_file_extent(trans, - fs_info, root->root_key.objectid, key->objectid, offset, &ins); if (ret) @@ -853,7 +832,6 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_dir_item *di) { - struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode; char *name; int name_len; @@ -887,7 +865,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; else - ret = btrfs_run_delayed_items(trans, fs_info); + ret = btrfs_run_delayed_items(trans); out: kfree(name); iput(inode); @@ -1007,7 +985,6 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, u64 ref_index, char *name, int namelen, int *search_done) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *victim_name; int victim_name_len; @@ -1065,7 +1042,7 @@ again: kfree(victim_name); if (ret) return ret; - ret = btrfs_run_delayed_items(trans, fs_info); + ret = btrfs_run_delayed_items(trans); if (ret) return ret; *search_done = 1; @@ -1136,8 +1113,7 @@ again: victim_name_len); if (!ret) ret = btrfs_run_delayed_items( - trans, - fs_info); + trans); } iput(victim_parent); kfree(victim_name); @@ -1310,6 +1286,46 @@ again: return ret; } +static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, + const u8 ref_type, const char *name, + const int namelen) +{ + struct btrfs_key key; + struct btrfs_path *path; + const u64 parent_id = btrfs_ino(BTRFS_I(dir)); + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = ref_type; + if (key.type == BTRFS_INODE_REF_KEY) + key.offset = parent_id; + else + key.offset = btrfs_extref_hash(parent_id, name, namelen); + + ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + if (key.type == BTRFS_INODE_EXTREF_KEY) + ret = btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], parent_id, + name, namelen, NULL); + else + ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, namelen, NULL); + +out: + btrfs_free_path(path); + return ret; +} + /* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. @@ -1419,6 +1435,32 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } } + /* + * If a reference item already exists for this inode + * with the same parent and name, but different index, + * drop it and the corresponding directory index entries + * from the parent before adding the new reference item + * and dir index entries, otherwise we would fail with + * -EEXIST returned from btrfs_add_link() below. + */ + ret = btrfs_inode_ref_exists(inode, dir, key->type, + name, namelen); + if (ret > 0) { + ret = btrfs_unlink_inode(trans, root, + BTRFS_I(dir), + BTRFS_I(inode), + name, namelen); + /* + * If we dropped the link count to 0, bump it so + * that later the iput() on the inode will not + * free it. We will fixup the link count later. + */ + if (!ret && inode->i_nlink == 0) + inc_nlink(inode); + } + if (ret < 0) + goto out; + /* insert our name */ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), @@ -2098,7 +2140,6 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct inode *dir, struct btrfs_key *dir_key) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; struct extent_buffer *eb; int slot; @@ -2140,7 +2181,7 @@ again: dir_key->offset, name, name_len, 0); } - if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { + if (!log_di || log_di == ERR_PTR(-ENOENT)) { btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); @@ -2162,7 +2203,7 @@ again: ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(inode), name, name_len); if (!ret) - ret = btrfs_run_delayed_items(trans, fs_info); + ret = btrfs_run_delayed_items(trans); kfree(name); iput(inode); if (ret) @@ -2356,8 +2397,10 @@ again: nritems = btrfs_header_nritems(path->nodes[0]); if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); - if (ret) + if (ret == 1) break; + else if (ret < 0) + goto out; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -2410,17 +2453,16 @@ out: * back refs). */ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, - struct walk_control *wc, u64 gen) + struct walk_control *wc, u64 gen, int level) { int nritems; struct btrfs_path *path; struct btrfs_root *root = wc->replay_dest; struct btrfs_key key; - int level; int i; int ret; - ret = btrfs_read_buffer(eb, gen); + ret = btrfs_read_buffer(eb, gen, level, NULL); if (ret) return ret; @@ -2461,13 +2503,41 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (ret) break; - /* for regular files, make sure corresponding - * orphan item exist. extents past the new EOF - * will be truncated later by orphan cleanup. + /* + * Before replaying extents, truncate the inode to its + * size. We need to do it now and not after log replay + * because before an fsync we can have prealloc extents + * added beyond the inode's i_size. If we did it after, + * through orphan cleanup for example, we would drop + * those prealloc extents just after replaying them. */ if (S_ISREG(mode)) { - ret = insert_orphan_item(wc->trans, root, - key.objectid); + struct inode *inode; + u64 from; + + inode = read_one_inode(root, key.objectid); + if (!inode) { + ret = -EIO; + break; + } + from = ALIGN(i_size_read(inode), + root->fs_info->sectorsize); + ret = btrfs_drop_extents(wc->trans, root, inode, + from, (u64)-1, 1); + /* + * If the nlink count is zero here, the iput + * will free the inode. We bump it to make + * sure it doesn't get freed until the link + * count fixup is done. + */ + if (!ret) { + if (inode->i_nlink == 0) + inc_nlink(inode); + /* Update link count and nbytes. */ + ret = btrfs_update_inode(wc->trans, + root, inode); + } + iput(inode); if (ret) break; } @@ -2537,6 +2607,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level >= BTRFS_MAX_LEVEL); while (*level > 0) { + struct btrfs_key first_key; + WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); cur = path->nodes[*level]; @@ -2549,6 +2621,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); blocksize = fs_info->nodesize; parent = path->nodes[*level]; @@ -2559,7 +2632,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return PTR_ERR(next); if (*level == 1) { - ret = wc->process_func(root, next, wc, ptr_gen); + ret = wc->process_func(root, next, wc, ptr_gen, + *level - 1); if (ret) { free_extent_buffer(next); return ret; @@ -2567,7 +2641,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - ret = btrfs_read_buffer(next, ptr_gen); + ret = btrfs_read_buffer(next, ptr_gen, + *level - 1, &first_key); if (ret) { free_extent_buffer(next); return ret; @@ -2597,7 +2672,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - ret = btrfs_read_buffer(next, ptr_gen); + ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); if (ret) { free_extent_buffer(next); return ret; @@ -2647,7 +2722,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(parent); ret = wc->process_func(root, path->nodes[*level], wc, - btrfs_header_generation(path->nodes[*level])); + btrfs_header_generation(path->nodes[*level]), + *level); if (ret) return ret; @@ -2729,7 +2805,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, /* was the root node processed? if not, catch it here */ if (path->nodes[orig_level]) { ret = wc->process_func(log, path->nodes[orig_level], wc, - btrfs_header_generation(path->nodes[orig_level])); + btrfs_header_generation(path->nodes[orig_level]), + orig_level); if (ret) goto out; if (wc->free) { @@ -2917,7 +2994,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* bail out if we need to do a full commit */ if (btrfs_need_log_full_commit(fs_info, trans)) { ret = -EAGAIN; - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); goto out; } @@ -2935,7 +3011,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); - btrfs_free_logged_extents(log, log_transid); btrfs_set_log_full_commit(fs_info, trans); mutex_unlock(&root->log_mutex); goto out; @@ -2969,11 +3044,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { - /* - * Implicit memory barrier after atomic_dec_and_test - */ - if (waitqueue_active(&log_root_tree->log_writer_wait)) - wake_up(&log_root_tree->log_writer_wait); + /* atomic_dec_and_test implies a barrier */ + cond_wake_up_nomb(&log_root_tree->log_writer_wait); } if (ret) { @@ -2989,7 +3061,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } btrfs_wait_tree_log_extents(log, mark); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out; @@ -3007,7 +3078,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); ret = btrfs_wait_tree_log_extents(log, mark); - btrfs_wait_logged_extents(trans, log, log_transid); wait_log_commit(log_root_tree, root_log_ctx.log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -3032,7 +3102,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (btrfs_need_log_full_commit(fs_info, trans)) { blk_finish_plug(&plug); btrfs_wait_tree_log_extents(log, mark); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; @@ -3045,7 +3114,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (ret) { btrfs_set_log_full_commit(fs_info, trans); btrfs_abort_transaction(trans, ret); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } @@ -3055,11 +3123,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_NEW | EXTENT_DIRTY); if (ret) { btrfs_set_log_full_commit(fs_info, trans); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } - btrfs_wait_logged_extents(trans, log, log_transid); btrfs_set_super_log_root(fs_info->super_for_commit, log_root_tree->node->start); @@ -3097,10 +3163,11 @@ out_wake_log_root: mutex_unlock(&log_root_tree->log_mutex); /* - * The barrier before waitqueue_active is implied by mutex_unlock + * The barrier before waitqueue_active (in cond_wake_up) is needed so + * all the updates above are seen by the woken threads. It might not be + * necessary, but proving that seems to be hard. */ - if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) - wake_up(&log_root_tree->log_commit_wait[index2]); + cond_wake_up(&log_root_tree->log_commit_wait[index2]); out: mutex_lock(&root->log_mutex); btrfs_remove_all_log_ctxs(root, index1, ret); @@ -3109,10 +3176,11 @@ out: mutex_unlock(&root->log_mutex); /* - * The barrier before waitqueue_active is implied by mutex_unlock + * The barrier before waitqueue_active (in cond_wake_up) is needed so + * all the updates above are seen by the woken threads. It might not be + * necessary, but proving that seems to be hard. */ - if (waitqueue_active(&root->log_commit_wait[index1])) - wake_up(&root->log_commit_wait[index1]); + cond_wake_up(&root->log_commit_wait[index1]); return ret; } @@ -3144,14 +3212,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); } - /* - * We may have short-circuited the log tree with the full commit logic - * and left ordered extents on our list, so clear these out to keep us - * from leaking inodes and memory. - */ - btrfs_free_logged_extents(log, 0); - btrfs_free_logged_extents(log, 1); - free_extent_buffer(log->node); kfree(log); } @@ -3518,8 +3578,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * from this directory and from this transaction */ ret = btrfs_next_leaf(root, path); - if (ret == 1) { - last_offset = (u64)-1; + if (ret) { + if (ret == 1) + last_offset = (u64)-1; + else + err = ret; goto done; } btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); @@ -3738,7 +3801,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int start_slot, int nr, int inode_only, u64 logged_isize) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = trans->fs_info; unsigned long src_offset; unsigned long dst_offset; struct btrfs_root *log = inode->root->log_root; @@ -3919,9 +3982,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); if (btrfs_file_extent_type(src, extent) == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(src, - src_path->slots[0], - extent); + len = btrfs_file_extent_ram_bytes(src, extent); *last_extent = ALIGN(key.offset + len, fs_info->sectorsize); } else { @@ -3972,6 +4033,7 @@ fill_holes: ASSERT(ret == 0); src = src_path->nodes[0]; i = 0; + need_find_last_extent = true; } btrfs_item_key_to_cpu(src, &key, i); @@ -3985,7 +4047,7 @@ fill_holes: extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); if (btrfs_file_extent_type(src, extent) == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(src, i, extent); + len = btrfs_file_extent_ram_bytes(src, extent); extent_end = ALIGN(key.offset + len, fs_info->sectorsize); } else { @@ -4006,6 +4068,36 @@ fill_holes: break; *last_extent = extent_end; } + + /* + * Check if there is a hole between the last extent found in our leaf + * and the first extent in the next leaf. If there is one, we need to + * log an explicit hole so that at replay time we can punch the hole. + */ + if (ret == 0 && + key.objectid == btrfs_ino(inode) && + key.type == BTRFS_EXTENT_DATA_KEY && + i == btrfs_header_nritems(src_path->nodes[0])) { + ret = btrfs_next_leaf(inode->root, src_path); + need_find_last_extent = true; + if (ret > 0) { + ret = 0; + } else if (ret == 0) { + btrfs_item_key_to_cpu(src_path->nodes[0], &key, + src_path->slots[0]); + if (key.objectid == btrfs_ino(inode) && + key.type == BTRFS_EXTENT_DATA_KEY && + *last_extent < key.offset) { + const u64 len = key.offset - *last_extent; + + ret = btrfs_insert_file_extent(trans, log, + btrfs_ino(inode), + *last_extent, 0, + 0, len, 0, len, + 0, 0, 0); + } + } + } /* * Need to let the callers know we dropped the path so they should * re-search. @@ -4029,131 +4121,32 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -static int wait_ordered_extents(struct btrfs_trans_handle *trans, - struct inode *inode, - struct btrfs_root *root, - const struct extent_map *em, - const struct list_head *logged_list, - bool *ordered_io_error) +static int log_extent_csums(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_root *log_root, + const struct extent_map *em) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_ordered_extent *ordered; - struct btrfs_root *log = root->log_root; - u64 mod_start = em->mod_start; - u64 mod_len = em->mod_len; - const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; u64 csum_offset; u64 csum_len; LIST_HEAD(ordered_sums); int ret = 0; - *ordered_io_error = false; - - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + if (inode->flags & BTRFS_INODE_NODATASUM || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || em->block_start == EXTENT_MAP_HOLE) return 0; - /* - * Wait far any ordered extent that covers our extent map. If it - * finishes without an error, first check and see if our csums are on - * our outstanding ordered extents. - */ - list_for_each_entry(ordered, logged_list, log_list) { - struct btrfs_ordered_sum *sum; - - if (!mod_len) - break; - - if (ordered->file_offset + ordered->len <= mod_start || - mod_start + mod_len <= ordered->file_offset) - continue; - - if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && - !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && - !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { - const u64 start = ordered->file_offset; - const u64 end = ordered->file_offset + ordered->len - 1; - - WARN_ON(ordered->inode != inode); - filemap_fdatawrite_range(inode->i_mapping, start, end); - } - - wait_event(ordered->wait, - (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || - test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); - - if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { - /* - * Clear the AS_EIO/AS_ENOSPC flags from the inode's - * i_mapping flags, so that the next fsync won't get - * an outdated io error too. - */ - filemap_check_errors(inode->i_mapping); - *ordered_io_error = true; - break; - } - /* - * We are going to copy all the csums on this ordered extent, so - * go ahead and adjust mod_start and mod_len in case this - * ordered extent has already been logged. - */ - if (ordered->file_offset > mod_start) { - if (ordered->file_offset + ordered->len >= - mod_start + mod_len) - mod_len = ordered->file_offset - mod_start; - /* - * If we have this case - * - * |--------- logged extent ---------| - * |----- ordered extent ----| - * - * Just don't mess with mod_start and mod_len, we'll - * just end up logging more csums than we need and it - * will be ok. - */ - } else { - if (ordered->file_offset + ordered->len < - mod_start + mod_len) { - mod_len = (mod_start + mod_len) - - (ordered->file_offset + ordered->len); - mod_start = ordered->file_offset + - ordered->len; - } else { - mod_len = 0; - } - } - - if (skip_csum) - continue; - - /* - * To keep us from looping for the above case of an ordered - * extent that falls inside of the logged extent. - */ - if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, - &ordered->flags)) - continue; - - list_for_each_entry(sum, &ordered->list, list) { - ret = btrfs_csum_file_blocks(trans, log, sum); - if (ret) - break; - } - } - - if (*ordered_io_error || !mod_len || ret || skip_csum) - return ret; - + /* If we're compressed we have to save the entire range of csums. */ if (em->compress_type) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { - csum_offset = mod_start - em->start; - csum_len = mod_len; + csum_offset = em->mod_start - em->start; + csum_len = em->mod_len; } /* block start is already adjusted for the file extent offset. */ - ret = btrfs_lookup_csums_range(fs_info->csum_root, + ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, em->block_start + csum_offset, em->block_start + csum_offset + csum_len - 1, &ordered_sums, 0); @@ -4165,7 +4158,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log, sums); + ret = btrfs_csum_file_blocks(trans, log_root, sums); list_del(&sums->list); kfree(sums); } @@ -4177,7 +4170,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_root *root, const struct extent_map *em, struct btrfs_path *path, - const struct list_head *logged_list, struct btrfs_log_ctx *ctx) { struct btrfs_root *log = root->log_root; @@ -4189,18 +4181,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans, u64 block_len; int ret; int extent_inserted = 0; - bool ordered_io_err = false; - ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, - logged_list, &ordered_io_err); + ret = log_extent_csums(trans, inode, log, em); if (ret) return ret; - if (ordered_io_err) { - ctx->io_err = -EIO; - return ctx->io_err; - } - btrfs_init_map_token(&token); ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, @@ -4267,11 +4252,114 @@ static int log_one_extent(struct btrfs_trans_handle *trans, return ret; } +/* + * Log all prealloc extents beyond the inode's i_size to make sure we do not + * lose them after doing a fast fsync and replaying the log. We scan the + * subvolume's root instead of iterating the inode's extent map tree because + * otherwise we can log incorrect extent items based on extent map conversion. + * That can happen due to the fact that extent maps are merged when they + * are not in the extent map tree's list of modified extents. + */ +static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path) +{ + struct btrfs_root *root = inode->root; + struct btrfs_key key; + const u64 i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + struct btrfs_path *dst_path = NULL; + u64 last_extent = (u64)-1; + int ins_nr = 0; + int start_slot; + int ret; + + if (!(inode->flags & BTRFS_INODE_PREALLOC)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = i_size; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + if (ret < 0) + goto out; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid > ino) + break; + if (WARN_ON_ONCE(key.objectid < ino) || + key.type < BTRFS_EXTENT_DATA_KEY || + key.offset < i_size) { + path->slots[0]++; + continue; + } + if (last_extent == (u64)-1) { + last_extent = key.offset; + /* + * Avoid logging extent items logged in past fsync calls + * and leading to duplicate keys in the log tree. + */ + do { + ret = btrfs_truncate_inode_items(trans, + root->log_root, + &inode->vfs_inode, + i_size, + BTRFS_EXTENT_DATA_KEY); + } while (ret == -EAGAIN); + if (ret) + goto out; + } + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + if (!dst_path) { + dst_path = btrfs_alloc_path(); + if (!dst_path) { + ret = -ENOMEM; + goto out; + } + } + } + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + start_slot, ins_nr, 1, 0); + if (ret > 0) + ret = 0; + } +out: + btrfs_release_path(path); + btrfs_free_path(dst_path); + return ret; +} + static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, - struct list_head *logged_list, struct btrfs_log_ctx *ctx, const u64 start, const u64 end) @@ -4309,6 +4397,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; + /* We log prealloc extents beyond eof later. */ + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && + em->start >= i_size_read(&inode->vfs_inode)) + continue; + if (em->start < logged_start) logged_start = em->start; if ((em->start + em->len - 1) > logged_end) @@ -4322,20 +4415,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); - btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); - /* - * Some ordered extents started by fsync might have completed - * before we could collect them into the list logged_list, which - * means they're gone, not in our logged_list nor in the inode's - * ordered tree. We want the application/user space to know an - * error happened while attempting to persist file data so that - * it can take proper action. If such error happened, we leave - * without writing to the log tree and the fsync must report the - * file data write error and not commit the current transaction. - */ - ret = filemap_check_errors(inode->vfs_inode.i_mapping); - if (ret) - ctx->io_err = ret; process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4354,8 +4433,7 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path, logged_list, - ctx); + ret = log_one_extent(trans, inode, root, em, path, ctx); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); @@ -4365,6 +4443,9 @@ process: up_write(&inode->dio_sem); btrfs_release_path(path); + if (!ret) + ret = btrfs_log_prealloc_extents(trans, inode, path); + return ret; } @@ -4551,9 +4632,7 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, if (btrfs_file_extent_type(leaf, extent) == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(leaf, - path->slots[0], - extent); + len = btrfs_file_extent_ram_bytes(leaf, extent); ASSERT(len == i_size || (len == fs_info->sectorsize && btrfs_file_extent_compression(leaf, extent) != @@ -4737,7 +4816,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = root->log_root; - LIST_HEAD(logged_list); u64 last_extent = 0; int err = 0; int ret; @@ -4749,6 +4827,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct extent_map_tree *em_tree = &inode->extent_tree; u64 logged_isize = 0; bool need_log_inode_item = true; + bool xattrs_logged = false; path = btrfs_alloc_path(); if (!path) @@ -4932,8 +5011,7 @@ again: * we don't need to do more work nor fallback to * a transaction commit. */ - if (IS_ERR(other_inode) && - PTR_ERR(other_inode) == -ENOENT) { + if (other_inode == ERR_PTR(-ENOENT)) { goto next_key; } else if (IS_ERR(other_inode)) { err = PTR_ERR(other_inode); @@ -5050,6 +5128,7 @@ next_key: err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); if (err) goto out_unlock; + xattrs_logged = true; if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); @@ -5062,12 +5141,17 @@ log_extents: btrfs_release_path(dst_path); if (need_log_inode_item) { err = log_inode_item(trans, log, dst_path, inode); + if (!err && !xattrs_logged) { + err = btrfs_log_all_xattrs(trans, root, inode, path, + dst_path); + btrfs_release_path(path); + } if (err) goto out_unlock; } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - &logged_list, ctx, start, end); + ctx, start, end); if (ret) { err = ret; goto out_unlock; @@ -5118,10 +5202,6 @@ log_extents: inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); out_unlock: - if (unlikely(err)) - btrfs_put_logged_extents(&logged_list); - else - btrfs_submit_logged_extents(&logged_list, log); mutex_unlock(&inode->log_mutex); btrfs_free_path(path); @@ -5417,7 +5497,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_path *path; struct btrfs_key key; @@ -5517,7 +5597,6 @@ out: * the last committed transaction */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, struct dentry *parent, const loff_t start, @@ -5525,6 +5604,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, int inode_only, struct btrfs_log_ctx *ctx) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct super_block *sb; struct dentry *old_parent = NULL; @@ -5550,7 +5630,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) { + if (btrfs_root_refs(&root->root_item) == 0) { ret = 1; goto end_no_trans; } @@ -5682,7 +5762,7 @@ end_no_trans: * data on disk. */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry, + struct dentry *dentry, const loff_t start, const loff_t end, struct btrfs_log_ctx *ctx) @@ -5690,8 +5770,8 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *parent = dget_parent(dentry); int ret; - ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), - parent, start, end, LOG_INODE_ALL, ctx); + ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, + start, end, LOG_INODE_ALL, ctx); dput(parent); return ret; @@ -5945,15 +6025,25 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. * - * It will return zero if all goes well, and it will return 1 if a - * full transaction commit is required. + * @ctx can not be NULL when @sync_log is false, and should be NULL when it's + * true (because it's not used). + * + * Return value depends on whether @sync_log is true or false. + * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be + * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT + * otherwise. + * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to + * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, + * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be + * committed (without attempting to sync the log). */ int btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent) + struct dentry *parent, + bool sync_log, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; /* * this will force the logging code to walk the dentry chain @@ -5968,9 +6058,34 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, */ if (inode->logged_trans <= fs_info->last_trans_committed && (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) - return 0; + return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : + BTRFS_DONT_NEED_LOG_SYNC; + + if (sync_log) { + struct btrfs_log_ctx ctx2; + + btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); + ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, + LOG_INODE_EXISTS, &ctx2); + if (ret == BTRFS_NO_LOG_SYNC) + return BTRFS_DONT_NEED_TRANS_COMMIT; + else if (ret) + return BTRFS_NEED_TRANS_COMMIT; + + ret = btrfs_sync_log(trans, inode->root, &ctx2); + if (ret) + return BTRFS_NEED_TRANS_COMMIT; + return BTRFS_DONT_NEED_TRANS_COMMIT; + } + + ASSERT(ctx); + ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, + LOG_INODE_EXISTS, ctx); + if (ret == BTRFS_NO_LOG_SYNC) + return BTRFS_DONT_NEED_LOG_SYNC; + else if (ret) + return BTRFS_NEED_TRANS_COMMIT; - return btrfs_log_inode_parent(trans, root, inode, parent, 0, - LLONG_MAX, LOG_INODE_EXISTS, NULL); + return BTRFS_NEED_LOG_SYNC; } |