aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c496
1 files changed, 249 insertions, 247 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8f23a94dab77..ac232b3d6d7e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -205,14 +205,11 @@ static int join_running_log_trans(struct btrfs_root *root)
* until you call btrfs_end_log_trans() or it makes any future
* log transactions wait until you call btrfs_end_log_trans()
*/
-int btrfs_pin_log_trans(struct btrfs_root *root)
+void btrfs_pin_log_trans(struct btrfs_root *root)
{
- int ret = -ENOENT;
-
mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
- return ret;
}
/*
@@ -222,11 +219,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
- /*
- * Implicit memory barrier after atomic_dec_and_test
- */
- if (waitqueue_active(&root->log_writer_wait))
- wake_up(&root->log_writer_wait);
+ /* atomic_dec_and_test implies a barrier */
+ cond_wake_up_nomb(&root->log_writer_wait);
}
}
@@ -261,6 +255,13 @@ struct walk_control {
/* what stage of the replay code we're currently in */
int stage;
+ /*
+ * Ignore any items from the inode currently being processed. Needs
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
+ * the LOG_WALK_REPLAY_INODES stage.
+ */
+ bool ignore_cur_inode;
+
/* the root we are currently replaying */
struct btrfs_root *replay_dest;
@@ -548,12 +549,8 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
- if (IS_ERR(inode)) {
- inode = NULL;
- } else if (is_bad_inode(inode)) {
- iput(inode);
+ if (IS_ERR(inode))
inode = NULL;
- }
return inode;
}
@@ -600,7 +597,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
nbytes = 0;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- size = btrfs_file_extent_inline_len(eb, slot, item);
+ size = btrfs_file_extent_ram_bytes(eb, item);
nbytes = btrfs_file_extent_ram_bytes(eb, item);
extent_end = ALIGN(start + size,
fs_info->sectorsize);
@@ -688,7 +685,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* as the owner of the file extent changed from log tree
* (doesn't affect qgroup) to fs/file tree(affects qgroup)
*/
- ret = btrfs_qgroup_trace_extent(trans, fs_info,
+ ret = btrfs_qgroup_trace_extent(trans,
btrfs_file_extent_disk_bytenr(eb, item),
btrfs_file_extent_disk_num_bytes(eb, item),
GFP_NOFS);
@@ -718,7 +715,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* allocation tree
*/
ret = btrfs_alloc_logged_file_extent(trans,
- fs_info,
root->root_key.objectid,
key->objectid, offset, &ins);
if (ret)
@@ -1148,7 +1144,7 @@ next:
}
btrfs_release_path(path);
- /* look for a conflicing name */
+ /* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
name, namelen, 0);
if (di && !IS_ERR(di)) {
@@ -1294,6 +1290,46 @@ again:
return ret;
}
+static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
+ const u8 ref_type, const char *name,
+ const int namelen)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ const u64 parent_id = btrfs_ino(BTRFS_I(dir));
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.type = ref_type;
+ if (key.type == BTRFS_INODE_REF_KEY)
+ key.offset = parent_id;
+ else
+ key.offset = btrfs_extref_hash(parent_id, name, namelen);
+
+ ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ if (key.type == BTRFS_INODE_EXTREF_KEY)
+ ret = btrfs_find_name_in_ext_backref(path->nodes[0],
+ path->slots[0], parent_id,
+ name, namelen, NULL);
+ else
+ ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+ name, namelen, NULL);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
@@ -1403,6 +1439,32 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
}
}
+ /*
+ * If a reference item already exists for this inode
+ * with the same parent and name, but different index,
+ * drop it and the corresponding directory index entries
+ * from the parent before adding the new reference item
+ * and dir index entries, otherwise we would fail with
+ * -EEXIST returned from btrfs_add_link() below.
+ */
+ ret = btrfs_inode_ref_exists(inode, dir, key->type,
+ name, namelen);
+ if (ret > 0) {
+ ret = btrfs_unlink_inode(trans, root,
+ BTRFS_I(dir),
+ BTRFS_I(inode),
+ name, namelen);
+ /*
+ * If we dropped the link count to 0, bump it so
+ * that later the iput() on the inode will not
+ * free it. We will fixup the link count later.
+ */
+ if (!ret && inode->i_nlink == 0)
+ inc_nlink(inode);
+ }
+ if (ret < 0)
+ goto out;
+
/* insert our name */
ret = btrfs_add_link(trans, BTRFS_I(dir),
BTRFS_I(inode),
@@ -2123,7 +2185,7 @@ again:
dir_key->offset,
name, name_len, 0);
}
- if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
+ if (!log_di || log_di == ERR_PTR(-ENOENT)) {
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
@@ -2429,6 +2491,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
inode_item = btrfs_item_ptr(eb, i,
struct btrfs_inode_item);
+ /*
+ * If we have a tmpfile (O_TMPFILE) that got fsync'ed
+ * and never got linked before the fsync, skip it, as
+ * replaying it is pointless since it would be deleted
+ * later. We skip logging tmpfiles, but it's always
+ * possible we are replaying a log created with a kernel
+ * that used to log tmpfiles.
+ */
+ if (btrfs_inode_nlink(eb, inode_item) == 0) {
+ wc->ignore_cur_inode = true;
+ continue;
+ } else {
+ wc->ignore_cur_inode = false;
+ }
ret = replay_xattr_deletes(wc->trans, root, log,
path, key.objectid);
if (ret)
@@ -2466,16 +2542,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
root->fs_info->sectorsize);
ret = btrfs_drop_extents(wc->trans, root, inode,
from, (u64)-1, 1);
- /*
- * If the nlink count is zero here, the iput
- * will free the inode. We bump it to make
- * sure it doesn't get freed until the link
- * count fixup is done.
- */
if (!ret) {
- if (inode->i_nlink == 0)
- inc_nlink(inode);
- /* Update link count and nbytes. */
+ /* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
root, inode);
}
@@ -2490,6 +2558,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
break;
}
+ if (wc->ignore_cur_inode)
+ continue;
+
if (key.type == BTRFS_DIR_INDEX_KEY &&
wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
ret = replay_one_dir_item(wc->trans, root, path,
@@ -2936,7 +3007,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* bail out if we need to do a full commit */
if (btrfs_need_log_full_commit(fs_info, trans)) {
ret = -EAGAIN;
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -2954,7 +3024,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (ret) {
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, ret);
- btrfs_free_logged_extents(log, log_transid);
btrfs_set_log_full_commit(fs_info, trans);
mutex_unlock(&root->log_mutex);
goto out;
@@ -2988,11 +3057,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- /*
- * Implicit memory barrier after atomic_dec_and_test
- */
- if (waitqueue_active(&log_root_tree->log_writer_wait))
- wake_up(&log_root_tree->log_writer_wait);
+ /* atomic_dec_and_test implies a barrier */
+ cond_wake_up_nomb(&log_root_tree->log_writer_wait);
}
if (ret) {
@@ -3008,7 +3074,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_wait_tree_log_extents(log, mark);
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out;
@@ -3026,7 +3091,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
ret = btrfs_wait_tree_log_extents(log, mark);
- btrfs_wait_logged_extents(trans, log, log_transid);
wait_log_commit(log_root_tree,
root_log_ctx.log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -3051,7 +3115,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (btrfs_need_log_full_commit(fs_info, trans)) {
blk_finish_plug(&plug);
btrfs_wait_tree_log_extents(log, mark);
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out_wake_log_root;
@@ -3064,7 +3127,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_set_log_full_commit(fs_info, trans);
btrfs_abort_transaction(trans, ret);
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
@@ -3074,11 +3136,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
EXTENT_NEW | EXTENT_DIRTY);
if (ret) {
btrfs_set_log_full_commit(fs_info, trans);
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
- btrfs_wait_logged_extents(trans, log, log_transid);
btrfs_set_super_log_root(fs_info->super_for_commit,
log_root_tree->node->start);
@@ -3089,7 +3149,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
/*
- * nobody else is going to jump in and write the the ctree
+ * Nobody else is going to jump in and write the ctree
* super here because the log_commit atomic below is protecting
* us. We must be called with a transaction handle pinning
* the running transaction open, so a full commit can't hop
@@ -3116,10 +3176,11 @@ out_wake_log_root:
mutex_unlock(&log_root_tree->log_mutex);
/*
- * The barrier before waitqueue_active is implied by mutex_unlock
+ * The barrier before waitqueue_active (in cond_wake_up) is needed so
+ * all the updates above are seen by the woken threads. It might not be
+ * necessary, but proving that seems to be hard.
*/
- if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
- wake_up(&log_root_tree->log_commit_wait[index2]);
+ cond_wake_up(&log_root_tree->log_commit_wait[index2]);
out:
mutex_lock(&root->log_mutex);
btrfs_remove_all_log_ctxs(root, index1, ret);
@@ -3128,10 +3189,11 @@ out:
mutex_unlock(&root->log_mutex);
/*
- * The barrier before waitqueue_active is implied by mutex_unlock
+ * The barrier before waitqueue_active (in cond_wake_up) is needed so
+ * all the updates above are seen by the woken threads. It might not be
+ * necessary, but proving that seems to be hard.
*/
- if (waitqueue_active(&root->log_commit_wait[index1]))
- wake_up(&root->log_commit_wait[index1]);
+ cond_wake_up(&root->log_commit_wait[index1]);
return ret;
}
@@ -3139,38 +3201,21 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *log)
{
int ret;
- u64 start;
- u64 end;
struct walk_control wc = {
.free = 1,
.process_func = process_one_buffer
};
ret = walk_log_tree(trans, log, &wc);
- /* I don't think this can happen but just in case */
- if (ret)
- btrfs_abort_transaction(trans, ret);
-
- while (1) {
- ret = find_first_extent_bit(&log->dirty_log_pages,
- 0, &start, &end,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT,
- NULL);
- if (ret)
- break;
-
- clear_extent_bits(&log->dirty_log_pages, start, end,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
+ if (ret) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
}
- /*
- * We may have short-circuited the log tree with the full commit logic
- * and left ordered extents on our list, so clear these out to keep us
- * from leaking inodes and memory.
- */
- btrfs_free_logged_extents(log, 0);
- btrfs_free_logged_extents(log, 1);
-
+ clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
+ EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
free_extent_buffer(log->node);
kfree(log);
}
@@ -3760,7 +3805,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+ struct btrfs_fs_info *fs_info = trans->fs_info;
unsigned long src_offset;
unsigned long dst_offset;
struct btrfs_root *log = inode->root->log_root;
@@ -3941,9 +3986,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item);
if (btrfs_file_extent_type(src, extent) ==
BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_inline_len(src,
- src_path->slots[0],
- extent);
+ len = btrfs_file_extent_ram_bytes(src, extent);
*last_extent = ALIGN(key.offset + len,
fs_info->sectorsize);
} else {
@@ -4008,7 +4051,7 @@ fill_holes:
extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(src, extent) ==
BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_inline_len(src, i, extent);
+ len = btrfs_file_extent_ram_bytes(src, extent);
extent_end = ALIGN(key.offset + len,
fs_info->sectorsize);
} else {
@@ -4082,131 +4125,32 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
return 0;
}
-static int wait_ordered_extents(struct btrfs_trans_handle *trans,
- struct inode *inode,
- struct btrfs_root *root,
- const struct extent_map *em,
- const struct list_head *logged_list,
- bool *ordered_io_error)
+static int log_extent_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_root *log_root,
+ const struct extent_map *em)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_ordered_extent *ordered;
- struct btrfs_root *log = root->log_root;
- u64 mod_start = em->mod_start;
- u64 mod_len = em->mod_len;
- const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
u64 csum_offset;
u64 csum_len;
LIST_HEAD(ordered_sums);
int ret = 0;
- *ordered_io_error = false;
-
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ if (inode->flags & BTRFS_INODE_NODATASUM ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
em->block_start == EXTENT_MAP_HOLE)
return 0;
- /*
- * Wait far any ordered extent that covers our extent map. If it
- * finishes without an error, first check and see if our csums are on
- * our outstanding ordered extents.
- */
- list_for_each_entry(ordered, logged_list, log_list) {
- struct btrfs_ordered_sum *sum;
-
- if (!mod_len)
- break;
-
- if (ordered->file_offset + ordered->len <= mod_start ||
- mod_start + mod_len <= ordered->file_offset)
- continue;
-
- if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
- !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
- !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
- const u64 start = ordered->file_offset;
- const u64 end = ordered->file_offset + ordered->len - 1;
-
- WARN_ON(ordered->inode != inode);
- filemap_fdatawrite_range(inode->i_mapping, start, end);
- }
-
- wait_event(ordered->wait,
- (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
- test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
-
- if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
- /*
- * Clear the AS_EIO/AS_ENOSPC flags from the inode's
- * i_mapping flags, so that the next fsync won't get
- * an outdated io error too.
- */
- filemap_check_errors(inode->i_mapping);
- *ordered_io_error = true;
- break;
- }
- /*
- * We are going to copy all the csums on this ordered extent, so
- * go ahead and adjust mod_start and mod_len in case this
- * ordered extent has already been logged.
- */
- if (ordered->file_offset > mod_start) {
- if (ordered->file_offset + ordered->len >=
- mod_start + mod_len)
- mod_len = ordered->file_offset - mod_start;
- /*
- * If we have this case
- *
- * |--------- logged extent ---------|
- * |----- ordered extent ----|
- *
- * Just don't mess with mod_start and mod_len, we'll
- * just end up logging more csums than we need and it
- * will be ok.
- */
- } else {
- if (ordered->file_offset + ordered->len <
- mod_start + mod_len) {
- mod_len = (mod_start + mod_len) -
- (ordered->file_offset + ordered->len);
- mod_start = ordered->file_offset +
- ordered->len;
- } else {
- mod_len = 0;
- }
- }
-
- if (skip_csum)
- continue;
-
- /*
- * To keep us from looping for the above case of an ordered
- * extent that falls inside of the logged extent.
- */
- if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
- &ordered->flags))
- continue;
-
- list_for_each_entry(sum, &ordered->list, list) {
- ret = btrfs_csum_file_blocks(trans, log, sum);
- if (ret)
- break;
- }
- }
-
- if (*ordered_io_error || !mod_len || ret || skip_csum)
- return ret;
-
+ /* If we're compressed we have to save the entire range of csums. */
if (em->compress_type) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
- csum_offset = mod_start - em->start;
- csum_len = mod_len;
+ csum_offset = em->mod_start - em->start;
+ csum_len = em->mod_len;
}
/* block start is already adjusted for the file extent offset. */
- ret = btrfs_lookup_csums_range(fs_info->csum_root,
+ ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
em->block_start + csum_offset,
em->block_start + csum_offset +
csum_len - 1, &ordered_sums, 0);
@@ -4218,7 +4162,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_csum_file_blocks(trans, log, sums);
+ ret = btrfs_csum_file_blocks(trans, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4230,7 +4174,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_root *root,
const struct extent_map *em,
struct btrfs_path *path,
- const struct list_head *logged_list,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *log = root->log_root;
@@ -4242,18 +4185,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
u64 block_len;
int ret;
int extent_inserted = 0;
- bool ordered_io_err = false;
- ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em,
- logged_list, &ordered_io_err);
+ ret = log_extent_csums(trans, inode, log, em);
if (ret)
return ret;
- if (ordered_io_err) {
- ctx->io_err = -EIO;
- return ctx->io_err;
- }
-
btrfs_init_map_token(&token);
ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
@@ -4428,7 +4364,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct list_head *logged_list,
struct btrfs_log_ctx *ctx,
const u64 start,
const u64 end)
@@ -4436,20 +4371,33 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
- u64 logged_start, logged_end;
u64 test_gen;
int ret = 0;
int num = 0;
INIT_LIST_HEAD(&extents);
- down_write(&inode->dio_sem);
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
- logged_start = start;
- logged_end = end;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+ /*
+ * Skip extents outside our logging range. It's important to do
+ * it for correctness because if we don't ignore them, we may
+ * log them before their ordered extent completes, and therefore
+ * we could log them without logging their respective checksums
+ * (the checksum items are added to the csum tree at the very
+ * end of btrfs_finish_ordered_io()). Also leave such extents
+ * outside of our range in the list, since we may have another
+ * ranged fsync in the near future that needs them. If an extent
+ * outside our range corresponds to a hole, log it to avoid
+ * leaving gaps between extents (fsck will complain when we are
+ * not using the NO_HOLES feature).
+ */
+ if ((em->start > end || em->start + em->len <= start) &&
+ em->block_start != EXTENT_MAP_HOLE)
+ continue;
+
list_del_init(&em->list);
/*
* Just an arbitrary number, this can be really CPU intensive
@@ -4471,11 +4419,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
em->start >= i_size_read(&inode->vfs_inode))
continue;
- if (em->start < logged_start)
- logged_start = em->start;
- if ((em->start + em->len - 1) > logged_end)
- logged_end = em->start + em->len - 1;
-
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(&em->refs);
set_bit(EXTENT_FLAG_LOGGING, &em->flags);
@@ -4484,20 +4427,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
}
list_sort(NULL, &extents, extent_cmp);
- btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end);
- /*
- * Some ordered extents started by fsync might have completed
- * before we could collect them into the list logged_list, which
- * means they're gone, not in our logged_list nor in the inode's
- * ordered tree. We want the application/user space to know an
- * error happened while attempting to persist file data so that
- * it can take proper action. If such error happened, we leave
- * without writing to the log tree and the fsync must report the
- * file data write error and not commit the current transaction.
- */
- ret = filemap_check_errors(inode->vfs_inode.i_mapping);
- if (ret)
- ctx->io_err = ret;
process:
while (!list_empty(&extents)) {
em = list_entry(extents.next, struct extent_map, list);
@@ -4516,15 +4445,13 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path, logged_list,
- ctx);
+ ret = log_one_extent(trans, inode, root, em, path, ctx);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- up_write(&inode->dio_sem);
btrfs_release_path(path);
if (!ret)
@@ -4716,13 +4643,12 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
if (btrfs_file_extent_type(leaf, extent) ==
BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_inline_len(leaf,
- path->slots[0],
- extent);
+ len = btrfs_file_extent_ram_bytes(leaf, extent);
ASSERT(len == i_size ||
(len == fs_info->sectorsize &&
btrfs_file_extent_compression(leaf, extent) !=
- BTRFS_COMPRESS_NONE));
+ BTRFS_COMPRESS_NONE) ||
+ (len < i_size && i_size < fs_info->sectorsize));
return 0;
}
@@ -4902,7 +4828,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
- LIST_HEAD(logged_list);
u64 last_extent = 0;
int err = 0;
int ret;
@@ -5098,8 +5023,7 @@ again:
* we don't need to do more work nor fallback to
* a transaction commit.
*/
- if (IS_ERR(other_inode) &&
- PTR_ERR(other_inode) == -ENOENT) {
+ if (other_inode == ERR_PTR(-ENOENT)) {
goto next_key;
} else if (IS_ERR(other_inode)) {
err = PTR_ERR(other_inode);
@@ -5239,7 +5163,7 @@ log_extents:
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- &logged_list, ctx, start, end);
+ ctx, start, end);
if (ret) {
err = ret;
goto out_unlock;
@@ -5290,10 +5214,6 @@ log_extents:
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
out_unlock:
- if (unlikely(err))
- btrfs_put_logged_extents(&logged_list);
- else
- btrfs_submit_logged_extents(&logged_list, log);
mutex_unlock(&inode->log_mutex);
btrfs_free_path(path);
@@ -5589,7 +5509,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
@@ -5656,9 +5576,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
dir_inode = btrfs_iget(fs_info->sb, &inode_key,
root, NULL);
- /* If parent inode was deleted, skip it. */
- if (IS_ERR(dir_inode))
- continue;
+ /*
+ * If the parent inode was deleted, return an error to
+ * fallback to a transaction commit. This is to prevent
+ * getting an inode that was moved from one parent A to
+ * a parent B, got its former parent A deleted and then
+ * it got fsync'ed, from existing at both parents after
+ * a log replay (and the old parent still existing).
+ * Example:
+ *
+ * mkdir /mnt/A
+ * mkdir /mnt/B
+ * touch /mnt/B/bar
+ * sync
+ * mv /mnt/B/bar /mnt/A/bar
+ * mv -T /mnt/A /mnt/B
+ * fsync /mnt/B/bar
+ * <power fail>
+ *
+ * If we ignore the old parent B which got deleted,
+ * after a log replay we would have file bar linked
+ * at both parents and the old parent B would still
+ * exist.
+ */
+ if (IS_ERR(dir_inode)) {
+ ret = PTR_ERR(dir_inode);
+ goto out;
+ }
if (ctx)
ctx->log_new_dentries = false;
@@ -5732,7 +5676,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- if (btrfs_inode_in_log(inode, trans->transid)) {
+ /*
+ * Skip already logged inodes or inodes corresponding to tmpfiles
+ * (since logging them is pointless, a link count of 0 means they
+ * will never be accessible).
+ */
+ if (btrfs_inode_in_log(inode, trans->transid) ||
+ inode->vfs_inode.i_nlink == 0) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
}
@@ -5808,6 +5758,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
+ /*
+ * If a new hard link was added to the inode in the current transaction
+ * and its link count is now greater than 1, we need to fallback to a
+ * transaction commit, otherwise we can end up not logging all its new
+ * parents for all the hard links. Here just from the dentry used to
+ * fsync, we can not visit the ancestor inodes for all the other hard
+ * links to figure out if any is new, so we fallback to a transaction
+ * commit (instead of adding a lot of complexity of scanning a btree,
+ * since this scenario is not a common use case).
+ */
+ if (inode->vfs_inode.i_nlink > 1 &&
+ inode->last_link_trans > last_committed) {
+ ret = -EMLINK;
+ goto end_trans;
+ }
+
while (1) {
if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
break;
@@ -6117,14 +6083,25 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
*
- * It will return zero if all goes well, and it will return 1 if a
- * full transaction commit is required.
+ * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
+ * true (because it's not used).
+ *
+ * Return value depends on whether @sync_log is true or false.
+ * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
+ * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
+ * otherwise.
+ * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
+ * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
+ * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
+ * committed (without attempting to sync the log).
*/
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent)
+ struct dentry *parent,
+ bool sync_log, struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
/*
* this will force the logging code to walk the dentry chain
@@ -6139,9 +6116,34 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
*/
if (inode->logged_trans <= fs_info->last_trans_committed &&
(!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return 0;
+ return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
+ BTRFS_DONT_NEED_LOG_SYNC;
+
+ if (sync_log) {
+ struct btrfs_log_ctx ctx2;
+
+ btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
+ ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, &ctx2);
+ if (ret == BTRFS_NO_LOG_SYNC)
+ return BTRFS_DONT_NEED_TRANS_COMMIT;
+ else if (ret)
+ return BTRFS_NEED_TRANS_COMMIT;
+
+ ret = btrfs_sync_log(trans, inode->root, &ctx2);
+ if (ret)
+ return BTRFS_NEED_TRANS_COMMIT;
+ return BTRFS_DONT_NEED_TRANS_COMMIT;
+ }
+
+ ASSERT(ctx);
+ ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, ctx);
+ if (ret == BTRFS_NO_LOG_SYNC)
+ return BTRFS_DONT_NEED_LOG_SYNC;
+ else if (ret)
+ return BTRFS_NEED_TRANS_COMMIT;
- return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, NULL);
+ return BTRFS_NEED_LOG_SYNC;
}