aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c613
1 files changed, 364 insertions, 249 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 434457794c27..3c2ae0e4f25a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
@@ -21,12 +8,12 @@
#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include <linux/iversion.h>
+#include "ctree.h"
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
#include "print-tree.h"
#include "backref.h"
-#include "hash.h"
#include "compression.h"
#include "qgroup.h"
#include "inode-map.h"
@@ -235,11 +222,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
- /*
- * Implicit memory barrier after atomic_dec_and_test
- */
- if (waitqueue_active(&root->log_writer_wait))
- wake_up(&root->log_writer_wait);
+ /* atomic_dec_and_test implies a barrier */
+ cond_wake_up_nomb(&root->log_writer_wait);
}
}
@@ -286,7 +270,7 @@ struct walk_control {
* inside it
*/
int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
- struct walk_control *wc, u64 gen);
+ struct walk_control *wc, u64 gen, int level);
};
/*
@@ -294,7 +278,7 @@ struct walk_control {
*/
static int process_one_buffer(struct btrfs_root *log,
struct extent_buffer *eb,
- struct walk_control *wc, u64 gen)
+ struct walk_control *wc, u64 gen, int level)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
@@ -304,7 +288,7 @@ static int process_one_buffer(struct btrfs_root *log,
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
- ret = btrfs_read_buffer(eb, gen);
+ ret = btrfs_read_buffer(eb, gen, level, NULL);
if (ret)
return ret;
}
@@ -561,12 +545,8 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
- if (IS_ERR(inode)) {
- inode = NULL;
- } else if (is_bad_inode(inode)) {
- iput(inode);
+ if (IS_ERR(inode))
inode = NULL;
- }
return inode;
}
@@ -613,7 +593,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
nbytes = 0;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- size = btrfs_file_extent_inline_len(eb, slot, item);
+ size = btrfs_file_extent_ram_bytes(eb, item);
nbytes = btrfs_file_extent_ram_bytes(eb, item);
extent_end = ALIGN(start + size,
fs_info->sectorsize);
@@ -701,7 +681,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* as the owner of the file extent changed from log tree
* (doesn't affect qgroup) to fs/file tree(affects qgroup)
*/
- ret = btrfs_qgroup_trace_extent(trans, fs_info,
+ ret = btrfs_qgroup_trace_extent(trans,
btrfs_file_extent_disk_bytenr(eb, item),
btrfs_file_extent_disk_num_bytes(eb, item),
GFP_NOFS);
@@ -731,7 +711,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* allocation tree
*/
ret = btrfs_alloc_logged_file_extent(trans,
- fs_info,
root->root_key.objectid,
key->objectid, offset, &ins);
if (ret)
@@ -853,7 +832,6 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode;
char *name;
int name_len;
@@ -887,7 +865,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
else
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
out:
kfree(name);
iput(inode);
@@ -1007,7 +985,6 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
u64 ref_index, char *name, int namelen,
int *search_done)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *victim_name;
int victim_name_len;
@@ -1065,7 +1042,7 @@ again:
kfree(victim_name);
if (ret)
return ret;
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
if (ret)
return ret;
*search_done = 1;
@@ -1136,8 +1113,7 @@ again:
victim_name_len);
if (!ret)
ret = btrfs_run_delayed_items(
- trans,
- fs_info);
+ trans);
}
iput(victim_parent);
kfree(victim_name);
@@ -1310,6 +1286,46 @@ again:
return ret;
}
+static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
+ const u8 ref_type, const char *name,
+ const int namelen)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ const u64 parent_id = btrfs_ino(BTRFS_I(dir));
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.type = ref_type;
+ if (key.type == BTRFS_INODE_REF_KEY)
+ key.offset = parent_id;
+ else
+ key.offset = btrfs_extref_hash(parent_id, name, namelen);
+
+ ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ if (key.type == BTRFS_INODE_EXTREF_KEY)
+ ret = btrfs_find_name_in_ext_backref(path->nodes[0],
+ path->slots[0], parent_id,
+ name, namelen, NULL);
+ else
+ ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+ name, namelen, NULL);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
@@ -1419,6 +1435,32 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
}
}
+ /*
+ * If a reference item already exists for this inode
+ * with the same parent and name, but different index,
+ * drop it and the corresponding directory index entries
+ * from the parent before adding the new reference item
+ * and dir index entries, otherwise we would fail with
+ * -EEXIST returned from btrfs_add_link() below.
+ */
+ ret = btrfs_inode_ref_exists(inode, dir, key->type,
+ name, namelen);
+ if (ret > 0) {
+ ret = btrfs_unlink_inode(trans, root,
+ BTRFS_I(dir),
+ BTRFS_I(inode),
+ name, namelen);
+ /*
+ * If we dropped the link count to 0, bump it so
+ * that later the iput() on the inode will not
+ * free it. We will fixup the link count later.
+ */
+ if (!ret && inode->i_nlink == 0)
+ inc_nlink(inode);
+ }
+ if (ret < 0)
+ goto out;
+
/* insert our name */
ret = btrfs_add_link(trans, BTRFS_I(dir),
BTRFS_I(inode),
@@ -2098,7 +2140,6 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct inode *dir,
struct btrfs_key *dir_key)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
struct extent_buffer *eb;
int slot;
@@ -2140,7 +2181,7 @@ again:
dir_key->offset,
name, name_len, 0);
}
- if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
+ if (!log_di || log_di == ERR_PTR(-ENOENT)) {
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
@@ -2162,7 +2203,7 @@ again:
ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
BTRFS_I(inode), name, name_len);
if (!ret)
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
kfree(name);
iput(inode);
if (ret)
@@ -2356,8 +2397,10 @@ again:
nritems = btrfs_header_nritems(path->nodes[0]);
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
- if (ret)
+ if (ret == 1)
break;
+ else if (ret < 0)
+ goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
@@ -2410,17 +2453,16 @@ out:
* back refs).
*/
static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
- struct walk_control *wc, u64 gen)
+ struct walk_control *wc, u64 gen, int level)
{
int nritems;
struct btrfs_path *path;
struct btrfs_root *root = wc->replay_dest;
struct btrfs_key key;
- int level;
int i;
int ret;
- ret = btrfs_read_buffer(eb, gen);
+ ret = btrfs_read_buffer(eb, gen, level, NULL);
if (ret)
return ret;
@@ -2461,13 +2503,41 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
if (ret)
break;
- /* for regular files, make sure corresponding
- * orphan item exist. extents past the new EOF
- * will be truncated later by orphan cleanup.
+ /*
+ * Before replaying extents, truncate the inode to its
+ * size. We need to do it now and not after log replay
+ * because before an fsync we can have prealloc extents
+ * added beyond the inode's i_size. If we did it after,
+ * through orphan cleanup for example, we would drop
+ * those prealloc extents just after replaying them.
*/
if (S_ISREG(mode)) {
- ret = insert_orphan_item(wc->trans, root,
- key.objectid);
+ struct inode *inode;
+ u64 from;
+
+ inode = read_one_inode(root, key.objectid);
+ if (!inode) {
+ ret = -EIO;
+ break;
+ }
+ from = ALIGN(i_size_read(inode),
+ root->fs_info->sectorsize);
+ ret = btrfs_drop_extents(wc->trans, root, inode,
+ from, (u64)-1, 1);
+ /*
+ * If the nlink count is zero here, the iput
+ * will free the inode. We bump it to make
+ * sure it doesn't get freed until the link
+ * count fixup is done.
+ */
+ if (!ret) {
+ if (inode->i_nlink == 0)
+ inc_nlink(inode);
+ /* Update link count and nbytes. */
+ ret = btrfs_update_inode(wc->trans,
+ root, inode);
+ }
+ iput(inode);
if (ret)
break;
}
@@ -2537,6 +2607,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level >= BTRFS_MAX_LEVEL);
while (*level > 0) {
+ struct btrfs_key first_key;
+
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
cur = path->nodes[*level];
@@ -2549,6 +2621,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+ btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
parent = path->nodes[*level];
@@ -2559,7 +2632,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return PTR_ERR(next);
if (*level == 1) {
- ret = wc->process_func(root, next, wc, ptr_gen);
+ ret = wc->process_func(root, next, wc, ptr_gen,
+ *level - 1);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2567,7 +2641,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- ret = btrfs_read_buffer(next, ptr_gen);
+ ret = btrfs_read_buffer(next, ptr_gen,
+ *level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2597,7 +2672,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
continue;
}
- ret = btrfs_read_buffer(next, ptr_gen);
+ ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2647,7 +2722,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
root_owner = btrfs_header_owner(parent);
ret = wc->process_func(root, path->nodes[*level], wc,
- btrfs_header_generation(path->nodes[*level]));
+ btrfs_header_generation(path->nodes[*level]),
+ *level);
if (ret)
return ret;
@@ -2729,7 +2805,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
/* was the root node processed? if not, catch it here */
if (path->nodes[orig_level]) {
ret = wc->process_func(log, path->nodes[orig_level], wc,
- btrfs_header_generation(path->nodes[orig_level]));
+ btrfs_header_generation(path->nodes[orig_level]),
+ orig_level);
if (ret)
goto out;
if (wc->free) {
@@ -2917,7 +2994,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* bail out if we need to do a full commit */
if (btrfs_need_log_full_commit(fs_info, trans)) {
ret = -EAGAIN;
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -2935,7 +3011,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (ret) {
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, ret);
- btrfs_free_logged_extents(log, log_transid);
btrfs_set_log_full_commit(fs_info, trans);
mutex_unlock(&root->log_mutex);
goto out;
@@ -2969,11 +3044,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- /*
- * Implicit memory barrier after atomic_dec_and_test
- */
- if (waitqueue_active(&log_root_tree->log_writer_wait))
- wake_up(&log_root_tree->log_writer_wait);
+ /* atomic_dec_and_test implies a barrier */
+ cond_wake_up_nomb(&log_root_tree->log_writer_wait);
}
if (ret) {
@@ -2989,7 +3061,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_wait_tree_log_extents(log, mark);
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out;
@@ -3007,7 +3078,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
ret = btrfs_wait_tree_log_extents(log, mark);
- btrfs_wait_logged_extents(trans, log, log_transid);
wait_log_commit(log_root_tree,
root_log_ctx.log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -3032,7 +3102,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (btrfs_need_log_full_commit(fs_info, trans)) {
blk_finish_plug(&plug);
btrfs_wait_tree_log_extents(log, mark);
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out_wake_log_root;
@@ -3045,7 +3114,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_set_log_full_commit(fs_info, trans);
btrfs_abort_transaction(trans, ret);
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
@@ -3055,11 +3123,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
EXTENT_NEW | EXTENT_DIRTY);
if (ret) {
btrfs_set_log_full_commit(fs_info, trans);
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
- btrfs_wait_logged_extents(trans, log, log_transid);
btrfs_set_super_log_root(fs_info->super_for_commit,
log_root_tree->node->start);
@@ -3097,10 +3163,11 @@ out_wake_log_root:
mutex_unlock(&log_root_tree->log_mutex);
/*
- * The barrier before waitqueue_active is implied by mutex_unlock
+ * The barrier before waitqueue_active (in cond_wake_up) is needed so
+ * all the updates above are seen by the woken threads. It might not be
+ * necessary, but proving that seems to be hard.
*/
- if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
- wake_up(&log_root_tree->log_commit_wait[index2]);
+ cond_wake_up(&log_root_tree->log_commit_wait[index2]);
out:
mutex_lock(&root->log_mutex);
btrfs_remove_all_log_ctxs(root, index1, ret);
@@ -3109,10 +3176,11 @@ out:
mutex_unlock(&root->log_mutex);
/*
- * The barrier before waitqueue_active is implied by mutex_unlock
+ * The barrier before waitqueue_active (in cond_wake_up) is needed so
+ * all the updates above are seen by the woken threads. It might not be
+ * necessary, but proving that seems to be hard.
*/
- if (waitqueue_active(&root->log_commit_wait[index1]))
- wake_up(&root->log_commit_wait[index1]);
+ cond_wake_up(&root->log_commit_wait[index1]);
return ret;
}
@@ -3144,14 +3212,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
}
- /*
- * We may have short-circuited the log tree with the full commit logic
- * and left ordered extents on our list, so clear these out to keep us
- * from leaking inodes and memory.
- */
- btrfs_free_logged_extents(log, 0);
- btrfs_free_logged_extents(log, 1);
-
free_extent_buffer(log->node);
kfree(log);
}
@@ -3518,8 +3578,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* from this directory and from this transaction
*/
ret = btrfs_next_leaf(root, path);
- if (ret == 1) {
- last_offset = (u64)-1;
+ if (ret) {
+ if (ret == 1)
+ last_offset = (u64)-1;
+ else
+ err = ret;
goto done;
}
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
@@ -3738,7 +3801,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+ struct btrfs_fs_info *fs_info = trans->fs_info;
unsigned long src_offset;
unsigned long dst_offset;
struct btrfs_root *log = inode->root->log_root;
@@ -3919,9 +3982,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item);
if (btrfs_file_extent_type(src, extent) ==
BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_inline_len(src,
- src_path->slots[0],
- extent);
+ len = btrfs_file_extent_ram_bytes(src, extent);
*last_extent = ALIGN(key.offset + len,
fs_info->sectorsize);
} else {
@@ -3972,6 +4033,7 @@ fill_holes:
ASSERT(ret == 0);
src = src_path->nodes[0];
i = 0;
+ need_find_last_extent = true;
}
btrfs_item_key_to_cpu(src, &key, i);
@@ -3985,7 +4047,7 @@ fill_holes:
extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(src, extent) ==
BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_inline_len(src, i, extent);
+ len = btrfs_file_extent_ram_bytes(src, extent);
extent_end = ALIGN(key.offset + len,
fs_info->sectorsize);
} else {
@@ -4006,6 +4068,36 @@ fill_holes:
break;
*last_extent = extent_end;
}
+
+ /*
+ * Check if there is a hole between the last extent found in our leaf
+ * and the first extent in the next leaf. If there is one, we need to
+ * log an explicit hole so that at replay time we can punch the hole.
+ */
+ if (ret == 0 &&
+ key.objectid == btrfs_ino(inode) &&
+ key.type == BTRFS_EXTENT_DATA_KEY &&
+ i == btrfs_header_nritems(src_path->nodes[0])) {
+ ret = btrfs_next_leaf(inode->root, src_path);
+ need_find_last_extent = true;
+ if (ret > 0) {
+ ret = 0;
+ } else if (ret == 0) {
+ btrfs_item_key_to_cpu(src_path->nodes[0], &key,
+ src_path->slots[0]);
+ if (key.objectid == btrfs_ino(inode) &&
+ key.type == BTRFS_EXTENT_DATA_KEY &&
+ *last_extent < key.offset) {
+ const u64 len = key.offset - *last_extent;
+
+ ret = btrfs_insert_file_extent(trans, log,
+ btrfs_ino(inode),
+ *last_extent, 0,
+ 0, len, 0, len,
+ 0, 0, 0);
+ }
+ }
+ }
/*
* Need to let the callers know we dropped the path so they should
* re-search.
@@ -4029,131 +4121,32 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
return 0;
}
-static int wait_ordered_extents(struct btrfs_trans_handle *trans,
- struct inode *inode,
- struct btrfs_root *root,
- const struct extent_map *em,
- const struct list_head *logged_list,
- bool *ordered_io_error)
+static int log_extent_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_root *log_root,
+ const struct extent_map *em)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_ordered_extent *ordered;
- struct btrfs_root *log = root->log_root;
- u64 mod_start = em->mod_start;
- u64 mod_len = em->mod_len;
- const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
u64 csum_offset;
u64 csum_len;
LIST_HEAD(ordered_sums);
int ret = 0;
- *ordered_io_error = false;
-
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ if (inode->flags & BTRFS_INODE_NODATASUM ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
em->block_start == EXTENT_MAP_HOLE)
return 0;
- /*
- * Wait far any ordered extent that covers our extent map. If it
- * finishes without an error, first check and see if our csums are on
- * our outstanding ordered extents.
- */
- list_for_each_entry(ordered, logged_list, log_list) {
- struct btrfs_ordered_sum *sum;
-
- if (!mod_len)
- break;
-
- if (ordered->file_offset + ordered->len <= mod_start ||
- mod_start + mod_len <= ordered->file_offset)
- continue;
-
- if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
- !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
- !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
- const u64 start = ordered->file_offset;
- const u64 end = ordered->file_offset + ordered->len - 1;
-
- WARN_ON(ordered->inode != inode);
- filemap_fdatawrite_range(inode->i_mapping, start, end);
- }
-
- wait_event(ordered->wait,
- (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
- test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
-
- if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
- /*
- * Clear the AS_EIO/AS_ENOSPC flags from the inode's
- * i_mapping flags, so that the next fsync won't get
- * an outdated io error too.
- */
- filemap_check_errors(inode->i_mapping);
- *ordered_io_error = true;
- break;
- }
- /*
- * We are going to copy all the csums on this ordered extent, so
- * go ahead and adjust mod_start and mod_len in case this
- * ordered extent has already been logged.
- */
- if (ordered->file_offset > mod_start) {
- if (ordered->file_offset + ordered->len >=
- mod_start + mod_len)
- mod_len = ordered->file_offset - mod_start;
- /*
- * If we have this case
- *
- * |--------- logged extent ---------|
- * |----- ordered extent ----|
- *
- * Just don't mess with mod_start and mod_len, we'll
- * just end up logging more csums than we need and it
- * will be ok.
- */
- } else {
- if (ordered->file_offset + ordered->len <
- mod_start + mod_len) {
- mod_len = (mod_start + mod_len) -
- (ordered->file_offset + ordered->len);
- mod_start = ordered->file_offset +
- ordered->len;
- } else {
- mod_len = 0;
- }
- }
-
- if (skip_csum)
- continue;
-
- /*
- * To keep us from looping for the above case of an ordered
- * extent that falls inside of the logged extent.
- */
- if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
- &ordered->flags))
- continue;
-
- list_for_each_entry(sum, &ordered->list, list) {
- ret = btrfs_csum_file_blocks(trans, log, sum);
- if (ret)
- break;
- }
- }
-
- if (*ordered_io_error || !mod_len || ret || skip_csum)
- return ret;
-
+ /* If we're compressed we have to save the entire range of csums. */
if (em->compress_type) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
- csum_offset = mod_start - em->start;
- csum_len = mod_len;
+ csum_offset = em->mod_start - em->start;
+ csum_len = em->mod_len;
}
/* block start is already adjusted for the file extent offset. */
- ret = btrfs_lookup_csums_range(fs_info->csum_root,
+ ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
em->block_start + csum_offset,
em->block_start + csum_offset +
csum_len - 1, &ordered_sums, 0);
@@ -4165,7 +4158,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_csum_file_blocks(trans, log, sums);
+ ret = btrfs_csum_file_blocks(trans, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4177,7 +4170,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_root *root,
const struct extent_map *em,
struct btrfs_path *path,
- const struct list_head *logged_list,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *log = root->log_root;
@@ -4189,18 +4181,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
u64 block_len;
int ret;
int extent_inserted = 0;
- bool ordered_io_err = false;
- ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em,
- logged_list, &ordered_io_err);
+ ret = log_extent_csums(trans, inode, log, em);
if (ret)
return ret;
- if (ordered_io_err) {
- ctx->io_err = -EIO;
- return ctx->io_err;
- }
-
btrfs_init_map_token(&token);
ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
@@ -4267,11 +4252,114 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
return ret;
}
+/*
+ * Log all prealloc extents beyond the inode's i_size to make sure we do not
+ * lose them after doing a fast fsync and replaying the log. We scan the
+ * subvolume's root instead of iterating the inode's extent map tree because
+ * otherwise we can log incorrect extent items based on extent map conversion.
+ * That can happen due to the fact that extent maps are merged when they
+ * are not in the extent map tree's list of modified extents.
+ */
+static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_key key;
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_path *dst_path = NULL;
+ u64 last_extent = (u64)-1;
+ int ins_nr = 0;
+ int start_slot;
+ int ret;
+
+ if (!(inode->flags & BTRFS_INODE_PREALLOC))
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = i_size;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, start_slot,
+ ins_nr, 1, 0);
+ if (ret < 0)
+ goto out;
+ ins_nr = 0;
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(key.objectid < ino) ||
+ key.type < BTRFS_EXTENT_DATA_KEY ||
+ key.offset < i_size) {
+ path->slots[0]++;
+ continue;
+ }
+ if (last_extent == (u64)-1) {
+ last_extent = key.offset;
+ /*
+ * Avoid logging extent items logged in past fsync calls
+ * and leading to duplicate keys in the log tree.
+ */
+ do {
+ ret = btrfs_truncate_inode_items(trans,
+ root->log_root,
+ &inode->vfs_inode,
+ i_size,
+ BTRFS_EXTENT_DATA_KEY);
+ } while (ret == -EAGAIN);
+ if (ret)
+ goto out;
+ }
+ if (ins_nr == 0)
+ start_slot = slot;
+ ins_nr++;
+ path->slots[0]++;
+ if (!dst_path) {
+ dst_path = btrfs_alloc_path();
+ if (!dst_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ start_slot, ins_nr, 1, 0);
+ if (ret > 0)
+ ret = 0;
+ }
+out:
+ btrfs_release_path(path);
+ btrfs_free_path(dst_path);
+ return ret;
+}
+
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct list_head *logged_list,
struct btrfs_log_ctx *ctx,
const u64 start,
const u64 end)
@@ -4309,6 +4397,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
if (em->generation <= test_gen)
continue;
+ /* We log prealloc extents beyond eof later. */
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
+ em->start >= i_size_read(&inode->vfs_inode))
+ continue;
+
if (em->start < logged_start)
logged_start = em->start;
if ((em->start + em->len - 1) > logged_end)
@@ -4322,20 +4415,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
}
list_sort(NULL, &extents, extent_cmp);
- btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end);
- /*
- * Some ordered extents started by fsync might have completed
- * before we could collect them into the list logged_list, which
- * means they're gone, not in our logged_list nor in the inode's
- * ordered tree. We want the application/user space to know an
- * error happened while attempting to persist file data so that
- * it can take proper action. If such error happened, we leave
- * without writing to the log tree and the fsync must report the
- * file data write error and not commit the current transaction.
- */
- ret = filemap_check_errors(inode->vfs_inode.i_mapping);
- if (ret)
- ctx->io_err = ret;
process:
while (!list_empty(&extents)) {
em = list_entry(extents.next, struct extent_map, list);
@@ -4354,8 +4433,7 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path, logged_list,
- ctx);
+ ret = log_one_extent(trans, inode, root, em, path, ctx);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -4365,6 +4443,9 @@ process:
up_write(&inode->dio_sem);
btrfs_release_path(path);
+ if (!ret)
+ ret = btrfs_log_prealloc_extents(trans, inode, path);
+
return ret;
}
@@ -4551,9 +4632,7 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
if (btrfs_file_extent_type(leaf, extent) ==
BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_inline_len(leaf,
- path->slots[0],
- extent);
+ len = btrfs_file_extent_ram_bytes(leaf, extent);
ASSERT(len == i_size ||
(len == fs_info->sectorsize &&
btrfs_file_extent_compression(leaf, extent) !=
@@ -4737,7 +4816,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
- LIST_HEAD(logged_list);
u64 last_extent = 0;
int err = 0;
int ret;
@@ -4749,6 +4827,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct extent_map_tree *em_tree = &inode->extent_tree;
u64 logged_isize = 0;
bool need_log_inode_item = true;
+ bool xattrs_logged = false;
path = btrfs_alloc_path();
if (!path)
@@ -4932,8 +5011,7 @@ again:
* we don't need to do more work nor fallback to
* a transaction commit.
*/
- if (IS_ERR(other_inode) &&
- PTR_ERR(other_inode) == -ENOENT) {
+ if (other_inode == ERR_PTR(-ENOENT)) {
goto next_key;
} else if (IS_ERR(other_inode)) {
err = PTR_ERR(other_inode);
@@ -5050,6 +5128,7 @@ next_key:
err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
if (err)
goto out_unlock;
+ xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
@@ -5062,12 +5141,17 @@ log_extents:
btrfs_release_path(dst_path);
if (need_log_inode_item) {
err = log_inode_item(trans, log, dst_path, inode);
+ if (!err && !xattrs_logged) {
+ err = btrfs_log_all_xattrs(trans, root, inode, path,
+ dst_path);
+ btrfs_release_path(path);
+ }
if (err)
goto out_unlock;
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- &logged_list, ctx, start, end);
+ ctx, start, end);
if (ret) {
err = ret;
goto out_unlock;
@@ -5118,10 +5202,6 @@ log_extents:
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
out_unlock:
- if (unlikely(err))
- btrfs_put_logged_extents(&logged_list);
- else
- btrfs_submit_logged_extents(&logged_list, log);
mutex_unlock(&inode->log_mutex);
btrfs_free_path(path);
@@ -5417,7 +5497,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
@@ -5517,7 +5597,6 @@ out:
* the last committed transaction
*/
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct dentry *parent,
const loff_t start,
@@ -5525,6 +5604,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
int inode_only,
struct btrfs_log_ctx *ctx)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct super_block *sb;
struct dentry *old_parent = NULL;
@@ -5550,7 +5630,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) {
+ if (btrfs_root_refs(&root->root_item) == 0) {
ret = 1;
goto end_no_trans;
}
@@ -5682,7 +5762,7 @@ end_no_trans:
* data on disk.
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry,
+ struct dentry *dentry,
const loff_t start,
const loff_t end,
struct btrfs_log_ctx *ctx)
@@ -5690,8 +5770,8 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *parent = dget_parent(dentry);
int ret;
- ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)),
- parent, start, end, LOG_INODE_ALL, ctx);
+ ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
+ start, end, LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -5945,15 +6025,25 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
*
- * It will return zero if all goes well, and it will return 1 if a
- * full transaction commit is required.
+ * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
+ * true (because it's not used).
+ *
+ * Return value depends on whether @sync_log is true or false.
+ * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
+ * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
+ * otherwise.
+ * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
+ * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
+ * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
+ * committed (without attempting to sync the log).
*/
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent)
+ struct dentry *parent,
+ bool sync_log, struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
/*
* this will force the logging code to walk the dentry chain
@@ -5968,9 +6058,34 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
*/
if (inode->logged_trans <= fs_info->last_trans_committed &&
(!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return 0;
+ return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
+ BTRFS_DONT_NEED_LOG_SYNC;
+
+ if (sync_log) {
+ struct btrfs_log_ctx ctx2;
+
+ btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
+ ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, &ctx2);
+ if (ret == BTRFS_NO_LOG_SYNC)
+ return BTRFS_DONT_NEED_TRANS_COMMIT;
+ else if (ret)
+ return BTRFS_NEED_TRANS_COMMIT;
+
+ ret = btrfs_sync_log(trans, inode->root, &ctx2);
+ if (ret)
+ return BTRFS_NEED_TRANS_COMMIT;
+ return BTRFS_DONT_NEED_TRANS_COMMIT;
+ }
+
+ ASSERT(ctx);
+ ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, ctx);
+ if (ret == BTRFS_NO_LOG_SYNC)
+ return BTRFS_DONT_NEED_LOG_SYNC;
+ else if (ret)
+ return BTRFS_NEED_TRANS_COMMIT;
- return btrfs_log_inode_parent(trans, root, inode, parent, 0,
- LLONG_MAX, LOG_INODE_EXISTS, NULL);
+ return BTRFS_NEED_LOG_SYNC;
}