aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c140
1 files changed, 118 insertions, 22 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1bbaace73383..24d03c751149 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -26,6 +26,7 @@
#include "print-tree.h"
#include "backref.h"
#include "hash.h"
+#include "compression.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -229,7 +230,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
- smp_mb();
+ /*
+ * Implicit memory barrier after atomic_dec_and_test
+ */
if (waitqueue_active(&root->log_writer_wait))
wake_up(&root->log_writer_wait);
}
@@ -691,7 +694,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_inc_extent_ref(trans, root,
ins.objectid, ins.offset,
0, root->root_key.objectid,
- key->objectid, offset, 0);
+ key->objectid, offset);
if (ret)
goto out;
} else {
@@ -1043,7 +1046,7 @@ again:
/*
* NOTE: we have searched root tree and checked the
- * coresponding ref, it does not need to check again.
+ * corresponding ref, it does not need to check again.
*/
*search_done = 1;
}
@@ -2820,7 +2823,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- smp_mb();
+ /*
+ * Implicit memory barrier after atomic_dec_and_test
+ */
if (waitqueue_active(&log_root_tree->log_writer_wait))
wake_up(&log_root_tree->log_writer_wait);
}
@@ -2950,6 +2955,9 @@ out_wake_log_root:
atomic_set(&log_root_tree->log_commit[index2], 0);
mutex_unlock(&log_root_tree->log_mutex);
+ /*
+ * The barrier before waitqueue_active is implied by mutex_unlock
+ */
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
wake_up(&log_root_tree->log_commit_wait[index2]);
out:
@@ -2961,6 +2969,9 @@ out:
atomic_set(&root->log_commit[index1], 0);
mutex_unlock(&root->log_mutex);
+ /*
+ * The barrier before waitqueue_active is implied by mutex_unlock
+ */
if (waitqueue_active(&root->log_commit_wait[index1]))
wake_up(&root->log_commit_wait[index1]);
return ret;
@@ -4117,7 +4128,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *path,
struct list_head *logged_list,
- struct btrfs_log_ctx *ctx)
+ struct btrfs_log_ctx *ctx,
+ const u64 start,
+ const u64 end)
{
struct extent_map *em, *n;
struct list_head extents;
@@ -4156,7 +4169,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
}
list_sort(NULL, &extents, extent_cmp);
-
+ /*
+ * Collect any new ordered extents within the range. This is to
+ * prevent logging file extent items without waiting for the disk
+ * location they point to being written. We do this only to deal
+ * with races against concurrent lockless direct IO writes.
+ */
+ btrfs_get_logged_extents(inode, logged_list, start, end);
process:
while (!list_empty(&extents)) {
em = list_entry(extents.next, struct extent_map, list);
@@ -4482,7 +4501,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
mutex_lock(&BTRFS_I(inode)->log_mutex);
- btrfs_get_logged_extents(inode, &logged_list, start, end);
+ /*
+ * Collect ordered extents only if we are logging data. This is to
+ * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
+ * will process the ordered extents if they still exists at the time,
+ * because when we collect them we test and set for the flag
+ * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
+ * same ordered extents. The consequence for the LOG_INODE_ALL log mode
+ * not processing the ordered extents is that we end up logging the
+ * corresponding file extent items, based on the extent maps in the
+ * inode's extent_map_tree's modified_list, without logging the
+ * respective checksums (since the may still be only attached to the
+ * ordered extents and have not been inserted in the csum tree by
+ * btrfs_finish_ordered_io() yet).
+ */
+ if (inode_only == LOG_INODE_ALL)
+ btrfs_get_logged_extents(inode, &logged_list, start, end);
/*
* a brute force approach to making sure we get the most uptodate
@@ -4691,7 +4725,7 @@ log_extents:
goto out_unlock;
}
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- &logged_list, ctx);
+ &logged_list, ctx, start, end);
if (ret) {
err = ret;
goto out_unlock;
@@ -4754,6 +4788,42 @@ out_unlock:
}
/*
+ * Check if we must fallback to a transaction commit when logging an inode.
+ * This must be called after logging the inode and is used only in the context
+ * when fsyncing an inode requires the need to log some other inode - in which
+ * case we can't lock the i_mutex of each other inode we need to log as that
+ * can lead to deadlocks with concurrent fsync against other inodes (as we can
+ * log inodes up or down in the hierarchy) or rename operations for example. So
+ * we take the log_mutex of the inode after we have logged it and then check for
+ * its last_unlink_trans value - this is safe because any task setting
+ * last_unlink_trans must take the log_mutex and it must do this before it does
+ * the actual unlink operation, so if we do this check before a concurrent task
+ * sets last_unlink_trans it means we've logged a consistent version/state of
+ * all the inode items, otherwise we are not sure and must do a transaction
+ * commit (the concurrent task migth have only updated last_unlink_trans before
+ * we logged the inode or it might have also done the unlink).
+ */
+static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ bool ret = false;
+
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+ if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) {
+ /*
+ * Make sure any commits to the log are forced to be full
+ * commits.
+ */
+ btrfs_set_log_full_commit(fs_info, trans);
+ ret = true;
+ }
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+ return ret;
+}
+
+/*
* follow the dentry parent pointers up the chain and see if any
* of the directories in it require a full commit before they can
* be logged. Returns zero if nothing special needs to be done or 1 if
@@ -4766,7 +4836,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
u64 last_committed)
{
int ret = 0;
- struct btrfs_root *root;
struct dentry *old_parent = NULL;
struct inode *orig_inode = inode;
@@ -4798,14 +4867,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->logged_trans = trans->transid;
smp_mb();
- if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
- root = BTRFS_I(inode)->root;
-
- /*
- * make sure any commits to the log are forced
- * to be full commits
- */
- btrfs_set_log_full_commit(root->fs_info, trans);
+ if (btrfs_must_commit_transaction(trans, inode)) {
ret = 1;
break;
}
@@ -4964,6 +5026,9 @@ process_leaf:
btrfs_release_path(path);
ret = btrfs_log_inode(trans, root, di_inode,
log_mode, 0, LLONG_MAX, ctx);
+ if (!ret &&
+ btrfs_must_commit_transaction(trans, di_inode))
+ ret = 1;
iput(di_inode);
if (ret)
goto next_dir_inode;
@@ -5078,6 +5143,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, root, dir_inode,
LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+ if (!ret &&
+ btrfs_must_commit_transaction(trans, dir_inode))
+ ret = 1;
iput(dir_inode);
if (ret)
goto out;
@@ -5314,7 +5382,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_error(fs_info, ret, "Failed to pin buffers while "
+ btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
"recovering log root tree.");
goto error;
}
@@ -5328,7 +5396,7 @@ again:
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_error(fs_info, ret,
+ btrfs_std_error(fs_info, ret,
"Couldn't find tree log root.");
goto error;
}
@@ -5346,7 +5414,7 @@ again:
log = btrfs_read_fs_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_error(fs_info, ret,
+ btrfs_std_error(fs_info, ret,
"Couldn't read tree log root.");
goto error;
}
@@ -5361,7 +5429,7 @@ again:
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
- btrfs_error(fs_info, ret, "Couldn't read target root "
+ btrfs_std_error(fs_info, ret, "Couldn't read target root "
"for tree log recovery.");
goto error;
}
@@ -5429,6 +5497,9 @@ error:
* They revolve around files there were unlinked from the directory, and
* this function updates the parent directory so that a full commit is
* properly done if it is fsync'd later after the unlinks are done.
+ *
+ * Must be called before the unlink operations (updates to the subvolume tree,
+ * inodes, etc) are done.
*/
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode,
@@ -5444,8 +5515,11 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
* into the file. When the file is logged we check it and
* don't log the parents if the file is fully on disk.
*/
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode)) {
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
BTRFS_I(inode)->last_unlink_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+ }
/*
* if this directory was already logged any new
@@ -5476,7 +5550,29 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
return;
record:
+ mutex_lock(&BTRFS_I(dir)->log_mutex);
BTRFS_I(dir)->last_unlink_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(dir)->log_mutex);
+}
+
+/*
+ * Make sure that if someone attempts to fsync the parent directory of a deleted
+ * snapshot, it ends up triggering a transaction commit. This is to guarantee
+ * that after replaying the log tree of the parent directory's root we will not
+ * see the snapshot anymore and at log replay time we will not see any log tree
+ * corresponding to the deleted snapshot's root, which could lead to replaying
+ * it after replaying the log tree of the parent directory (which would replay
+ * the snapshot delete operation).
+ *
+ * Must be called before the actual snapshot destroy operation (updates to the
+ * parent root and tree of tree roots trees, etc) are done.
+ */
+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
+ struct inode *dir)
+{
+ mutex_lock(&BTRFS_I(dir)->log_mutex);
+ BTRFS_I(dir)->last_unlink_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(dir)->log_mutex);
}
/*