diff options
author | Luca Coelho <luciano.coelho@intel.com> | 2016-05-10 10:30:56 +0300 |
---|---|---|
committer | Luca Coelho <luciano.coelho@intel.com> | 2016-05-10 10:30:56 +0300 |
commit | bae6692c24236d0203f88a444986d86437a858fa (patch) | |
tree | 0f9bee1250af3046fa46049736b615b81e60f56e /fs/btrfs/tree-log.c | |
parent | 46167a8fd4248533ad15867e6988ff20e76de641 (diff) | |
parent | 57fbcce37be7c1d2622b56587c10ade00e96afa3 (diff) |
Merge tag 'mac80211-next-for-davem-2016-04-13' of git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next into master
To synchronize with Kalle, here's just a big change that affects
all drivers - removing the duplicated enum ieee80211_band and
replacing it by enum nl80211_band. On top of that, just a small
documentation update.
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r-- | fs/btrfs/tree-log.c | 239 |
1 files changed, 227 insertions, 12 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 978c3a810893..517d0ccb351e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -26,6 +26,7 @@ #include "print-tree.h" #include "backref.h" #include "hash.h" +#include "compression.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -1045,7 +1046,7 @@ again: /* * NOTE: we have searched root tree and checked the - * coresponding ref, it does not need to check again. + * corresponding ref, it does not need to check again. */ *search_done = 1; } @@ -4414,6 +4415,127 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, return ret; } +/* + * When we are logging a new inode X, check if it doesn't have a reference that + * matches the reference from some other inode Y created in a past transaction + * and that was renamed in the current transaction. If we don't do this, then at + * log replay time we can lose inode Y (and all its files if it's a directory): + * + * mkdir /mnt/x + * echo "hello world" > /mnt/x/foobar + * sync + * mv /mnt/x /mnt/y + * mkdir /mnt/x # or touch /mnt/x + * xfs_io -c fsync /mnt/x + * <power fail> + * mount fs, trigger log replay + * + * After the log replay procedure, we would lose the first directory and all its + * files (file foobar). + * For the case where inode Y is not a directory we simply end up losing it: + * + * echo "123" > /mnt/foo + * sync + * mv /mnt/foo /mnt/bar + * echo "abc" > /mnt/foo + * xfs_io -c fsync /mnt/foo + * <power fail> + * + * We also need this for cases where a snapshot entry is replaced by some other + * entry (file or directory) otherwise we end up with an unreplayable log due to + * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as + * if it were a regular entry: + * + * mkdir /mnt/x + * btrfs subvolume snapshot /mnt /mnt/x/snap + * btrfs subvolume delete /mnt/x/snap + * rmdir /mnt/x + * mkdir /mnt/x + * fsync /mnt/x or fsync some new file inside it + * <power fail> + * + * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in + * the same transaction. + */ +static int btrfs_check_ref_name_override(struct extent_buffer *eb, + const int slot, + const struct btrfs_key *key, + struct inode *inode) +{ + int ret; + struct btrfs_path *search_path; + char *name = NULL; + u32 name_len = 0; + u32 item_size = btrfs_item_size_nr(eb, slot); + u32 cur_offset = 0; + unsigned long ptr = btrfs_item_ptr_offset(eb, slot); + + search_path = btrfs_alloc_path(); + if (!search_path) + return -ENOMEM; + search_path->search_commit_root = 1; + search_path->skip_locking = 1; + + while (cur_offset < item_size) { + u64 parent; + u32 this_name_len; + u32 this_len; + unsigned long name_ptr; + struct btrfs_dir_item *di; + + if (key->type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + + iref = (struct btrfs_inode_ref *)(ptr + cur_offset); + parent = key->offset; + this_name_len = btrfs_inode_ref_name_len(eb, iref); + name_ptr = (unsigned long)(iref + 1); + this_len = sizeof(*iref) + this_name_len; + } else { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + this_name_len = btrfs_inode_extref_name_len(eb, extref); + name_ptr = (unsigned long)&extref->name; + this_len = sizeof(*extref) + this_name_len; + } + + if (this_name_len > name_len) { + char *new_name; + + new_name = krealloc(name, this_name_len, GFP_NOFS); + if (!new_name) { + ret = -ENOMEM; + goto out; + } + name_len = this_name_len; + name = new_name; + } + + read_extent_buffer(eb, name, name_ptr, this_name_len); + di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root, + search_path, parent, + name, this_name_len, 0); + if (di && !IS_ERR(di)) { + ret = 1; + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + btrfs_release_path(search_path); + + cur_offset += this_len; + } + ret = 0; +out: + btrfs_free_path(search_path); + kfree(name); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4500,7 +4622,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); - btrfs_get_logged_extents(inode, &logged_list, start, end); + /* + * Collect ordered extents only if we are logging data. This is to + * ensure a subsequent request to log this inode in LOG_INODE_ALL mode + * will process the ordered extents if they still exists at the time, + * because when we collect them we test and set for the flag + * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the + * same ordered extents. The consequence for the LOG_INODE_ALL log mode + * not processing the ordered extents is that we end up logging the + * corresponding file extent items, based on the extent maps in the + * inode's extent_map_tree's modified_list, without logging the + * respective checksums (since the may still be only attached to the + * ordered extents and have not been inserted in the csum tree by + * btrfs_finish_ordered_io() yet). + */ + if (inode_only == LOG_INODE_ALL) + btrfs_get_logged_extents(inode, &logged_list, start, end); /* * a brute force approach to making sure we get the most uptodate @@ -4586,6 +4723,22 @@ again: if (min_key.type == BTRFS_INODE_ITEM_KEY) need_log_inode_item = false; + if ((min_key.type == BTRFS_INODE_REF_KEY || + min_key.type == BTRFS_INODE_EXTREF_KEY) && + BTRFS_I(inode)->generation == trans->transid) { + ret = btrfs_check_ref_name_override(path->nodes[0], + path->slots[0], + &min_key, inode); + if (ret < 0) { + err = ret; + goto out_unlock; + } else if (ret > 0) { + err = 1; + btrfs_set_log_full_commit(root->fs_info, trans); + goto out_unlock; + } + } + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ if (min_key.type == BTRFS_XATTR_ITEM_KEY) { if (ins_nr == 0) @@ -4772,6 +4925,42 @@ out_unlock: } /* + * Check if we must fallback to a transaction commit when logging an inode. + * This must be called after logging the inode and is used only in the context + * when fsyncing an inode requires the need to log some other inode - in which + * case we can't lock the i_mutex of each other inode we need to log as that + * can lead to deadlocks with concurrent fsync against other inodes (as we can + * log inodes up or down in the hierarchy) or rename operations for example. So + * we take the log_mutex of the inode after we have logged it and then check for + * its last_unlink_trans value - this is safe because any task setting + * last_unlink_trans must take the log_mutex and it must do this before it does + * the actual unlink operation, so if we do this check before a concurrent task + * sets last_unlink_trans it means we've logged a consistent version/state of + * all the inode items, otherwise we are not sure and must do a transaction + * commit (the concurrent task migth have only updated last_unlink_trans before + * we logged the inode or it might have also done the unlink). + */ +static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + bool ret = false; + + mutex_lock(&BTRFS_I(inode)->log_mutex); + if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) { + /* + * Make sure any commits to the log are forced to be full + * commits. + */ + btrfs_set_log_full_commit(fs_info, trans); + ret = true; + } + mutex_unlock(&BTRFS_I(inode)->log_mutex); + + return ret; +} + +/* * follow the dentry parent pointers up the chain and see if any * of the directories in it require a full commit before they can * be logged. Returns zero if nothing special needs to be done or 1 if @@ -4784,7 +4973,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, u64 last_committed) { int ret = 0; - struct btrfs_root *root; struct dentry *old_parent = NULL; struct inode *orig_inode = inode; @@ -4816,14 +5004,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, BTRFS_I(inode)->logged_trans = trans->transid; smp_mb(); - if (BTRFS_I(inode)->last_unlink_trans > last_committed) { - root = BTRFS_I(inode)->root; - - /* - * make sure any commits to the log are forced - * to be full commits - */ - btrfs_set_log_full_commit(root->fs_info, trans); + if (btrfs_must_commit_transaction(trans, inode)) { ret = 1; break; } @@ -4982,6 +5163,9 @@ process_leaf: btrfs_release_path(path); ret = btrfs_log_inode(trans, root, di_inode, log_mode, 0, LLONG_MAX, ctx); + if (!ret && + btrfs_must_commit_transaction(trans, di_inode)) + ret = 1; iput(di_inode); if (ret) goto next_dir_inode; @@ -5096,6 +5280,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, dir_inode, LOG_INODE_ALL, 0, LLONG_MAX, ctx); + if (!ret && + btrfs_must_commit_transaction(trans, dir_inode)) + ret = 1; iput(dir_inode); if (ret) goto out; @@ -5447,6 +5634,9 @@ error: * They revolve around files there were unlinked from the directory, and * this function updates the parent directory so that a full commit is * properly done if it is fsync'd later after the unlinks are done. + * + * Must be called before the unlink operations (updates to the subvolume tree, + * inodes, etc) are done. */ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, @@ -5462,8 +5652,11 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * into the file. When the file is logged we check it and * don't log the parents if the file is fully on disk. */ - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode)) { + mutex_lock(&BTRFS_I(inode)->log_mutex); BTRFS_I(inode)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); + } /* * if this directory was already logged any new @@ -5494,7 +5687,29 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, return; record: + mutex_lock(&BTRFS_I(dir)->log_mutex); BTRFS_I(dir)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(dir)->log_mutex); +} + +/* + * Make sure that if someone attempts to fsync the parent directory of a deleted + * snapshot, it ends up triggering a transaction commit. This is to guarantee + * that after replaying the log tree of the parent directory's root we will not + * see the snapshot anymore and at log replay time we will not see any log tree + * corresponding to the deleted snapshot's root, which could lead to replaying + * it after replaying the log tree of the parent directory (which would replay + * the snapshot delete operation). + * + * Must be called before the actual snapshot destroy operation (updates to the + * parent root and tree of tree roots trees, etc) are done. + */ +void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, + struct inode *dir) +{ + mutex_lock(&BTRFS_I(dir)->log_mutex); + BTRFS_I(dir)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(dir)->log_mutex); } /* |