diff options
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r-- | fs/btrfs/tree-log.c | 496 |
1 files changed, 249 insertions, 247 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8f23a94dab77..ac232b3d6d7e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -205,14 +205,11 @@ static int join_running_log_trans(struct btrfs_root *root) * until you call btrfs_end_log_trans() or it makes any future * log transactions wait until you call btrfs_end_log_trans() */ -int btrfs_pin_log_trans(struct btrfs_root *root) +void btrfs_pin_log_trans(struct btrfs_root *root) { - int ret = -ENOENT; - mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); - return ret; } /* @@ -222,11 +219,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root) void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { - /* - * Implicit memory barrier after atomic_dec_and_test - */ - if (waitqueue_active(&root->log_writer_wait)) - wake_up(&root->log_writer_wait); + /* atomic_dec_and_test implies a barrier */ + cond_wake_up_nomb(&root->log_writer_wait); } } @@ -261,6 +255,13 @@ struct walk_control { /* what stage of the replay code we're currently in */ int stage; + /* + * Ignore any items from the inode currently being processed. Needs + * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in + * the LOG_WALK_REPLAY_INODES stage. + */ + bool ignore_cur_inode; + /* the root we are currently replaying */ struct btrfs_root *replay_dest; @@ -548,12 +549,8 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); - if (IS_ERR(inode)) { - inode = NULL; - } else if (is_bad_inode(inode)) { - iput(inode); + if (IS_ERR(inode)) inode = NULL; - } return inode; } @@ -600,7 +597,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (btrfs_file_extent_disk_bytenr(eb, item) == 0) nbytes = 0; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_inline_len(eb, slot, item); + size = btrfs_file_extent_ram_bytes(eb, item); nbytes = btrfs_file_extent_ram_bytes(eb, item); extent_end = ALIGN(start + size, fs_info->sectorsize); @@ -688,7 +685,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * as the owner of the file extent changed from log tree * (doesn't affect qgroup) to fs/file tree(affects qgroup) */ - ret = btrfs_qgroup_trace_extent(trans, fs_info, + ret = btrfs_qgroup_trace_extent(trans, btrfs_file_extent_disk_bytenr(eb, item), btrfs_file_extent_disk_num_bytes(eb, item), GFP_NOFS); @@ -718,7 +715,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * allocation tree */ ret = btrfs_alloc_logged_file_extent(trans, - fs_info, root->root_key.objectid, key->objectid, offset, &ins); if (ret) @@ -1148,7 +1144,7 @@ next: } btrfs_release_path(path); - /* look for a conflicing name */ + /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); if (di && !IS_ERR(di)) { @@ -1294,6 +1290,46 @@ again: return ret; } +static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, + const u8 ref_type, const char *name, + const int namelen) +{ + struct btrfs_key key; + struct btrfs_path *path; + const u64 parent_id = btrfs_ino(BTRFS_I(dir)); + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = ref_type; + if (key.type == BTRFS_INODE_REF_KEY) + key.offset = parent_id; + else + key.offset = btrfs_extref_hash(parent_id, name, namelen); + + ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + if (key.type == BTRFS_INODE_EXTREF_KEY) + ret = btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], parent_id, + name, namelen, NULL); + else + ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, namelen, NULL); + +out: + btrfs_free_path(path); + return ret; +} + /* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. @@ -1403,6 +1439,32 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } } + /* + * If a reference item already exists for this inode + * with the same parent and name, but different index, + * drop it and the corresponding directory index entries + * from the parent before adding the new reference item + * and dir index entries, otherwise we would fail with + * -EEXIST returned from btrfs_add_link() below. + */ + ret = btrfs_inode_ref_exists(inode, dir, key->type, + name, namelen); + if (ret > 0) { + ret = btrfs_unlink_inode(trans, root, + BTRFS_I(dir), + BTRFS_I(inode), + name, namelen); + /* + * If we dropped the link count to 0, bump it so + * that later the iput() on the inode will not + * free it. We will fixup the link count later. + */ + if (!ret && inode->i_nlink == 0) + inc_nlink(inode); + } + if (ret < 0) + goto out; + /* insert our name */ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), @@ -2123,7 +2185,7 @@ again: dir_key->offset, name, name_len, 0); } - if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { + if (!log_di || log_di == ERR_PTR(-ENOENT)) { btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); @@ -2429,6 +2491,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + /* + * If we have a tmpfile (O_TMPFILE) that got fsync'ed + * and never got linked before the fsync, skip it, as + * replaying it is pointless since it would be deleted + * later. We skip logging tmpfiles, but it's always + * possible we are replaying a log created with a kernel + * that used to log tmpfiles. + */ + if (btrfs_inode_nlink(eb, inode_item) == 0) { + wc->ignore_cur_inode = true; + continue; + } else { + wc->ignore_cur_inode = false; + } ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid); if (ret) @@ -2466,16 +2542,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, root->fs_info->sectorsize); ret = btrfs_drop_extents(wc->trans, root, inode, from, (u64)-1, 1); - /* - * If the nlink count is zero here, the iput - * will free the inode. We bump it to make - * sure it doesn't get freed until the link - * count fixup is done. - */ if (!ret) { - if (inode->i_nlink == 0) - inc_nlink(inode); - /* Update link count and nbytes. */ + /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, root, inode); } @@ -2490,6 +2558,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; } + if (wc->ignore_cur_inode) + continue; + if (key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { ret = replay_one_dir_item(wc->trans, root, path, @@ -2936,7 +3007,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* bail out if we need to do a full commit */ if (btrfs_need_log_full_commit(fs_info, trans)) { ret = -EAGAIN; - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); goto out; } @@ -2954,7 +3024,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); - btrfs_free_logged_extents(log, log_transid); btrfs_set_log_full_commit(fs_info, trans); mutex_unlock(&root->log_mutex); goto out; @@ -2988,11 +3057,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { - /* - * Implicit memory barrier after atomic_dec_and_test - */ - if (waitqueue_active(&log_root_tree->log_writer_wait)) - wake_up(&log_root_tree->log_writer_wait); + /* atomic_dec_and_test implies a barrier */ + cond_wake_up_nomb(&log_root_tree->log_writer_wait); } if (ret) { @@ -3008,7 +3074,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } btrfs_wait_tree_log_extents(log, mark); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out; @@ -3026,7 +3091,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); ret = btrfs_wait_tree_log_extents(log, mark); - btrfs_wait_logged_extents(trans, log, log_transid); wait_log_commit(log_root_tree, root_log_ctx.log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -3051,7 +3115,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (btrfs_need_log_full_commit(fs_info, trans)) { blk_finish_plug(&plug); btrfs_wait_tree_log_extents(log, mark); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; @@ -3064,7 +3127,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (ret) { btrfs_set_log_full_commit(fs_info, trans); btrfs_abort_transaction(trans, ret); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } @@ -3074,11 +3136,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_NEW | EXTENT_DIRTY); if (ret) { btrfs_set_log_full_commit(fs_info, trans); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } - btrfs_wait_logged_extents(trans, log, log_transid); btrfs_set_super_log_root(fs_info->super_for_commit, log_root_tree->node->start); @@ -3089,7 +3149,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); /* - * nobody else is going to jump in and write the the ctree + * Nobody else is going to jump in and write the ctree * super here because the log_commit atomic below is protecting * us. We must be called with a transaction handle pinning * the running transaction open, so a full commit can't hop @@ -3116,10 +3176,11 @@ out_wake_log_root: mutex_unlock(&log_root_tree->log_mutex); /* - * The barrier before waitqueue_active is implied by mutex_unlock + * The barrier before waitqueue_active (in cond_wake_up) is needed so + * all the updates above are seen by the woken threads. It might not be + * necessary, but proving that seems to be hard. */ - if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) - wake_up(&log_root_tree->log_commit_wait[index2]); + cond_wake_up(&log_root_tree->log_commit_wait[index2]); out: mutex_lock(&root->log_mutex); btrfs_remove_all_log_ctxs(root, index1, ret); @@ -3128,10 +3189,11 @@ out: mutex_unlock(&root->log_mutex); /* - * The barrier before waitqueue_active is implied by mutex_unlock + * The barrier before waitqueue_active (in cond_wake_up) is needed so + * all the updates above are seen by the woken threads. It might not be + * necessary, but proving that seems to be hard. */ - if (waitqueue_active(&root->log_commit_wait[index1])) - wake_up(&root->log_commit_wait[index1]); + cond_wake_up(&root->log_commit_wait[index1]); return ret; } @@ -3139,38 +3201,21 @@ static void free_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log) { int ret; - u64 start; - u64 end; struct walk_control wc = { .free = 1, .process_func = process_one_buffer }; ret = walk_log_tree(trans, log, &wc); - /* I don't think this can happen but just in case */ - if (ret) - btrfs_abort_transaction(trans, ret); - - while (1) { - ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, - NULL); - if (ret) - break; - - clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); + if (ret) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(log->fs_info, ret, NULL); } - /* - * We may have short-circuited the log tree with the full commit logic - * and left ordered extents on our list, so clear these out to keep us - * from leaking inodes and memory. - */ - btrfs_free_logged_extents(log, 0); - btrfs_free_logged_extents(log, 1); - + clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); free_extent_buffer(log->node); kfree(log); } @@ -3760,7 +3805,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int start_slot, int nr, int inode_only, u64 logged_isize) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = trans->fs_info; unsigned long src_offset; unsigned long dst_offset; struct btrfs_root *log = inode->root->log_root; @@ -3941,9 +3986,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); if (btrfs_file_extent_type(src, extent) == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(src, - src_path->slots[0], - extent); + len = btrfs_file_extent_ram_bytes(src, extent); *last_extent = ALIGN(key.offset + len, fs_info->sectorsize); } else { @@ -4008,7 +4051,7 @@ fill_holes: extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); if (btrfs_file_extent_type(src, extent) == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(src, i, extent); + len = btrfs_file_extent_ram_bytes(src, extent); extent_end = ALIGN(key.offset + len, fs_info->sectorsize); } else { @@ -4082,131 +4125,32 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -static int wait_ordered_extents(struct btrfs_trans_handle *trans, - struct inode *inode, - struct btrfs_root *root, - const struct extent_map *em, - const struct list_head *logged_list, - bool *ordered_io_error) +static int log_extent_csums(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_root *log_root, + const struct extent_map *em) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_ordered_extent *ordered; - struct btrfs_root *log = root->log_root; - u64 mod_start = em->mod_start; - u64 mod_len = em->mod_len; - const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; u64 csum_offset; u64 csum_len; LIST_HEAD(ordered_sums); int ret = 0; - *ordered_io_error = false; - - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + if (inode->flags & BTRFS_INODE_NODATASUM || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || em->block_start == EXTENT_MAP_HOLE) return 0; - /* - * Wait far any ordered extent that covers our extent map. If it - * finishes without an error, first check and see if our csums are on - * our outstanding ordered extents. - */ - list_for_each_entry(ordered, logged_list, log_list) { - struct btrfs_ordered_sum *sum; - - if (!mod_len) - break; - - if (ordered->file_offset + ordered->len <= mod_start || - mod_start + mod_len <= ordered->file_offset) - continue; - - if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && - !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && - !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { - const u64 start = ordered->file_offset; - const u64 end = ordered->file_offset + ordered->len - 1; - - WARN_ON(ordered->inode != inode); - filemap_fdatawrite_range(inode->i_mapping, start, end); - } - - wait_event(ordered->wait, - (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || - test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); - - if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { - /* - * Clear the AS_EIO/AS_ENOSPC flags from the inode's - * i_mapping flags, so that the next fsync won't get - * an outdated io error too. - */ - filemap_check_errors(inode->i_mapping); - *ordered_io_error = true; - break; - } - /* - * We are going to copy all the csums on this ordered extent, so - * go ahead and adjust mod_start and mod_len in case this - * ordered extent has already been logged. - */ - if (ordered->file_offset > mod_start) { - if (ordered->file_offset + ordered->len >= - mod_start + mod_len) - mod_len = ordered->file_offset - mod_start; - /* - * If we have this case - * - * |--------- logged extent ---------| - * |----- ordered extent ----| - * - * Just don't mess with mod_start and mod_len, we'll - * just end up logging more csums than we need and it - * will be ok. - */ - } else { - if (ordered->file_offset + ordered->len < - mod_start + mod_len) { - mod_len = (mod_start + mod_len) - - (ordered->file_offset + ordered->len); - mod_start = ordered->file_offset + - ordered->len; - } else { - mod_len = 0; - } - } - - if (skip_csum) - continue; - - /* - * To keep us from looping for the above case of an ordered - * extent that falls inside of the logged extent. - */ - if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, - &ordered->flags)) - continue; - - list_for_each_entry(sum, &ordered->list, list) { - ret = btrfs_csum_file_blocks(trans, log, sum); - if (ret) - break; - } - } - - if (*ordered_io_error || !mod_len || ret || skip_csum) - return ret; - + /* If we're compressed we have to save the entire range of csums. */ if (em->compress_type) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { - csum_offset = mod_start - em->start; - csum_len = mod_len; + csum_offset = em->mod_start - em->start; + csum_len = em->mod_len; } /* block start is already adjusted for the file extent offset. */ - ret = btrfs_lookup_csums_range(fs_info->csum_root, + ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, em->block_start + csum_offset, em->block_start + csum_offset + csum_len - 1, &ordered_sums, 0); @@ -4218,7 +4162,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log, sums); + ret = btrfs_csum_file_blocks(trans, log_root, sums); list_del(&sums->list); kfree(sums); } @@ -4230,7 +4174,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_root *root, const struct extent_map *em, struct btrfs_path *path, - const struct list_head *logged_list, struct btrfs_log_ctx *ctx) { struct btrfs_root *log = root->log_root; @@ -4242,18 +4185,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans, u64 block_len; int ret; int extent_inserted = 0; - bool ordered_io_err = false; - ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, - logged_list, &ordered_io_err); + ret = log_extent_csums(trans, inode, log, em); if (ret) return ret; - if (ordered_io_err) { - ctx->io_err = -EIO; - return ctx->io_err; - } - btrfs_init_map_token(&token); ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, @@ -4428,7 +4364,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, - struct list_head *logged_list, struct btrfs_log_ctx *ctx, const u64 start, const u64 end) @@ -4436,20 +4371,33 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; - u64 logged_start, logged_end; u64 test_gen; int ret = 0; int num = 0; INIT_LIST_HEAD(&extents); - down_write(&inode->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; - logged_start = start; - logged_end = end; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { + /* + * Skip extents outside our logging range. It's important to do + * it for correctness because if we don't ignore them, we may + * log them before their ordered extent completes, and therefore + * we could log them without logging their respective checksums + * (the checksum items are added to the csum tree at the very + * end of btrfs_finish_ordered_io()). Also leave such extents + * outside of our range in the list, since we may have another + * ranged fsync in the near future that needs them. If an extent + * outside our range corresponds to a hole, log it to avoid + * leaving gaps between extents (fsck will complain when we are + * not using the NO_HOLES feature). + */ + if ((em->start > end || em->start + em->len <= start) && + em->block_start != EXTENT_MAP_HOLE) + continue; + list_del_init(&em->list); /* * Just an arbitrary number, this can be really CPU intensive @@ -4471,11 +4419,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em->start >= i_size_read(&inode->vfs_inode)) continue; - if (em->start < logged_start) - logged_start = em->start; - if ((em->start + em->len - 1) > logged_end) - logged_end = em->start + em->len - 1; - /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -4484,20 +4427,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); - btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); - /* - * Some ordered extents started by fsync might have completed - * before we could collect them into the list logged_list, which - * means they're gone, not in our logged_list nor in the inode's - * ordered tree. We want the application/user space to know an - * error happened while attempting to persist file data so that - * it can take proper action. If such error happened, we leave - * without writing to the log tree and the fsync must report the - * file data write error and not commit the current transaction. - */ - ret = filemap_check_errors(inode->vfs_inode.i_mapping); - if (ret) - ctx->io_err = ret; process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4516,15 +4445,13 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path, logged_list, - ctx); + ret = log_one_extent(trans, inode, root, em, path, ctx); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - up_write(&inode->dio_sem); btrfs_release_path(path); if (!ret) @@ -4716,13 +4643,12 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, if (btrfs_file_extent_type(leaf, extent) == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_inline_len(leaf, - path->slots[0], - extent); + len = btrfs_file_extent_ram_bytes(leaf, extent); ASSERT(len == i_size || (len == fs_info->sectorsize && btrfs_file_extent_compression(leaf, extent) != - BTRFS_COMPRESS_NONE)); + BTRFS_COMPRESS_NONE) || + (len < i_size && i_size < fs_info->sectorsize)); return 0; } @@ -4902,7 +4828,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = root->log_root; - LIST_HEAD(logged_list); u64 last_extent = 0; int err = 0; int ret; @@ -5098,8 +5023,7 @@ again: * we don't need to do more work nor fallback to * a transaction commit. */ - if (IS_ERR(other_inode) && - PTR_ERR(other_inode) == -ENOENT) { + if (other_inode == ERR_PTR(-ENOENT)) { goto next_key; } else if (IS_ERR(other_inode)) { err = PTR_ERR(other_inode); @@ -5239,7 +5163,7 @@ log_extents: } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - &logged_list, ctx, start, end); + ctx, start, end); if (ret) { err = ret; goto out_unlock; @@ -5290,10 +5214,6 @@ log_extents: inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); out_unlock: - if (unlikely(err)) - btrfs_put_logged_extents(&logged_list); - else - btrfs_submit_logged_extents(&logged_list, log); mutex_unlock(&inode->log_mutex); btrfs_free_path(path); @@ -5589,7 +5509,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_path *path; struct btrfs_key key; @@ -5656,9 +5576,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, dir_inode = btrfs_iget(fs_info->sb, &inode_key, root, NULL); - /* If parent inode was deleted, skip it. */ - if (IS_ERR(dir_inode)) - continue; + /* + * If the parent inode was deleted, return an error to + * fallback to a transaction commit. This is to prevent + * getting an inode that was moved from one parent A to + * a parent B, got its former parent A deleted and then + * it got fsync'ed, from existing at both parents after + * a log replay (and the old parent still existing). + * Example: + * + * mkdir /mnt/A + * mkdir /mnt/B + * touch /mnt/B/bar + * sync + * mv /mnt/B/bar /mnt/A/bar + * mv -T /mnt/A /mnt/B + * fsync /mnt/B/bar + * <power fail> + * + * If we ignore the old parent B which got deleted, + * after a log replay we would have file bar linked + * at both parents and the old parent B would still + * exist. + */ + if (IS_ERR(dir_inode)) { + ret = PTR_ERR(dir_inode); + goto out; + } if (ctx) ctx->log_new_dentries = false; @@ -5732,7 +5676,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - if (btrfs_inode_in_log(inode, trans->transid)) { + /* + * Skip already logged inodes or inodes corresponding to tmpfiles + * (since logging them is pointless, a link count of 0 means they + * will never be accessible). + */ + if (btrfs_inode_in_log(inode, trans->transid) || + inode->vfs_inode.i_nlink == 0) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; } @@ -5808,6 +5758,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } + /* + * If a new hard link was added to the inode in the current transaction + * and its link count is now greater than 1, we need to fallback to a + * transaction commit, otherwise we can end up not logging all its new + * parents for all the hard links. Here just from the dentry used to + * fsync, we can not visit the ancestor inodes for all the other hard + * links to figure out if any is new, so we fallback to a transaction + * commit (instead of adding a lot of complexity of scanning a btree, + * since this scenario is not a common use case). + */ + if (inode->vfs_inode.i_nlink > 1 && + inode->last_link_trans > last_committed) { + ret = -EMLINK; + goto end_trans; + } + while (1) { if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; @@ -6117,14 +6083,25 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. * - * It will return zero if all goes well, and it will return 1 if a - * full transaction commit is required. + * @ctx can not be NULL when @sync_log is false, and should be NULL when it's + * true (because it's not used). + * + * Return value depends on whether @sync_log is true or false. + * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be + * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT + * otherwise. + * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to + * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, + * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be + * committed (without attempting to sync the log). */ int btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent) + struct dentry *parent, + bool sync_log, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; /* * this will force the logging code to walk the dentry chain @@ -6139,9 +6116,34 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, */ if (inode->logged_trans <= fs_info->last_trans_committed && (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) - return 0; + return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : + BTRFS_DONT_NEED_LOG_SYNC; + + if (sync_log) { + struct btrfs_log_ctx ctx2; + + btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); + ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, + LOG_INODE_EXISTS, &ctx2); + if (ret == BTRFS_NO_LOG_SYNC) + return BTRFS_DONT_NEED_TRANS_COMMIT; + else if (ret) + return BTRFS_NEED_TRANS_COMMIT; + + ret = btrfs_sync_log(trans, inode->root, &ctx2); + if (ret) + return BTRFS_NEED_TRANS_COMMIT; + return BTRFS_DONT_NEED_TRANS_COMMIT; + } + + ASSERT(ctx); + ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, + LOG_INODE_EXISTS, ctx); + if (ret == BTRFS_NO_LOG_SYNC) + return BTRFS_DONT_NEED_LOG_SYNC; + else if (ret) + return BTRFS_NEED_TRANS_COMMIT; - return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, NULL); + return BTRFS_NEED_LOG_SYNC; } |