diff options
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r-- | fs/btrfs/disk-io.c | 283 |
1 files changed, 150 insertions, 133 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 62e0cafd6e25..7cda51995c1e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -46,6 +46,10 @@ #include "check-integrity.h" #include "rcu-string.h" +#ifdef CONFIG_X86 +#include <asm/cpufeature.h> +#endif + static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); @@ -217,26 +221,16 @@ static struct extent_map *btree_get_extent(struct inode *inode, write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); if (ret == -EEXIST) { - u64 failed_start = em->start; - u64 failed_len = em->len; - free_extent_map(em); em = lookup_extent_mapping(em_tree, start, len); - if (em) { - ret = 0; - } else { - em = lookup_extent_mapping(em_tree, failed_start, - failed_len); - ret = -EIO; - } + if (!em) + em = ERR_PTR(-EIO); } else if (ret) { free_extent_map(em); - em = NULL; + em = ERR_PTR(ret); } write_unlock(&em_tree->lock); - if (ret) - em = ERR_PTR(ret); out: return em; } @@ -377,9 +371,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, ret = read_extent_buffer_pages(io_tree, eb, start, WAIT_COMPLETE, btree_get_extent, mirror_num); - if (!ret && !verify_parent_transid(io_tree, eb, + if (!ret) { + if (!verify_parent_transid(io_tree, eb, parent_transid, 0)) - break; + break; + else + ret = -EIO; + } /* * This buffer's crc is fine, but its contents are corrupted, so @@ -435,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) WARN_ON(1); return 0; } - if (eb->pages[0] != page) { - WARN_ON(1); - return 0; - } if (!PageUptodate(page)) { WARN_ON(1); return 0; @@ -754,9 +748,7 @@ static void run_one_async_done(struct btrfs_work *work) limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; - atomic_dec(&fs_info->nr_async_submits); - - if (atomic_read(&fs_info->nr_async_submits) < limit && + if (atomic_dec_return(&fs_info->nr_async_submits) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); @@ -867,10 +859,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); } +static int check_async_write(struct inode *inode, unsigned long bio_flags) +{ + if (bio_flags & EXTENT_BIO_TREE_LOG) + return 0; +#ifdef CONFIG_X86 + if (cpu_has_xmm4_2) + return 0; +#endif + return 1; +} + static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + int async = check_async_write(inode, bio_flags); int ret; if (!(rw & REQ_WRITE)) { @@ -885,6 +889,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, return ret; return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); + } else if (!async) { + ret = btree_csum_one_bio(bio); + if (ret) + return ret; + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); } /* @@ -1166,8 +1176,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); + atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); - root->log_batch = 0; root->log_transid = 0; root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, @@ -1665,9 +1675,10 @@ static int transaction_kthread(void *arg) spin_unlock(&root->fs_info->trans_lock); /* If the file system is aborted, this will always fail. */ - trans = btrfs_join_transaction(root); + trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { - cannot_commit = true; + if (PTR_ERR(trans) != -ENOENT) + cannot_commit = true; goto sleep; } if (transid == trans->transid) { @@ -1992,13 +2003,11 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); - INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); INIT_LIST_HEAD(&fs_info->ordered_operations); INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->ref_cache_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); @@ -2012,12 +2021,15 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); btrfs_mapping_init(&fs_info->mapping_tree); - btrfs_init_block_rsv(&fs_info->global_block_rsv); - btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); - btrfs_init_block_rsv(&fs_info->trans_block_rsv); - btrfs_init_block_rsv(&fs_info->chunk_block_rsv); - btrfs_init_block_rsv(&fs_info->empty_block_rsv); - btrfs_init_block_rsv(&fs_info->delayed_block_rsv); + btrfs_init_block_rsv(&fs_info->global_block_rsv, + BTRFS_BLOCK_RSV_GLOBAL); + btrfs_init_block_rsv(&fs_info->delalloc_block_rsv, + BTRFS_BLOCK_RSV_DELALLOC); + btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); + btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv, + BTRFS_BLOCK_RSV_DELOPS); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); @@ -2032,8 +2044,6 @@ int open_ctree(struct super_block *sb, fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; - init_waitqueue_head(&fs_info->tree_mod_seq_wait); - /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); spin_lock_init(&fs_info->reada_lock); @@ -2491,6 +2501,8 @@ retry_root_backup: printk(KERN_ERR "Failed to read block groups: %d\n", ret); goto fail_block_groups; } + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); @@ -2528,8 +2540,7 @@ retry_root_backup: goto fail_trans_kthread; /* do not make disk changes in broken FS */ - if (btrfs_super_log_root(disk_super) != 0 && - !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { + if (btrfs_super_log_root(disk_super) != 0) { u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { @@ -2875,12 +2886,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait) printk_in_rcu("btrfs: disabling barriers on dev %s\n", rcu_str_deref(device->name)); device->nobarriers = 1; - } - if (!bio_flagged(bio, BIO_UPTODATE)) { + } else if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; - if (!bio_flagged(bio, BIO_EOPNOTSUPP)) - btrfs_dev_stat_inc_and_print(device, - BTRFS_DEV_STAT_FLUSH_ERRS); + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_FLUSH_ERRS); } /* drop the reference from the wait == 0 run */ @@ -2919,14 +2928,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info) { struct list_head *head; struct btrfs_device *dev; - int errors = 0; + int errors_send = 0; + int errors_wait = 0; int ret; /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { - errors++; + errors_send++; continue; } if (!dev->in_fs_metadata || !dev->writeable) @@ -2934,13 +2944,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info) ret = write_dev_flush(dev, 0); if (ret) - errors++; + errors_send++; } /* wait for all the barriers */ list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { - errors++; + errors_wait++; continue; } if (!dev->in_fs_metadata || !dev->writeable) @@ -2948,13 +2958,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info) ret = write_dev_flush(dev, 1); if (ret) - errors++; + errors_wait++; } - if (errors) + if (errors_send > info->num_tolerated_disk_barrier_failures || + errors_wait > info->num_tolerated_disk_barrier_failures) return -EIO; return 0; } +int btrfs_calc_num_tolerated_disk_barrier_failures( + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ioctl_space_info space; + struct btrfs_space_info *sinfo; + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + int num_types = 4; + int i; + int c; + int num_tolerated_disk_barrier_failures = + (int)fs_info->fs_devices->num_devices; + + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + sinfo = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &fs_info->space_info, list) { + if (tmp->flags == types[i]) { + sinfo = tmp; + break; + } + } + rcu_read_unlock(); + + if (!sinfo) + continue; + + down_read(&sinfo->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&sinfo->block_groups[c])) { + u64 flags; + + btrfs_get_block_group_info( + &sinfo->block_groups[c], &space); + if (space.total_bytes == 0 || + space.used_bytes == 0) + continue; + flags = space.flags; + /* + * return + * 0: if dup, single or RAID0 is configured for + * any of metadata, system or data, else + * 1: if RAID5 is configured, or if RAID1 or + * RAID10 is configured and only two mirrors + * are used, else + * 2: if RAID6 is configured, else + * num_mirrors - 1: if RAID1 or RAID10 is + * configured and more than + * 2 mirrors are used. + */ + if (num_tolerated_disk_barrier_failures > 0 && + ((flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID0)) || + ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) + == 0))) + num_tolerated_disk_barrier_failures = 0; + else if (num_tolerated_disk_barrier_failures > 1 + && + (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10))) + num_tolerated_disk_barrier_failures = 1; + } + } + up_read(&sinfo->groups_sem); + } + + return num_tolerated_disk_barrier_failures; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2977,8 +3061,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; - if (do_barriers) - barrier_all_devices(root->fs_info); + if (do_barriers) { + ret = barrier_all_devices(root->fs_info); + if (ret) { + mutex_unlock( + &root->fs_info->fs_devices->device_list_mutex); + btrfs_error(root->fs_info, ret, + "errors while submitting device barriers."); + return ret; + } + } list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { @@ -3189,30 +3281,14 @@ int close_ctree(struct btrfs_root *root) /* clear out the rbtree of defraggable inodes */ btrfs_run_defrag_inodes(fs_info); - /* - * Here come 2 situations when btrfs is broken to flip readonly: - * - * 1. when btrfs flips readonly somewhere else before - * btrfs_commit_super, sb->s_flags has MS_RDONLY flag, - * and btrfs will skip to write sb directly to keep - * ERROR state on disk. - * - * 2. when btrfs flips readonly just in btrfs_commit_super, - * and in such case, btrfs cannot write sb via btrfs_commit_super, - * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, - * btrfs will cleanup all FS resources first and write sb then. - */ if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - ret = btrfs_error_commit_super(root); - if (ret) - printk(KERN_ERR "btrfs: commit super ret %d\n", ret); - } + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + btrfs_error_commit_super(root); btrfs_put_block_group_cache(fs_info); @@ -3228,10 +3304,6 @@ int close_ctree(struct btrfs_root *root) printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", (unsigned long long)fs_info->delalloc_bytes); } - if (fs_info->total_ref_cache_size) { - printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", - (unsigned long long)fs_info->total_ref_cache_size); - } free_extent_buffer(fs_info->extent_root->node); free_extent_buffer(fs_info->extent_root->commit_root); @@ -3377,52 +3449,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); } -int btree_lock_page_hook(struct page *page, void *data, - void (*flush_fn)(void *)) -{ - struct inode *inode = page->mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_buffer *eb; - - /* - * We culled this eb but the page is still hanging out on the mapping, - * carry on. - */ - if (!PagePrivate(page)) - goto out; - - eb = (struct extent_buffer *)page->private; - if (!eb) { - WARN_ON(1); - goto out; - } - if (page != eb->pages[0]) - goto out; - - if (!btrfs_try_tree_write_lock(eb)) { - flush_fn(data); - btrfs_tree_lock(eb); - } - btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= eb->len) - root->fs_info->dirty_metadata_bytes -= eb->len; - else - WARN_ON(1); - spin_unlock(&root->fs_info->delalloc_lock); - } - - btrfs_tree_unlock(eb); -out: - if (!trylock_page(page)) { - flush_fn(data); - lock_page(page); - } - return 0; -} - static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { @@ -3434,18 +3460,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, if (read_only) return 0; - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - printk(KERN_WARNING "warning: mount fs with errors, " - "running btrfsck is recommended\n"); - } - return 0; } -int btrfs_error_commit_super(struct btrfs_root *root) +void btrfs_error_commit_super(struct btrfs_root *root) { - int ret; - mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -3455,10 +3474,6 @@ int btrfs_error_commit_super(struct btrfs_root *root) /* cleanup FS via transaction */ btrfs_cleanup_transaction(root); - - ret = write_ctree_super(NULL, root, 0); - - return ret; } static void btrfs_destroy_ordered_operations(struct btrfs_root *root) @@ -3636,7 +3651,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, while (1) { ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); + mark, NULL); if (ret) break; @@ -3691,7 +3706,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, again: while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret) break; @@ -3782,14 +3797,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) /* FIXME: cleanup wait for commit */ t->in_commit = 1; t->blocked = 1; + smp_mb(); if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) wake_up(&root->fs_info->transaction_blocked_wait); t->blocked = 0; + smp_mb(); if (waitqueue_active(&root->fs_info->transaction_wait)) wake_up(&root->fs_info->transaction_wait); t->commit_done = 1; + smp_mb(); if (waitqueue_active(&t->commit_wait)) wake_up(&t->commit_wait); @@ -3825,7 +3843,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) } static struct extent_io_ops btree_extent_io_ops = { - .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, |