diff options
Diffstat (limited to 'fs/btrfs')
60 files changed, 5520 insertions, 4209 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 247b8dfaf6e5..8d8370ddb6b2 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -78,12 +78,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (ret) - return ret; - } - ret = 0; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) @@ -119,6 +113,13 @@ out: int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + int ret; + + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (ret) + return ret; + } return __btrfs_set_acl(NULL, inode, acl, type); } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8299601a3549..f723c11bb763 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -16,7 +16,7 @@ * Boston, MA 021110-1307, USA. */ -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/rbtree.h> #include "ctree.h" #include "disk-io.h" @@ -26,6 +26,11 @@ #include "delayed-ref.h" #include "locking.h" +enum merge_mode { + MERGE_IDENTICAL_KEYS = 1, + MERGE_IDENTICAL_PARENTS, +}; + /* Just an arbitrary number so we can be sure this happened */ #define BACKREF_FOUND_SHARED 6 @@ -533,7 +538,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * slot==nritems. In that case, go to the next leaf before we continue. */ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); @@ -577,7 +582,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_next_item(root, path); else ret = btrfs_next_old_item(root, path, time_seq); @@ -629,7 +634,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == (u64)-1) + else if (time_seq == SEQ_LAST) root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -640,7 +645,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } path->lowest_level = level; - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, 0, 0); else @@ -809,14 +814,12 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, /* * merge backrefs and adjust counts accordingly * - * mode = 1: merge identical keys, if key is set - * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. - * additionally, we could even add a key range for the blocks we - * looked into to merge even more (-> replace unresolved refs by those - * having a parent). - * mode = 2: merge identical parents + * FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in __add_prelim_ref + * then we can merge more here. Additionally, we could even add a key + * range for the blocks we looked into to merge even more (-> replace + * unresolved refs by those having a parent). */ -static void __merge_refs(struct list_head *head, int mode) +static void __merge_refs(struct list_head *head, enum merge_mode mode) { struct __prelim_ref *pos1; @@ -829,7 +832,7 @@ static void __merge_refs(struct list_head *head, int mode) if (!ref_for_same_block(ref1, ref2)) continue; - if (mode == 1) { + if (mode == MERGE_IDENTICAL_KEYS) { if (!ref1->parent && ref2->parent) swap(ref1, ref2); } else { @@ -956,8 +959,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, /* * add all inline backrefs for bytenr to the list */ -static int __add_inline_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 bytenr, +static int __add_inline_refs(struct btrfs_path *path, u64 bytenr, int *info_level, struct list_head *prefs, struct ref_root *ref_tree, u64 *total_refs, u64 inum) @@ -1197,7 +1199,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * * NOTE: This can return values > 0 * - * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave + * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave * much like trans == NULL case, the difference only lies in it will not * commit root. * The special case is for qgroup to search roots in commit_transaction(). @@ -1244,7 +1246,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) path->skip_locking = 1; /* @@ -1274,9 +1276,9 @@ again: #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != (u64)-1) { + time_seq != SEQ_LAST) { #else - if (trans && time_seq != (u64)-1) { + if (trans && time_seq != SEQ_LAST) { #endif /* * look if there are updates for this ref queued and lock the @@ -1284,10 +1286,10 @@ again: */ delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -1354,7 +1356,7 @@ again: if (key.objectid == bytenr && (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { - ret = __add_inline_refs(fs_info, path, bytenr, + ret = __add_inline_refs(path, bytenr, &info_level, &prefs, ref_tree, &total_refs, inum); @@ -1375,7 +1377,7 @@ again: if (ret) goto out; - __merge_refs(&prefs, 1); + __merge_refs(&prefs, MERGE_IDENTICAL_KEYS); ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, extent_item_pos, total_refs, @@ -1383,7 +1385,7 @@ again: if (ret) goto out; - __merge_refs(&prefs, 2); + __merge_refs(&prefs, MERGE_IDENTICAL_PARENTS); while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); @@ -2303,7 +2305,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes) size_t alloc_bytes; alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); - data = vmalloc(alloc_bytes); + data = kvmalloc(alloc_bytes, GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); @@ -2337,9 +2339,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, if (IS_ERR(fspath)) return (void *)fspath; - ifp = kmalloc(sizeof(*ifp), GFP_NOFS); + ifp = kmalloc(sizeof(*ifp), GFP_KERNEL); if (!ifp) { - vfree(fspath); + kvfree(fspath); return ERR_PTR(-ENOMEM); } @@ -2354,6 +2356,6 @@ void free_ipath(struct inode_fs_paths *ipath) { if (!ipath) return; - vfree(ipath->fspath); + kvfree(ipath->fspath); kfree(ipath); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 1a8fa46ff87e..d87ac27a5f2b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -125,6 +125,13 @@ struct btrfs_inode { u64 delalloc_bytes; /* + * Total number of bytes pending delalloc that fall within a file + * range that is either a hole or beyond EOF (and no prealloc extent + * exists in the range). This is always <= delalloc_bytes. + */ + u64 new_delalloc_bytes; + + /* * total number of bytes pending defrag, used by stat to check whether * it needs COW. */ @@ -224,47 +231,45 @@ static inline void btrfs_insert_inode_hash(struct inode *inode) __insert_inode_hash(inode, h); } -static inline u64 btrfs_ino(struct inode *inode) +static inline u64 btrfs_ino(struct btrfs_inode *inode) { - u64 ino = BTRFS_I(inode)->location.objectid; + u64 ino = inode->location.objectid; /* * !ino: btree_inode * type == BTRFS_ROOT_ITEM_KEY: subvol dir */ - if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY) - ino = inode->i_ino; + if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY) + ino = inode->vfs_inode.i_ino; return ino; } -static inline void btrfs_i_size_write(struct inode *inode, u64 size) +static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) { - i_size_write(inode, size); - BTRFS_I(inode)->disk_i_size = size; + i_size_write(&inode->vfs_inode, size); + inode->disk_i_size = size; } -static inline bool btrfs_is_free_space_inode(struct inode *inode) +static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; if (root == root->fs_info->tree_root && btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) return true; - if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) + if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID) return true; return false; } -static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) +static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { int ret = 0; - spin_lock(&BTRFS_I(inode)->lock); - if (BTRFS_I(inode)->logged_trans == generation && - BTRFS_I(inode)->last_sub_trans <= - BTRFS_I(inode)->last_log_commit && - BTRFS_I(inode)->last_sub_trans <= - BTRFS_I(inode)->root->last_log_commit) { + spin_lock(&inode->lock); + if (inode->logged_trans == generation && + inode->last_sub_trans <= inode->last_log_commit && + inode->last_sub_trans <= inode->root->last_log_commit) { /* * After a ranged fsync we might have left some extent maps * (that fall outside the fsync's range). So return false @@ -272,10 +277,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) * will be called and process those extent maps. */ smp_mb(); - if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents)) + if (list_empty(&inode->extent_tree.modified_extents)) ret = 1; } - spin_unlock(&BTRFS_I(inode)->lock); + spin_unlock(&inode->lock); return ret; } @@ -305,7 +310,8 @@ struct btrfs_dio_private { * The original bio may be split to several sub-bios, this is * done during endio of sub-bios */ - int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); + blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *, + blk_status_t); }; /* @@ -313,17 +319,34 @@ struct btrfs_dio_private { * to grab i_mutex. It is used to avoid the endless truncate due to * nonlocked dio read. */ -static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) +static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode) { - set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); smp_mb(); } -static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) +static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) { smp_mb__before_atomic(); - clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); +} + +static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, + u64 logical_start, u32 csum, u32 csum_expected, int mirror_num) +{ + struct btrfs_root *root = inode->root; + + /* Output minus objectid, which is more meaningful */ + if (root->objectid >= BTRFS_LAST_FREE_OBJECTID) + btrfs_warn_rl(root->fs_info, + "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d", + root->objectid, btrfs_ino(inode), + logical_start, csum, csum_expected, mirror_num); + else + btrfs_warn_rl(root->fs_info, + "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d", + root->objectid, btrfs_ino(inode), + logical_start, csum, csum_expected, mirror_num); } bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ab14c2e635ca..11d37c94ce05 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -94,7 +94,7 @@ #include <linux/mutex.h> #include <linux/genhd.h> #include <linux/blkdev.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/string.h> #include "ctree.h" #include "disk-io.h" @@ -1638,12 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, struct bio *bio; unsigned int j; - bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); - if (!bio) { - pr_info("btrfsic: bio_alloc() for %u pages failed!\n", - num_pages - i); - return -1; - } + bio = btrfs_io_bio_alloc(num_pages - i); bio->bi_bdev = block_ctx->dev->bdev; bio->bi_iter.bi_sector = dev_bytenr >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); @@ -1668,14 +1663,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, dev_bytenr += (j - i) * PAGE_SIZE; i = j; } - for (i = 0; i < num_pages; i++) { + for (i = 0; i < num_pages; i++) block_ctx->datav[i] = kmap(block_ctx->pagev[i]); - if (!block_ctx->datav[i]) { - pr_info("btrfsic: kmap() failed (dev %s)!\n", - block_ctx->dev->name); - return -1; - } - } return block_ctx->len; } @@ -2129,7 +2118,7 @@ static void btrfsic_bio_end_io(struct bio *bp) /* mutex is not held! This is not save if IO is not yet completed * on umount */ iodone_w_error = 0; - if (bp->bi_error) + if (bp->bi_status) iodone_w_error = 1; BUG_ON(NULL == block); @@ -2143,7 +2132,7 @@ static void btrfsic_bio_end_io(struct bio *bp) if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", - bp->bi_error, + bp->bi_status, btrfsic_get_block_type(dev_state->state, block), block->logical_bytenr, dev_state->name, block->dev_bytenr, block->mirror_num); @@ -2822,44 +2811,47 @@ static void __btrfsic_submit_bio(struct bio *bio) dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - unsigned int i; + unsigned int i = 0; u64 dev_bytenr; u64 cur_bytenr; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; int bio_is_patched; char **mapped_datav; + unsigned int segs = bio_segments(bio); dev_bytenr = 512 * bio->bi_iter.bi_sector; bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_vcnt, + bio_op(bio), bio->bi_opf, segs, (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - mapped_datav = kmalloc_array(bio->bi_vcnt, + mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); if (!mapped_datav) goto leave; cur_bytenr = dev_bytenr; - bio_for_each_segment_all(bvec, bio, i) { - BUG_ON(bvec->bv_len != PAGE_SIZE); - mapped_datav[i] = kmap(bvec->bv_page); + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_len != PAGE_SIZE); + mapped_datav[i] = kmap(bvec.bv_page); + i++; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec->bv_len, bvec->bv_offset); - cur_bytenr += bvec->bv_len; + i, cur_bytenr, bvec.bv_len, bvec.bv_offset); + cur_bytenr += bvec.bv_len; } btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_datav, bio->bi_vcnt, + mapped_datav, segs, bio, &bio_is_patched, NULL, bio->bi_opf); - bio_for_each_segment_all(bvec, bio, i) - kunmap(bvec->bv_page); + bio_for_each_segment(bvec, bio, iter) + kunmap(bvec.bv_page); kfree(mapped_datav); } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & @@ -2923,13 +2915,10 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, fs_info->sectorsize, PAGE_SIZE); return -1; } - state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + state = kvzalloc(sizeof(*state), GFP_KERNEL); if (!state) { - state = vzalloc(sizeof(*state)); - if (!state) { - pr_info("btrfs check-integrity: vzalloc() failed!\n"); - return -1; - } + pr_info("btrfs check-integrity: allocation failed!\n"); + return -1; } if (!btrfsic_is_initialized) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7f390849343b..d2ef9ac2a630 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -32,6 +32,7 @@ #include <linux/writeback.h> #include <linux/bit_spinlock.h> #include <linux/slab.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -42,48 +43,7 @@ #include "extent_io.h" #include "extent_map.h" -struct compressed_bio { - /* number of bios pending for this compressed extent */ - atomic_t pending_bios; - - /* the pages with the compressed data on them */ - struct page **compressed_pages; - - /* inode that owns this data */ - struct inode *inode; - - /* starting offset in the inode for our pages */ - u64 start; - - /* number of bytes in the inode we're working on */ - unsigned long len; - - /* number of bytes on disk */ - unsigned long compressed_len; - - /* the compression algorithm for this bio */ - int compress_type; - - /* number of compressed pages in the array */ - unsigned long nr_pages; - - /* IO errors */ - int errors; - int mirror_num; - - /* for reads, this is the bio we are copying the data into */ - struct bio *orig_bio; - - /* - * the start of a variable length array of checksums only - * used by reads - */ - u32 sums; -}; - -static int btrfs_decompress_bio(int type, struct page **pages_in, - u64 disk_start, struct bio *orig_bio, - size_t srclen); +static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, unsigned long disk_size) @@ -94,13 +54,7 @@ static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; } -static struct bio *compressed_bio_alloc(struct block_device *bdev, - u64 first_byte, gfp_t gfp_flags) -{ - return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags); -} - -static int check_compressed_csum(struct inode *inode, +static int check_compressed_csum(struct btrfs_inode *inode, struct compressed_bio *cb, u64 disk_start) { @@ -111,7 +65,7 @@ static int check_compressed_csum(struct inode *inode, u32 csum; u32 *cb_sum = &cb->sums; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATASUM) return 0; for (i = 0; i < cb->nr_pages; i++) { @@ -124,10 +78,8 @@ static int check_compressed_csum(struct inode *inode, kunmap_atomic(kaddr); if (csum != *cb_sum) { - btrfs_info(BTRFS_I(inode)->root->fs_info, - "csum failed ino %llu extent %llu csum %u wanted %u mirror %d", - btrfs_ino(inode), disk_start, csum, *cb_sum, - cb->mirror_num); + btrfs_print_data_csum_error(inode, disk_start, csum, + *cb_sum, cb->mirror_num); ret = -EIO; goto fail; } @@ -157,17 +109,17 @@ static void end_compressed_bio_read(struct bio *bio) unsigned long index; int ret; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed * extent, just exit */ - if (!atomic_dec_and_test(&cb->pending_bios)) + if (!refcount_dec_and_test(&cb->pending_bios)) goto out; inode = cb->inode; - ret = check_compressed_csum(inode, cb, + ret = check_compressed_csum(BTRFS_I(inode), cb, (u64)bio->bi_iter.bi_sector << 9); if (ret) goto csum_failed; @@ -175,11 +127,8 @@ static void end_compressed_bio_read(struct bio *bio) /* ok, we're the last bio for this extent, lets start * the decompression. */ - ret = btrfs_decompress_bio(cb->compress_type, - cb->compressed_pages, - cb->start, - cb->orig_bio, - cb->compressed_len); + ret = btrfs_decompress_bio(cb); + csum_failed: if (ret) cb->errors = 1; @@ -203,6 +152,7 @@ csum_failed: * we have verified the checksum already, set page * checked so the end_io handlers know about it */ + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, cb->orig_bio, i) SetPageChecked(bvec->bv_page); @@ -270,13 +220,13 @@ static void end_compressed_bio_write(struct bio *bio) struct page *page; unsigned long index; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed * extent, just exit */ - if (!atomic_dec_and_test(&cb->pending_bios)) + if (!refcount_dec_and_test(&cb->pending_bios)) goto out; /* ok, we're the last bio for this extent, step one is to @@ -289,7 +239,7 @@ static void end_compressed_bio_write(struct bio *bio) cb->start, cb->start + cb->len - 1, NULL, - bio->bi_error ? 0 : 1); + bio->bi_status ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -322,7 +272,7 @@ out: * This also checksums the file bytes and gets things ready for * the end io hooks. */ -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, @@ -337,14 +287,14 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, struct page *page; u64 first_byte = disk_start; struct block_device *bdev; - int ret; + blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; WARN_ON(start & ((u64)PAGE_SIZE - 1)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) - return -ENOMEM; - atomic_set(&cb->pending_bios, 0); + return BLK_STS_RESOURCE; + refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; cb->start = start; @@ -357,30 +307,26 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = fs_info->fs_devices->latest_bdev; - bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); - if (!bio) { - kfree(cb); - return -ENOMEM; - } + bio = btrfs_bio_alloc(bdev, first_byte); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; - atomic_inc(&cb->pending_bios); + refcount_set(&cb->pending_bios, 1); /* create and submit bios for the compressed pages */ bytes_left = compressed_len; for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { + int submit = 0; + page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - ret = io_tree->ops->merge_bio_hook(page, 0, + submit = io_tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(bio); @@ -390,7 +336,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, * we inc the count. Otherwise, the cb might get * freed before we're done setting it up */ - atomic_inc(&cb->pending_bios); + refcount_inc(&cb->pending_bios); ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -402,14 +348,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } bio_put(bio); - bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); - BUG_ON(!bio); + bio = btrfs_bio_alloc(bdev, first_byte); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; @@ -436,7 +381,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } @@ -571,7 +516,7 @@ next: * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -588,7 +533,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - int ret = -ENOMEM; + blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; u32 *sums; @@ -602,14 +547,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, PAGE_SIZE); read_unlock(&em_tree->lock); if (!em) - return -EIO; + return BLK_STS_IOERR; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) goto out; - atomic_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; cb->mirror_num = mirror_num; @@ -640,7 +585,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, __GFP_HIGHMEM); if (!cb->compressed_pages[pg_index]) { faili = pg_index - 1; - ret = -ENOMEM; + ret = BLK_STS_RESOURCE; goto fail2; } } @@ -652,28 +597,26 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; - comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); - if (!comp_bio) - goto fail2; + comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); bio_set_op_attrs (comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; - atomic_inc(&cb->pending_bios); + refcount_set(&cb->pending_bios, 1); for (pg_index = 0; pg_index < nr_pages; pg_index++) { + int submit = 0; + page = cb->compressed_pages[pg_index]; page->mapping = inode->i_mapping; page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - ret = tree->ops->merge_bio_hook(page, 0, + submit = tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, comp_bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(comp_bio); @@ -687,7 +630,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, * we inc the count. Otherwise, the cb might get * freed before we're done setting it up */ - atomic_inc(&cb->pending_bios); + refcount_inc(&cb->pending_bios); if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(inode, comp_bio, @@ -699,15 +642,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } bio_put(comp_bio); - comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, - GFP_NOFS); - BUG_ON(!comp_bio); + comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); bio_set_op_attrs(comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -728,7 +669,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } @@ -803,6 +744,7 @@ static struct list_head *find_workspace(int type) struct list_head *workspace; int cpus = num_online_cpus(); int idx = type - 1; + unsigned nofs_flag; struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; @@ -832,7 +774,15 @@ again: atomic_inc(total_ws); spin_unlock(ws_lock); + /* + * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have + * to turn it off here because we might get called from the restricted + * context of btrfs_compress_bio/btrfs_compress_pages + */ + nofs_flag = memalloc_nofs_save(); workspace = btrfs_compress_op[idx]->alloc_workspace(); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(workspace)) { atomic_dec(total_ws); wake_up(ws_wait); @@ -913,32 +863,28 @@ static void free_workspaces(void) } /* - * given an address space and start/len, compress the bytes. + * Given an address space and start and length, compress the bytes into @pages + * that are allocated on demand. * - * pages are allocated to hold the compressed result and stored - * in 'pages' + * @out_pages is an in/out parameter, holds maximum number of pages to allocate + * and returns number of actually allocated pages * - * out_pages is used to return the number of pages allocated. There - * may be pages allocated even if we return an error - * - * total_in is used to return the number of bytes actually read. It - * may be smaller then len if we had to exit early because we + * @total_in is used to return the number of bytes actually read. It + * may be smaller than the input length if we had to exit early because we * ran out of room in the pages array or because we cross the * max_out threshold. * - * total_out is used to return the total number of compressed bytes + * @total_out is an in/out parameter, must be set to the input length and will + * be also used to return the total number of compressed bytes * - * max_out tells us the max number of bytes that we're allowed to + * @max_out tells us the max number of bytes that we're allowed to * stuff into pages */ int btrfs_compress_pages(int type, struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, + u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) + unsigned long *total_out) { struct list_head *workspace; int ret; @@ -946,10 +892,9 @@ int btrfs_compress_pages(int type, struct address_space *mapping, workspace = find_workspace(type); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, - start, len, pages, - nr_dest_pages, out_pages, - total_in, total_out, - max_out); + start, pages, + out_pages, + total_in, total_out); free_workspace(type, workspace); return ret; } @@ -968,19 +913,16 @@ int btrfs_compress_pages(int type, struct address_space *mapping, * be contiguous. They all correspond to the range of bytes covered by * the compressed extent. */ -static int btrfs_decompress_bio(int type, struct page **pages_in, - u64 disk_start, struct bio *orig_bio, - size_t srclen) +static int btrfs_decompress_bio(struct compressed_bio *cb) { struct list_head *workspace; int ret; + int type = cb->compress_type; workspace = find_workspace(type); - - ret = btrfs_compress_op[type-1]->decompress_bio(workspace, pages_in, - disk_start, orig_bio, - srclen); + ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb); free_workspace(type, workspace); + return ret; } @@ -1017,13 +959,14 @@ void btrfs_exit_compress(void) * * total_out is the last byte of the buffer */ -int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, +int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, struct bio *bio) { unsigned long buf_offset; unsigned long current_buf_start; unsigned long start_byte; + unsigned long prev_start_byte; unsigned long working_bytes = total_out - buf_start; unsigned long bytes; char *kaddr; @@ -1071,26 +1014,34 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, if (!bio->bi_iter.bi_size) return 0; bvec = bio_iter_iovec(bio, bio->bi_iter); - + prev_start_byte = start_byte; start_byte = page_offset(bvec.bv_page) - disk_start; /* - * make sure our new page is covered by this - * working buffer + * We need to make sure we're only adjusting + * our offset into compression working buffer when + * we're switching pages. Otherwise we can incorrectly + * keep copying when we were actually done. */ - if (total_out <= start_byte) - return 1; + if (start_byte != prev_start_byte) { + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + return 1; - /* - * the next page in the biovec might not be adjacent - * to the last page, but it might still be found - * inside this working buffer. bump our offset pointer - */ - if (total_out > start_byte && - current_buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes = total_out - start_byte; - current_buf_start = buf_start + buf_offset; + /* + * the next page in the biovec might not be adjacent + * to the last page, but it might still be found + * inside this working buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + buf_offset; + } } } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 09879579fbc8..87f6d3332163 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -19,29 +19,80 @@ #ifndef __BTRFS_COMPRESSION_ #define __BTRFS_COMPRESSION_ +/* + * We want to make sure that amount of RAM required to uncompress an extent is + * reasonable, so we limit the total size in ram of a compressed extent to + * 128k. This is a crucial number because it also controls how easily we can + * spread reads across cpus for decompression. + * + * We also want to make sure the amount of IO required to do a random read is + * reasonably small, so we limit the size of a compressed extent to 128k. + */ + +/* Maximum length of compressed data stored on disk */ +#define BTRFS_MAX_COMPRESSED (SZ_128K) +/* Maximum size of data before compression */ +#define BTRFS_MAX_UNCOMPRESSED (SZ_128K) + +struct compressed_bio { + /* number of bios pending for this compressed extent */ + refcount_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* the compression algorithm for this bio */ + int compress_type; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + int mirror_num; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + + /* + * the start of a variable length array of checksums only + * used by reads + */ + u32 sums; +}; + void btrfs_init_compress(void); void btrfs_exit_compress(void); int btrfs_compress_pages(int type, struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, + u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out); + unsigned long *total_out); int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); -int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, +int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, struct bio *bio); -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, unsigned long nr_pages); -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); enum btrfs_compression_type { @@ -59,19 +110,14 @@ struct btrfs_compress_op { int (*compress_pages)(struct list_head *workspace, struct address_space *mapping, - u64 start, unsigned long len, + u64 start, struct page **pages, - unsigned long nr_dest_pages, unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out); + unsigned long *total_out); int (*decompress_bio)(struct list_head *workspace, - struct page **pages_in, - u64 disk_start, - struct bio *orig_bio, - size_t srclen); + struct compressed_bio *cb); int (*decompress)(struct list_head *workspace, unsigned char *data_in, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a426dc822d4d..3f4daa9d6e2c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -19,7 +19,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/rbtree.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -28,9 +28,9 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); -static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *ins_key, - struct btrfs_path *path, int data_size, int extend); +static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *ins_key, struct btrfs_path *path, + int data_size, int extend); static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct extent_buffer *dst, @@ -426,7 +426,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, tm_root = &fs_info->tree_mod_log; for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); - tm = container_of(node, struct tree_mod_elem, node); + tm = rb_entry(node, struct tree_mod_elem, node); if (tm->seq > min_seq) continue; rb_erase(node, tm_root); @@ -453,14 +453,12 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node *parent = NULL; struct tree_mod_elem *cur; - BUG_ON(!tm); - tm->seq = btrfs_inc_tree_mod_seq(fs_info); tm_root = &fs_info->tree_mod_log; new = &tm_root->rb_node; while (*new) { - cur = container_of(*new, struct tree_mod_elem, node); + cur = rb_entry(*new, struct tree_mod_elem, node); parent = *new; if (cur->logical < tm->logical) new = &((*new)->rb_left); @@ -569,7 +567,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, static noinline int tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int dst_slot, int src_slot, - int nr_items, gfp_t flags) + int nr_items) { struct tree_mod_elem *tm = NULL; struct tree_mod_elem **tm_list = NULL; @@ -580,11 +578,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, if (!tree_mod_need_log(fs_info, eb)) return 0; - tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags); + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; goto free_tms; @@ -598,7 +596,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); + MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -665,7 +663,7 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, static noinline int tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *old_root, - struct extent_buffer *new_root, gfp_t flags, + struct extent_buffer *new_root, int log_removal) { struct tree_mod_elem *tm = NULL; @@ -680,14 +678,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (log_removal && btrfs_header_level(old_root) > 0) { nritems = btrfs_header_nritems(old_root); tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), - flags); + GFP_NOFS); if (!tm_list) { ret = -ENOMEM; goto free_tms; } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -695,7 +693,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, } } - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; goto free_tms; @@ -746,7 +744,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, tm_root = &fs_info->tree_mod_log; node = tm_root->rb_node; while (node) { - cur = container_of(node, struct tree_mod_elem, node); + cur = rb_entry(node, struct tree_mod_elem, node); if (cur->logical < start) { node = node->rb_left; } else if (cur->logical > start) { @@ -875,7 +873,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, { int ret; ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, - nr_items, GFP_NOFS); + nr_items); BUG_ON(ret < 0); } @@ -945,7 +943,7 @@ tree_mod_log_set_root_pointer(struct btrfs_root *root, { int ret; ret = tree_mod_log_insert_root(root->fs_info, root->node, - new_root_node, GFP_NOFS, log_removal); + new_root_node, log_removal); BUG_ON(ret < 0); } @@ -1074,7 +1072,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ } - clean_tree_block(trans, fs_info, buf); + clean_tree_block(fs_info, buf); *last_ref = 1; } return 0; @@ -1326,7 +1324,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, next = rb_next(&tm->node); if (!next) break; - tm = container_of(next, struct tree_mod_elem, node); + tm = rb_entry(next, struct tree_mod_elem, node); if (tm->logical != first_tm->logical) break; } @@ -1580,7 +1578,8 @@ static int close_blocks(u64 blocknr, u64 other, u32 blocksize) /* * compare two keys in a memcmp fashion */ -static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) +static int comp_keys(const struct btrfs_disk_key *disk, + const struct btrfs_key *k2) { struct btrfs_key k1; @@ -1592,7 +1591,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) /* * same as comp_keys only with two btrfs_key's */ -int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) { if (k1->objectid > k2->objectid) return 1; @@ -1732,8 +1731,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * slot may point to max if the key is bigger than all of the keys */ static noinline int generic_bin_search(struct extent_buffer *eb, - unsigned long p, - int item_size, struct btrfs_key *key, + unsigned long p, int item_size, + const struct btrfs_key *key, int max, int *slot) { int low = 0; @@ -1802,7 +1801,7 @@ static noinline int generic_bin_search(struct extent_buffer *eb, * simple bin_search frontend that does the right thing for * leaves vs nodes */ -static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, +static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int level, int *slot) { if (level == 0) @@ -1819,7 +1818,7 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, slot); } -int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, +int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int level, int *slot) { return bin_search(eb, key, level, slot); @@ -1937,7 +1936,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - clean_tree_block(trans, fs_info, mid); + clean_tree_block(fs_info, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1998,7 +1997,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - clean_tree_block(trans, fs_info, right); + clean_tree_block(fs_info, right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -2042,7 +2041,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - clean_tree_block(trans, fs_info, mid); + clean_tree_block(fs_info, mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -2437,10 +2436,9 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) * reada. -EAGAIN is returned and the search must be repeated. */ static int -read_block_for_search(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer **eb_ret, int level, int slot, - struct btrfs_key *key, u64 time_seq) +read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer **eb_ret, int level, int slot, + const struct btrfs_key *key) { struct btrfs_fs_info *fs_info = root->fs_info; u64 blocknr; @@ -2587,7 +2585,7 @@ done: } static void key_search_validate(struct extent_buffer *b, - struct btrfs_key *key, + const struct btrfs_key *key, int level) { #ifdef CONFIG_BTRFS_ASSERT @@ -2606,7 +2604,7 @@ static void key_search_validate(struct extent_buffer *b, #endif } -static int key_search(struct extent_buffer *b, struct btrfs_key *key, +static int key_search(struct extent_buffer *b, const struct btrfs_key *key, int level, int *prev_cmp, int *slot) { if (*prev_cmp != 0) { @@ -2668,9 +2666,9 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if * possible) */ -int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_path *p, int - ins_len, int cow) +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, struct btrfs_path *p, + int ins_len, int cow) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; @@ -2870,8 +2868,8 @@ cow_done: goto done; } - err = read_block_for_search(trans, root, p, - &b, level, slot, key, 0); + err = read_block_for_search(root, p, &b, level, + slot, key); if (err == -EAGAIN) goto again; if (err) { @@ -2953,7 +2951,7 @@ done: * The resulting path and return value will be set up as if we called * btrfs_search_slot at that point in time with ins_len and cow both set to 0. */ -int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, +int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, u64 time_seq) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3014,8 +3012,8 @@ again: goto done; } - err = read_block_for_search(NULL, root, p, &b, level, - slot, key, time_seq); + err = read_block_for_search(root, p, &b, level, + slot, key); if (err == -EAGAIN) goto again; if (err) { @@ -3067,8 +3065,9 @@ done: * < 0 on error */ int btrfs_search_slot_for_read(struct btrfs_root *root, - struct btrfs_key *key, struct btrfs_path *p, - int find_higher, int return_any) + const struct btrfs_key *key, + struct btrfs_path *p, int find_higher, + int return_any) { int ret; struct extent_buffer *leaf; @@ -3166,7 +3165,7 @@ static void fixup_low_keys(struct btrfs_fs_info *fs_info, */ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_key *new_key) + const struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; struct extent_buffer *eb; @@ -3594,8 +3593,7 @@ noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info, * min slot controls the lowest index we're willing to push to the * right. We'll push up to and including min_slot, but no lower */ -static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, +static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, @@ -3669,14 +3667,14 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, /* make room in the right data area */ data_end = leaf_data_end(fs_info, right); memmove_extent_buffer(right, - btrfs_leaf_data(right) + data_end - push_space, - btrfs_leaf_data(right) + data_end, + BTRFS_LEAF_DATA_OFFSET + data_end - push_space, + BTRFS_LEAF_DATA_OFFSET + data_end, BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); /* copy from the left data area */ - copy_extent_buffer(right, left, btrfs_leaf_data(right) + + copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - btrfs_leaf_data(left) + leaf_data_end(fs_info, left), + BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), @@ -3704,7 +3702,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (left_nritems) btrfs_mark_buffer_dirty(left); else - clean_tree_block(trans, fs_info, left); + clean_tree_block(fs_info, left); btrfs_mark_buffer_dirty(right); @@ -3716,7 +3714,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, fs_info, path->nodes[0]); + clean_tree_block(fs_info, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3809,7 +3807,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 0; } - return __push_leaf_right(trans, fs_info, path, min_data_size, empty, + return __push_leaf_right(fs_info, path, min_data_size, empty, right, free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); @@ -3825,8 +3823,7 @@ out_unlock: * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the * items */ -static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, +static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, int free_space, u32 right_nritems, @@ -3891,9 +3888,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_offset_nr(right, push_items - 1); - copy_extent_buffer(left, right, btrfs_leaf_data(left) + + copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left) - push_space, - btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset_nr(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); @@ -3920,9 +3917,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) - leaf_data_end(fs_info, right); - memmove_extent_buffer(right, btrfs_leaf_data(right) + + memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, right), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(0), @@ -3945,7 +3942,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (right_nritems) btrfs_mark_buffer_dirty(right); else - clean_tree_block(trans, fs_info, right); + clean_tree_block(fs_info, right); btrfs_item_key(right, &disk_key, 0); fixup_low_keys(fs_info, path, &disk_key, 1); @@ -4035,7 +4032,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - return __push_leaf_left(trans, fs_info, path, min_data_size, + return __push_leaf_left(fs_info, path, min_data_size, empty, left, free_space, right_nritems, max_slot); out: @@ -4072,8 +4069,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, nritems * sizeof(struct btrfs_item)); copy_extent_buffer(right, l, - btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(fs_info) - - data_copy_size, btrfs_leaf_data(l) + + BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - + data_copy_size, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, l), data_copy_size); rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); @@ -4160,6 +4157,9 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, /* try to push all the items before our slot into the next leaf */ slot = path->slots[0]; + space_needed = data_size; + if (slot > 0) + space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]); ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -4180,7 +4180,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, */ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_key *ins_key, + const struct btrfs_key *ins_key, struct btrfs_path *path, int data_size, int extend) { @@ -4215,6 +4215,10 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, if (wret < 0) return wret; if (wret) { + space_needed = data_size; + if (slot > 0) + space_needed -= btrfs_leaf_free_space(fs_info, + l); wret = push_leaf_left(trans, root, path, space_needed, space_needed, 0, (u32)-1); if (wret < 0) @@ -4412,10 +4416,9 @@ err: return ret; } -static noinline int split_item(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, +static noinline int split_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_key *new_key, + const struct btrfs_key *new_key, unsigned long split_offset) { struct extent_buffer *leaf; @@ -4501,7 +4504,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans, int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key, + const struct btrfs_key *new_key, unsigned long split_offset) { int ret; @@ -4510,7 +4513,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = split_item(trans, root->fs_info, path, new_key, split_offset); + ret = split_item(root->fs_info, path, new_key, split_offset); return ret; } @@ -4525,7 +4528,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, int btrfs_duplicate_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key) + const struct btrfs_key *new_key) { struct extent_buffer *leaf; int ret; @@ -4604,8 +4607,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, /* shift the data */ if (from_end) { - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + size_diff, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + data_end, old_data_start + new_size - data_end); } else { struct btrfs_disk_key disk_key; @@ -4631,8 +4634,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, } } - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + size_diff, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + data_end, old_data_start - data_end); offset = btrfs_disk_key_offset(&disk_key); @@ -4704,8 +4707,8 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, } /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - data_size, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end - data_size, BTRFS_LEAF_DATA_OFFSET + data_end, old_data - data_end); data_end = old_data; @@ -4726,7 +4729,7 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, * that doesn't call btrfs_search_slot */ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, + const struct btrfs_key *cpu_key, u32 *data_size, u32 total_data, u32 total_size, int nr) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4787,8 +4790,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, (nritems - slot) * sizeof(struct btrfs_item)); /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - total_data, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + + data_end - total_data, BTRFS_LEAF_DATA_OFFSET + data_end, old_data - data_end); data_end = old_data; } @@ -4820,7 +4823,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, + const struct btrfs_key *cpu_key, u32 *data_size, int nr) { int ret = 0; @@ -4851,9 +4854,9 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, * Given a key and some data, insert an item into the tree. * This does all the path init required, making room in the tree if needed. */ -int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *cpu_key, void *data, u32 - data_size) +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *cpu_key, void *data, + u32 data_size) { int ret = 0; struct btrfs_path *path; @@ -4980,9 +4983,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (slot + nr != nritems) { int data_end = leaf_data_end(fs_info, leaf); - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, - btrfs_leaf_data(leaf) + data_end, + BTRFS_LEAF_DATA_OFFSET + data_end, last_off - data_end); for (i = slot + nr; i < nritems; i++) { @@ -5008,7 +5011,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { btrfs_set_path_blocking(path); - clean_tree_block(trans, fs_info, leaf); + clean_tree_block(fs_info, leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { @@ -5243,7 +5246,7 @@ out: static int tree_move_down(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - int *level, int root_level) + int *level) { struct extent_buffer *eb; @@ -5258,8 +5261,7 @@ static int tree_move_down(struct btrfs_fs_info *fs_info, return 0; } -static int tree_move_next_or_upnext(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, +static int tree_move_next_or_upnext(struct btrfs_path *path, int *level, int root_level) { int ret = 0; @@ -5298,10 +5300,9 @@ static int tree_advance(struct btrfs_fs_info *fs_info, int ret; if (*level == 0 || !allow_down) { - ret = tree_move_next_or_upnext(fs_info, path, level, - root_level); + ret = tree_move_next_or_upnext(path, level, root_level); } else { - ret = tree_move_down(fs_info, path, level, root_level); + ret = tree_move_down(fs_info, path, level); } if (ret >= 0) { if (*level == 0) @@ -5391,13 +5392,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN); + tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); if (!tmp_buf) { - tmp_buf = vmalloc(fs_info->nodesize); - if (!tmp_buf) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } left_path->search_commit_root = 1; @@ -5784,8 +5782,8 @@ again: next = c; next_rw_lock = path->locks[level]; - ret = read_block_for_search(NULL, root, path, &next, level, - slot, &key, 0); + ret = read_block_for_search(root, path, &next, level, + slot, &key); if (ret == -EAGAIN) goto again; @@ -5834,8 +5832,8 @@ again: if (!level) break; - ret = read_block_for_search(NULL, root, path, &next, level, - 0, &key, 0); + ret = read_block_for_search(root, path, &next, level, + 0, &key); if (ret == -EAGAIN) goto again; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6a823719b6c5..3f3eb7b17cac 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -20,6 +20,7 @@ #define __BTRFS_CTREE__ #include <linux/mm.h> +#include <linux/sched/signal.h> #include <linux/highmem.h> #include <linux/fs.h> #include <linux/rwsem.h> @@ -38,6 +39,7 @@ #include <linux/security.h> #include <linux/sizes.h> #include <linux/dynamic_debug.h> +#include <linux/refcount.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -46,7 +48,6 @@ struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; extern struct kmem_cache *btrfs_trans_handle_cachep; -extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; @@ -97,6 +98,14 @@ static const int btrfs_csum_sizes[] = { 4 }; #define BTRFS_MAX_EXTENT_SIZE SZ_128M +/* + * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size + */ +static inline u32 count_max_extents(u64 size) +{ + return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); +} + struct btrfs_mapping_tree { struct extent_map_tree map_tree; }; @@ -509,7 +518,7 @@ struct btrfs_caching_control { struct btrfs_work work; struct btrfs_block_group_cache *block_group; u64 progress; - atomic_t count; + refcount_t count; }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ @@ -529,6 +538,14 @@ struct btrfs_io_ctl { unsigned check_crcs:1; }; +/* + * Tree to record all locked full stripes of a RAID5/6 block group + */ +struct btrfs_full_stripe_locks_tree { + struct rb_root root; + struct mutex lock; +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; @@ -639,6 +656,9 @@ struct btrfs_block_group_cache { * Protected by free_space_lock. */ int needs_free_space; + + /* Record locked full stripes for RAID5/6 block group */ + struct btrfs_full_stripe_locks_tree full_stripe_locks_root; }; /* delayed seq elem */ @@ -649,6 +669,8 @@ struct seq_list { #define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } +#define SEQ_LAST ((u64)-1) + enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -693,6 +715,15 @@ struct btrfs_delayed_root; #define BTRFS_FS_BTREE_ERR 11 #define BTRFS_FS_LOG1_ERR 12 #define BTRFS_FS_LOG2_ERR 13 +#define BTRFS_FS_QUOTA_OVERRIDE 14 +/* Used to record internally whether fs has been frozen */ +#define BTRFS_FS_FROZEN 15 + +/* + * Indicate that a whole-filesystem exclusive operation is running + * (device replace, resize, device add/delete, balance) + */ +#define BTRFS_FS_EXCL_OP 14 struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; @@ -720,8 +751,7 @@ struct btrfs_fs_info { struct rb_root block_group_cache_tree; /* keep track of unallocated space */ - spinlock_t free_chunk_lock; - u64 free_chunk_space; + atomic64_t free_chunk_space; struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; @@ -769,17 +799,7 @@ struct btrfs_fs_info { * so it is also safe. */ u64 max_inline; - /* - * Protected by ->chunk_mutex and sb->s_umount. - * - * The reason that we use two lock to protect it is because only - * remount and mount operations can change it and these two operations - * are under sb->s_umount, but the read side (chunk allocation) can not - * acquire sb->s_umount or the deadlock would happen. So we use two - * locks to protect it. On the write side, we must acquire two locks, - * and on the read side, we just need acquire one of them. - */ - u64 alloc_start; + struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; @@ -801,7 +821,6 @@ struct btrfs_fs_info { struct btrfs_super_block *super_for_commit; struct super_block *sb; struct inode *btree_inode; - struct backing_dev_info bdi; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; @@ -1058,8 +1077,6 @@ struct btrfs_fs_info { /* device replace state */ struct btrfs_dev_replace dev_replace; - atomic_t mutually_exclusive_operation_running; - struct percpu_counter bio_counter; wait_queue_head_t replace_wait; @@ -1082,9 +1099,6 @@ struct btrfs_fs_info { */ struct list_head pinned_chunks; - /* Used to record internally whether fs has been frozen */ - int fs_frozen; - /* Cached block sizes */ u32 nodesize; u32 sectorsize; @@ -1212,7 +1226,7 @@ struct btrfs_root { dev_t anon_dev; spinlock_t root_item_lock; - atomic_t refs; + refcount_t refs; struct mutex delalloc_mutex; spinlock_t delalloc_lock; @@ -1250,23 +1264,22 @@ struct btrfs_root { atomic_t will_be_snapshoted; /* For qgroup metadata space reserve */ - atomic_t qgroup_meta_rsv; + atomic64_t qgroup_meta_rsv; }; + static inline u32 btrfs_inode_sectorsize(const struct inode *inode) { return btrfs_sb(inode->i_sb)->sectorsize; } -static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize) -{ - return blocksize - sizeof(struct btrfs_header); -} - static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { - return __BTRFS_LEAF_DATA_SIZE(info->nodesize); + + return info->nodesize - sizeof(struct btrfs_header); } +#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) + static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); @@ -1528,8 +1541,27 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ s->member = cpu_to_le##bits(val); \ } + +static inline u64 btrfs_device_total_bytes(struct extent_buffer *eb, + struct btrfs_dev_item *s) +{ + BUILD_BUG_ON(sizeof(u64) != + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, + total_bytes)); +} +static inline void btrfs_set_device_total_bytes(struct extent_buffer *eb, + struct btrfs_dev_item *s, + u64 val) +{ + BUILD_BUG_ON(sizeof(u64) != + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); + btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); +} + + BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); @@ -1953,7 +1985,7 @@ BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, - struct btrfs_disk_key *disk) + const struct btrfs_disk_key *disk) { cpu->offset = le64_to_cpu(disk->offset); cpu->type = disk->type; @@ -1961,7 +1993,7 @@ static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, } static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, - struct btrfs_key *cpu) + const struct btrfs_key *cpu) { disk->offset = cpu_to_le64(cpu->offset); disk->type = cpu->type; @@ -1993,8 +2025,7 @@ static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb, btrfs_disk_key_to_cpu(key, &disk_key); } - -static inline u8 btrfs_key_type(struct btrfs_key *key) +static inline u8 btrfs_key_type(const struct btrfs_key *key) { return key->type; } @@ -2300,10 +2331,6 @@ static inline int btrfs_super_csum_size(struct btrfs_super_block *s) return btrfs_csum_sizes[t]; } -static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) -{ - return offsetof(struct btrfs_leaf, items); -} /* * The leaf data grows from end-to-front in the node. @@ -2514,11 +2541,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ - ((type *)(btrfs_leaf_data(leaf) + \ + ((type *)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ - ((unsigned long)(btrfs_leaf_data(leaf) + \ + ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) @@ -2539,7 +2566,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes); static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { - return fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; } /* @@ -2549,7 +2576,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { - return fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, @@ -2577,8 +2604,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); -int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, @@ -2587,10 +2613,11 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int get_block_group_index(struct btrfs_block_group_cache *cache); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, - struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size); + struct btrfs_root *root, + u64 parent, u64 root_objectid, + const struct btrfs_disk_key *key, + int level, u64 hint, + u64 empty_size); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -2623,8 +2650,7 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); -void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, @@ -2657,7 +2683,9 @@ void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); +u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info); +u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info); +u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); enum btrfs_reserve_flush_enum { @@ -2680,28 +2708,31 @@ enum btrfs_flush_state { COMMIT_TRANS = 6, }; -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); -int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes); -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, - struct inode *inode); -void btrfs_orphan_release_metadata(struct inode *inode); + struct btrfs_inode *inode); +void btrfs_orphan_release_metadata(struct btrfs_inode *inode); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, u64 *qgroup_reserved, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv, - u64 qgroup_reserved); -int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); -void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); + struct btrfs_block_rsv *rsv); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); @@ -2724,7 +2755,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_inc_block_group_ro(struct btrfs_root *root, +int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache); void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); @@ -2750,9 +2781,9 @@ u64 add_new_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *info, u64 start, u64 end); /* ctree.c */ -int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, +int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int level, int *slot); -int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); +int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); @@ -2760,7 +2791,7 @@ int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_key *new_key); + const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -2802,22 +2833,23 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key, + const struct btrfs_key *new_key, unsigned long split_offset); int btrfs_duplicate_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key); + const struct btrfs_key *new_key); int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key); -int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_path *p, int - ins_len, int cow); -int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, struct btrfs_path *p, + int ins_len, int cow); +int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, u64 time_seq); int btrfs_search_slot_for_read(struct btrfs_root *root, - struct btrfs_key *key, struct btrfs_path *p, - int find_higher, int return_any); + const struct btrfs_key *key, + struct btrfs_path *p, int find_higher, + int return_any); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, u64 *last_ret, @@ -2840,19 +2872,20 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, } void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, + const struct btrfs_key *cpu_key, u32 *data_size, u32 total_data, u32 total_size, int nr); -int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, void *data, u32 data_size); +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, int nr); + const struct btrfs_key *cpu_key, u32 *data_size, + int nr); static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, + const struct btrfs_key *key, u32 data_size) { return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); @@ -2941,15 +2974,15 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, const char *name, int name_len); int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_key *key); -int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_root_item - *item); + const struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_root_item *item); int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); -int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, +int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, struct btrfs_path *path, struct btrfs_root_item *root_item, struct btrfs_key *root_key); int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); @@ -2975,7 +3008,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const char *name, int name_len); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, - int name_len, struct inode *dir, + int name_len, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3007,12 +3040,14 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, const char *name, u16 name_len, int mod); int verify_dir_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, + struct extent_buffer *leaf, int slot, struct btrfs_dir_item *dir_item); struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const char *name, int name_len); +bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, + unsigned long start, u16 name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -3054,8 +3089,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3070,11 +3105,11 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); -void btrfs_extent_item_to_extent_map(struct inode *inode, +void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, const bool new_inline, @@ -3093,9 +3128,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, int delay_iput); void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); -struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, - size_t pg_offset, u64 start, u64 len, - int create); +struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, u64 start, + u64 len, int create); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes); @@ -3116,13 +3151,13 @@ static inline void btrfs_force_ra(struct address_space *mapping, } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); -int btrfs_set_inode_index(struct inode *dir, u64 *index); +int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *dir, struct inode *inode, + struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len); int btrfs_add_link(struct btrfs_trans_handle *trans, - struct inode *parent_inode, struct inode *inode, + struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const char *name, int name_len, int add_backref, u64 index); int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3147,7 +3182,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +void btrfs_set_range_writeback(void *private_data, u64 start, u64 end); +int btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); @@ -3159,15 +3195,16 @@ void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root, int *was_new); -struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, - size_t pg_offset, u64 start, u64 end, - int create); +struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, + u64 start, u64 end, int create); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); -int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3208,11 +3245,11 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, int btrfs_auto_defrag_init(void); void btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct inode *inode); + struct btrfs_inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, +void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, int skip_pinned); extern const struct file_operations btrfs_file_operations; int __btrfs_drop_extents(struct btrfs_trans_handle *trans, @@ -3226,7 +3263,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct inode *inode, u64 start, u64 end); + struct btrfs_inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, @@ -3447,7 +3484,8 @@ do { \ "BTRFS: Transaction aborted (error %d)\n", \ (errno)); \ } else { \ - pr_debug("BTRFS: Transaction aborted (error %d)\n", \ + btrfs_debug((trans)->fs_info, \ + "Transaction aborted (error %d)", \ (errno)); \ } \ } \ @@ -3637,6 +3675,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, struct btrfs_device *dev); int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress); +static inline void btrfs_init_full_stripe_locks_tree( + struct btrfs_full_stripe_locks_tree *locks_root) +{ + locks_root->root = RB_ROOT; + mutex_init(&locks_root->lock); +} /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); @@ -3661,8 +3705,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, struct btrfs_key *start, struct btrfs_key *end); int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); -int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int err); +int btree_readahead_hook(struct extent_buffer *eb, int err); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 80982a83c9fd..8ae409b5a61d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -52,7 +52,7 @@ static inline void btrfs_init_delayed_node( { delayed_node->root = root; delayed_node->inode_id = inode_id; - atomic_set(&delayed_node->refs, 0); + refcount_set(&delayed_node->refs, 0); delayed_node->ins_root = RB_ROOT; delayed_node->del_root = RB_ROOT; mutex_init(&delayed_node->mutex); @@ -72,16 +72,16 @@ static inline int btrfs_is_continuous_delayed_item( return 0; } -static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) +static struct btrfs_delayed_node *btrfs_get_delayed_node( + struct btrfs_inode *btrfs_inode) { - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; - u64 ino = btrfs_ino(inode); + u64 ino = btrfs_ino(btrfs_inode); struct btrfs_delayed_node *node; - node = ACCESS_ONCE(btrfs_inode->delayed_node); + node = READ_ONCE(btrfs_inode->delayed_node); if (node) { - atomic_inc(&node->refs); + refcount_inc(&node->refs); return node; } @@ -89,14 +89,14 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { - atomic_inc(&node->refs); /* can be accessed */ + refcount_inc(&node->refs); /* can be accessed */ BUG_ON(btrfs_inode->delayed_node != node); spin_unlock(&root->inode_lock); return node; } btrfs_inode->delayed_node = node; /* can be accessed and cached in the inode */ - atomic_add(2, &node->refs); + refcount_add(2, &node->refs); spin_unlock(&root->inode_lock); return node; } @@ -107,16 +107,15 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) /* Will return either the node or PTR_ERR(-ENOMEM) */ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( - struct inode *inode) + struct btrfs_inode *btrfs_inode) { struct btrfs_delayed_node *node; - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; - u64 ino = btrfs_ino(inode); + u64 ino = btrfs_ino(btrfs_inode); int ret; again: - node = btrfs_get_delayed_node(inode); + node = btrfs_get_delayed_node(btrfs_inode); if (node) return node; @@ -126,7 +125,7 @@ again: btrfs_init_delayed_node(node, root, ino); /* cached in the btrfs inode and can be accessed */ - atomic_add(2, &node->refs); + refcount_set(&node->refs, 2); ret = radix_tree_preload(GFP_NOFS); if (ret) { @@ -167,7 +166,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, } else { list_add_tail(&node->n_list, &root->node_list); list_add_tail(&node->p_list, &root->prepare_list); - atomic_inc(&node->refs); /* inserted into list */ + refcount_inc(&node->refs); /* inserted into list */ root->nodes++; set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } @@ -181,7 +180,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, spin_lock(&root->lock); if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { root->nodes--; - atomic_dec(&node->refs); /* not in the list */ + refcount_dec(&node->refs); /* not in the list */ list_del_init(&node->n_list); if (!list_empty(&node->p_list)) list_del_init(&node->p_list); @@ -202,7 +201,7 @@ static struct btrfs_delayed_node *btrfs_first_delayed_node( p = delayed_root->node_list.next; node = list_entry(p, struct btrfs_delayed_node, n_list); - atomic_inc(&node->refs); + refcount_inc(&node->refs); out: spin_unlock(&delayed_root->lock); @@ -229,7 +228,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( p = node->n_list.next; next = list_entry(p, struct btrfs_delayed_node, n_list); - atomic_inc(&next->refs); + refcount_inc(&next->refs); out: spin_unlock(&delayed_root->lock); @@ -254,11 +253,11 @@ static void __btrfs_release_delayed_node( btrfs_dequeue_delayed_node(delayed_root, delayed_node); mutex_unlock(&delayed_node->mutex); - if (atomic_dec_and_test(&delayed_node->refs)) { + if (refcount_dec_and_test(&delayed_node->refs)) { bool free = false; struct btrfs_root *root = delayed_node->root; spin_lock(&root->inode_lock); - if (atomic_read(&delayed_node->refs) == 0) { + if (refcount_read(&delayed_node->refs) == 0) { radix_tree_delete(&root->delayed_nodes_tree, delayed_node->inode_id); free = true; @@ -287,7 +286,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( p = delayed_root->prepare_list.next; list_del_init(p); node = list_entry(p, struct btrfs_delayed_node, p_list); - atomic_inc(&node->refs); + refcount_inc(&node->refs); out: spin_unlock(&delayed_root->lock); @@ -309,7 +308,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) item->ins_or_del = 0; item->bytes_reserved = 0; item->delayed_node = NULL; - atomic_set(&item->refs, 1); + refcount_set(&item->refs, 1); } return item; } @@ -484,7 +483,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) { if (item) { __btrfs_remove_delayed_item(item); - if (atomic_dec_and_test(&item->refs)) + if (refcount_dec_and_test(&item->refs)) kfree(item); } } @@ -574,7 +573,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_fs_info *fs_info, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *inode, + struct btrfs_inode *inode, struct btrfs_delayed_node *node) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -603,13 +602,13 @@ static int btrfs_delayed_inode_reserve_metadata( * worth which is less likely to hurt us. */ if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { - spin_lock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) release = true; else src_rsv = NULL; - spin_unlock(&BTRFS_I(inode)->lock); + spin_unlock(&inode->lock); } /* @@ -1196,7 +1195,7 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, } int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); struct btrfs_path *path; @@ -1233,9 +1232,9 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, return ret; } -int btrfs_commit_inode_delayed_inode(struct inode *inode) +int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); struct btrfs_trans_handle *trans; struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); struct btrfs_path *path; @@ -1288,15 +1287,15 @@ out: return ret; } -void btrfs_remove_delayed_node(struct inode *inode) +void btrfs_remove_delayed_node(struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node; - delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node); + delayed_node = READ_ONCE(inode->delayed_node); if (!delayed_node) return; - BTRFS_I(inode)->delayed_node = NULL; + inode->delayed_node = NULL; btrfs_release_delayed_node(delayed_node); } @@ -1434,7 +1433,7 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, const char *name, int name_len, - struct inode *dir, + struct btrfs_inode *dir, struct btrfs_disk_key *disk_key, u8 type, u64 index) { @@ -1510,7 +1509,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct inode *dir, u64 index) + struct btrfs_inode *dir, u64 index) { struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; @@ -1558,7 +1557,7 @@ end: return ret; } -int btrfs_inode_delayed_dir_index_count(struct inode *inode) +int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); @@ -1575,7 +1574,7 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode) return -EINVAL; } - BTRFS_I(inode)->index_cnt = delayed_node->index_cnt; + inode->index_cnt = delayed_node->index_cnt; btrfs_release_delayed_node(delayed_node); return 0; } @@ -1587,7 +1586,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *item; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); if (!delayed_node) return false; @@ -1601,14 +1600,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); while (item) { - atomic_inc(&item->refs); + refcount_inc(&item->refs); list_add_tail(&item->readdir_list, ins_list); item = __btrfs_next_delayed_item(item); } item = __btrfs_first_delayed_deletion_item(delayed_node); while (item) { - atomic_inc(&item->refs); + refcount_inc(&item->refs); list_add_tail(&item->readdir_list, del_list); item = __btrfs_next_delayed_item(item); } @@ -1622,7 +1621,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * insert/delete delayed items in this period. So we also needn't * requeue or dequeue this delayed node. */ - atomic_dec(&delayed_node->refs); + refcount_dec(&delayed_node->refs); return true; } @@ -1635,13 +1634,13 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, list_for_each_entry_safe(curr, next, ins_list, readdir_list) { list_del(&curr->readdir_list); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); } list_for_each_entry_safe(curr, next, del_list, readdir_list) { list_del(&curr->readdir_list); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); } @@ -1668,7 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, list_del(&curr->readdir_list); ret = (curr->key.offset == index); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (ret) @@ -1706,7 +1705,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, list_del(&curr->readdir_list); if (curr->key.offset < ctx->pos) { - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); continue; } @@ -1723,7 +1722,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, over = !dir_emit(ctx, name, name_len, location.objectid, d_type); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (over) @@ -1776,7 +1775,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); if (!delayed_node) return -ENOENT; @@ -1791,7 +1790,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); - btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); @@ -1831,7 +1830,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *delayed_node; int ret = 0; - delayed_node = btrfs_get_or_create_delayed_node(inode); + delayed_node = btrfs_get_or_create_delayed_node(BTRFS_I(inode)); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1841,7 +1840,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, + ret = btrfs_delayed_inode_reserve_metadata(trans, root, BTRFS_I(inode), delayed_node); if (ret) goto release_node; @@ -1856,9 +1855,9 @@ release_node: return ret; } -int btrfs_delayed_delete_inode_ref(struct inode *inode) +int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); struct btrfs_delayed_node *delayed_node; /* @@ -1933,7 +1932,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) mutex_unlock(&delayed_node->mutex); } -void btrfs_kill_delayed_inode_items(struct inode *inode) +void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node; @@ -1964,7 +1963,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) inode_id = delayed_nodes[n - 1]->inode_id + 1; for (i = 0; i < n; i++) - atomic_inc(&delayed_nodes[i]->refs); + refcount_inc(&delayed_nodes[i]->refs); spin_unlock(&root->inode_lock); for (i = 0; i < n; i++) { diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 8a2bf5e3e4cf..c4189d495934 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -26,7 +26,7 @@ #include <linux/list.h> #include <linux/wait.h> #include <linux/atomic.h> - +#include <linux/refcount.h> #include "ctree.h" /* types of the delayed item */ @@ -67,7 +67,7 @@ struct btrfs_delayed_node { struct rb_root del_root; struct mutex mutex; struct btrfs_inode_item inode_item; - atomic_t refs; + refcount_t refs; u64 index_cnt; unsigned long flags; int count; @@ -80,7 +80,7 @@ struct btrfs_delayed_item { struct list_head readdir_list; /* used for readdir items */ u64 bytes_reserved; struct btrfs_delayed_node *delayed_node; - atomic_t refs; + refcount_t refs; int ins_or_del; u32 data_len; char data[0]; @@ -101,15 +101,15 @@ static inline void btrfs_init_delayed_root( int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, const char *name, int name_len, - struct inode *dir, + struct btrfs_inode *dir, struct btrfs_disk_key *disk_key, u8 type, u64 index); int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct inode *dir, u64 index); + struct btrfs_inode *dir, u64 index); -int btrfs_inode_delayed_dir_index_count(struct inode *inode); +int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode); int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); @@ -119,17 +119,17 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info); int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, - struct inode *inode); + struct btrfs_inode *inode); /* Used for evicting the inode. */ -void btrfs_remove_delayed_node(struct inode *inode); -void btrfs_kill_delayed_inode_items(struct inode *inode); -int btrfs_commit_inode_delayed_inode(struct inode *inode); +void btrfs_remove_delayed_node(struct btrfs_inode *inode); +void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode); +int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); int btrfs_fill_inode(struct inode *inode, u32 *rdev); -int btrfs_delayed_delete_inode_ref(struct inode *inode); +int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); /* Used for drop dead root */ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ef724a5fc30e..93ffa898df6d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -164,7 +164,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, if (mutex_trylock(&head->mutex)) return 0; - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); @@ -470,7 +470,8 @@ add_tail: static noinline void update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) + struct btrfs_delayed_ref_node *update, + int *old_ref_mod_ret) { struct btrfs_delayed_ref_head *existing_ref; struct btrfs_delayed_ref_head *ref; @@ -523,6 +524,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * currently, for refs we just added we know we're a-ok. */ old_ref_mod = existing_ref->total_ref_mod; + if (old_ref_mod_ret) + *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; existing_ref->total_ref_mod += update->ref_mod; @@ -550,13 +553,15 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, - int action, int is_data) + int action, int is_data, int *qrecord_inserted_ret, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; int must_insert_reserved = 0; + int qrecord_inserted = 0; /* If reserved is provided, it must be a data extent. */ BUG_ON(!is_data && reserved); @@ -589,7 +594,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = count_mod; @@ -623,6 +628,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, if(btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord)) kfree(qrecord); + else + qrecord_inserted = 1; } spin_lock_init(&head_ref->lock); @@ -635,7 +642,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, if (existing) { WARN_ON(ref_root && reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, &existing->node, ref); + update_existing_head_ref(delayed_refs, &existing->node, ref, + old_ref_mod); /* * we've updated the existing ref, free the newly * allocated ref @@ -643,6 +651,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + if (old_ref_mod) + *old_ref_mod = 0; if (is_data && count_mod < 0) delayed_refs->pending_csums += num_bytes; delayed_refs->num_heads++; @@ -650,6 +660,10 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, atomic_inc(&delayed_refs->num_entries); trans->delayed_ref_updates++; } + if (qrecord_inserted_ret) + *qrecord_inserted_ret = qrecord_inserted; + if (new_ref_mod) + *new_ref_mod = head_ref->total_ref_mod; return head_ref; } @@ -677,7 +691,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = 1; @@ -734,7 +748,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, seq = atomic64_read(&fs_info->tree_mod_seq); /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = 1; @@ -773,12 +787,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; + int qrecord_inserted; BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); @@ -806,12 +822,16 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * the spin lock */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, - bytenr, num_bytes, 0, 0, action, 0); + bytenr, num_bytes, 0, 0, action, 0, + &qrecord_inserted, old_ref_mod, + new_ref_mod); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action); spin_unlock(&delayed_refs->lock); + if (qrecord_inserted) + return btrfs_qgroup_trace_extent_post(fs_info, record); return 0; free_head_ref: @@ -830,14 +850,14 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, u64 reserved, int action, - struct btrfs_delayed_extent_op *extent_op) + int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; + int qrecord_inserted; - BUG_ON(extent_op && !extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; @@ -859,7 +879,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, } } - head_ref->extent_op = extent_op; + head_ref->extent_op = NULL; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); @@ -870,13 +890,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, ref_root, reserved, - action, 1); + action, 1, &qrecord_inserted, + old_ref_mod, new_ref_mod); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action); spin_unlock(&delayed_refs->lock); + if (qrecord_inserted) + return btrfs_qgroup_trace_extent_post(fs_info, record); return 0; } @@ -899,7 +922,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data); + extent_op->is_data, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); return 0; @@ -911,11 +934,8 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, * the head node if any where found, or NULL if not. */ struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) +btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { - struct btrfs_delayed_ref_root *delayed_refs; - - delayed_refs = &trans->transaction->delayed_refs; return find_ref_head(&delayed_refs->href_root, bytenr, 0); } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 50947b5a9152..ce88e4ac5276 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -18,6 +18,8 @@ #ifndef __DELAYED_REF__ #define __DELAYED_REF__ +#include <linux/refcount.h> + /* these are the possible values of struct btrfs_delayed_ref_node->action */ #define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ #define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ @@ -53,7 +55,7 @@ struct btrfs_delayed_ref_node { u64 seq; /* ref count on this data structure */ - atomic_t refs; + refcount_t refs; /* * how many refs is this entry adding or deleting. For @@ -220,8 +222,8 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { - WARN_ON(atomic_read(&ref->refs) == 0); - if (atomic_dec_and_test(&ref->refs)) { + WARN_ON(refcount_read(&ref->refs) == 0); + if (refcount_dec_and_test(&ref->refs)) { WARN_ON(ref->in_tree); switch (ref->type) { case BTRFS_TREE_BLOCK_REF_KEY: @@ -245,13 +247,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op); + struct btrfs_delayed_extent_op *extent_op, + int *old_ref_mod, int *new_ref_mod); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, u64 reserved, int action, - struct btrfs_delayed_extent_op *extent_op); + int *old_ref_mod, int *new_ref_mod); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, @@ -262,7 +265,8 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); +btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + u64 bytenr); int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head); static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5de280b9ad73..bee3edeea7a3 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -304,8 +304,9 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) dev_replace->cursor_left_last_write_of_item; } -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, char *tgtdev_name, - u64 srcdevid, char *srcdev_name, int read_src) +int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, + const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, + int read_src) { struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; @@ -387,7 +388,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, char *tgtdev_name, if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -506,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -545,8 +546,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + btrfs_rm_dev_replace_unblocked(fs_info); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return scrub_ret; @@ -664,7 +667,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: srcdev = dev_replace->srcdev; - args->status.progress_1000 = div_u64(dev_replace->cursor_left, + args->status.progress_1000 = div64_u64(dev_replace->cursor_left, div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } @@ -783,8 +786,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) } btrfs_dev_replace_unlock(dev_replace, 1); - WARN_ON(atomic_xchg( - &fs_info->mutually_exclusive_operation_running, 1)); + WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); return PTR_ERR_OR_ZERO(task); } @@ -813,7 +815,7 @@ static int btrfs_dev_replace_kthread(void *data) (unsigned int)progress); } btrfs_dev_replace_continue_on_mount(fs_info); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return 0; } diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 54ea12bda15b..f94a76844ae7 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -27,8 +27,9 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, char *tgtdev_name, - u64 srcdevid, char *srcdev_name, int read_src); +int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, + const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, + int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index b039fe0c751a..41cb9196eaa8 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -80,7 +80,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; u32 data_size; - BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)); + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) + return -ENOSPC; key.objectid = objectid; key.type = BTRFS_XATTR_ITEM_KEY; @@ -120,7 +121,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, - struct inode *dir, struct btrfs_key *location, + struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index) { int ret = 0; @@ -174,8 +175,7 @@ second_insert: btrfs_release_path(path); ret2 = btrfs_insert_delayed_dir_index(trans, root->fs_info, name, - name_len, dir, &disk_key, type, - index); + name_len, dir, &disk_key, type, index); out_free: btrfs_free_path(path); if (ret) @@ -395,8 +395,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); - if (verify_dir_item(fs_info, leaf, dir_item)) - return NULL; total_len = btrfs_item_size_nr(leaf, path->slots[0]); while (cur < total_len) { @@ -405,6 +403,8 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, btrfs_dir_data_len(leaf, dir_item); name_ptr = (unsigned long)(dir_item + 1); + if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item)) + return NULL; if (btrfs_dir_name_len(leaf, dir_item) == name_len && memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) return dir_item; @@ -453,9 +453,11 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, int verify_dir_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, + int slot, struct btrfs_dir_item *dir_item) { u16 namelen = BTRFS_NAME_LEN; + int ret; u8 type = btrfs_dir_type(leaf, dir_item); if (type >= BTRFS_FT_MAX) { @@ -468,10 +470,16 @@ int verify_dir_item(struct btrfs_fs_info *fs_info, if (btrfs_dir_name_len(leaf, dir_item) > namelen) { btrfs_crit(fs_info, "invalid dir item name len: %u", - (unsigned)btrfs_dir_data_len(leaf, dir_item)); + (unsigned)btrfs_dir_name_len(leaf, dir_item)); return 1; } + namelen = btrfs_dir_name_len(leaf, dir_item); + ret = btrfs_is_name_len_valid(leaf, slot, + (unsigned long)(dir_item + 1), namelen); + if (!ret) + return 1; + /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ if ((btrfs_dir_data_len(leaf, dir_item) + btrfs_dir_name_len(leaf, dir_item)) > @@ -484,3 +492,67 @@ int verify_dir_item(struct btrfs_fs_info *fs_info, return 0; } + +bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, + unsigned long start, u16 name_len) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_key key; + u32 read_start; + u32 read_end; + u32 item_start; + u32 item_end; + u32 size; + bool ret = true; + + ASSERT(start > BTRFS_LEAF_DATA_OFFSET); + + read_start = start - BTRFS_LEAF_DATA_OFFSET; + read_end = read_start + name_len; + item_start = btrfs_item_offset_nr(leaf, slot); + item_end = btrfs_item_end_nr(leaf, slot); + + btrfs_item_key_to_cpu(leaf, &key, slot); + + switch (key.type) { + case BTRFS_DIR_ITEM_KEY: + case BTRFS_XATTR_ITEM_KEY: + case BTRFS_DIR_INDEX_KEY: + size = sizeof(struct btrfs_dir_item); + break; + case BTRFS_INODE_REF_KEY: + size = sizeof(struct btrfs_inode_ref); + break; + case BTRFS_INODE_EXTREF_KEY: + size = sizeof(struct btrfs_inode_extref); + break; + case BTRFS_ROOT_REF_KEY: + case BTRFS_ROOT_BACKREF_KEY: + size = sizeof(struct btrfs_root_ref); + break; + default: + ret = false; + goto out; + } + + if (read_start < item_start) { + ret = false; + goto out; + } + if (read_end > item_end) { + ret = false; + goto out; + } + + /* there shall be item(s) before name */ + if (read_start - item_start < size) { + ret = false; + goto out; + } + +out: + if (!ret) + btrfs_crit(fs_info, "invalid dir item name len: %u", + (unsigned int)name_len); + return ret; +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 18004169552c..080e2ebb8aa0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -64,8 +64,7 @@ static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); -static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, - int read_only); +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); @@ -88,9 +87,8 @@ struct btrfs_end_io_wq { bio_end_io_t *end_io; void *private; struct btrfs_fs_info *info; - int error; + blk_status_t status; enum btrfs_wq_endio_type metadata; - struct list_head list; struct btrfs_work work; }; @@ -119,9 +117,9 @@ void btrfs_end_io_wq_exit(void) * just before they are sent down the IO stack. */ struct async_submit_bio { - struct inode *inode; + void *private_data; + struct btrfs_fs_info *fs_info; struct bio *bio; - struct list_head list; extent_submit_bio_hook_t *submit_bio_start; extent_submit_bio_hook_t *submit_bio_done; int mirror_num; @@ -132,7 +130,7 @@ struct async_submit_bio { */ u64 bio_offset; struct btrfs_work work; - int error; + blk_status_t status; }; /* @@ -220,12 +218,12 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, * extents on the btree inode are pretty simple, there's one extent * that covers the entire device */ -static struct extent_map *btree_get_extent(struct inode *inode, +static struct extent_map *btree_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; @@ -266,7 +264,7 @@ out: return em; } -u32 btrfs_csum_data(char *data, u32 seed, size_t len) +u32 btrfs_csum_data(const char *data, u32 seed, size_t len) { return btrfs_crc32c(seed, data, len); } @@ -763,7 +761,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(fs_info, eb, ret); + btree_readahead_hook(eb, ret); if (ret) { /* @@ -788,7 +786,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb->fs_info, eb, -EIO); + btree_readahead_hook(eb, -EIO); return -EIO; /* we fixed nothing */ } @@ -800,7 +798,7 @@ static void end_workqueue_bio(struct bio *bio) btrfs_work_func_t func; fs_info = end_io_wq->info; - end_io_wq->error = bio->bi_error; + end_io_wq->status = bio->bi_status; if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { @@ -837,19 +835,19 @@ static void end_workqueue_bio(struct bio *bio) btrfs_queue_work(wq, &end_io_wq->work); } -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata) { struct btrfs_end_io_wq *end_io_wq; end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); if (!end_io_wq) - return -ENOMEM; + return BLK_STS_RESOURCE; end_io_wq->private = bio->bi_private; end_io_wq->end_io = bio->bi_end_io; end_io_wq->info = info; - end_io_wq->error = 0; + end_io_wq->status = 0; end_io_wq->bio = bio; end_io_wq->metadata = metadata; @@ -869,14 +867,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; - int ret; + blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->bio, + ret = async->submit_bio_start(async->private_data, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); if (ret) - async->error = ret; + async->status = ret; } static void run_one_async_done(struct btrfs_work *work) @@ -886,7 +884,7 @@ static void run_one_async_done(struct btrfs_work *work) int limit; async = container_of(work, struct async_submit_bio, work); - fs_info = BTRFS_I(async->inode)->root->fs_info; + fs_info = async->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; @@ -899,13 +897,13 @@ static void run_one_async_done(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); /* If an error occurred we just want to clean up the bio and move on */ - if (async->error) { - async->bio->bi_error = async->error; + if (async->status) { + async->bio->bi_status = async->status; bio_endio(async->bio); return; } - async->submit_bio_done(async->inode, async->bio, async->mirror_num, + async->submit_bio_done(async->private_data, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); } @@ -917,20 +915,20 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, - u64 bio_offset, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done) +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset, void *private_data, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) { struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) - return -ENOMEM; + return BLK_STS_RESOURCE; - async->inode = inode; + async->private_data = private_data; + async->fs_info = fs_info; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; @@ -942,7 +940,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; async->bio_offset = bio_offset; - async->error = 0; + async->status = 0; atomic_inc(&fs_info->nr_async_submits); @@ -960,12 +958,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, return 0; } -static int btree_csum_one_bio(struct bio *bio) +static blk_status_t btree_csum_one_bio(struct bio *bio) { struct bio_vec *bvec; struct btrfs_root *root; int i, ret = 0; + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, i) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); @@ -973,12 +972,12 @@ static int btree_csum_one_bio(struct bio *bio) break; } - return ret; + return errno_to_blk_status(ret); } -static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -987,11 +986,12 @@ static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, return btree_csum_one_bio(bio); } -static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { - int ret; + struct inode *inode = private_data; + blk_status_t ret; /* * when we're called for a write, we're already in the async @@ -999,13 +999,13 @@ static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, */ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; } -static int check_async_write(struct inode *inode, unsigned long bio_flags) +static int check_async_write(unsigned long bio_flags) { if (bio_flags & EXTENT_BIO_TREE_LOG) return 0; @@ -1016,13 +1016,14 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) return 1; } -static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int async = check_async_write(inode, bio_flags); - int ret; + int async = check_async_write(bio_flags); + blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { /* @@ -1044,8 +1045,8 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ - ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, 0, - bio_offset, + ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, + bio_offset, private_data, __btree_submit_bio_start, __btree_submit_bio_done); } @@ -1055,7 +1056,7 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, return 0; out_w_error: - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); return ret; } @@ -1223,10 +1224,10 @@ int btrfs_write_tree_block(struct extent_buffer *buf) buf->start + buf->len - 1); } -int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return filemap_fdatawait_range(buf->pages[0]->mapping, - buf->start, buf->start + buf->len - 1); + filemap_fdatawait_range(buf->pages[0]->mapping, + buf->start, buf->start + buf->len - 1); } struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, @@ -1248,8 +1249,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, } -void clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, +void clean_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *buf) { if (btrfs_header_generation(buf) == @@ -1257,9 +1257,9 @@ void clean_tree_block(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - __percpu_counter_add(&fs_info->dirty_metadata_bytes, - -buf->len, - fs_info->dirty_metadata_batch); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + -buf->len, + fs_info->dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(buf); @@ -1342,15 +1342,14 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); - atomic_set(&root->refs, 1); + refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshoted, 0); - atomic_set(&root->qgroup_meta_rsv, 0); + atomic64_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; if (!dummy) - extent_io_tree_init(&root->dirty_log_pages, - fs_info->btree_inode->i_mapping); + extent_io_tree_init(&root->dirty_log_pages, NULL); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -1800,7 +1799,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; - bdi = blk_get_backing_dev_info(device->bdev); + bdi = device->bdev->bd_bdi; if (bdi_congested(bdi, bdi_bits)) { ret = 1; break; @@ -1810,21 +1809,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) return ret; } -static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) -{ - int err; - - err = bdi_setup_and_register(bdi, "btrfs"); - if (err) - return err; - - bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; - bdi->congested_fn = btrfs_congested_fn; - bdi->congested_data = info; - bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - return 0; -} - /* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens @@ -1837,7 +1821,7 @@ static void end_workqueue_fn(struct btrfs_work *work) end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; - bio->bi_error = end_io_wq->error; + bio->bi_status = end_io_wq->status; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); @@ -2207,11 +2191,9 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->delalloc_workers); btrfs_destroy_workqueue(fs_info->workers); btrfs_destroy_workqueue(fs_info->endio_workers); - btrfs_destroy_workqueue(fs_info->endio_meta_workers); btrfs_destroy_workqueue(fs_info->endio_raid56_workers); btrfs_destroy_workqueue(fs_info->endio_repair_workers); btrfs_destroy_workqueue(fs_info->rmw_workers); - btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->submit_workers); @@ -2221,6 +2203,13 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); btrfs_destroy_workqueue(fs_info->extent_workers); + /* + * Now that all other work queues are destroyed, we can safely destroy + * the queues used for metadata I/O, since tasks from those other work + * queues can do metadata I/O operations. + */ + btrfs_destroy_workqueue(fs_info->endio_meta_workers); + btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -2321,7 +2310,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) inode->i_mapping->a_ops = &btree_aops; RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode); BTRFS_I(inode)->io_tree.track_uptodate = 0; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); @@ -2598,16 +2587,10 @@ int open_ctree(struct super_block *sb, goto fail; } - ret = setup_bdi(fs_info, &fs_info->bdi); - if (ret) { - err = ret; - goto fail_srcu; - } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; - goto fail_bdi; + goto fail_srcu; } fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -2644,7 +2627,6 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); - spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->qgroup_op_lock); @@ -2680,12 +2662,11 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); atomic64_set(&fs_info->tree_mod_seq, 0); - fs_info->fs_frozen = 0; fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; - fs_info->free_chunk_space = 0; + atomic64_set(&fs_info->free_chunk_space, 0); fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ @@ -2715,7 +2696,6 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); - sb->s_bdi = &fs_info->bdi; btrfs_init_btree_inode(fs_info); @@ -2723,10 +2703,8 @@ int open_ctree(struct super_block *sb, fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; - extent_io_tree_init(&fs_info->freed_extents[0], - fs_info->btree_inode->i_mapping); - extent_io_tree_init(&fs_info->freed_extents[1], - fs_info->btree_inode->i_mapping); + extent_io_tree_init(&fs_info->freed_extents[0], NULL); + extent_io_tree_init(&fs_info->freed_extents[1], NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_BARRIER, &fs_info->flags); @@ -2802,7 +2780,7 @@ int open_ctree(struct super_block *sb, memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); - ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + ret = btrfs_check_super_valid(fs_info); if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; @@ -2912,9 +2890,12 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); - fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, - SZ_4M / PAGE_SIZE); + sb->s_bdi->congested_fn = btrfs_congested_fn; + sb->s_bdi->congested_data = fs_info; + sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); + sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); @@ -3263,7 +3244,6 @@ fail_fsdev_sysfs: fail_block_groups: btrfs_put_block_group_cache(fs_info); - btrfs_free_block_groups(fs_info); fail_tree_roots: free_root_pointers(fs_info, 1); @@ -3271,6 +3251,7 @@ fail_tree_roots: fail_sb_buffer: btrfs_stop_all_workers(fs_info); + btrfs_free_block_groups(fs_info); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -3282,8 +3263,6 @@ fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: percpu_counter_destroy(&fs_info->dirty_metadata_bytes); -fail_bdi: - bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: @@ -3411,7 +3390,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, - int do_barriers, int wait, int max_mirrors) + int wait, int max_mirrors) { struct buffer_head *bh; int i; @@ -3450,7 +3429,7 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_set_super_bytenr(sb, bytenr); crc = ~(u32)0; - crc = btrfs_csum_data((char *)sb + + crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); @@ -3485,10 +3464,12 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - if (i == 0) - ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh); - else + if (i == 0) { + ret = btrfsic_submit_bh(REQ_OP_WRITE, + REQ_SYNC | REQ_FUA, bh); + } else { ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + } if (ret) errors++; } @@ -3501,64 +3482,61 @@ static int write_dev_supers(struct btrfs_device *device, */ static void btrfs_end_empty_barrier(struct bio *bio) { - if (bio->bi_private) - complete(bio->bi_private); - bio_put(bio); + complete(bio->bi_private); } /* - * trigger flushes for one the devices. If you pass wait == 0, the flushes are - * sent down. With wait == 1, it waits for the previous flush. - * - * any device where the flush fails with eopnotsupp are flagged as not-barrier - * capable + * Submit a flush request to the device if it supports it. Error handling is + * done in the waiting counterpart. */ -static int write_dev_flush(struct btrfs_device *device, int wait) +static void write_dev_flush(struct btrfs_device *device) { - struct bio *bio; - int ret = 0; + struct request_queue *q = bdev_get_queue(device->bdev); + struct bio *bio = device->flush_bio; - if (device->nobarriers) - return 0; + if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + return; - if (wait) { - bio = device->flush_bio; - if (!bio) - return 0; + bio_reset(bio); + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; - wait_for_completion(&device->flush_wait); + submit_bio(bio); + device->flush_bio_sent = 1; +} - if (bio->bi_error) { - ret = bio->bi_error; - btrfs_dev_stat_inc_and_print(device, - BTRFS_DEV_STAT_FLUSH_ERRS); - } +/* + * If the flush bio has been submitted by write_dev_flush, wait for it. + */ +static blk_status_t wait_dev_flush(struct btrfs_device *device) +{ + struct bio *bio = device->flush_bio; - /* drop the reference from the wait == 0 run */ - bio_put(bio); - device->flush_bio = NULL; + if (!device->flush_bio_sent) + return 0; - return ret; - } + device->flush_bio_sent = 0; + wait_for_completion_io(&device->flush_wait); - /* - * one reference for us, and we leave it for the - * caller - */ - device->flush_bio = NULL; - bio = btrfs_io_bio_alloc(GFP_NOFS, 0); - if (!bio) - return -ENOMEM; + return bio->bi_status; +} - bio->bi_end_io = btrfs_end_empty_barrier; - bio->bi_bdev = device->bdev; - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - init_completion(&device->flush_wait); - bio->bi_private = &device->flush_wait; - device->flush_bio = bio; +static int check_barrier_error(struct btrfs_fs_devices *fsdevs) +{ + int dev_flush_error = 0; + struct btrfs_device *dev; - bio_get(bio); - btrfsic_submit_bio(bio); + list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) { + if (!dev->bdev || dev->last_flush_error) + dev_flush_error++; + } + + if (dev_flush_error > + fsdevs->fs_info->num_tolerated_disk_barrier_failures) + return -EIO; return 0; } @@ -3571,25 +3549,21 @@ static int barrier_all_devices(struct btrfs_fs_info *info) { struct list_head *head; struct btrfs_device *dev; - int errors_send = 0; int errors_wait = 0; - int ret; + blk_status_t ret; /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry_rcu(dev, head, dev_list) { if (dev->missing) continue; - if (!dev->bdev) { - errors_send++; + if (!dev->bdev) continue; - } if (!dev->in_fs_metadata || !dev->writeable) continue; - ret = write_dev_flush(dev, 0); - if (ret) - errors_send++; + write_dev_flush(dev); + dev->last_flush_error = 0; } /* wait for all the barriers */ @@ -3603,13 +3577,23 @@ static int barrier_all_devices(struct btrfs_fs_info *info) if (!dev->in_fs_metadata || !dev->writeable) continue; - ret = write_dev_flush(dev, 1); - if (ret) + ret = wait_dev_flush(dev); + if (ret) { + dev->last_flush_error = ret; + btrfs_dev_stat_inc_and_print(dev, + BTRFS_DEV_STAT_FLUSH_ERRS); errors_wait++; + } + } + + if (errors_wait) { + /* + * At some point we need the status of all disks + * to arrive at the volume status. So error checking + * is being pushed to a separate loop. + */ + return check_barrier_error(info->fs_devices); } - if (errors_send > info->num_tolerated_disk_barrier_failures || - errors_wait > info->num_tolerated_disk_barrier_failures) - return -EIO; return 0; } @@ -3696,7 +3680,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( return num_tolerated_disk_barrier_failures; } -static int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) +int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) { struct list_head *head; struct btrfs_device *dev; @@ -3753,7 +3737,7 @@ static int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); - ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors); + ret = write_dev_supers(dev, sb, 0, max_mirrors); if (ret) total_errors++; } @@ -3776,7 +3760,7 @@ static int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) if (!dev->in_fs_metadata || !dev->writeable) continue; - ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors); + ret = write_dev_supers(dev, sb, 1, max_mirrors); if (ret) total_errors++; } @@ -3790,12 +3774,6 @@ static int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) return 0; } -int write_ctree_super(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, int max_mirrors) -{ - return write_all_supers(fs_info, max_mirrors); -} - /* Drop a fs root from the radix tree and free it. */ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) @@ -3985,8 +3963,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) btrfs_put_block_group_cache(fs_info); - btrfs_free_block_groups(fs_info); - /* * we must make sure there is not any read request to * submit after we stopping all workers. @@ -3994,6 +3970,8 @@ void close_ctree(struct btrfs_fs_info *fs_info) invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); + btrfs_free_block_groups(fs_info); + clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, 1); @@ -4010,7 +3988,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->bio_counter); - bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); @@ -4071,9 +4048,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) buf->start, transid, fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) - __percpu_counter_add(&fs_info->dirty_metadata_bytes, - buf->len, - fs_info->dirty_metadata_batch); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + buf->len, + fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { btrfs_print_leaf(fs_info, buf); @@ -4122,8 +4099,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) return btree_read_extent_buffer_pages(fs_info, buf, parent_transid); } -static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, - int read_only) +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) { struct btrfs_super_block *sb = fs_info->super_copy; u64 nodesize = btrfs_super_nodesize(sb); @@ -4347,7 +4323,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); @@ -4601,11 +4577,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); - - /* - memset(cur_trans, 0, sizeof(*cur_trans)); - kmem_cache_free(btrfs_transaction_cachep, cur_trans); - */ } static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) @@ -4619,7 +4590,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); if (t->state >= TRANS_STATE_COMMIT_START) { - atomic_inc(&t->use_count); + refcount_inc(&t->use_count); spin_unlock(&fs_info->trans_lock); btrfs_wait_for_commit(fs_info, t->transid); btrfs_put_transaction(t); @@ -4661,10 +4632,21 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } +static struct btrfs_fs_info *btree_fs_info(void *private_data) +{ + struct inode *inode = private_data; + return btrfs_sb(inode->i_sb); +} + static const struct extent_io_ops btree_extent_io_ops = { - .readpage_end_io_hook = btree_readpage_end_io_hook, - .readpage_io_failed_hook = btree_io_failed_hook, + /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, + .readpage_end_io_hook = btree_readpage_end_io_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, + .readpage_io_failed_hook = btree_io_failed_hook, + .set_range_writeback = btrfs_set_range_writeback, + .tree_fs_info = btree_fs_info, + + /* optional callbacks */ }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 44dcd9af6b7c..0a634d3ffc16 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,14 +52,12 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr); -void clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct extent_buffer *buf); +void clean_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *buf); int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); void close_ctree(struct btrfs_fs_info *fs_info); -int write_ctree_super(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, int max_mirrors); +int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, struct buffer_head **bh_ret); @@ -103,14 +101,14 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); */ static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) { - if (atomic_inc_not_zero(&root->refs)) + if (refcount_inc_not_zero(&root->refs)) return root; return NULL; } static inline void btrfs_put_fs_root(struct btrfs_root *root) { - if (atomic_dec_and_test(&root->refs)) + if (refcount_dec_and_test(&root->refs)) kfree(root); } @@ -118,18 +116,18 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); -u32 btrfs_csum_data(char *data, u32 seed, size_t len); +u32 btrfs_csum_data(const char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, u8 *result); -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset, +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset, void *private_data, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); int btrfs_write_tree_block(struct extent_buffer *buf); -int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); +void btrfs_wait_tree_block_writeback(struct extent_buffer *buf); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 340d90751263..fa66980726c9 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -30,7 +30,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, len = BTRFS_FID_SIZE_NON_CONNECTABLE; type = FILEID_BTRFS_WITHOUT_PARENT; - fid->objectid = btrfs_ino(inode); + fid->objectid = btrfs_ino(BTRFS_I(inode)); fid->root_objectid = BTRFS_I(inode)->root->objectid; fid->gen = inode->i_generation; @@ -166,13 +166,13 @@ static struct dentry *btrfs_get_parent(struct dentry *child) if (!path) return ERR_PTR(-ENOMEM); - if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = root->root_key.objectid; key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { - key.objectid = btrfs_ino(dir); + key.objectid = btrfs_ino(BTRFS_I(dir)); key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; } @@ -235,13 +235,10 @@ static int btrfs_get_name(struct dentry *parent, char *name, int ret; u64 ino; - if (!dir || !inode) - return -EINVAL; - if (!S_ISDIR(dir->i_mode)) return -EINVAL; - ino = btrfs_ino(inode); + ino = btrfs_ino(BTRFS_I(inode)); path = btrfs_alloc_path(); if (!path) @@ -255,7 +252,7 @@ static int btrfs_get_name(struct dentry *parent, char *name, root = fs_info->tree_root; } else { key.objectid = ino; - key.offset = btrfs_ino(dir); + key.offset = btrfs_ino(BTRFS_I(dir)); key.type = BTRFS_INODE_REF_KEY; } @@ -285,6 +282,11 @@ static int btrfs_get_name(struct dentry *parent, char *name, name_len = btrfs_inode_ref_name_len(leaf, iref); } + ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len); + if (!ret) { + btrfs_free_path(path); + return -EIO; + } read_extent_buffer(leaf, name, name_ptr, name_len); btrfs_free_path(path); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dcd2e798767e..375f8c728d91 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> @@ -96,10 +97,11 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -static int __reserve_metadata_bytes(struct btrfs_root *root, +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush); + enum btrfs_reserve_flush_enum flush, + bool system_chunk); static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes); @@ -130,6 +132,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); + + /* + * If not empty, someone is still holding mutex of + * full_stripe_lock, which can only be released by caller. + * And it will definitely cause use-after-free when caller + * tries to release full stripe lock. + * + * No better way to resolve, but only to warn. + */ + WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); kfree(cache->free_space_ctl); kfree(cache); } @@ -315,14 +327,14 @@ get_caching_control(struct btrfs_block_group_cache *cache) } ctl = cache->caching_ctl; - atomic_inc(&ctl->count); + refcount_inc(&ctl->count); spin_unlock(&cache->lock); return ctl; } static void put_caching_control(struct btrfs_caching_control *ctl) { - if (atomic_dec_and_test(&ctl->count)) + if (refcount_dec_and_test(&ctl->count)) kfree(ctl); } @@ -598,7 +610,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; - atomic_set(&caching_ctl->count, 1); + refcount_set(&caching_ctl->count, 1); btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, caching_thread, NULL, NULL); @@ -619,7 +631,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_caching_control *ctl; ctl = cache->caching_ctl; - atomic_inc(&ctl->count); + refcount_inc(&ctl->count); prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&cache->lock); @@ -706,7 +718,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } down_write(&fs_info->commit_root_sem); - atomic_inc(&caching_ctl->count); + refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->commit_root_sem); @@ -755,6 +767,26 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, return NULL; } +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, + u64 owner, u64 root_objectid) +{ + struct btrfs_space_info *space_info; + u64 flags; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + } else { + flags = BTRFS_BLOCK_GROUP_DATA; + } + + space_info = __find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); +} + /* * after adding space to the filesystem, we need to clear the full flags * on all the space infos. @@ -888,10 +920,10 @@ search_again: delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -1035,10 +1067,11 @@ out_free: #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 owner, u32 extra_size) { + struct btrfs_root *root = fs_info->extent_root; struct btrfs_extent_item *item; struct btrfs_extent_item_v0 *ei0; struct btrfs_extent_ref_v0 *ref0; @@ -1092,7 +1125,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, return ret; BUG_ON(ret); /* Corruption */ - btrfs_extend_item(root->fs_info, path, new_size); + btrfs_extend_item(fs_info, path, new_size); leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1151,12 +1184,13 @@ static int match_extent_data_ref(struct extent_buffer *leaf, } static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + struct btrfs_root *root = fs_info->extent_root; struct btrfs_key key; struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; @@ -1238,12 +1272,13 @@ fail: } static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add) { + struct btrfs_root *root = fs_info->extent_root; struct btrfs_key key; struct extent_buffer *leaf; u32 size; @@ -1317,7 +1352,7 @@ fail: } static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, int refs_to_drop, int *last_ref) { @@ -1354,7 +1389,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, num_refs -= refs_to_drop; if (num_refs == 0) { - ret = btrfs_del_item(trans, root, path); + ret = btrfs_del_item(trans, fs_info->extent_root, path); *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) @@ -1416,11 +1451,12 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, } static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid) { + struct btrfs_root *root = fs_info->extent_root; struct btrfs_key key; int ret; @@ -1449,7 +1485,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, } static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid) @@ -1466,7 +1502,8 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, key.offset = root_objectid; } - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, + path, &key, 0); btrfs_release_path(path); return ret; } @@ -1524,14 +1561,14 @@ static int find_next_key(struct btrfs_path *path, int level, */ static noinline_for_stack int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_extent_inline_ref **ref_ret, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int insert) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *root = fs_info->extent_root; struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1614,7 +1651,7 @@ again: err = -ENOENT; goto out; } - ret = convert_extent_item_v0(trans, root, path, owner, + ret = convert_extent_item_v0(trans, fs_info, path, owner, extra_size); if (ret < 0) { err = ret; @@ -1716,7 +1753,7 @@ out: * helper to add new inline back ref */ static noinline_for_stack -void setup_inline_extent_backref(struct btrfs_root *root, +void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, u64 parent, u64 root_objectid, @@ -1739,7 +1776,7 @@ void setup_inline_extent_backref(struct btrfs_root *root, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - btrfs_extend_item(root->fs_info, path, size); + btrfs_extend_item(fs_info, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1777,7 +1814,7 @@ void setup_inline_extent_backref(struct btrfs_root *root, } static int lookup_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_extent_inline_ref **ref_ret, u64 bytenr, u64 num_bytes, u64 parent, @@ -1785,7 +1822,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, { int ret; - ret = lookup_inline_extent_backref(trans, root, path, ref_ret, + ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret, bytenr, num_bytes, parent, root_objectid, owner, offset, 0); if (ret != -ENOENT) @@ -1795,11 +1832,12 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, *ref_ret = NULL; if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, - root_objectid); + ret = lookup_tree_block_ref(trans, fs_info, path, bytenr, + parent, root_objectid); } else { - ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, - root_objectid, owner, offset); + ret = lookup_extent_data_ref(trans, fs_info, path, bytenr, + parent, root_objectid, owner, + offset); } return ret; } @@ -1808,7 +1846,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, * helper to update/remove inline back ref */ static noinline_for_stack -void update_inline_extent_backref(struct btrfs_root *root, +void update_inline_extent_backref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, @@ -1866,14 +1904,14 @@ void update_inline_extent_backref(struct btrfs_root *root, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - btrfs_truncate_item(root->fs_info, path, item_size, 1); + btrfs_truncate_item(fs_info, path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); } static noinline_for_stack int insert_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, @@ -1883,15 +1921,15 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref; int ret; - ret = lookup_inline_extent_backref(trans, root, path, &iref, + ret = lookup_inline_extent_backref(trans, fs_info, path, &iref, bytenr, num_bytes, parent, root_objectid, owner, offset, 1); if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - update_inline_extent_backref(root, path, iref, + update_inline_extent_backref(fs_info, path, iref, refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { - setup_inline_extent_backref(root, path, iref, parent, + setup_inline_extent_backref(fs_info, path, iref, parent, root_objectid, owner, offset, refs_to_add, extent_op); ret = 0; @@ -1900,7 +1938,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, } static int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add) @@ -1908,10 +1946,10 @@ static int insert_extent_backref(struct btrfs_trans_handle *trans, int ret; if (owner < BTRFS_FIRST_FREE_OBJECTID) { BUG_ON(refs_to_add != 1); - ret = insert_tree_block_ref(trans, root, path, bytenr, + ret = insert_tree_block_ref(trans, fs_info, path, bytenr, parent, root_objectid); } else { - ret = insert_extent_data_ref(trans, root, path, bytenr, + ret = insert_extent_data_ref(trans, fs_info, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); } @@ -1919,7 +1957,7 @@ static int insert_extent_backref(struct btrfs_trans_handle *trans, } static int remove_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_drop, int is_data, int *last_ref) @@ -1928,14 +1966,14 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (iref) { - update_inline_extent_backref(root, path, iref, + update_inline_extent_backref(fs_info, path, iref, -refs_to_drop, NULL, last_ref); } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop, + ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop, last_ref); } else { *last_ref = 1; - ret = btrfs_del_item(trans, root, path); + ret = btrfs_del_item(trans, fs_info->extent_root, path); } return ret; } @@ -2075,6 +2113,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + int old_ref_mod, new_ref_mod; int ret; BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && @@ -2082,15 +2121,21 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL); + num_bytes, parent, + root_objectid, (int)owner, + BTRFS_ADD_DELAYED_REF, NULL, + &old_ref_mod, &new_ref_mod); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, parent, root_objectid, - owner, offset, 0, - BTRFS_ADD_DELAYED_REF, NULL); + num_bytes, parent, + root_objectid, owner, offset, + 0, BTRFS_ADD_DELAYED_REF, + &old_ref_mod, &new_ref_mod); } + + if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) + add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid); + return ret; } @@ -2117,9 +2162,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path->reada = READA_FORWARD; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ - ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, - bytenr, num_bytes, parent, - root_objectid, owner, offset, + ret = insert_inline_extent_backref(trans, fs_info, path, bytenr, + num_bytes, parent, root_objectid, + owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; @@ -2143,9 +2188,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ - ret = insert_extent_backref(trans, fs_info->extent_root, - path, bytenr, parent, root_objectid, - owner, offset, refs_to_add); + ret = insert_extent_backref(trans, fs_info, path, bytenr, parent, + root_objectid, owner, offset, refs_to_add); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -2290,8 +2334,7 @@ again: item_size = btrfs_item_size_nr(leaf, path->slots[0]); #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (item_size < sizeof(*ei)) { - ret = convert_extent_item_v0(trans, fs_info->extent_root, - path, (u64)-1, 0); + ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0); if (ret < 0) { err = ret; goto out; @@ -2396,6 +2439,16 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, head = btrfs_delayed_node_to_head(node); trace_run_delayed_ref_head(fs_info, node, head, node->action); + if (head->total_ref_mod < 0) { + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(fs_info, node->bytenr); + ASSERT(cache); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + -node->num_bytes); + btrfs_put_block_group(cache); + } + if (insert_reserved) { btrfs_pin_extent(fs_info, node->bytenr, node->num_bytes, 1); @@ -2975,7 +3028,7 @@ again: struct btrfs_delayed_ref_node *ref; ref = &head->node; - atomic_inc(&ref->refs); + refcount_inc(&ref->refs); spin_unlock(&delayed_refs->lock); /* @@ -2998,7 +3051,6 @@ again: goto again; } out: - assert_qgroups_uptodate(trans); trans->can_flush_pending_bgs = can_flush_pending_bgs; return 0; } @@ -3028,8 +3080,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, return ret; } -static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static noinline int check_delayed_ref(struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 offset, u64 bytenr) { @@ -3037,18 +3088,23 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_transaction *cur_trans; int ret = 0; - delayed_refs = &trans->transaction->delayed_refs; + cur_trans = root->fs_info->running_transaction; + if (!cur_trans) + return 0; + + delayed_refs = &cur_trans->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (!head) { spin_unlock(&delayed_refs->lock); return 0; } if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -3090,8 +3146,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline int check_committed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 offset, u64 bytenr) { @@ -3162,9 +3217,8 @@ out: return ret; } -int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr) +int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, + u64 bytenr) { struct btrfs_path *path; int ret; @@ -3175,12 +3229,12 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, return -ENOENT; do { - ret = check_committed_ref(trans, root, path, objectid, + ret = check_committed_ref(root, path, objectid, offset, bytenr); if (ret && ret != -ENOENT) goto out; - ret2 = check_delayed_ref(trans, root, path, objectid, + ret2 = check_delayed_ref(root, path, objectid, offset, bytenr); } while (ret2 == -EAGAIN); @@ -3348,6 +3402,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; u64 num_pages = 0; @@ -3368,7 +3423,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, if (trans->aborted) return 0; again: - inode = lookup_free_space_inode(root, block_group, path); + inode = lookup_free_space_inode(fs_info, block_group, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ret = PTR_ERR(inode); btrfs_release_path(path); @@ -3382,7 +3437,8 @@ again: if (block_group->ro) goto out_free; - ret = create_free_space_inode(root, trans, block_group, path); + ret = create_free_space_inode(fs_info, trans, block_group, + path); if (ret) goto out_free; goto again; @@ -3424,7 +3480,7 @@ again: if (ret) goto out_put; - ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); + ret = btrfs_truncate_free_space_cache(trans, NULL, inode); if (ret) goto out_put; } @@ -3435,7 +3491,8 @@ again: /* * don't bother trying to write stuff out _if_ * a) we're not cached, - * b) we're with nospace_cache mount option. + * b) we're with nospace_cache mount option, + * c) we're with v2 space_cache (FREE_SPACE_TREE). */ dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); @@ -3465,7 +3522,7 @@ again: num_pages *= 16; num_pages *= PAGE_SIZE; - ret = btrfs_check_data_free_space(inode, 0, num_pages); + ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); if (ret) goto out_put; @@ -3496,6 +3553,7 @@ out: block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); + extent_changeset_free(data_reserved); return ret; } @@ -3906,87 +3964,83 @@ static const char *alloc_name(u64 flags) }; } -static int update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, - struct btrfs_space_info **space_info) +static int create_space_info(struct btrfs_fs_info *info, u64 flags, + struct btrfs_space_info **new) { - struct btrfs_space_info *found; + + struct btrfs_space_info *space_info; int i; - int factor; int ret; - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; - - found = __find_space_info(info, flags); - if (found) { - spin_lock(&found->lock); - found->total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - if (total_bytes > 0) - found->full = 0; - space_info_add_new_bytes(info, found, total_bytes - - bytes_used - bytes_readonly); - spin_unlock(&found->lock); - *space_info = found; - return 0; - } - found = kzalloc(sizeof(*found), GFP_NOFS); - if (!found) + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) return -ENOMEM; - ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); + ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, + GFP_KERNEL); if (ret) { - kfree(found); + kfree(space_info); return ret; } for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - INIT_LIST_HEAD(&found->block_groups[i]); - init_rwsem(&found->groups_sem); - spin_lock_init(&found->lock); - found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; - found->total_bytes = total_bytes; - found->disk_total = total_bytes * factor; - found->bytes_used = bytes_used; - found->disk_used = bytes_used * factor; - found->bytes_pinned = 0; - found->bytes_reserved = 0; - found->bytes_readonly = bytes_readonly; - found->bytes_may_use = 0; - found->full = 0; - found->max_extent_size = 0; - found->force_alloc = CHUNK_ALLOC_NO_FORCE; - found->chunk_alloc = 0; - found->flush = 0; - init_waitqueue_head(&found->wait); - INIT_LIST_HEAD(&found->ro_bgs); - INIT_LIST_HEAD(&found->tickets); - INIT_LIST_HEAD(&found->priority_tickets); - - ret = kobject_init_and_add(&found->kobj, &space_info_ktype, + INIT_LIST_HEAD(&space_info->block_groups[i]); + init_rwsem(&space_info->groups_sem); + spin_lock_init(&space_info->lock); + space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + init_waitqueue_head(&space_info->wait); + INIT_LIST_HEAD(&space_info->ro_bgs); + INIT_LIST_HEAD(&space_info->tickets); + INIT_LIST_HEAD(&space_info->priority_tickets); + + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, info->space_info_kobj, "%s", - alloc_name(found->flags)); + alloc_name(space_info->flags)); if (ret) { - kfree(found); + percpu_counter_destroy(&space_info->total_bytes_pinned); + kfree(space_info); return ret; } - *space_info = found; - list_add_rcu(&found->list, &info->space_info); + *new = space_info; + list_add_rcu(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) - info->data_sinfo = found; + info->data_sinfo = space_info; return ret; } +static void update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + int factor; + + if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + + found = __find_space_info(info, flags); + ASSERT(found); + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; + found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; + if (total_bytes > 0) + found->full = 0; + space_info_add_new_bytes(info, found, total_bytes - + bytes_used - bytes_readonly); + spin_unlock(&found->lock); + *space_info = found; +} + static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = chunk_to_extended(flags) & @@ -4102,7 +4156,7 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) return btrfs_reduce_alloc_profile(fs_info, flags); } -u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; u64 flags; @@ -4119,10 +4173,34 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return ret; } -int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) +u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); +} + +u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); +} + +u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +} + +static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, + bool may_use_included) +{ + ASSERT(s_info); + return s_info->bytes_used + s_info->bytes_reserved + + s_info->bytes_pinned + s_info->bytes_readonly + + (may_use_included ? s_info->bytes_may_use : 0); +} + +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 used; int ret = 0; @@ -4144,9 +4222,7 @@ int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) again: /* make sure we have enough space to handle the data first */ spin_lock(&data_sinfo->lock); - used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + - data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + - data_sinfo->bytes_may_use; + used = btrfs_space_info_used(data_sinfo, true); if (used + bytes > data_sinfo->total_bytes) { struct btrfs_trans_handle *trans; @@ -4161,7 +4237,7 @@ again: data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; spin_unlock(&data_sinfo->lock); alloc: - alloc_target = btrfs_get_alloc_profile(root, 1); + alloc_target = btrfs_data_alloc_profile(fs_info); /* * It is ugly that we don't call nolock join * transaction for the free space inode case here. @@ -4212,7 +4288,7 @@ commit_trans: if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1, 0, + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } @@ -4252,12 +4328,8 @@ commit_trans: return ret; } -/* - * New check_data_free_space() with ability for precious data reservation - * Will replace old btrfs_check_data_free_space(), but for patch split, - * add a new function first and then replace it. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; @@ -4267,14 +4339,16 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); - ret = btrfs_alloc_data_chunk_ondemand(inode, len); + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); if (ret < 0) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, start, len); - if (ret) + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); + if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; return ret; } @@ -4315,7 +4389,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4325,7 +4400,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) start = round_down(start, root->fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(inode, start, len); - btrfs_qgroup_free_data(inode, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len); } static void force_metadata_allocation(struct btrfs_fs_info *info) @@ -4421,9 +4496,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); - left = info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly - - info->bytes_may_use; + left = info->total_bytes - btrfs_space_info_used(info, true); spin_unlock(&info->lock); num_devs = get_profile_num_devs(fs_info, type); @@ -4439,9 +4512,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, } if (left < thresh) { - u64 flags; + u64 flags = btrfs_system_alloc_profile(fs_info); - flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0); /* * Ignore failure to create system chunk. We might end up not * needing it, as we might not need to COW all nodes/leafs from @@ -4482,10 +4554,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info = __find_space_info(fs_info, flags); if (!space_info) { - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); - BUG_ON(ret); /* -ENOMEM */ + ret = create_space_info(fs_info, flags, &space_info); + if (ret) + return ret; } - BUG_ON(!space_info); /* Logic error */ again: spin_lock(&space_info->lock); @@ -4590,11 +4662,11 @@ out: return ret; } -static int can_overcommit(struct btrfs_root *root, +static int can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) + enum btrfs_reserve_flush_enum flush, + bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 profile; u64 space_size; @@ -4605,9 +4677,12 @@ static int can_overcommit(struct btrfs_root *root, if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) return 0; - profile = btrfs_get_alloc_profile(root, 0); - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly; + if (system_chunk) + profile = btrfs_system_alloc_profile(fs_info); + else + profile = btrfs_metadata_alloc_profile(fs_info); + + used = btrfs_space_info_used(space_info, false); /* * We only want to allow over committing if we have lots of actual space @@ -4623,9 +4698,7 @@ static int can_overcommit(struct btrfs_root *root, used += space_info->bytes_may_use; - spin_lock(&fs_info->free_chunk_lock); - avail = fs_info->free_chunk_space; - spin_unlock(&fs_info->free_chunk_lock); + avail = atomic64_read(&fs_info->free_chunk_space); /* * If we have dup, raid1 or raid10 then only half of the free @@ -4675,14 +4748,14 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, } } -static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, +static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 to_reclaim) { u64 bytes; - int nr; + u64 nr; bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - nr = (int)div64_u64(to_reclaim, bytes); + nr = div64_u64(to_reclaim, bytes); if (!nr) nr = 1; return nr; @@ -4693,24 +4766,23 @@ static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, - bool wait_ordered) +static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, + u64 orig, bool wait_ordered) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 max_reclaim; + u64 items; long time_left; unsigned long nr_pages; int loops; - int items; enum btrfs_reserve_flush_enum flush; /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; + to_reclaim = items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &fs_info->delalloc_block_rsv; @@ -4753,7 +4825,7 @@ skip_async: else flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (can_overcommit(root, space_info, orig, flush)) { + if (can_overcommit(fs_info, space_info, orig, flush, false)) { spin_unlock(&space_info->lock); break; } @@ -4787,11 +4859,10 @@ skip_async: * get us somewhere and then commit the transaction if it does. Otherwise it * will return -ENOSPC. */ -static int may_commit_transaction(struct btrfs_root *root, +static int may_commit_transaction(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, int force) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; struct btrfs_trans_handle *trans; @@ -4816,14 +4887,14 @@ static int may_commit_transaction(struct btrfs_root *root, spin_lock(&delayed_rsv->lock); if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes - delayed_rsv->size) >= 0) { + bytes - delayed_rsv->size) < 0) { spin_unlock(&delayed_rsv->lock); return -ENOSPC; } spin_unlock(&delayed_rsv->lock); commit: - trans = btrfs_join_transaction(root); + trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return -ENOSPC; @@ -4837,11 +4908,11 @@ struct reserve_ticket { wait_queue_head_t wait; }; -static int flush_space(struct btrfs_root *root, +static int flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, u64 orig_bytes, int state) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *root = fs_info->extent_root; struct btrfs_trans_handle *trans; int nr; int ret = 0; @@ -4864,7 +4935,7 @@ static int flush_space(struct btrfs_root *root, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(root, num_bytes * 2, orig_bytes, + shrink_delalloc(fs_info, num_bytes * 2, orig_bytes, state == FLUSH_DELALLOC_WAIT); break; case ALLOC_CHUNK: @@ -4874,14 +4945,15 @@ static int flush_space(struct btrfs_root *root, break; } ret = do_chunk_alloc(trans, fs_info, - btrfs_get_alloc_profile(root, 0), + btrfs_metadata_alloc_profile(fs_info), CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans); if (ret > 0 || ret == -ENOSPC) ret = 0; break; case COMMIT_TRANS: - ret = may_commit_transaction(root, space_info, orig_bytes, 0); + ret = may_commit_transaction(fs_info, space_info, + orig_bytes, 0); break; default: ret = -ENOSPC; @@ -4894,8 +4966,9 @@ static int flush_space(struct btrfs_root *root, } static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, - struct btrfs_space_info *space_info) +btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool system_chunk) { struct reserve_ticket *ticket; u64 used; @@ -4910,14 +4983,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, return to_reclaim; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) + if (can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) return 0; - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; - if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) + used = btrfs_space_info_used(space_info, true); + + if (can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -4931,17 +5004,18 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, return to_reclaim; } -static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_root *root, u64 used) +static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 used, bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 thresh = div_factor_fine(space_info->total_bytes, 98); /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; - if (!btrfs_calc_reclaim_metadata_size(root, space_info)) + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, + system_chunk)) return 0; return (used >= thresh && !btrfs_fs_closing(fs_info) && @@ -4978,8 +5052,8 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); if (!to_reclaim) { space_info->flush = 0; spin_unlock(&space_info->lock); @@ -4993,16 +5067,17 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) struct reserve_ticket *ticket; int ret; - ret = flush_space(fs_info->fs_root, space_info, to_reclaim, - to_reclaim, flush_state); + ret = flush_space(fs_info, space_info, to_reclaim, to_reclaim, + flush_state); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; spin_unlock(&space_info->lock); return; } - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, + space_info, + false); ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); if (last_tickets_id == space_info->tickets_id) { @@ -5040,8 +5115,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int flush_state = FLUSH_DELAYED_ITEMS_NR; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); if (!to_reclaim) { spin_unlock(&space_info->lock); return; @@ -5049,8 +5124,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); do { - flush_space(fs_info->fs_root, space_info, to_reclaim, - to_reclaim, flush_state); + flush_space(fs_info, space_info, to_reclaim, to_reclaim, + flush_state); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -5120,12 +5195,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_metadata_bytes(struct btrfs_root *root, +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) + enum btrfs_reserve_flush_enum flush, + bool system_chunk) { - struct btrfs_fs_info *fs_info = root->fs_info; struct reserve_ticket ticket; u64 used; int ret = 0; @@ -5135,9 +5210,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, spin_lock(&space_info->lock); ret = -ENOSPC; - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + used = btrfs_space_info_used(space_info, true); /* * If we have enough space then hooray, make our reservation and carry @@ -5149,7 +5222,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; - } else if (can_overcommit(root, space_info, orig_bytes, flush)) { + } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk)) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); @@ -5176,7 +5250,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, orig_bytes, flush, "enospc"); queue_work(system_unbound_wq, - &root->fs_info->async_reclaim_work); + &fs_info->async_reclaim_work); } } else { list_add_tail(&ticket.list, @@ -5190,7 +5264,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(space_info, root, used) && + need_do_async_reclaim(fs_info, space_info, + used, system_chunk) && !work_busy(&fs_info->async_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); @@ -5248,9 +5323,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; + bool system_chunk = (root == fs_info->chunk_root); - ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, - flush); + ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, + orig_bytes, flush, system_chunk); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && @@ -5359,9 +5435,7 @@ static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, * overcommit, and if we can't then we just need to free up our space * and not satisfy any requests. */ - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + used = btrfs_space_info_used(space_info, true); if (used - num_bytes >= space_info->total_bytes) check_overcommit = true; again: @@ -5373,8 +5447,7 @@ again: * adding the ticket space would be a double count. */ if (check_overcommit && - !can_overcommit(fs_info->extent_root, space_info, 0, - flush)) + !can_overcommit(fs_info, space_info, 0, flush, false)) break; if (num_bytes >= ticket->bytes) { list_del_init(&ticket->list); @@ -5630,9 +5703,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->size = min_t(u64, num_bytes, SZ_512M); if (block_rsv->reserved < block_rsv->size) { - num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + - sinfo->bytes_reserved + sinfo->bytes_readonly + - sinfo->bytes_may_use; + num_bytes = btrfs_space_info_used(sinfo, true); if (sinfo->total_bytes > num_bytes) { num_bytes = sinfo->total_bytes - num_bytes; num_bytes = min(num_bytes, @@ -5735,10 +5806,10 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) /* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; /* * We always use trans->block_rsv here as we will have reserved space * for our orphan when starting the transaction, using get_block_rsv() @@ -5755,19 +5826,19 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, */ u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - trace_btrfs_space_reservation(fs_info, "orphan", - btrfs_ino(inode), num_bytes, 1); + trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), + num_bytes, 1); return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); } -void btrfs_orphan_release_metadata(struct inode *inode) +void btrfs_orphan_release_metadata(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - trace_btrfs_space_reservation(fs_info, "orphan", - btrfs_ino(inode), num_bytes, 0); + trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), + num_bytes, 0); btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes); } @@ -5799,7 +5870,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { /* One for parent inode, two for dir entries */ num_bytes = 3 * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta(root, num_bytes); + ret = btrfs_qgroup_reserve_meta(root, num_bytes, true); if (ret) return ret; } else { @@ -5824,8 +5895,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, } void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv, - u64 qgroup_reserved) + struct btrfs_block_rsv *rsv) { btrfs_block_rsv_release(fs_info, rsv, (u64)-1); } @@ -5840,35 +5910,32 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, * reserved extents that need to be freed. This must be called with * BTRFS_I(inode)->lock held. */ -static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) +static unsigned drop_outstanding_extent(struct btrfs_inode *inode, + u64 num_bytes) { unsigned drop_inode_space = 0; unsigned dropped_extents = 0; - unsigned num_extents = 0; + unsigned num_extents; - num_extents = (unsigned)div64_u64(num_bytes + - BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE); + num_extents = count_max_extents(num_bytes); ASSERT(num_extents); - ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); - BTRFS_I(inode)->outstanding_extents -= num_extents; + ASSERT(inode->outstanding_extents >= num_extents); + inode->outstanding_extents -= num_extents; - if (BTRFS_I(inode)->outstanding_extents == 0 && + if (inode->outstanding_extents == 0 && test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) drop_inode_space = 1; /* * If we have more or the same amount of outstanding extents than we have * reserved then we need to leave the reserved extents count alone. */ - if (BTRFS_I(inode)->outstanding_extents >= - BTRFS_I(inode)->reserved_extents) + if (inode->outstanding_extents >= inode->reserved_extents) return drop_inode_space; - dropped_extents = BTRFS_I(inode)->reserved_extents - - BTRFS_I(inode)->outstanding_extents; - BTRFS_I(inode)->reserved_extents -= dropped_extents; + dropped_extents = inode->reserved_extents - inode->outstanding_extents; + inode->reserved_extents -= dropped_extents; return dropped_extents + drop_inode_space; } @@ -5890,24 +5957,21 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) * * This must be called with BTRFS_I(inode)->lock held. */ -static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, +static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes, int reserve) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); u64 old_csums, num_csums; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && - BTRFS_I(inode)->csum_bytes == 0) + if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0) return 0; - old_csums = btrfs_csum_bytes_to_leaves(fs_info, - BTRFS_I(inode)->csum_bytes); + old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); if (reserve) - BTRFS_I(inode)->csum_bytes += num_bytes; + inode->csum_bytes += num_bytes; else - BTRFS_I(inode)->csum_bytes -= num_bytes; - num_csums = btrfs_csum_bytes_to_leaves(fs_info, - BTRFS_I(inode)->csum_bytes); + inode->csum_bytes -= num_bytes; + num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); /* No change, no need to reserve more */ if (old_csums == num_csums) @@ -5920,14 +5984,14 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); } -int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv; u64 to_reserve = 0; u64 csum_bytes; - unsigned nr_extents = 0; + unsigned nr_extents; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; @@ -5955,31 +6019,28 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) schedule_timeout(1); if (delalloc_lock) - mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + mutex_lock(&inode->delalloc_mutex); num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - spin_lock(&BTRFS_I(inode)->lock); - nr_extents = (unsigned)div64_u64(num_bytes + - BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE); - BTRFS_I(inode)->outstanding_extents += nr_extents; + spin_lock(&inode->lock); + nr_extents = count_max_extents(num_bytes); + inode->outstanding_extents += nr_extents; nr_extents = 0; - if (BTRFS_I(inode)->outstanding_extents > - BTRFS_I(inode)->reserved_extents) - nr_extents += BTRFS_I(inode)->outstanding_extents - - BTRFS_I(inode)->reserved_extents; + if (inode->outstanding_extents > inode->reserved_extents) + nr_extents += inode->outstanding_extents - + inode->reserved_extents; /* We always want to reserve a slot for updating the inode. */ to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1); to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); - csum_bytes = BTRFS_I(inode)->csum_bytes; - spin_unlock(&BTRFS_I(inode)->lock); + csum_bytes = inode->csum_bytes; + spin_unlock(&inode->lock); if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { ret = btrfs_qgroup_reserve_meta(root, - nr_extents * fs_info->nodesize); + nr_extents * fs_info->nodesize, true); if (ret) goto out_fail; } @@ -5991,17 +6052,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) goto out_fail; } - spin_lock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) { + &inode->runtime_flags)) { to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1); release_extra = true; } - BTRFS_I(inode)->reserved_extents += nr_extents; - spin_unlock(&BTRFS_I(inode)->lock); + inode->reserved_extents += nr_extents; + spin_unlock(&inode->lock); if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + mutex_unlock(&inode->delalloc_mutex); if (to_reserve) trace_btrfs_space_reservation(fs_info, "delalloc", @@ -6012,17 +6073,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) return 0; out_fail: - spin_lock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); dropped = drop_outstanding_extent(inode, num_bytes); /* * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers * so we can just reduce our inodes csum bytes and carry on. */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) { + if (inode->csum_bytes == csum_bytes) { calc_csum_metadata_size(inode, num_bytes, 0); } else { - u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; + u64 orig_csum_bytes = inode->csum_bytes; u64 bytes; /* @@ -6033,8 +6094,8 @@ out_fail: * number of bytes that were freed while we were trying our * reservation. */ - bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; - BTRFS_I(inode)->csum_bytes = csum_bytes; + bytes = csum_bytes - inode->csum_bytes; + inode->csum_bytes = csum_bytes; to_free = calc_csum_metadata_size(inode, bytes, 0); @@ -6043,7 +6104,7 @@ out_fail: * been making this reservation and our ->csum_bytes were not * artificially inflated. */ - BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; + inode->csum_bytes = csum_bytes - num_bytes; bytes = csum_bytes - orig_csum_bytes; bytes = calc_csum_metadata_size(inode, bytes, 0); @@ -6055,13 +6116,13 @@ out_fail: * need to do anything, the other free-ers did the correct * thing. */ - BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; + inode->csum_bytes = orig_csum_bytes - num_bytes; if (bytes > to_free) to_free = bytes - to_free; else to_free = 0; } - spin_unlock(&BTRFS_I(inode)->lock); + spin_unlock(&inode->lock); if (dropped) to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); @@ -6071,7 +6132,7 @@ out_fail: btrfs_ino(inode), to_free, 0); } if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + mutex_unlock(&inode->delalloc_mutex); return ret; } @@ -6084,27 +6145,27 @@ out_fail: * once we complete IO for a given set of bytes to release their metadata * reservations. */ -void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); u64 to_free = 0; unsigned dropped; num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - spin_lock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); dropped = drop_outstanding_extent(inode, num_bytes); if (num_bytes) to_free = calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); + spin_unlock(&inode->lock); if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); if (btrfs_is_testing(fs_info)) return; - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), to_free, 0); + trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), + to_free, 0); btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free); } @@ -6115,6 +6176,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) * @inode: inode we're writing to * @start: start range we are writing to * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. * * This will do the following things * @@ -6132,16 +6195,17 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) * Return 0 for success * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, start, len); + ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; - ret = btrfs_delalloc_reserve_metadata(inode, len); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); if (ret < 0) - btrfs_free_reserved_data_space(inode, start, len); + btrfs_free_reserved_data_space(inode, *reserved, start, len); return ret; } @@ -6160,10 +6224,11 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) * list if there are no delalloc bytes left. * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { - btrfs_delalloc_release_metadata(inode, len); - btrfs_free_reserved_data_space(inode, start, len); + btrfs_delalloc_release_metadata(BTRFS_I(inode), len); + btrfs_free_reserved_data_space(inode, reserved, start, len); } static int update_block_group(struct btrfs_trans_handle *trans, @@ -6239,6 +6304,8 @@ static int update_block_group(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(info, "pinned", cache->space_info->flags, num_bytes, 1); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + num_bytes); set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); @@ -6315,6 +6382,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, trace_btrfs_space_reservation(fs_info, "pinned", cache->space_info->flags, num_bytes, 1); + percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes); set_extent_dirty(fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -6561,8 +6629,7 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, spin_unlock(&space_info->lock); return ret; } -void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) { struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; @@ -6786,27 +6853,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, return 0; } -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, - u64 owner, u64 root_objectid) -{ - struct btrfs_space_info *space_info; - u64 flags; - - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - } else { - flags = BTRFS_BLOCK_GROUP_DATA; - } - - space_info = __find_space_info(fs_info, flags); - BUG_ON(!space_info); /* Logic bug */ - percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); -} - - static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *info, struct btrfs_delayed_ref_node *node, u64 parent, @@ -6845,7 +6891,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (is_data) skinny_metadata = 0; - ret = lookup_extent_backref(trans, extent_root, path, &iref, + ret = lookup_extent_backref(trans, info, path, &iref, bytenr, num_bytes, parent, root_objectid, owner_objectid, owner_offset); @@ -6877,8 +6923,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, #endif if (!found_extent) { BUG_ON(iref); - ret = remove_extent_backref(trans, extent_root, path, - NULL, refs_to_drop, + ret = remove_extent_backref(trans, info, path, NULL, + refs_to_drop, is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -6953,8 +6999,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (item_size < sizeof(*ei)) { BUG_ON(found_extent || extent_slot != path->slots[0]); - ret = convert_extent_item_v0(trans, extent_root, path, - owner_objectid, 0); + ret = convert_extent_item_v0(trans, info, path, owner_objectid, + 0); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -7021,7 +7067,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); } if (found_extent) { - ret = remove_extent_backref(trans, extent_root, path, + ret = remove_extent_backref(trans, info, path, iref, refs_to_drop, is_data, &last_ref); if (ret) { @@ -7029,8 +7075,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } - add_pinned_bytes(info, -num_bytes, owner_objectid, - root_objectid); } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != @@ -7095,7 +7139,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (!head) goto out_delayed_unlock; @@ -7162,19 +7206,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(fs_info, trans, - buf->start, buf->len, - parent, + int old_ref_mod, new_ref_mod; + + ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, + buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL); + BTRFS_DROP_DELAYED_REF, NULL, + &old_ref_mod, &new_ref_mod); BUG_ON(ret); /* -ENOMEM */ + pin = old_ref_mod >= 0 && new_ref_mod < 0; } - if (!last_ref) - return; - - if (btrfs_header_generation(buf) == trans->transid) { + if (last_ref && btrfs_header_generation(buf) == trans->transid) { struct btrfs_block_group_cache *cache; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -7183,6 +7227,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } + pin = 0; cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -7198,18 +7243,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_free_reserved_bytes(cache, buf->len, 0); btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); - pin = 0; } out: if (pin) add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), root->root_key.objectid); - /* - * Deleting the buffer, clear the corrupt flag since it doesn't matter - * anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + if (last_ref) { + /* + * Deleting the buffer, clear the corrupt flag since it doesn't + * matter anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + } } /* Can return -ENOMEM */ @@ -7218,12 +7264,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) return 0; - add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); /* * tree log blocks never actually go into the extent allocation @@ -7233,19 +7279,25 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); /* unlocks the pinned mutex */ btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); + old_ref_mod = new_ref_mod = 0; ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL); + num_bytes, parent, + root_objectid, (int)owner, + BTRFS_DROP_DELAYED_REF, NULL, + &old_ref_mod, &new_ref_mod); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, - offset, 0, - BTRFS_DROP_DELAYED_REF, NULL); + num_bytes, parent, + root_objectid, owner, offset, + 0, BTRFS_DROP_DELAYED_REF, + &old_ref_mod, &new_ref_mod); } + + if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) + add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); + return ret; } @@ -7419,12 +7471,11 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, * If there is no suitable free space, we will record the max size of * the free space extent currently. */ -static noinline int find_free_extent(struct btrfs_root *orig_root, +static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 ram_bytes, u64 num_bytes, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, u64 flags, int delalloc) { - struct btrfs_fs_info *fs_info = orig_root->fs_info; int ret = 0; struct btrfs_root *root = fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; @@ -7716,18 +7767,20 @@ unclustered_alloc: last_ptr->fragmented = 1; spin_unlock(&last_ptr->lock); } - spin_lock(&block_group->free_space_ctl->tree_lock); - if (cached && - block_group->free_space_ctl->free_space < - num_bytes + empty_cluster + empty_size) { - if (block_group->free_space_ctl->free_space > - max_extent_size) - max_extent_size = - block_group->free_space_ctl->free_space; - spin_unlock(&block_group->free_space_ctl->tree_lock); - goto loop; + if (cached) { + struct btrfs_free_space_ctl *ctl = + block_group->free_space_ctl; + + spin_lock(&ctl->tree_lock); + if (ctl->free_space < + num_bytes + empty_cluster + empty_size) { + if (ctl->free_space > max_extent_size) + max_extent_size = ctl->free_space; + spin_unlock(&ctl->tree_lock); + goto loop; + } + spin_unlock(&ctl->tree_lock); } - spin_unlock(&block_group->free_space_ctl->tree_lock); offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size, @@ -7908,9 +7961,8 @@ static void dump_space_info(struct btrfs_fs_info *fs_info, spin_lock(&info->lock); btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", info->flags, - info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly - - info->bytes_may_use, (info->full) ? "" : "not "); + info->total_bytes - btrfs_space_info_used(info, true), + info->full ? "" : "not "); btrfs_info(fs_info, "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", info->total_bytes, info->bytes_used, info->bytes_pinned, @@ -7948,10 +8000,10 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 flags; int ret; - flags = btrfs_get_alloc_profile(root, is_data); + flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); - ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, + ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, hint_byte, ins, flags, delalloc); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); @@ -8192,10 +8244,9 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, - ins->offset, 0, - root_objectid, owner, offset, - ram_bytes, BTRFS_ADD_DELAYED_EXTENT, - NULL); + ins->offset, 0, root_objectid, owner, + offset, ram_bytes, + BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); return ret; } @@ -8256,7 +8307,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); - clean_tree_block(trans, fs_info, buf); + clean_tree_block(fs_info, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); @@ -8351,10 +8402,11 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, * returns the tree buffer or an ERR_PTR on error. */ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size) + struct btrfs_root *root, + u64 parent, u64 root_objectid, + const struct btrfs_disk_key *key, + int level, u64 hint, + u64 empty_size) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; @@ -8414,11 +8466,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->is_data = false; extent_op->level = level; - ret = btrfs_add_delayed_tree_ref(fs_info, trans, - ins.objectid, ins.offset, - parent, root_objectid, level, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid, + ins.offset, parent, + root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); + extent_op, NULL, NULL); if (ret) goto out_free_delayed; } @@ -8876,7 +8928,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } - clean_tree_block(trans, fs_info, eb); + clean_tree_block(fs_info, eb); } if (eb == root->node) { @@ -9346,8 +9398,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) num_bytes = cache->key.offset - cache->reserved - cache->pinned - cache->bytes_super - btrfs_block_group_used(&cache->item); - if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + - sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + + if (btrfs_space_info_used(sinfo, true) + num_bytes + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; cache->ro++; @@ -9360,17 +9411,16 @@ out: return ret; } -int btrfs_inc_block_group_ro(struct btrfs_root *root, +int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; u64 alloc_flags; int ret; again: - trans = btrfs_join_transaction(root); + trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -9557,9 +9607,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) * all of the extents from this block group. If we can, we're good */ if ((space_info->total_bytes != block_group->key.offset) && - (space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - min_free < space_info->total_bytes)) { + (btrfs_space_info_used(space_info, false) + min_free < + space_info->total_bytes)) { spin_unlock(&space_info->lock); goto out; } @@ -9742,6 +9791,11 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) } } +/* + * Must be called only after stopping all workers, since we could have block + * group caching kthreads running, and therefore they could race with us if we + * freed the block groups before stopping them. + */ int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group_cache *block_group; @@ -9781,9 +9835,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) list_del(&block_group->list); up_write(&block_group->space_info->groups_sem); - if (block_group->cached == BTRFS_CACHE_STARTED) - wait_block_group_cache_done(block_group); - /* * We haven't cached this block group, which means we could * possibly have excluded extents on this block group. @@ -9793,6 +9844,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) free_excluded_extents(info, block_group); btrfs_remove_free_space_cache(block_group); + ASSERT(block_group->cached != BTRFS_CACHE_STARTED); ASSERT(list_empty(&block_group->dirty_list)); ASSERT(list_empty(&block_group->io_list)); ASSERT(list_empty(&block_group->bg_list)); @@ -9920,6 +9972,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); mutex_init(&cache->free_space_lock); + btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); return cache; } @@ -10050,19 +10103,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } trace_btrfs_add_block_group(info, cache, 0); - ret = update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - cache->bytes_super, &space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - spin_lock(&info->block_group_cache_lock); - rb_erase(&cache->cache_node, - &info->block_group_cache_tree); - RB_CLEAR_NODE(&cache->cache_node); - spin_unlock(&info->block_group_cache_lock); - btrfs_put_block_group(cache); - goto error; - } + update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + cache->bytes_super, &space_info); cache->space_info = space_info; @@ -10194,16 +10237,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, } #endif /* - * Call to ensure the corresponding space_info object is created and - * assigned to our block group, but don't update its counters just yet. - * We want our bg to be added to the rbtree with its ->space_info set. + * Ensure the corresponding space_info object is created and + * assigned to our block group. We want our bg to be added to the rbtree + * with its ->space_info set. */ - ret = update_space_info(fs_info, cache->flags, 0, 0, 0, - &cache->space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - return ret; + cache->space_info = __find_space_info(fs_info, cache->flags); + if (!cache->space_info) { + ret = create_space_info(fs_info, cache->flags, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } } ret = btrfs_add_block_group_cache(fs_info, cache); @@ -10218,18 +10264,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * the rbtree, update the space info's counters. */ trace_btrfs_add_block_group(fs_info, cache, 1); - ret = update_space_info(fs_info, cache->flags, size, bytes_used, + update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, &cache->space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&cache->cache_node, - &fs_info->block_group_cache_tree); - RB_CLEAR_NODE(&cache->cache_node); - spin_unlock(&fs_info->block_group_cache_lock); - btrfs_put_block_group(cache); - return ret; - } update_global_block_rsv(fs_info); __link_block_group(cache->space_info, cache); @@ -10317,7 +10353,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * get the inode first so any iput calls done for the io_list * aren't the final iput (no unlinks allowed now) */ - inode = lookup_free_space_inode(tree_root, block_group, path); + inode = lookup_free_space_inode(fs_info, block_group, path); mutex_lock(&trans->transaction->cache_write_mutex); /* @@ -10344,7 +10380,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_unlock(&trans->transaction->cache_write_mutex); if (!IS_ERR(inode)) { - ret = btrfs_orphan_add(trans, inode); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(inode); goto out; @@ -10419,7 +10455,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, &fs_info->caching_block_groups, list) if (ctl->block_group == block_group) { caching_ctl = ctl; - atomic_inc(&caching_ctl->count); + refcount_inc(&caching_ctl->count); break; } } @@ -10777,21 +10813,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) mixed = 1; flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); if (ret) goto out; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); } else { flags = BTRFS_BLOCK_GROUP_METADATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); if (ret) goto out; flags = BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); + ret = create_space_info(fs_info, flags, &space_info); } out: return ret; @@ -10853,7 +10889,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; if (trans) - atomic_inc(&trans->use_count); + refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); ret = find_free_dev_extent_start(trans, device, minlen, start, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4ac383a3a649..0aff9b278c19 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -68,7 +68,7 @@ void btrfs_leak_debug_check(void) pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", state->start, state->end, state->state, extent_state_in_tree(state), - atomic_read(&state->refs)); + refcount_read(&state->refs)); list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); } @@ -87,19 +87,9 @@ void btrfs_leak_debug_check(void) static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - struct inode *inode; - u64 isize; - - if (!tree->mapping) - return; - - inode = tree->mapping->host; - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(inode), isize, start, end); - } + if (tree->ops && tree->ops->check_extent_io_range) + tree->ops->check_extent_io_range(tree->private_data, caller, + start, end); } #else #define btrfs_leak_debug_add(new, head) do {} while (0) @@ -144,7 +134,7 @@ static void add_extent_changeset(struct extent_state *state, unsigned bits, if (!set && (state->state & bits) == 0) return; changeset->bytes_changed += state->end - state->start + 1; - ret = ulist_add(changeset->range_changed, state->start, state->end, + ret = ulist_add(&changeset->range_changed, state->start, state->end, GFP_ATOMIC); /* ENOMEM */ BUG_ON(ret < 0); @@ -154,9 +144,9 @@ static noinline void flush_write_bio(void *data); static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) { - if (!tree->mapping) - return NULL; - return btrfs_sb(tree->mapping->host->i_sb); + if (tree->ops) + return tree->ops->tree_fs_info(tree->private_data); + return NULL; } int __init extent_io_init(void) @@ -174,7 +164,8 @@ int __init extent_io_init(void) goto free_state_cache; btrfs_bioset = bioset_create(BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio)); + offsetof(struct btrfs_io_bio, bio), + BIOSET_NEED_BVECS); if (!btrfs_bioset) goto free_buffer_cache; @@ -213,19 +204,24 @@ void extent_io_exit(void) } void extent_io_tree_init(struct extent_io_tree *tree, - struct address_space *mapping) + void *private_data) { tree->state = RB_ROOT; tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); - tree->mapping = mapping; + tree->private_data = private_data; } static struct extent_state *alloc_extent_state(gfp_t mask) { struct extent_state *state; + /* + * The given mask might be not appropriate for the slab allocator, + * drop the unsupported bits + */ + mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); state = kmem_cache_alloc(extent_state_cache, mask); if (!state) return state; @@ -233,7 +229,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->failrec = NULL; RB_CLEAR_NODE(&state->rb_node); btrfs_leak_debug_add(&state->leak_list, &states); - atomic_set(&state->refs, 1); + refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); trace_alloc_extent_state(state, mask, _RET_IP_); return state; @@ -243,7 +239,7 @@ void free_extent_state(struct extent_state *state) { if (!state) return; - if (atomic_dec_and_test(&state->refs)) { + if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); btrfs_leak_debug_del(&state->leak_list); trace_free_extent_state(state, _RET_IP_); @@ -364,8 +360,7 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, struct extent_state *other) { if (tree->ops && tree->ops->merge_extent_hook) - tree->ops->merge_extent_hook(tree->mapping->host, new, - other); + tree->ops->merge_extent_hook(tree->private_data, new, other); } /* @@ -416,14 +411,14 @@ static void set_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->set_bit_hook) - tree->ops->set_bit_hook(tree->mapping->host, state, bits); + tree->ops->set_bit_hook(tree->private_data, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->clear_bit_hook) - tree->ops->clear_bit_hook(tree->mapping->host, state, bits); + tree->ops->clear_bit_hook(tree->private_data, state, bits); } static void set_state_bits(struct extent_io_tree *tree, @@ -472,7 +467,7 @@ static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, u64 split) { if (tree->ops && tree->ops->split_extent_hook) - tree->ops->split_extent_hook(tree->mapping->host, orig, split); + tree->ops->split_extent_hook(tree->private_data, orig, split); } /* @@ -635,7 +630,7 @@ again: if (cached && extent_state_in_tree(cached) && cached->start <= start && cached->end > start) { if (clear) - atomic_dec(&cached->refs); + refcount_dec(&cached->refs); state = cached; goto hit_next; } @@ -787,7 +782,7 @@ process_node: if (state->state & bits) { start = state->start; - atomic_inc(&state->refs); + refcount_inc(&state->refs); wait_on_state(tree, state); free_extent_state(state); goto again; @@ -828,7 +823,7 @@ static void cache_state_if_flags(struct extent_state *state, if (cached_ptr && !(*cached_ptr)) { if (!flags || (state->state & flags)) { *cached_ptr = state; - atomic_inc(&state->refs); + refcount_inc(&state->refs); } } } @@ -1396,17 +1391,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) */ static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - set_page_writeback(page); - put_page(page); - index++; - } + tree->ops->set_range_writeback(tree->private_data, start, end); } /* find the first state struct with 'bits' set after 'start', and @@ -1532,7 +1517,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, if (!found) { *start = state->start; *cached_state = state; - atomic_inc(&state->refs); + refcount_inc(&state->refs); } found++; *end = state->end; @@ -1549,33 +1534,24 @@ out: return found; } +static int __process_pages_contig(struct address_space *mapping, + struct page *locked_page, + pgoff_t start_index, pgoff_t end_index, + unsigned long page_ops, pgoff_t *index_ret); + static noinline void __unlock_for_delalloc(struct inode *inode, struct page *locked_page, u64 start, u64 end) { - int ret; - struct page *pages[16]; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - unsigned long nr_pages = end_index - index + 1; - int i; + ASSERT(locked_page); if (index == locked_page->index && end_index == index) return; - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, nr_pages, - ARRAY_SIZE(pages)), pages); - for (i = 0; i < ret; i++) { - if (pages[i] != locked_page) - unlock_page(pages[i]); - put_page(pages[i]); - } - nr_pages -= ret; - index += ret; - cond_resched(); - } + __process_pages_contig(inode->i_mapping, locked_page, index, end_index, + PAGE_UNLOCK, NULL); } static noinline int lock_delalloc_pages(struct inode *inode, @@ -1584,59 +1560,19 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 delalloc_end) { unsigned long index = delalloc_start >> PAGE_SHIFT; - unsigned long start_index = index; + unsigned long index_ret = index; unsigned long end_index = delalloc_end >> PAGE_SHIFT; - unsigned long pages_locked = 0; - struct page *pages[16]; - unsigned long nrpages; int ret; - int i; - /* the caller is responsible for locking the start index */ + ASSERT(locked_page); if (index == locked_page->index && index == end_index) return 0; - /* skip the page at the start index */ - nrpages = end_index - index + 1; - while (nrpages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, - nrpages, ARRAY_SIZE(pages)), pages); - if (ret == 0) { - ret = -EAGAIN; - goto done; - } - /* now we have an array of pages, lock them all */ - for (i = 0; i < ret; i++) { - /* - * the caller is taking responsibility for - * locked_page - */ - if (pages[i] != locked_page) { - lock_page(pages[i]); - if (!PageDirty(pages[i]) || - pages[i]->mapping != inode->i_mapping) { - ret = -EAGAIN; - unlock_page(pages[i]); - put_page(pages[i]); - goto done; - } - } - put_page(pages[i]); - pages_locked++; - } - nrpages -= ret; - index += ret; - cond_resched(); - } - ret = 0; -done: - if (ret && pages_locked) { - __unlock_for_delalloc(inode, locked_page, - delalloc_start, - ((u64)(start_index + pages_locked - 1)) << - PAGE_SHIFT); - } + ret = __process_pages_contig(inode->i_mapping, locked_page, index, + end_index, PAGE_LOCK, &index_ret); + if (ret == -EAGAIN) + __unlock_for_delalloc(inode, locked_page, delalloc_start, + (u64)index_ret << PAGE_SHIFT); return ret; } @@ -1726,37 +1662,48 @@ out_failed: return found; } -void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, - u64 delalloc_end, struct page *locked_page, - unsigned clear_bits, - unsigned long page_ops) +static int __process_pages_contig(struct address_space *mapping, + struct page *locked_page, + pgoff_t start_index, pgoff_t end_index, + unsigned long page_ops, pgoff_t *index_ret) { - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - int ret; + unsigned long nr_pages = end_index - start_index + 1; + unsigned long pages_locked = 0; + pgoff_t index = start_index; struct page *pages[16]; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - unsigned long nr_pages = end_index - index + 1; + unsigned ret; + int err = 0; int i; - clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); - if (page_ops == 0) - return; + if (page_ops & PAGE_LOCK) { + ASSERT(page_ops == PAGE_LOCK); + ASSERT(index_ret && *index_ret == start_index); + } if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) - mapping_set_error(inode->i_mapping, -EIO); + mapping_set_error(mapping, -EIO); while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, + ret = find_get_pages_contig(mapping, index, min_t(unsigned long, nr_pages, ARRAY_SIZE(pages)), pages); - for (i = 0; i < ret; i++) { + if (ret == 0) { + /* + * Only if we're going to lock these pages, + * can we find nothing at @index. + */ + ASSERT(page_ops & PAGE_LOCK); + err = -EAGAIN; + goto out; + } + for (i = 0; i < ret; i++) { if (page_ops & PAGE_SET_PRIVATE2) SetPagePrivate2(pages[i]); if (pages[i] == locked_page) { put_page(pages[i]); + pages_locked++; continue; } if (page_ops & PAGE_CLEAR_DIRTY) @@ -1769,12 +1716,40 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, end_page_writeback(pages[i]); if (page_ops & PAGE_UNLOCK) unlock_page(pages[i]); + if (page_ops & PAGE_LOCK) { + lock_page(pages[i]); + if (!PageDirty(pages[i]) || + pages[i]->mapping != mapping) { + unlock_page(pages[i]); + put_page(pages[i]); + err = -EAGAIN; + goto out; + } + } put_page(pages[i]); + pages_locked++; } nr_pages -= ret; index += ret; cond_resched(); } +out: + if (err && index_ret) + *index_ret = start_index + pages_locked - 1; + return err; +} + +void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, + u64 delalloc_end, struct page *locked_page, + unsigned clear_bits, + unsigned long page_ops) +{ + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, + NULL, GFP_NOFS); + + __process_pages_contig(inode->i_mapping, locked_page, + start >> PAGE_SHIFT, end >> PAGE_SHIFT, + page_ops, NULL); } /* @@ -1965,11 +1940,12 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) SetPageUptodate(page); } -int free_io_failure(struct inode *inode, struct io_failure_record *rec) +int free_io_failure(struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, + struct io_failure_record *rec) { int ret; int err = 0; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; set_state_failrec(failure_tree, rec->start, NULL); ret = clear_extent_bits(failure_tree, rec->start, @@ -1978,7 +1954,7 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec) if (ret) err = ret; - ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + ret = clear_extent_bits(io_tree, rec->start, rec->start + rec->len - 1, EXTENT_DAMAGED); if (ret && !err) @@ -1998,28 +1974,21 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec) * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, - struct page *page, unsigned int pg_offset, int mirror_num) +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct bio *bio; struct btrfs_device *dev; u64 map_length = 0; u64 sector; struct btrfs_bio *bbio = NULL; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int ret; ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); BUG_ON(!mirror_num); - /* we can't repair anything in raid56 yet */ - if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) - return 0; - - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; + bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; map_length = length; @@ -2029,17 +1998,35 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, * read repair operation. */ btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bbio, mirror_num); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; + if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) { + /* + * Note that we don't use BTRFS_MAP_WRITE because it's supposed + * to update all raid stripes, but here we just want to correct + * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad + * stripe's dev and sector. + */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &map_length, &bbio, 0); + if (ret) { + btrfs_bio_counter_dec(fs_info); + bio_put(bio); + return -EIO; + } + ASSERT(bbio->mirror_num == 1); + } else { + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + btrfs_bio_counter_dec(fs_info); + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); } - BUG_ON(mirror_num != bbio->mirror_num); - sector = bbio->stripes[mirror_num-1].physical >> 9; + + sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; bio->bi_iter.bi_sector = sector; - dev = bbio->stripes[mirror_num-1].dev; + dev = bbio->stripes[bbio->mirror_num - 1].dev; btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { btrfs_bio_counter_dec(fs_info); @@ -2060,7 +2047,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", - btrfs_ino(inode), start, + ino, start, rcu_str_deref(dev->name), sector); btrfs_bio_counter_dec(fs_info); bio_put(bio); @@ -2080,8 +2067,7 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info, for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; - ret = repair_io_failure(fs_info->btree_inode, start, - PAGE_SIZE, start, p, + ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, start - page_offset(p), mirror_num); if (ret) break; @@ -2095,24 +2081,24 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info, * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -int clean_io_failure(struct inode *inode, u64 start, struct page *page, - unsigned int pg_offset) +int clean_io_failure(struct btrfs_fs_info *fs_info, + struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, u64 start, + struct page *page, u64 ino, unsigned int pg_offset) { u64 private; struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct extent_state *state; int num_copies; int ret; private = 0; - ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY, 0); + ret = count_range_bits(failure_tree, &private, (u64)-1, 1, + EXTENT_DIRTY, 0); if (!ret) return 0; - ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start, - &failrec); + ret = get_state_failrec(failure_tree, start, &failrec); if (ret) return 0; @@ -2128,25 +2114,25 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page, if (fs_info->sb->s_flags & MS_RDONLY) goto out; - spin_lock(&BTRFS_I(inode)->io_tree.lock); - state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + spin_lock(&io_tree->lock); + state = find_first_extent_bit_state(io_tree, failrec->start, EXTENT_LOCKED); - spin_unlock(&BTRFS_I(inode)->io_tree.lock); + spin_unlock(&io_tree->lock); if (state && state->start <= failrec->start && state->end >= failrec->start + failrec->len - 1) { num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); if (num_copies > 1) { - repair_io_failure(inode, start, failrec->len, - failrec->logical, page, - pg_offset, failrec->failed_mirror); + repair_io_failure(fs_info, ino, start, failrec->len, + failrec->logical, page, pg_offset, + failrec->failed_mirror); } } out: - free_io_failure(inode, failrec); + free_io_failure(failure_tree, io_tree, failrec); return 0; } @@ -2157,9 +2143,9 @@ out: * - under ordered extent * - the inode is freeing */ -void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end) +void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) { - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *failure_tree = &inode->io_failure_tree; struct io_failure_record *failrec; struct extent_state *state, *next; @@ -2272,7 +2258,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, return 0; } -int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, +bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, int failed_mirror) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2288,7 +2274,7 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, btrfs_debug(fs_info, "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", num_copies, failrec->this_mirror, failed_mirror); - return 0; + return false; } /* @@ -2329,10 +2315,10 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, btrfs_debug(fs_info, "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", num_copies, failrec->this_mirror, failed_mirror); - return 0; + return false; } - return 1; + return true; } @@ -2346,10 +2332,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct btrfs_io_bio *btrfs_failed_bio; struct btrfs_io_bio *btrfs_bio; - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) - return NULL; - + bio = btrfs_io_bio_alloc(1); bio->bi_end_io = endio_func; bio->bi_iter.bi_sector = failrec->logical >> 9; bio->bi_bdev = fs_info->fs_devices->latest_bdev; @@ -2387,8 +2370,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct io_failure_record *failrec; struct inode *inode = page->mapping->host; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct bio *bio; int read_mode = 0; + blk_status_t status; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2397,9 +2382,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, if (ret) return ret; - ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); - if (!ret) { - free_io_failure(inode, failrec); + if (!btrfs_check_repairable(inode, failed_bio, failrec, + failed_mirror)) { + free_io_failure(failure_tree, tree, failrec); return -EIO; } @@ -2411,21 +2396,18 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, start - page_offset(page), (int)phy_offset, failed_bio->bi_end_io, NULL); - if (!bio) { - free_io_failure(inode, failrec); - return -EIO; - } bio_set_op_attrs(bio, REQ_OP_READ, read_mode); btrfs_debug(btrfs_sb(inode->i_sb), "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", read_mode, failrec->this_mirror, failrec->in_validation); - ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, + status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror, failrec->bio_flags, 0); - if (ret) { - free_io_failure(inode, failrec); + if (status) { + free_io_failure(failure_tree, tree, failrec); bio_put(bio); + ret = blk_status_to_errno(status); } return ret; @@ -2441,17 +2423,14 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) tree = &BTRFS_I(page->mapping->host)->io_tree; - if (tree->ops && tree->ops->writepage_end_io_hook) { - ret = tree->ops->writepage_end_io_hook(page, start, - end, NULL, uptodate); - if (ret) - uptodate = 0; - } + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, end, NULL, + uptodate); if (!uptodate) { ClearPageUptodate(page); SetPageError(page); - ret = ret < 0 ? ret : -EIO; + ret = err < 0 ? err : -EIO; mapping_set_error(page->mapping, ret); } } @@ -2467,11 +2446,13 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) */ static void end_bio_extent_writepage(struct bio *bio) { + int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; u64 start; u64 end; int i; + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; @@ -2496,7 +2477,7 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - end_extent_writepage(page, bio->bi_error, start, end); + end_extent_writepage(page, error, start, end); end_page_writeback(page); } @@ -2529,9 +2510,9 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - struct extent_io_tree *tree; + struct extent_io_tree *tree, *failure_tree; u64 offset = 0; u64 start; u64 end; @@ -2542,6 +2523,7 @@ static void end_bio_extent_readpage(struct bio *bio) int ret; int i; + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; @@ -2549,9 +2531,10 @@ static void end_bio_extent_readpage(struct bio *bio) btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - (u64)bio->bi_iter.bi_sector, bio->bi_error, + (u64)bio->bi_iter.bi_sector, bio->bi_status, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; + failure_tree = &BTRFS_I(inode)->io_failure_tree; /* We always issue full-page reads, but if some block * in a page fails to read, blk_update_request() will @@ -2574,42 +2557,54 @@ static void end_bio_extent_readpage(struct bio *bio) len = bvec->bv_len; mirror = io_bio->mirror_num; - if (likely(uptodate && tree->ops && - tree->ops->readpage_end_io_hook)) { + if (likely(uptodate && tree->ops)) { ret = tree->ops->readpage_end_io_hook(io_bio, offset, page, start, end, mirror); if (ret) uptodate = 0; else - clean_io_failure(inode, start, page, 0); + clean_io_failure(BTRFS_I(inode)->root->fs_info, + failure_tree, tree, start, + page, + btrfs_ino(BTRFS_I(inode)), 0); } if (likely(uptodate)) goto readpage_ok; - if (tree->ops && tree->ops->readpage_io_failed_hook) { + if (tree->ops) { ret = tree->ops->readpage_io_failed_hook(page, mirror); - if (!ret && !bio->bi_error) - uptodate = 1; - } else { + if (ret == -EAGAIN) { + /* + * Data inode's readpage_io_failed_hook() always + * returns -EAGAIN. + * + * The generic bio_readpage_error handles errors + * the following way: If possible, new read + * requests are created and submitted and will + * end up in end_bio_extent_readpage as well (if + * we're lucky, not in the !uptodate case). In + * that case it returns 0 and we just go on with + * the next page in our bio. If it can't handle + * the error it will return -EIO and we remain + * responsible for that page. + */ + ret = bio_readpage_error(bio, offset, page, + start, end, mirror); + if (ret == 0) { + uptodate = !bio->bi_status; + offset += len; + continue; + } + } + /* - * The generic bio_readpage_error handles errors the - * following way: If possible, new read requests are - * created and submitted and will end up in - * end_bio_extent_readpage as well (if we're lucky, not - * in the !uptodate case). In that case it returns 0 and - * we just go on with the next page in our bio. If it - * can't handle the error it will return -EIO and we - * remain responsible for that page. + * metadata's readpage_io_failed_hook() always returns + * -EIO and fixes nothing. -EIO is also returned if + * data inode error could not be fixed. */ - ret = bio_readpage_error(bio, offset, page, start, end, - mirror); - if (ret == 0) { - uptodate = !bio->bi_error; - offset += len; - continue; - } + ASSERT(ret == -EIO); } readpage_ok: if (likely(uptodate)) { @@ -2656,77 +2651,80 @@ readpage_ok: endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); if (io_bio->end_io) - io_bio->end_io(io_bio, bio->bi_error); + io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); bio_put(bio); } /* - * this allocates from the btrfs_bioset. We're returning a bio right now - * but you can call btrfs_io_bio for the appropriate container_of magic + * Initialize the members up to but not including 'bio'. Use after allocating a + * new bio by bio_alloc_bioset as it does not initialize the bytes outside of + * 'bio' because use of __GFP_ZERO is not supported. */ -struct bio * -btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags) +static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) { - struct btrfs_io_bio *btrfs_bio; - struct bio *bio; - - bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); + memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio)); +} - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) { - bio = bio_alloc_bioset(gfp_flags, - nr_vecs, btrfs_bioset); - } - } +/* + * The following helpers allocate a bio. As it's backed by a bioset, it'll + * never fail. We're returning a bio right now but you can call btrfs_io_bio + * for the appropriate container_of magic + */ +struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) +{ + struct bio *bio; - if (bio) { - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = first_sector; - btrfs_bio = btrfs_io_bio(bio); - btrfs_bio->csum = NULL; - btrfs_bio->csum_allocated = NULL; - btrfs_bio->end_io = NULL; - } + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset); + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = first_byte >> 9; + btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; } -struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) +struct bio *btrfs_bio_clone(struct bio *bio) { struct btrfs_io_bio *btrfs_bio; struct bio *new; - new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); - if (new) { - btrfs_bio = btrfs_io_bio(new); - btrfs_bio->csum = NULL; - btrfs_bio->csum_allocated = NULL; - btrfs_bio->end_io = NULL; - } + /* Bio allocation backed by a bioset does not fail */ + new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset); + btrfs_bio = btrfs_io_bio(new); + btrfs_io_bio_init(btrfs_bio); + btrfs_bio->iter = bio->bi_iter; return new; } -/* this also allocates from the btrfs_bioset */ -struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) { - struct btrfs_io_bio *btrfs_bio; struct bio *bio; - bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); - if (bio) { - btrfs_bio = btrfs_io_bio(bio); - btrfs_bio->csum = NULL; - btrfs_bio->csum_allocated = NULL; - btrfs_bio->end_io = NULL; - } + /* Bio allocation backed by a bioset does not fail */ + bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset); + btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; } +struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) +{ + struct bio *bio; + struct btrfs_io_bio *btrfs_bio; + + /* this will never fail when it's backed by a bioset */ + bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset); + ASSERT(bio); + + btrfs_bio = btrfs_io_bio(bio); + btrfs_io_bio_init(btrfs_bio); + + bio_trim(bio, offset >> 9, size >> 9); + btrfs_bio->iter = bio->bi_iter; + return bio; +} static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { - int ret = 0; + blk_status_t ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; @@ -2737,14 +2735,14 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, bio->bi_private = NULL; bio_get(bio); - if (tree->ops && tree->ops->submit_bio_hook) - ret = tree->ops->submit_bio_hook(page->mapping->host, bio, + if (tree->ops) + ret = tree->ops->submit_bio_hook(tree->private_data, bio, mirror_num, bio_flags, start); else btrfsic_submit_bio(bio); bio_put(bio); - return ret; + return blk_status_to_errno(ret); } static int merge_bio(struct extent_io_tree *tree, struct page *page, @@ -2752,7 +2750,7 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page, unsigned long bio_flags) { int ret = 0; - if (tree->ops && tree->ops->merge_bio_hook) + if (tree->ops) ret = tree->ops->merge_bio_hook(page, offset, size, bio, bio_flags); return ret; @@ -2765,7 +2763,6 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, size_t size, unsigned long offset, struct block_device *bdev, struct bio **bio_ret, - unsigned long max_pages, bio_end_io_t end_io_func, int mirror_num, unsigned long prev_bio_flags, @@ -2802,14 +2799,11 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, } } - bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, - GFP_NOFS | __GFP_HIGH); - if (!bio) - return -ENOMEM; - + bio = btrfs_bio_alloc(bdev, sector << 9); bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + bio->bi_write_hint = page->mapping->host->i_write_hint; bio_set_op_attrs(bio, op, op_flags); if (wbc) { wbc_init_bio(wbc, bio); @@ -2856,7 +2850,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, em = *em_cached; if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { - atomic_inc(&em->refs); + refcount_inc(&em->refs); return em; } @@ -2864,10 +2858,10 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = get_extent(inode, page, pg_offset, start, len, 0); + em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); - atomic_inc(&em->refs); + refcount_inc(&em->refs); *em_cached = em; } return em; @@ -2931,7 +2925,6 @@ static int __do_readpage(struct extent_io_tree *tree, } } while (cur <= end) { - unsigned long pnr = (last_byte >> PAGE_SHIFT) + 1; bool force_bio_submit = false; if (cur >= last_byte) { @@ -3066,10 +3059,9 @@ static int __do_readpage(struct extent_io_tree *tree, continue; } - pnr -= page->index; ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL, page, sector, disk_io_size, pg_offset, - bdev, bio, pnr, + bdev, bio, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag, @@ -3110,7 +3102,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, inode = pages[0]->mapping->host; while (1) { lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(inode, start, + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, end - start + 1); if (!ordered) break; @@ -3182,7 +3174,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, while (1) { lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(inode, start, + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, PAGE_SIZE); if (!ordered) break; @@ -3210,7 +3202,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } -static void update_nr_written(struct page *page, struct writeback_control *wbc, +static void update_nr_written(struct writeback_control *wbc, unsigned long nr_written) { wbc->nr_to_write -= nr_written; @@ -3330,7 +3322,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, u64 block_start; u64 iosize; sector_t sector; - struct extent_state *cached_state = NULL; struct extent_map *em; struct block_device *bdev; size_t pg_offset = 0; @@ -3349,10 +3340,9 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, else redirty_page_for_writepage(wbc, page); - update_nr_written(page, wbc, nr_written); + update_nr_written(wbc, nr_written); unlock_page(page); - ret = 1; - goto done_unlocked; + return 1; } } @@ -3360,7 +3350,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, * we don't want to touch the inode after unlocking the page, * so we update the mapping writeback index now */ - update_nr_written(page, wbc, nr_written + 1); + update_nr_written(wbc, nr_written + 1); end = page_end; if (i_size <= start) { @@ -3374,7 +3364,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, while (cur <= end) { u64 em_end; - unsigned long max_nr; if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) @@ -3382,7 +3371,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page_end, NULL, 1); break; } - em = epd->get_extent(inode, page, pg_offset, cur, + em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur, end - cur + 1, 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); @@ -3431,8 +3420,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, continue; } - max_nr = (i_size >> PAGE_SHIFT) + 1; - set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { btrfs_err(BTRFS_I(inode)->root->fs_info, @@ -3442,11 +3429,14 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, page, sector, iosize, pg_offset, - bdev, &epd->bio, max_nr, + bdev, &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); - if (ret) + if (ret) { SetPageError(page); + if (PageWriteback(page)) + end_page_writeback(page); + } cur = cur + iosize; pg_offset += iosize; @@ -3454,11 +3444,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, } done: *nr_ret = nr; - -done_unlocked: - - /* drop our reference on any cached states */ - free_extent_state(cached_state); return ret; } @@ -3590,9 +3575,9 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - __percpu_counter_add(&fs_info->dirty_metadata_bytes, - -eb->len, - fs_info->dirty_metadata_batch); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + -eb->len, + fs_info->dirty_metadata_batch); ret = 1; } else { spin_unlock(&eb->refs_lock); @@ -3693,6 +3678,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) struct extent_buffer *eb; int i, done; + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; @@ -3700,7 +3686,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (bio->bi_error || + if (bio->bi_status || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); set_btree_ioerr(page); @@ -3750,7 +3736,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ start = btrfs_item_nr_offset(nritems); - end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb); + end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb); memzero_extent_buffer(eb, start, end - start); } @@ -3761,20 +3747,21 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, p, offset >> 9, PAGE_SIZE, 0, bdev, - &epd->bio, -1, + &epd->bio, end_bio_extent_buffer_writepage, 0, epd->bio_flags, bio_flags, false); epd->bio_flags = bio_flags; if (ret) { set_btree_ioerr(p); - end_page_writeback(p); + if (PageWriteback(p)) + end_page_writeback(p); if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) end_extent_buffer_writeback(eb); ret = -EIO; break; } offset += PAGE_SIZE; - update_nr_written(p, wbc, 1); + update_nr_written(wbc, 1); unlock_page(p); } @@ -3926,8 +3913,7 @@ retry: * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. */ -static int extent_write_cache_pages(struct extent_io_tree *tree, - struct address_space *mapping, +static int extent_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data, void (*flush_fn)(void *)) @@ -4168,8 +4154,7 @@ int extent_writepages(struct extent_io_tree *tree, .bio_flags = 0, }; - ret = extent_write_cache_pages(tree, mapping, wbc, - __extent_writepage, &epd, + ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd, flush_write_bio); flush_epd_write_bio(&epd); return ret; @@ -4264,8 +4249,6 @@ static int try_release_extent_state(struct extent_map_tree *map, EXTENT_IOBITS, 0, NULL)) ret = 0; else { - if ((mask & GFP_NOFS) == GFP_NOFS) - mask = GFP_NOFS; /* * at this point we can safely clear everything except the * locked bit and the nodatasum bit @@ -4354,7 +4337,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = get_extent(inode, NULL, 0, offset, len, 0); + em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0); if (IS_ERR_OR_NULL(em)) return em; @@ -4373,6 +4356,119 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, return NULL; } +/* + * To cache previous fiemap extent + * + * Will be used for merging fiemap extent + */ +struct fiemap_cache { + u64 offset; + u64 phys; + u64 len; + u32 flags; + bool cached; +}; + +/* + * Helper to submit fiemap extent. + * + * Will try to merge current fiemap extent specified by @offset, @phys, + * @len and @flags with cached one. + * And only when we fails to merge, cached one will be submitted as + * fiemap extent. + * + * Return value is the same as fiemap_fill_next_extent(). + */ +static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + u64 offset, u64 phys, u64 len, u32 flags) +{ + int ret = 0; + + if (!cache->cached) + goto assign; + + /* + * Sanity check, extent_fiemap() should have ensured that new + * fiemap extent won't overlap with cahced one. + * Not recoverable. + * + * NOTE: Physical address can overlap, due to compression + */ + if (cache->offset + cache->len > offset) { + WARN_ON(1); + return -EINVAL; + } + + /* + * Only merges fiemap extents if + * 1) Their logical addresses are continuous + * + * 2) Their physical addresses are continuous + * So truly compressed (physical size smaller than logical size) + * extents won't get merged with each other + * + * 3) Share same flags except FIEMAP_EXTENT_LAST + * So regular extent won't get merged with prealloc extent + */ + if (cache->offset + cache->len == offset && + cache->phys + cache->len == phys && + (cache->flags & ~FIEMAP_EXTENT_LAST) == + (flags & ~FIEMAP_EXTENT_LAST)) { + cache->len += len; + cache->flags |= flags; + goto try_submit_last; + } + + /* Not mergeable, need to submit cached one */ + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret) + return ret; +assign: + cache->cached = true; + cache->offset = offset; + cache->phys = phys; + cache->len = len; + cache->flags = flags; +try_submit_last: + if (cache->flags & FIEMAP_EXTENT_LAST) { + ret = fiemap_fill_next_extent(fieinfo, cache->offset, + cache->phys, cache->len, cache->flags); + cache->cached = false; + } + return ret; +} + +/* + * Emit last fiemap cache + * + * The last fiemap cache may still be cached in the following case: + * 0 4k 8k + * |<- Fiemap range ->| + * |<------------ First extent ----------->| + * + * In this case, the first extent range will be cached but not emitted. + * So we must emit it before ending extent_fiemap(). + */ +static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + int ret; + + if (!cache->cached) + return 0; + + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret > 0) + ret = 0; + return ret; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -4390,6 +4486,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(inode)->root; + struct fiemap_cache cache = { 0 }; int end = 0; u64 em_start = 0; u64 em_len = 0; @@ -4410,8 +4507,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size */ - ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, - 0); + ret = btrfs_lookup_file_extent(NULL, root, path, + btrfs_ino(BTRFS_I(inode)), -1, 0); if (ret < 0) { btrfs_free_path(path); return ret; @@ -4426,7 +4523,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, found_type = found_key.type; /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(inode) || + if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || found_type != BTRFS_EXTENT_DATA_KEY) { /* have to trust i_size as the end */ last = (u64)-1; @@ -4535,8 +4632,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * lookup stuff. */ ret = btrfs_check_shared(trans, root->fs_info, - root->objectid, - btrfs_ino(inode), bytenr); + root->objectid, + btrfs_ino(BTRFS_I(inode)), bytenr); if (trans) btrfs_end_transaction(trans); if (ret < 0) @@ -4569,8 +4666,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_LAST; end = 1; } - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); + ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, + em_len, flags); if (ret) { if (ret == 1) ret = 0; @@ -4578,6 +4675,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } } out_free: + if (!ret) + ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache); free_extent_map(em); out: btrfs_free_path(path); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 17f9ce479ed7..4f030912f3ef 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -2,6 +2,7 @@ #define __EXTENTIO__ #include <linux/rbtree.h> +#include <linux/refcount.h> #include "ulist.h" /* bits for the extent state */ @@ -14,14 +15,17 @@ #define EXTENT_DEFRAG (1U << 6) #define EXTENT_BOUNDARY (1U << 9) #define EXTENT_NODATASUM (1U << 10) -#define EXTENT_DO_ACCOUNTING (1U << 11) +#define EXTENT_CLEAR_META_RESV (1U << 11) #define EXTENT_FIRST_DELALLOC (1U << 12) #define EXTENT_NEED_WAIT (1U << 13) #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) #define EXTENT_QGROUP_RESERVED (1U << 16) #define EXTENT_CLEAR_DATA_RESV (1U << 17) +#define EXTENT_DELALLOC_NEW (1U << 18) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ + EXTENT_CLEAR_DATA_RESV) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* @@ -45,13 +49,14 @@ #define EXTENT_BUFFER_IN_TREE 10 #define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */ -/* these are flags for extent_clear_unlock_delalloc */ +/* these are flags for __process_pages_contig */ #define PAGE_UNLOCK (1 << 0) #define PAGE_CLEAR_DIRTY (1 << 1) #define PAGE_SET_WRITEBACK (1 << 2) #define PAGE_END_WRITEBACK (1 << 3) #define PAGE_SET_PRIVATE2 (1 << 4) #define PAGE_SET_ERROR (1 << 5) +#define PAGE_LOCK (1 << 6) /* * page->private values. Every page that is controlled by the extent @@ -83,41 +88,56 @@ extern void le_bitmap_clear(u8 *map, unsigned int start, int len); struct extent_state; struct btrfs_root; +struct btrfs_inode; struct btrfs_io_bio; struct io_failure_record; -typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio, +typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset); struct extent_io_ops { - int (*fill_delalloc)(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written); - int (*writepage_start_hook)(struct page *page, u64 start, u64 end); + /* + * The following callbacks must be allways defined, the function + * pointer will be called unconditionally. + */ extent_submit_bio_hook_t *submit_bio_hook; + int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror); int (*merge_bio_hook)(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int mirror); - int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, + struct btrfs_fs_info *(*tree_fs_info)(void *private_data); + void (*set_range_writeback)(void *private_data, u64 start, u64 end); + + /* + * Optional hooks, called if the pointer is not NULL + */ + int (*fill_delalloc)(void *private_data, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written); + + int (*writepage_start_hook)(struct page *page, u64 start, u64 end); + void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); - void (*set_bit_hook)(struct inode *inode, struct extent_state *state, + void (*set_bit_hook)(void *private_data, struct extent_state *state, unsigned *bits); - void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned *bits); - void (*merge_extent_hook)(struct inode *inode, + void (*clear_bit_hook)(void *private_data, + struct extent_state *state, + unsigned *bits); + void (*merge_extent_hook)(void *private_data, struct extent_state *new, struct extent_state *other); - void (*split_extent_hook)(struct inode *inode, + void (*split_extent_hook)(void *private_data, struct extent_state *orig, u64 split); + void (*check_extent_io_range)(void *private_data, const char *caller, + u64 start, u64 end); }; struct extent_io_tree { struct rb_root state; - struct address_space *mapping; + void *private_data; u64 dirty_bytes; int track_uptodate; spinlock_t lock; @@ -131,7 +151,7 @@ struct extent_state { /* ADD NEW ELEMENTS AFTER THIS */ wait_queue_head_t wq; - atomic_t refs; + refcount_t refs; unsigned state; struct io_failure_record *failrec; @@ -189,12 +209,46 @@ struct extent_buffer { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - u64 bytes_changed; + unsigned int bytes_changed; /* Changed ranges */ - struct ulist *range_changed; + struct ulist range_changed; }; +static inline void extent_changeset_init(struct extent_changeset *changeset) +{ + changeset->bytes_changed = 0; + ulist_init(&changeset->range_changed); +} + +static inline struct extent_changeset *extent_changeset_alloc(void) +{ + struct extent_changeset *ret; + + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + extent_changeset_init(ret); + return ret; +} + +static inline void extent_changeset_release(struct extent_changeset *changeset) +{ + if (!changeset) + return; + changeset->bytes_changed = 0; + ulist_release(&changeset->range_changed); +} + +static inline void extent_changeset_free(struct extent_changeset *changeset) +{ + if (!changeset) + return; + extent_changeset_release(changeset); + kfree(changeset); +} + static inline void extent_set_compress_type(unsigned long *bio_flags, int compress_type) { @@ -208,14 +262,13 @@ static inline int extent_compress_type(unsigned long bio_flags) struct extent_map_tree; -typedef struct extent_map *(get_extent_t)(struct inode *inode, +typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); -void extent_io_tree_init(struct extent_io_tree *tree, - struct address_space *mapping); +void extent_io_tree_init(struct extent_io_tree *tree, void *private_data); int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); @@ -443,19 +496,21 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, u64 delalloc_end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); -struct bio * -btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags); -struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); -struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); +struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte); +struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); +struct bio *btrfs_bio_clone(struct bio *bio); +struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); struct btrfs_fs_info; - -int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, - struct page *page, unsigned int pg_offset, - int mirror_num); -int clean_io_failure(struct inode *inode, u64 start, struct page *page, - unsigned int pg_offset); +struct btrfs_inode; + +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num); +int clean_io_failure(struct btrfs_fs_info *fs_info, + struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, u64 start, + struct page *page, u64 ino, unsigned int pg_offset); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int mirror_num); @@ -479,16 +534,20 @@ struct io_failure_record { int in_validation; }; -void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end); + +void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, + u64 end); int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, struct io_failure_record **failrec_ret); -int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, - struct io_failure_record *failrec, int fail_mirror); +bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, int fail_mirror); struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, struct page *page, int pg_offset, int icsum, bio_end_io_t *endio_func, void *data); -int free_io_failure(struct inode *inode, struct io_failure_record *rec); +int free_io_failure(struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, + struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS noinline u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 26f9ac719d20..69850155870c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -55,7 +55,7 @@ struct extent_map *alloc_extent_map(void) em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; em->generation = 0; - atomic_set(&em->refs, 1); + refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; } @@ -71,8 +71,8 @@ void free_extent_map(struct extent_map *em) { if (!em) return; - WARN_ON(atomic_read(&em->refs) == 0); - if (atomic_dec_and_test(&em->refs)) { + WARN_ON(refcount_read(&em->refs) == 0); + if (refcount_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) @@ -322,7 +322,7 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified) { - atomic_inc(&em->refs); + refcount_inc(&em->refs); em->mod_start = em->start; em->mod_len = em->len; @@ -381,7 +381,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, if (strict && !(end > em->start && start < extent_map_end(em))) return NULL; - atomic_inc(&em->refs); + refcount_inc(&em->refs); return em; } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index eb8b8fae036b..a67b2def5413 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -2,6 +2,7 @@ #define __EXTENTMAP__ #include <linux/rbtree.h> +#include <linux/refcount.h> #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) @@ -41,7 +42,7 @@ struct extent_map { */ struct map_lookup *map_lookup; }; - atomic_t refs; + refcount_t refs; unsigned int compress_type; struct list_head list; }; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index e97e322c28f0..fdcb41002623 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -160,11 +160,12 @@ static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) kfree(bio->csum_allocated); } -static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -177,12 +178,12 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 page_bytes_left; u32 diff; int nblocks; - int count = 0, i; + int count = 0; u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + return BLK_STS_RESOURCE; nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { @@ -191,7 +192,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, csum_size, GFP_NOFS); if (!btrfs_bio->csum_allocated) { btrfs_free_path(path); - return -ENOMEM; + return BLK_STS_RESOURCE; } btrfs_bio->csum = btrfs_bio->csum_allocated; btrfs_bio->end_io = btrfs_io_bio_endio_readpage; @@ -206,15 +207,13 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (bio->bi_iter.bi_size > PAGE_SIZE * 8) path->reada = READA_FORWARD; - WARN_ON(bio->bi_vcnt <= 0); - /* * the free space stuff is only read when it hasn't been * updated in the current transaction. So, we can safely * read from the commit root and sidestep a nasty deadlock * between reading the free space cache and updating the csum tree. */ - if (btrfs_is_free_space_inode(inode)) { + if (btrfs_is_free_space_inode(BTRFS_I(inode))) { path->search_commit_root = 1; path->skip_locking = 1; } @@ -223,13 +222,13 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (dio) offset = logical_offset; - bio_for_each_segment_all(bvec, bio, i) { - page_bytes_left = bvec->bv_len; + bio_for_each_segment(bvec, bio, iter) { + page_bytes_left = bvec.bv_len; if (count) goto next; if (!dio) - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, (u32 *)csum, nblocks); if (count) @@ -255,7 +254,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, } else { btrfs_info_rl(fs_info, "no csum found for inode %llu start %llu", - btrfs_ino(inode), offset); + btrfs_ino(BTRFS_I(inode)), offset); } item = NULL; btrfs_release_path(path); @@ -303,12 +302,12 @@ next: return 0; } -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) { return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); } -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) { return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1); } @@ -433,26 +432,26 @@ fail: return ret; } -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; char *data; - struct bio_vec *bvec; + struct bvec_iter iter; + struct bio_vec bvec; int index; int nr_sectors; - int i, j; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; + int i; u64 offset; - WARN_ON(bio->bi_vcnt <= 0); sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), GFP_NOFS); if (!sums) - return -ENOMEM; + return BLK_STS_RESOURCE; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); @@ -465,19 +464,19 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; - bio_for_each_segment_all(bvec, bio, j) { + bio_for_each_segment(bvec, bio, iter) { if (!contig) - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + offset = page_offset(bvec.bv_page) + bvec.bv_offset; if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ } - data = kmap_atomic(bvec->bv_page); + data = kmap_atomic(bvec.bv_page); nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, - bvec->bv_len + fs_info->sectorsize + bvec.bv_len + fs_info->sectorsize - 1); for (i = 0; i < nr_sectors; i++) { @@ -504,12 +503,12 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, + total_bytes; index = 0; - data = kmap_atomic(bvec->bv_page); + data = kmap_atomic(bvec.bv_page); } sums->sums[index] = ~(u32)0; sums->sums[index] - = btrfs_csum_data(data + bvec->bv_offset + = btrfs_csum_data(data + bvec.bv_offset + (i * fs_info->sectorsize), sums->sums[index], fs_info->sectorsize); @@ -643,7 +642,33 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, /* delete the entire item, it is inside our range */ if (key.offset >= bytenr && csum_end <= end_byte) { - ret = btrfs_del_item(trans, root, path); + int del_nr = 1; + + /* + * Check how many csum items preceding this one in this + * leaf correspond to our range and then delete them all + * at once. + */ + if (key.offset > bytenr && path->slots[0] > 0) { + int slot = path->slots[0] - 1; + + while (slot >= 0) { + struct btrfs_key pk; + + btrfs_item_key_to_cpu(leaf, &pk, slot); + if (pk.offset < bytenr || + pk.type != BTRFS_EXTENT_CSUM_KEY || + pk.objectid != + BTRFS_EXTENT_CSUM_OBJECTID) + break; + path->slots[0] = slot; + del_nr++; + key.offset = pk.offset; + slot--; + } + } + ret = btrfs_del_items(trans, root, path, + path->slots[0], del_nr); if (ret) goto out; if (key.offset == bytenr) @@ -856,8 +881,8 @@ insert: tmp = min(tmp, (next_offset - file_key.offset) >> fs_info->sb->s_blocksize_bits); - tmp = max((u64)1, tmp); - tmp = min(tmp, (u64)MAX_CSUM_ITEMS(fs_info, csum_size)); + tmp = max_t(u64, 1, tmp); + tmp = min_t(u64, tmp, MAX_CSUM_ITEMS(fs_info, csum_size)); ins_size = csum_size * tmp; } else { ins_size = csum_size; @@ -904,14 +929,14 @@ fail_unlock: goto out; } -void btrfs_extent_item_to_extent_map(struct inode *inode, +void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, const bool new_inline, struct extent_map *em) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct extent_buffer *leaf = path->nodes[0]; const int slot = path->slots[0]; struct btrfs_key key; @@ -976,8 +1001,8 @@ void btrfs_extent_item_to_extent_map(struct inode *inode, } } else { btrfs_err(fs_info, - "unknown file extent item type %d, inode %llu, offset %llu, root %llu", - type, btrfs_ino(inode), extent_start, + "unknown file extent item type %d, inode %llu, offset %llu, " + "root %llu", type, btrfs_ino(inode), extent_start, root->root_key.objectid); } } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b5c5da215d05..9e75d8a39aac 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -92,10 +92,10 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, * If an existing record is found the defrag item you * pass in is freed */ -static int __btrfs_add_inode_defrag(struct inode *inode, +static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, struct inode_defrag *defrag) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); struct inode_defrag *entry; struct rb_node **p; struct rb_node *parent = NULL; @@ -123,7 +123,7 @@ static int __btrfs_add_inode_defrag(struct inode *inode, return -EEXIST; } } - set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); rb_link_node(&defrag->rb_node, parent, p); rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); return 0; @@ -145,10 +145,10 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) * enabled */ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct inode_defrag *defrag; u64 transid; int ret; @@ -156,13 +156,13 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (!__need_auto_defrag(fs_info)) return 0; - if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) + if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) return 0; if (trans) transid = trans->transid; else - transid = BTRFS_I(inode)->root->last_trans; + transid = inode->root->last_trans; defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) @@ -173,7 +173,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->root = root->root_key.objectid; spin_lock(&fs_info->defrag_inodes_lock); - if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { /* * If we set IN_DEFRAG flag and evict the inode from memory, * and then re-read this inode, this new inode doesn't have @@ -194,10 +194,10 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, * the same inode in the tree, we will merge them together (by * __btrfs_add_inode_defrag()) and free the one that we want to requeue. */ -static void btrfs_requeue_inode_defrag(struct inode *inode, +static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode, struct inode_defrag *defrag) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); int ret; if (!__need_auto_defrag(fs_info)) @@ -334,7 +334,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, */ if (num_defrag == BTRFS_DEFRAG_BATCH) { defrag->last_offset = range.start; - btrfs_requeue_inode_defrag(inode, defrag); + btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); } else if (defrag->last_offset && !defrag->cycled) { /* * we didn't fill our defrag batch, but @@ -343,7 +343,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, */ defrag->last_offset = 0; defrag->cycled = 1; - btrfs_requeue_inode_defrag(inode, defrag); + btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } @@ -529,13 +529,13 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, * this drops all the extents in the cache that intersect the range * [start, end]. Existing extents are split as required. */ -void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, +void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, int skip_pinned) { struct extent_map *em; struct extent_map *split = NULL; struct extent_map *split2 = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; u64 len = end - start + 1; u64 gen; int ret; @@ -702,7 +702,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *fi; struct btrfs_key key; struct btrfs_key new_key; - u64 ino = btrfs_ino(inode); + u64 ino = btrfs_ino(BTRFS_I(inode)); u64 search_start = start; u64 disk_bytenr = 0; u64 num_bytes = 0; @@ -720,7 +720,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int leafs_visited = 0; if (drop_cache) - btrfs_drop_extent_cache(inode, start, end - 1, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0); if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) modify_tree = 0; @@ -1082,10 +1082,10 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, * two or three. */ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct inode *inode, u64 start, u64 end) + struct btrfs_inode *inode, u64 start, u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_file_extent_item *fi; @@ -1404,6 +1404,47 @@ fail: } +static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, + const u64 start, + const u64 len, + struct extent_state **cached_state) +{ + u64 search_start = start; + const u64 end = start + len - 1; + + while (search_start < end) { + const u64 search_len = end - search_start + 1; + struct extent_map *em; + u64 em_len; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, search_start, + search_len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start != EXTENT_MAP_HOLE) + goto next; + + em_len = em->len; + if (em->start < search_start) + em_len -= search_start - em->start; + if (em_len > search_len) + em_len = search_len; + + ret = set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, + NULL, cached_state, GFP_NOFS); +next: + search_start = extent_map_end(em); + free_extent_map(em); + if (ret) + return ret; + } + return 0; +} + /* * This function locks the extent and properly waits for data=ordered extents * to finish before allowing the pages to be modified if need. @@ -1415,13 +1456,13 @@ fail: * the other < 0 number - Something wrong happens */ static noinline int -lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, +lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, u64 *lockstart, u64 *lockend, struct extent_state **cached_state) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); u64 start_pos; u64 last_pos; int i; @@ -1432,33 +1473,44 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, + round_up(pos + write_bytes - start_pos, fs_info->sectorsize) - 1; - if (start_pos < inode->i_size) { + if (start_pos < inode->vfs_inode.i_size || + (inode->flags & BTRFS_INODE_PREALLOC)) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos, cached_state); + unsigned int clear_bits; + + lock_extent_bits(&inode->io_tree, start_pos, last_pos, + cached_state); ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->len > start_pos && ordered->file_offset <= last_pos) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos, - cached_state, GFP_NOFS); + unlock_extent_cached(&inode->io_tree, start_pos, + last_pos, cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(&inode->vfs_inode, + ordered, 1); btrfs_put_ordered_extent(ordered); return -EAGAIN; } if (ordered) btrfs_put_ordered_extent(ordered); - - clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, - last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached_state, GFP_NOFS); + ret = btrfs_find_new_delalloc_bytes(inode, start_pos, + last_pos - start_pos + 1, + cached_state); + clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG; + if (ret) + clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED; + clear_extent_bit(&inode->io_tree, start_pos, + last_pos, clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, + 0, cached_state, GFP_NOFS); + if (ret) + return ret; *lockstart = start_pos; *lockend = last_pos; ret = 1; @@ -1474,11 +1526,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, return ret; } -static noinline int check_can_nocow(struct inode *inode, loff_t pos, +static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct btrfs_ordered_extent *ordered; u64 lockstart, lockend; u64 num_bytes; @@ -1493,19 +1545,20 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, fs_info->sectorsize) - 1; while (1) { - lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + lock_extent(&inode->io_tree, lockstart, lockend); ordered = btrfs_lookup_ordered_range(inode, lockstart, lockend - lockstart + 1); if (!ordered) { break; } - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); - btrfs_start_ordered_extent(inode, ordered, 1); + unlock_extent(&inode->io_tree, lockstart, lockend); + btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); btrfs_put_ordered_extent(ordered); } num_bytes = lockend - lockstart + 1; - ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); + ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, + NULL, NULL, NULL); if (ret <= 0) { ret = 0; btrfs_end_write_no_snapshoting(root); @@ -1514,7 +1567,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, num_bytes - pos + lockstart); } - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + unlock_extent(&inode->io_tree, lockstart, lockend); return ret; } @@ -1528,6 +1581,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; u64 lockend; @@ -1575,11 +1629,14 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); - ret = btrfs_check_data_free_space(inode, pos, write_bytes); + extent_changeset_release(data_reserved); + ret = btrfs_check_data_free_space(inode, &data_reserved, pos, + write_bytes); if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && - check_can_nocow(inode, pos, &write_bytes) > 0) { + check_can_nocow(BTRFS_I(inode), pos, + &write_bytes) > 0) { /* * For nodata cow case, no need to reserve * data space. @@ -1599,11 +1656,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } } - ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), + reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, pos, - write_bytes); + btrfs_free_reserved_data_space(inode, + data_reserved, pos, + write_bytes); else btrfs_end_write_no_snapshoting(root); break; @@ -1623,9 +1682,9 @@ again: if (ret) break; - ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, - pos, write_bytes, &lockstart, - &lockend, &cached_state); + ret = lock_and_cleanup_extent_if_need(BTRFS_I(inode), pages, + num_pages, pos, write_bytes, &lockstart, + &lockend, &cached_state); if (ret < 0) { if (ret == -EAGAIN) goto again; @@ -1677,7 +1736,7 @@ again: spin_unlock(&BTRFS_I(inode)->lock); } if (only_release_metadata) { - btrfs_delalloc_release_metadata(inode, + btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes); } else { u64 __pos; @@ -1685,8 +1744,9 @@ again: __pos = round_down(pos, fs_info->sectorsize) + (dirty_pages << PAGE_SHIFT); - btrfs_delalloc_release_space(inode, __pos, - release_bytes); + btrfs_delalloc_release_space(inode, + data_reserved, __pos, + release_bytes); } } @@ -1738,14 +1798,16 @@ again: if (release_bytes) { if (only_release_metadata) { btrfs_end_write_no_snapshoting(root); - btrfs_delalloc_release_metadata(inode, release_bytes); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + release_bytes); } else { - btrfs_delalloc_release_space(inode, - round_down(pos, fs_info->sectorsize), - release_bytes); + btrfs_delalloc_release_space(inode, data_reserved, + round_down(pos, fs_info->sectorsize), + release_bytes); } } + extent_changeset_free(data_reserved); return num_written ? num_written : ret; } @@ -1820,17 +1882,36 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); ssize_t err; loff_t pos; - size_t count; + size_t count = iov_iter_count(from); loff_t oldsize; int clean_page = 0; - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + err = generic_write_checks(iocb, from); if (err <= 0) { inode_unlock(inode); return err; } + pos = iocb->ki_pos; + if (iocb->ki_flags & IOCB_NOWAIT) { + /* + * We will allocate space in case nodatacow is not set, + * so bail + */ + if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) || + check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { + inode_unlock(inode); + return -EAGAIN; + } + } + current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) { @@ -1858,8 +1939,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ update_time_for_write(inode); - pos = iocb->ki_pos; - count = iov_iter_count(from); start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); if (start_pos > oldsize) { @@ -1955,7 +2034,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; - int ret = 0; + int ret = 0, err; bool full_sync = 0; u64 len; @@ -1974,7 +2053,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) - return ret; + goto out; inode_lock(inode); atomic_inc(&root->log_batch); @@ -2062,7 +2141,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * commit does not start nor waits for ordered extents to complete. */ smp_mb(); - if (btrfs_inode_in_log(inode, fs_info->generation) || + if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || (full_sync && BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) || (!btrfs_have_ordered_extents_in_range(inode, start, len) && @@ -2079,10 +2158,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * An ordered extent might have started before and completed * already with io errors, in which case the inode was not * updated and we end up here. So check the inode's mapping - * flags for any errors that might have happened while doing - * writeback of file data. + * for any errors that might have happened since we last + * checked called fsync. */ - ret = filemap_check_errors(inode->i_mapping); + ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); inode_unlock(inode); goto out; } @@ -2171,6 +2250,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_end_transaction(trans); } out: + err = file_check_and_advance_wb_err(file); + if (!ret) + ret = err; return ret > 0 ? -EIO : ret; } @@ -2193,7 +2275,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } -static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, +static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, int slot, u64 start, u64 end) { struct btrfs_file_extent_item *fi; @@ -2222,15 +2304,16 @@ static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, return 0; } -static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, - struct btrfs_path *path, u64 offset, u64 end) +static int fill_holes(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, u64 offset, u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct extent_map *hole_em; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; struct btrfs_key key; int ret; @@ -2253,7 +2336,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, } leaf = path->nodes[0]; - if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { + if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) { u64 num_bytes; path->slots[0]--; @@ -2285,9 +2368,8 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, } btrfs_release_path(path); - ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, - 0, 0, end - offset, 0, end - offset, - 0, 0, 0); + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), + offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0); if (ret) return ret; @@ -2297,8 +2379,7 @@ out: hole_em = alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_cache(inode, offset, end - 1, 0); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); } else { hole_em->start = offset; hole_em->len = end - offset; @@ -2321,7 +2402,7 @@ out: free_extent_map(hole_em); if (ret) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); } return 0; @@ -2335,17 +2416,15 @@ out: */ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0); - if (IS_ERR_OR_NULL(em)) { - if (!em) - ret = -ENOMEM; - else - ret = PTR_ERR(em); - return ret; - } + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + round_down(*start, fs_info->sectorsize), + round_up(*len, fs_info->sectorsize), 0); + if (IS_ERR(em)) + return PTR_ERR(em); /* Hole or vacuum extent(only exists in no-hole mode) */ if (em->block_start == EXTENT_MAP_HOLE) { @@ -2551,8 +2630,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) trans->block_rsv = &fs_info->trans_block_rsv; if (cur_offset < drop_end && cur_offset < ino_size) { - ret = fill_holes(trans, inode, path, cur_offset, - drop_end); + ret = fill_holes(trans, BTRFS_I(inode), path, + cur_offset, drop_end); if (ret) { /* * If we failed then we didn't insert our hole @@ -2623,7 +2702,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * cur_offset == drop_end). */ if (cur_offset < ino_size && cur_offset < drop_end) { - ret = fill_holes(trans, inode, path, cur_offset, drop_end); + ret = fill_holes(trans, BTRFS_I(inode), path, + cur_offset, drop_end); if (ret) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); @@ -2718,6 +2798,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; struct list_head reserve_list; @@ -2748,7 +2829,8 @@ static long btrfs_fallocate(struct file *file, int mode, * * For qgroup space, it will be checked later. */ - ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start); + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + alloc_end - alloc_start); if (ret < 0) return ret; @@ -2828,13 +2910,10 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset, 0); - if (IS_ERR_OR_NULL(em)) { - if (!em) - ret = -ENOMEM; - else - ret = PTR_ERR(em); + if (IS_ERR(em)) { + ret = PTR_ERR(em); break; } last_byte = min(extent_map_end(em), alloc_end); @@ -2849,18 +2928,20 @@ static long btrfs_fallocate(struct file *file, int mode, free_extent_map(em); break; } - ret = btrfs_qgroup_reserve_data(inode, cur_offset, - last_byte - cur_offset); - if (ret < 0) + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + cur_offset, last_byte - cur_offset); + if (ret < 0) { + free_extent_map(em); break; + } } else { /* * Do not need to reserve unwritten extent for this * range, free reserved data space first, otherwise * it'll result in false ENOSPC error. */ - btrfs_free_reserved_data_space(inode, cur_offset, - last_byte - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -2876,11 +2957,12 @@ static long btrfs_fallocate(struct file *file, int mode, if (!ret) ret = btrfs_prealloc_file_range(inode, mode, range->start, - range->len, 1 << inode->i_blkbits, + range->len, i_blocksize(inode), offset + len, &alloc_hint); else - btrfs_free_reserved_data_space(inode, range->start, - range->len); + btrfs_free_reserved_data_space(inode, + data_reserved, range->start, + range->len); list_del(&range->list); kfree(range); } @@ -2918,8 +3000,9 @@ out: inode_unlock(inode); /* Let go of our reservation. */ if (ret != 0) - btrfs_free_reserved_data_space(inode, alloc_start, - alloc_end - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + alloc_start, alloc_end - cur_offset); + extent_changeset_free(data_reserved); return ret; } @@ -2955,7 +3038,8 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) &cached_state); while (start < inode->i_size) { - em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); + em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, + start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); em = NULL; @@ -3019,13 +3103,19 @@ out: return offset; } +static int btrfs_file_open(struct inode *inode, struct file *filp) +{ + filp->f_mode |= FMODE_AIO_NOWAIT; + return generic_file_open(inode, filp); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, - .open = generic_file_open, + .open = btrfs_file_open, .release = btrfs_release_file, .fsync = btrfs_sync_file, .fallocate = btrfs_fallocate, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 7015892c9ee8..c5e6180cdb8c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -18,6 +18,7 @@ #include <linux/pagemap.h> #include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/math64.h> #include <linux/ratelimit.h> @@ -94,12 +95,11 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, return inode; } -struct inode *lookup_free_space_inode(struct btrfs_root *root, +struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { struct inode *inode = NULL; - struct btrfs_fs_info *fs_info = root->fs_info; u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; spin_lock(&block_group->lock); @@ -109,7 +109,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, if (inode) return inode; - inode = __lookup_free_space_inode(root, path, + inode = __lookup_free_space_inode(fs_info->tree_root, path, block_group->key.objectid); if (IS_ERR(inode)) return inode; @@ -192,7 +192,7 @@ static int __create_free_space_inode(struct btrfs_root *root, return 0; } -int create_free_space_inode(struct btrfs_root *root, +int create_free_space_inode(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) @@ -200,11 +200,11 @@ int create_free_space_inode(struct btrfs_root *root, int ret; u64 ino; - ret = btrfs_find_free_objectid(root, &ino); + ret = btrfs_find_free_objectid(fs_info->tree_root, &ino); if (ret < 0) return ret; - return __create_free_space_inode(root, trans, path, ino, + return __create_free_space_inode(fs_info->tree_root, trans, path, ino, block_group->key.objectid); } @@ -227,21 +227,21 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, return ret; } -int btrfs_truncate_free_space_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, +int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct inode *inode) { + struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; - struct btrfs_path *path = btrfs_alloc_path(); bool locked = false; - if (!path) { - ret = -ENOMEM; - goto fail; - } - if (block_group) { + struct btrfs_path *path = btrfs_alloc_path(); + + if (!path) { + ret = -ENOMEM; + goto fail; + } locked = true; mutex_lock(&trans->transaction->cache_write_mutex); if (!list_empty(&block_group->io_list)) { @@ -258,10 +258,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_CLEAR; spin_unlock(&block_group->lock); + btrfs_free_path(path); } - btrfs_free_path(path); - btrfs_i_size_write(inode, 0); + btrfs_i_size_write(BTRFS_I(inode), 0); truncate_pagecache(inode, 0); /* @@ -286,14 +286,14 @@ fail: return ret; } -static int readahead_cache(struct inode *inode) +static void readahead_cache(struct inode *inode) { struct file_ra_state *ra; unsigned long last_index; ra = kzalloc(sizeof(*ra), GFP_NOFS); if (!ra) - return -ENOMEM; + return; file_ra_state_init(ra, inode->i_mapping); last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; @@ -301,8 +301,6 @@ static int readahead_cache(struct inode *inode) page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); kfree(ra); - - return 0; } static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, @@ -313,7 +311,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) + if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID) check_crcs = 1; /* Make sure we can fit our crcs into the first page */ @@ -357,7 +355,7 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) io_ctl->orig = io_ctl->cur; io_ctl->size = PAGE_SIZE; if (clear) - memset(io_ctl->cur, 0, PAGE_SIZE); + clear_page(io_ctl->cur); } static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) @@ -730,9 +728,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (ret) return ret; - ret = readahead_cache(inode); - if (ret) - goto out; + readahead_cache(inode); ret = io_ctl_prepare_pages(&io_ctl, inode, 1); if (ret) @@ -828,7 +824,6 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_root *root = fs_info->tree_root; struct inode *inode; struct btrfs_path *path; int ret = 0; @@ -852,7 +847,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, path->search_commit_root = 1; path->skip_locking = 1; - inode = lookup_free_space_inode(root, block_group, path); + inode = lookup_free_space_inode(fs_info, block_group, path); if (IS_ERR(inode)) { btrfs_free_path(path); return 0; @@ -1128,8 +1123,7 @@ cleanup_bitmap_list(struct list_head *bitmap_list) static void noinline_for_stack cleanup_write_cache_enospc(struct inode *inode, struct btrfs_io_ctl *io_ctl, - struct extent_state **cached_state, - struct list_head *bitmap_list) + struct extent_state **cached_state) { io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, @@ -1225,8 +1219,6 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, * @ctl - the free space cache we are going to write out * @block_group - the block_group for this cache if it belongs to a block_group * @trans - the trans handle - * @path - the path to use - * @offset - the offset for the key we'll insert * * This function writes out a free space cache struct to disk for quick recovery * on mount. This will return 0 if it was successful in writing the cache out, @@ -1236,8 +1228,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, struct btrfs_io_ctl *io_ctl, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, u64 offset) + struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_state *cached_state = NULL; @@ -1365,7 +1356,7 @@ out_nospc_locked: mutex_unlock(&ctl->cache_writeout_mutex); out_nospc: - cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list); + cleanup_write_cache_enospc(inode, io_ctl, &cached_state); if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); @@ -1378,7 +1369,6 @@ int btrfs_write_out_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { - struct btrfs_root *root = fs_info->tree_root; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; @@ -1390,13 +1380,12 @@ int btrfs_write_out_cache(struct btrfs_fs_info *fs_info, } spin_unlock(&block_group->lock); - inode = lookup_free_space_inode(root, block_group, path); + inode = lookup_free_space_inode(fs_info, block_group, path); if (IS_ERR(inode)) return 0; - ret = __btrfs_write_out_cache(root, inode, ctl, block_group, - &block_group->io_ctl, trans, - path, block_group->key.objectid); + ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, + block_group, &block_group->io_ctl, trans); if (ret) { #ifdef DEBUG btrfs_err(fs_info, @@ -3543,8 +3532,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, return 0; memset(&io_ctl, 0, sizeof(io_ctl)); - ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, - trans, path, 0); + ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans); if (!ret) { /* * At this point writepages() didn't error out, so our metadata @@ -3558,7 +3546,8 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, if (ret) { if (release_metadata) - btrfs_delalloc_release_metadata(inode, inode->i_size); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + inode->i_size); #ifdef DEBUG btrfs_err(fs_info, "failed to write free ino cache for root %llu", diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 6f3c025a2c6c..79eca4cabb1c 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -51,18 +51,17 @@ struct btrfs_free_space_op { struct btrfs_io_ctl; -struct inode *lookup_free_space_inode(struct btrfs_root *root, +struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); -int create_free_space_inode(struct btrfs_root *root, +int create_free_space_inode(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); -int btrfs_truncate_free_space_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, +int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct inode *inode); int load_free_space_cache(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index ff0c55337c2e..a5e34de06c2f 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -17,7 +17,7 @@ */ #include <linux/kernel.h> -#include <linux/vmalloc.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "disk-io.h" #include "locking.h" @@ -153,22 +153,21 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) static u8 *alloc_bitmap(u32 bitmap_size) { - void *mem; + u8 *ret; + unsigned int nofs_flag; /* - * The allocation size varies, observed numbers were < 4K up to 16K. - * Using vmalloc unconditionally would be too heavy, we'll try - * contiguous allocations first. + * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse + * into the filesystem as the free space bitmap can be modified in the + * critical section of a transaction commit. + * + * TODO: push the memalloc_nofs_{save,restore}() to the caller where we + * know that recursion is unsafe. */ - if (bitmap_size <= PAGE_SIZE) - return kzalloc(bitmap_size, GFP_NOFS); - - mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN); - if (mem) - return mem; - - return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + nofs_flag = memalloc_nofs_save(); + ret = kvzalloc(bitmap_size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + return ret; } int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, @@ -1189,11 +1188,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - return 0; + return btrfs_commit_transaction(trans); abort: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); @@ -1269,7 +1264,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) list_del(&free_space_root->dirty_list); btrfs_tree_lock(free_space_root->node); - clean_tree_block(trans, fs_info, free_space_root->node); + clean_tree_block(fs_info, free_space_root->node); btrfs_tree_unlock(free_space_root->node); btrfs_free_tree_block(trans, free_space_root, free_space_root->node, 0, 1); @@ -1278,11 +1273,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) free_extent_buffer(free_space_root->commit_root); kfree(free_space_root); - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - return 0; + return btrfs_commit_transaction(trans); abort: btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index a97fdc156a03..baacc1866861 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -38,6 +38,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) { SHASH_DESC_ON_STACK(shash, tfm); u32 *ctx = (u32 *)shash_desc_ctx(shash); + u32 retval; int err; shash->tfm = tfm; @@ -47,5 +48,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) err = crypto_shash_update(shash, address, length); BUG_ON(err); - return *ctx; + retval = *ctx; + barrier_data(ctx); + return retval; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 144b119ff43f..d02019747d00 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -400,6 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_path *path; struct inode *inode; struct btrfs_block_rsv *rsv; + struct extent_changeset *data_reserved = NULL; u64 num_bytes; u64 alloc_hint = 0; int ret; @@ -467,7 +468,7 @@ again: } if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); + ret = btrfs_truncate_free_space_cache(trans, NULL, inode); if (ret) { if (ret != -ENOSPC) btrfs_abort_transaction(trans, ret); @@ -492,14 +493,14 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_metadata(inode, prealloc); + btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc); goto out_put; } @@ -516,6 +517,7 @@ out: trans->bytes_reserved = num_bytes; btrfs_free_path(path); + extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1e861a063721..95c212037095 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -71,6 +71,7 @@ struct btrfs_dio_data { u64 reserve; u64 unsubmitted_oe_range_start; u64 unsubmitted_oe_range_end; + int overwrite; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -85,7 +86,6 @@ static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; -struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; @@ -108,11 +108,36 @@ static noinline int cow_file_range(struct inode *inode, u64 start, u64 end, u64 delalloc_end, int *page_started, unsigned long *nr_written, int unlock, struct btrfs_dedupe_hash *hash); -static struct extent_map *create_pinned_em(struct inode *inode, u64 start, - u64 len, u64 orig_start, - u64 block_start, u64 block_len, - u64 orig_block_len, u64 ram_bytes, - int type); +static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, + u64 orig_start, u64 block_start, + u64 block_len, u64 orig_block_len, + u64 ram_bytes, int compress_type, + int type); + +static void __endio_write_update_ordered(struct inode *inode, + const u64 offset, const u64 bytes, + const bool uptodate); + +/* + * Cleanup all submitted ordered extents in specified range to handle errors + * from the fill_dellaloc() callback. + * + * NOTE: caller must ensure that when an error happens, it can not call + * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING + * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata + * to be released, which we want to happen only when finishing the ordered + * extent (btrfs_finish_ordered_io()). Also note that the caller of the + * fill_delalloc() callback already does proper cleanup for the first page of + * the range, that is, it invokes the callback writepage_end_io_hook() for the + * range of the first page. + */ +static inline void btrfs_cleanup_ordered_extents(struct inode *inode, + const u64 offset, + const u64 bytes) +{ + return __endio_write_update_ordered(inode, offset + PAGE_SIZE, + bytes - PAGE_SIZE, false); +} static int btrfs_dirty_inode(struct inode *inode); @@ -152,7 +177,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; - int err = 0; int ret; size_t cur_size = size; unsigned long offset; @@ -166,7 +190,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; size_t datasize; - key.objectid = btrfs_ino(inode); + key.objectid = btrfs_ino(BTRFS_I(inode)); key.offset = start; key.type = BTRFS_EXTENT_DATA_KEY; @@ -174,10 +198,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); - if (ret) { - err = ret; + if (ret) goto fail; - } } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -232,9 +254,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, BTRFS_I(inode)->disk_i_size = inode->i_size; ret = btrfs_update_inode(trans, root, inode); - return ret; fail: - return err; + return ret; } @@ -315,8 +336,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, } set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - btrfs_delalloc_release_metadata(inode, end + 1 - start); - btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); + btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start); + btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0); out: /* * Don't forget to free the reserved space, as for inlined extent @@ -324,7 +345,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -388,6 +409,15 @@ static inline int inode_need_compress(struct inode *inode) return 0; } +static inline void inode_should_defrag(struct btrfs_inode *inode, + u64 start, u64 end, u64 num_bytes, u64 small_write) +{ + /* If this is a small write inside eof, kick off a defrag */ + if (num_bytes < small_write && + (start > 0 || end + 1 < inode->disk_i_size)) + btrfs_add_inode_defrag(NULL, inode); +} + /* * we create compressed extents in two phases. The first * phase compresses a range of pages that have already been @@ -420,26 +450,23 @@ static noinline void compress_file_range(struct inode *inode, int ret = 0; struct page **pages = NULL; unsigned long nr_pages; - unsigned long nr_pages_ret = 0; unsigned long total_compressed = 0; unsigned long total_in = 0; - unsigned long max_compressed = SZ_128K; - unsigned long max_uncompressed = SZ_128K; int i; int will_compress; int compress_type = fs_info->compress_type; int redirty = 0; - /* if this is a small write inside eof, kick off a defrag */ - if ((end - start + 1) < SZ_16K && - (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode); + inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, + SZ_16K); actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE); + BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0); + nr_pages = min_t(unsigned long, nr_pages, + BTRFS_MAX_COMPRESSED / PAGE_SIZE); /* * we don't want to send crud past the end of i_size through @@ -464,17 +491,8 @@ again: (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) goto cleanup_and_bail_uncompressed; - /* we want to make sure that amount of ram required to uncompress - * an extent is reasonable, so we limit the total size in ram - * of a compressed extent to 128k. This is a crucial number - * because it also controls how easily we can spread reads across - * cpus for decompression. - * - * We also want to make sure the amount of IO required to do - * a random read is reasonably small, so we limit the size of - * a compressed extent to 128k. - */ - total_compressed = min(total_compressed, max_uncompressed); + total_compressed = min_t(unsigned long, total_compressed, + BTRFS_MAX_UNCOMPRESSED); num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); total_in = 0; @@ -509,16 +527,15 @@ again: redirty = 1; ret = btrfs_compress_pages(compress_type, inode->i_mapping, start, - total_compressed, pages, - nr_pages, &nr_pages_ret, + pages, + &nr_pages, &total_in, - &total_compressed, - max_compressed); + &total_compressed); if (!ret) { unsigned long offset = total_compressed & (PAGE_SIZE - 1); - struct page *page = pages[nr_pages_ret - 1]; + struct page *page = pages[nr_pages - 1]; char *kaddr; /* zero the tail end of the last page, we might be @@ -541,7 +558,7 @@ cont: * to make an uncompressed inline extent. */ ret = cow_file_range_inline(root, inode, start, end, - 0, 0, NULL); + 0, BTRFS_COMPRESS_NONE, NULL); } else { /* try making a compressed inline extent */ ret = cow_file_range_inline(root, inode, start, end, @@ -550,7 +567,7 @@ cont: } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DEFRAG; + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG; unsigned long page_error_op; clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; @@ -568,8 +585,10 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); - btrfs_free_reserved_data_space_noquota(inode, start, - end - start + 1); + if (ret == 0) + btrfs_free_reserved_data_space_noquota(inode, + start, + end - start + 1); goto free_pages_out; } } @@ -584,12 +603,11 @@ cont: /* * one last check to make sure the compression is really a - * win, compare the page count read with the blocks on disk + * win, compare the page count read with the blocks on disk, + * compression must free at least one sector size */ total_in = ALIGN(total_in, PAGE_SIZE); - if (total_compressed >= total_in) { - will_compress = 0; - } else { + if (total_compressed + blocksize <= total_in) { num_bytes = total_in; *num_added += 1; @@ -599,7 +617,7 @@ cont: * will submit them to the elevator. */ add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret, + total_compressed, pages, nr_pages, compress_type); if (start + num_bytes < end) { @@ -616,14 +634,14 @@ cont: * the compression code ran but failed to make things smaller, * free any pages it allocated and our page pointer array */ - for (i = 0; i < nr_pages_ret; i++) { + for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); put_page(pages[i]); } kfree(pages); pages = NULL; total_compressed = 0; - nr_pages_ret = 0; + nr_pages = 0; /* flag the file so we don't compress in the future */ if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && @@ -652,7 +670,7 @@ cleanup_and_bail_uncompressed: return; free_pages_out: - for (i = 0; i < nr_pages_ret; i++) { + for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); put_page(pages[i]); } @@ -690,7 +708,6 @@ static noinline void submit_compressed_extents(struct inode *inode, struct btrfs_key ins; struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree; int ret = 0; @@ -778,46 +795,19 @@ retry: * here we're doing allocation and writeback of the * compressed pages */ - btrfs_drop_extent_cache(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, 0); - - em = alloc_extent_map(); - if (!em) { - ret = -ENOMEM; - goto out_free_reserve; - } - em->start = async_extent->start; - em->len = async_extent->ram_size; - em->orig_start = em->start; - em->mod_start = em->start; - em->mod_len = em->len; - - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->orig_block_len = ins.offset; - em->ram_bytes = async_extent->ram_size; - em->bdev = fs_info->fs_devices->latest_bdev; - em->compress_type = async_extent->compress_type; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->generation = -1; - - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 1); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, 0); - } - - if (ret) + em = create_io_em(inode, async_extent->start, + async_extent->ram_size, /* len */ + async_extent->start, /* orig_start */ + ins.objectid, /* block_start */ + ins.offset, /* block_len */ + ins.offset, /* orig_block_len */ + async_extent->ram_size, /* ram_bytes */ + async_extent->compress_type, + BTRFS_ORDERED_COMPRESSED); + if (IS_ERR(em)) + /* ret value is not necessary due to void function */ goto out_free_reserve; + free_extent_map(em); ret = btrfs_add_ordered_extent_compress(inode, async_extent->start, @@ -827,7 +817,8 @@ retry: BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); if (ret) { - btrfs_drop_extent_cache(inode, async_extent->start, + btrfs_drop_extent_cache(BTRFS_I(inode), + async_extent->start, async_extent->start + async_extent->ram_size - 1, 0); goto out_free_reserve; @@ -845,13 +836,12 @@ retry: NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); - ret = btrfs_submit_compressed_write(inode, + if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, - async_extent->nr_pages); - if (ret) { + async_extent->nr_pages)) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; @@ -882,6 +872,7 @@ out_free: async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | @@ -948,14 +939,16 @@ static noinline int cow_file_range(struct inode *inode, u64 num_bytes; unsigned long ram_size; u64 disk_num_bytes; - u64 cur_alloc_size; + u64 cur_alloc_size = 0; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + unsigned clear_bits; + unsigned long page_ops; + bool extent_reserved = false; int ret = 0; - if (btrfs_is_free_space_inode(inode)) { + if (btrfs_is_free_space_inode(BTRFS_I(inode))) { WARN_ON_ONCE(1); ret = -EINVAL; goto out_unlock; @@ -965,19 +958,17 @@ static noinline int cow_file_range(struct inode *inode, num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; - /* if this is a small write inside eof, kick off defrag */ - if (num_bytes < SZ_64K && - (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode); + inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K); if (start == 0) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(root, inode, start, end, 0, 0, - NULL); + ret = cow_file_range_inline(root, inode, start, end, 0, + BTRFS_COMPRESS_NONE, NULL); if (ret == 0) { extent_clear_unlock_delalloc(inode, start, end, delalloc_end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); @@ -996,53 +987,32 @@ static noinline int cow_file_range(struct inode *inode, btrfs_super_total_bytes(fs_info->super_copy)); alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); - btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), start, + start + num_bytes - 1, 0); while (disk_num_bytes > 0) { - unsigned long op; - cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret < 0) goto out_unlock; + cur_alloc_size = ins.offset; + extent_reserved = true; - em = alloc_extent_map(); - if (!em) { - ret = -ENOMEM; - goto out_reserve; - } - em->start = start; - em->orig_start = em->start; ram_size = ins.offset; - em->len = ins.offset; - em->mod_start = em->start; - em->mod_len = em->len; - - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->orig_block_len = ins.offset; - em->ram_bytes = ram_size; - em->bdev = fs_info->fs_devices->latest_bdev; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - em->generation = -1; - - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 1); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, start, - start + ram_size - 1, 0); - } - if (ret) + em = create_io_em(inode, start, ins.offset, /* len */ + start, /* orig_start */ + ins.objectid, /* block_start */ + ins.offset, /* block_len */ + ins.offset, /* orig_block_len */ + ram_size, /* ram_bytes */ + BTRFS_COMPRESS_NONE, /* compress_type */ + BTRFS_ORDERED_REGULAR /* type */); + if (IS_ERR(em)) goto out_reserve; + free_extent_map(em); - cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); if (ret) @@ -1052,15 +1022,24 @@ static noinline int cow_file_range(struct inode *inode, BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); + /* + * Only drop cache here, and process as normal. + * + * We must not allow extent_clear_unlock_delalloc() + * at out_unlock label to free meta of this ordered + * extent, as its meta should be freed by + * btrfs_finish_ordered_io(). + * + * So we must continue until @start is increased to + * skip current ordered extent. + */ if (ret) - goto out_drop_extent_cache; + btrfs_drop_extent_cache(BTRFS_I(inode), start, + start + ram_size - 1, 0); } btrfs_dec_block_group_reservations(fs_info, ins.objectid); - if (disk_num_bytes < cur_alloc_size) - break; - /* we're not doing compressed IO, don't unlock the first * page (which the caller expects to stay locked), don't * clear any dirty bits and don't set any writeback bits @@ -1068,34 +1047,69 @@ static noinline int cow_file_range(struct inode *inode, * Do set the Private2 bit so we know this page was properly * setup for writepage */ - op = unlock ? PAGE_UNLOCK : 0; - op |= PAGE_SET_PRIVATE2; + page_ops = unlock ? PAGE_UNLOCK : 0; + page_ops |= PAGE_SET_PRIVATE2; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, delalloc_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, - op); - disk_num_bytes -= cur_alloc_size; + page_ops); + if (disk_num_bytes < cur_alloc_size) + disk_num_bytes = 0; + else + disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; + extent_reserved = false; + + /* + * btrfs_reloc_clone_csums() error, since start is increased + * extent_clear_unlock_delalloc() at out_unlock label won't + * free metadata of current ordered extent, we're OK to exit. + */ + if (ret) + goto out_unlock; } out: return ret; out_drop_extent_cache: - btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK; + /* + * If we reserved an extent for our delalloc range (or a subrange) and + * failed to create the respective ordered extent, then it means that + * when we reserved the extent we decremented the extent's size from + * the data space_info's bytes_may_use counter and incremented the + * space_info's bytes_reserved counter by the same amount. We must make + * sure extent_clear_unlock_delalloc() does not try to decrement again + * the data space_info's bytes_may_use counter, therefore we do not pass + * it the flag EXTENT_CLEAR_DATA_RESV. + */ + if (extent_reserved) { + extent_clear_unlock_delalloc(inode, start, + start + cur_alloc_size, + start + cur_alloc_size, + locked_page, + clear_bits, + page_ops); + start += cur_alloc_size; + if (start >= end) + goto out; + } extent_clear_unlock_delalloc(inode, start, end, delalloc_end, locked_page, - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DELALLOC | EXTENT_DEFRAG, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); + clear_bits | EXTENT_CLEAR_DATA_RESV, + page_ops); goto out; } @@ -1164,7 +1178,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; u64 cur_end; - int limit = 10 * SZ_1M; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); @@ -1196,12 +1209,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work); - if (atomic_read(&fs_info->async_delalloc_pages) > limit) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->async_delalloc_pages) < - limit)); - } - while (atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->async_delalloc_pages)) { wait_event(fs_info->async_submit_wait, @@ -1250,11 +1257,11 @@ static noinline int run_delalloc_nocow(struct inode *inode, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_file_extent_item *fi; struct btrfs_key found_key; + struct extent_map *em; u64 cow_start; u64 cur_offset; u64 extent_end; @@ -1269,7 +1276,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, int nocow; int check_prev = 1; bool nolock; - u64 ino = btrfs_ino(inode); + u64 ino = btrfs_ino(BTRFS_I(inode)); path = btrfs_alloc_path(); if (!path) { @@ -1284,32 +1291,12 @@ static noinline int run_delalloc_nocow(struct inode *inode, return -ENOMEM; } - nolock = btrfs_is_free_space_inode(inode); - - if (nolock) - trans = btrfs_join_transaction_nolock(root); - else - trans = btrfs_join_transaction(root); - - if (IS_ERR(trans)) { - extent_clear_unlock_delalloc(inode, start, end, end, - locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | - PAGE_END_WRITEBACK); - btrfs_free_path(path); - return PTR_ERR(trans); - } - - trans->block_rsv = &fs_info->delalloc_block_rsv; + nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); cow_start = (u64)-1; cur_offset = start; while (1) { - ret = btrfs_lookup_file_extent(trans, root, path, ino, + ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) goto error; @@ -1382,7 +1369,7 @@ next_slot: goto out_check; if (btrfs_extent_readonly(fs_info, disk_bytenr)) goto out_check; - if (btrfs_cross_ref_exist(trans, root, ino, + if (btrfs_cross_ref_exist(root, ino, found_key.offset - extent_offset, disk_bytenr)) goto out_check; @@ -1404,10 +1391,16 @@ next_slot: * either valid or do not exist. */ if (csum_exist_in_range(fs_info, disk_bytenr, - num_bytes)) + num_bytes)) { + if (!nolock) + btrfs_end_write_no_snapshoting(root); goto out_check; - if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) + } + if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) { + if (!nolock) + btrfs_end_write_no_snapshoting(root); goto out_check; + } nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + @@ -1455,35 +1448,28 @@ out_check: } if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - struct extent_map *em; - struct extent_map_tree *em_tree; - em_tree = &BTRFS_I(inode)->extent_tree; - em = alloc_extent_map(); - BUG_ON(!em); /* -ENOMEM */ - em->start = cur_offset; - em->orig_start = found_key.offset - extent_offset; - em->len = num_bytes; - em->block_len = num_bytes; - em->block_start = disk_bytenr; - em->orig_block_len = disk_num_bytes; - em->ram_bytes = ram_bytes; - em->bdev = fs_info->fs_devices->latest_bdev; - em->mod_start = em->start; - em->mod_len = em->len; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - set_bit(EXTENT_FLAG_FILLING, &em->flags); - em->generation = -1; - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 1); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, em->start, - em->start + em->len - 1, 0); + u64 orig_start = found_key.offset - extent_offset; + + em = create_io_em(inode, cur_offset, num_bytes, + orig_start, + disk_bytenr, /* block_start */ + num_bytes, /* block_len */ + disk_num_bytes, /* orig_block_len */ + ram_bytes, BTRFS_COMPRESS_NONE, + BTRFS_ORDERED_PREALLOC); + if (IS_ERR(em)) { + if (!nolock && nocow) + btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(fs_info, + disk_bytenr); + ret = PTR_ERR(em); + goto error; } + free_extent_map(em); + } + + if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { type = BTRFS_ORDERED_PREALLOC; } else { type = BTRFS_ORDERED_NOCOW; @@ -1496,15 +1482,14 @@ out_check: BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { + BTRFS_DATA_RELOC_TREE_OBJECTID) + /* + * Error handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler + * from freeing metadata of created ordered extent. + */ ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - if (ret) { - if (!nolock && nocow) - btrfs_end_write_no_snapshoting(root); - goto error; - } - } extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, end, @@ -1516,6 +1501,14 @@ out_check: if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; + + /* + * btrfs_reloc_clone_csums() error, now we're OK to call error + * handler, as metadata for created ordered extent will only + * be freed by btrfs_finish_ordered_io(). + */ + if (ret) + goto error; if (cur_offset > end) break; } @@ -1534,10 +1527,6 @@ out_check: } error: - err = btrfs_end_transaction(trans); - if (!ret) - ret = err; - if (ret && cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, end, locked_page, EXTENT_LOCKED | @@ -1573,10 +1562,11 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) /* * extent_io.c call back to do delayed allocation processing */ -static int run_delalloc_range(struct inode *inode, struct page *locked_page, +static int run_delalloc_range(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written) { + struct inode *inode = private_data; int ret; int force_cow = need_force_cow(inode, start, end); @@ -1595,12 +1585,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); } + if (ret) + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } -static void btrfs_split_extent_hook(struct inode *inode, +static void btrfs_split_extent_hook(void *private_data, struct extent_state *orig, u64 split) { + struct inode *inode = private_data; u64 size; /* not delalloc, ignore it */ @@ -1609,7 +1602,7 @@ static void btrfs_split_extent_hook(struct inode *inode, size = orig->end - orig->start + 1; if (size > BTRFS_MAX_EXTENT_SIZE) { - u64 num_extents; + u32 num_extents; u64 new_size; /* @@ -1617,13 +1610,10 @@ static void btrfs_split_extent_hook(struct inode *inode, * applies here, just in reverse. */ new_size = orig->end - split + 1; - num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE); + num_extents = count_max_extents(new_size); new_size = split - orig->start; - num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE); - if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE) >= num_extents) + num_extents += count_max_extents(new_size); + if (count_max_extents(size) >= num_extents) return; } @@ -1638,12 +1628,13 @@ static void btrfs_split_extent_hook(struct inode *inode, * extents, such as when we are doing sequential writes, so we can properly * account for the metadata space we'll need. */ -static void btrfs_merge_extent_hook(struct inode *inode, +static void btrfs_merge_extent_hook(void *private_data, struct extent_state *new, struct extent_state *other) { + struct inode *inode = private_data; u64 new_size, old_size; - u64 num_extents; + u32 num_extents; /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) @@ -1681,14 +1672,10 @@ static void btrfs_merge_extent_hook(struct inode *inode, * this case. */ old_size = other->end - other->start + 1; - num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE); + num_extents = count_max_extents(old_size); old_size = new->end - new->start + 1; - num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE); - - if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE) >= num_extents) + num_extents += count_max_extents(old_size); + if (count_max_extents(new_size) >= num_extents) return; spin_lock(&BTRFS_I(inode)->lock); @@ -1720,15 +1707,15 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root, } static void btrfs_del_delalloc_inode(struct btrfs_root *root, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); spin_lock(&root->delalloc_lock); - if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); + if (!list_empty(&inode->delalloc_inodes)) { + list_del_init(&inode->delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { spin_lock(&fs_info->delalloc_root_lock); @@ -1745,9 +1732,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ -static void btrfs_set_bit_hook(struct inode *inode, +static void btrfs_set_bit_hook(void *private_data, struct extent_state *state, unsigned *bits) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1761,7 +1749,7 @@ static void btrfs_set_bit_hook(struct inode *inode, if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - bool do_list = !btrfs_is_free_space_inode(inode); + bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; @@ -1775,8 +1763,8 @@ static void btrfs_set_bit_hook(struct inode *inode, if (btrfs_is_testing(fs_info)) return; - __percpu_counter_add(&fs_info->delalloc_bytes, len, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->delalloc_bytes, len, + fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; if (*bits & EXTENT_DEFRAG) @@ -1786,24 +1774,32 @@ static void btrfs_set_bit_hook(struct inode *inode, btrfs_add_delalloc_inodes(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } + + if (!(state->state & EXTENT_DELALLOC_NEW) && + (*bits & EXTENT_DELALLOC_NEW)) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - + state->start; + spin_unlock(&BTRFS_I(inode)->lock); + } } /* * extent_io.c clear_bit_hook, see set_bit_hook for why */ -static void btrfs_clear_bit_hook(struct inode *inode, +static void btrfs_clear_bit_hook(void *private_data, struct extent_state *state, unsigned *bits) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); u64 len = state->end + 1 - state->start; - u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1, - BTRFS_MAX_EXTENT_SIZE); + u32 num_extents = count_max_extents(len); - spin_lock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) - BTRFS_I(inode)->defrag_bytes -= len; - spin_unlock(&BTRFS_I(inode)->lock); + inode->defrag_bytes -= len; + spin_unlock(&inode->lock); /* * set_bit and clear bit hooks normally require _irqsave/restore @@ -1811,15 +1807,15 @@ static void btrfs_clear_bit_hook(struct inode *inode, * bit, which is only set or cleared with irqs on */ if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; - } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents -= num_extents; - spin_unlock(&BTRFS_I(inode)->lock); + } else if (!(*bits & EXTENT_CLEAR_META_RESV)) { + spin_lock(&inode->lock); + inode->outstanding_extents -= num_extents; + spin_unlock(&inode->lock); } /* @@ -1827,7 +1823,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, * don't need to call dellalloc_release_metadata if there is an * error. */ - if (*bits & EXTENT_DO_ACCOUNTING && + if (*bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); @@ -1835,22 +1831,30 @@ static void btrfs_clear_bit_hook(struct inode *inode, if (btrfs_is_testing(fs_info)) return; - if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list && !(state->state & EXTENT_NORESERVE) - && (*bits & (EXTENT_DO_ACCOUNTING | - EXTENT_CLEAR_DATA_RESV))) - btrfs_free_reserved_data_space_noquota(inode, + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && + do_list && !(state->state & EXTENT_NORESERVE) && + (*bits & EXTENT_CLEAR_DATA_RESV)) + btrfs_free_reserved_data_space_noquota( + &inode->vfs_inode, state->start, len); - __percpu_counter_add(&fs_info->delalloc_bytes, -len, - fs_info->delalloc_batch); - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->delalloc_bytes -= len; - if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && + percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, + fs_info->delalloc_batch); + spin_lock(&inode->lock); + inode->delalloc_bytes -= len; + if (do_list && inode->delalloc_bytes == 0 && test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) btrfs_del_delalloc_inode(root, inode); - spin_unlock(&BTRFS_I(inode)->lock); + spin_unlock(&inode->lock); + } + + if ((state->state & EXTENT_DELALLOC_NEW) && + (*bits & EXTENT_DELALLOC_NEW)) { + spin_lock(&inode->lock); + ASSERT(inode->new_delalloc_bytes >= len); + inode->new_delalloc_bytes -= len; + spin_unlock(&inode->lock); } } @@ -1895,11 +1899,12 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { - int ret = 0; + struct inode *inode = private_data; + blk_status_t ret = 0; ret = btrfs_csum_one_bio(inode, bio, 0, 0); BUG_ON(ret); /* -ENOMEM */ @@ -1914,16 +1919,17 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; + blk_status_t ret; ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1933,20 +1939,21 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read */ -static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { + struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - int ret = 0; + blk_status_t ret = 0; int skip_sum; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - if (btrfs_is_free_space_inode(inode)) + if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; if (bio_op(bio) != REQ_OP_WRITE) { @@ -1970,8 +1977,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, - bio_flags, bio_offset, + ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags, + bio_offset, inode, __btrfs_submit_bio_start, __btrfs_submit_bio_done); goto out; @@ -1985,8 +1992,8 @@ mapit: ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); out: - if (ret < 0) { - bio->bi_error = ret; + if (ret) { + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1997,8 +2004,7 @@ out: * at IO completion time based on sums calculated at bio submission time. */ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, - struct inode *inode, u64 file_offset, - struct list_head *list) + struct inode *inode, struct list_head *list) { struct btrfs_ordered_sum *sum; @@ -2030,6 +2036,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_writepage_fixup *fixup; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; struct page *page; struct inode *inode; u64 page_start; @@ -2056,7 +2063,7 @@ again: if (PagePrivate2(page)) goto out; - ordered = btrfs_lookup_ordered_range(inode, page_start, + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); if (ordered) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, @@ -2067,7 +2074,7 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, PAGE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); @@ -2087,6 +2094,7 @@ out_page: unlock_page(page); put_page(page); kfree(fixup); + extent_changeset_free(data_reserved); } /* @@ -2138,6 +2146,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; + u64 qg_released; int extent_inserted = 0; int ret; @@ -2161,7 +2170,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, goto out; if (!extent_inserted) { - ins.objectid = btrfs_ino(inode); + ins.objectid = btrfs_ino(BTRFS_I(inode)); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; @@ -2193,14 +2202,17 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, - btrfs_ino(inode), file_pos, - ram_bytes, &ins); + /* * Release the reserved range from inode dirty range map, as it is * already moved into delayed_ref_head */ - btrfs_qgroup_release_data(inode, file_pos, ram_bytes); + ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes); + if (ret < 0) + goto out; + qg_released = ret; + ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins); out: btrfs_free_path(path); @@ -2320,7 +2332,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, u64 num_bytes; if (BTRFS_I(inode)->root->root_key.objectid == root_id && - inum == btrfs_ino(inode)) + inum == btrfs_ino(BTRFS_I(inode))) return 0; key.objectid = root_id; @@ -2589,7 +2601,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path, if (ret) goto out_free_path; again: - key.objectid = btrfs_ino(inode); + key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; @@ -2768,7 +2780,7 @@ record_old_file_extents(struct inode *inode, if (!path) goto out_kfree; - key.objectid = btrfs_ino(inode); + key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = new->file_pos; @@ -2803,7 +2815,7 @@ record_old_file_extents(struct inode *inode, btrfs_item_key_to_cpu(l, &key, slot); - if (key.objectid != btrfs_ino(inode)) + if (key.objectid != btrfs_ino(BTRFS_I(inode))) break; if (key.type != BTRFS_EXTENT_DATA_KEY) break; @@ -2886,17 +2898,25 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) u64 logical_len = ordered_extent->len; bool nolock; bool truncated = false; + bool range_locked = false; + bool clear_new_delalloc_bytes = false; + + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) + clear_new_delalloc_bytes = true; - nolock = btrfs_is_free_space_inode(inode); + nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; goto out; } - btrfs_free_io_failure_record(inode, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1); + btrfs_free_io_failure_record(BTRFS_I(inode), + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -2914,7 +2934,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) * space for NOCOW range. * As NOCOW won't cause a new delayed ref, just free the space */ - btrfs_qgroup_free_data(inode, ordered_extent->file_offset, + btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (nolock) @@ -2933,13 +2953,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + range_locked = true; lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, &cached_state); ret = test_range_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 1, cached_state); + EXTENT_DEFRAG, 0, cached_state); if (ret) { u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); if (0 && last_snapshot >= BTRFS_I(inode)->generation) @@ -2958,7 +2979,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; - goto out_unlock; + goto out; } trans->block_rsv = &fs_info->delalloc_block_rsv; @@ -2967,7 +2988,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); - ret = btrfs_mark_extent_written(trans, inode, + ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), ordered_extent->file_offset, ordered_extent->file_offset + logical_len); @@ -2990,26 +3011,38 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); - goto out_unlock; + goto out; } - add_pending_csums(trans, inode, ordered_extent->file_offset, - &ordered_extent->list); + add_pending_csums(trans, inode, &ordered_extent->list); btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); - goto out_unlock; + goto out; } ret = 0; -out_unlock: - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); out: + if (range_locked || clear_new_delalloc_bytes) { + unsigned int clear_bits = 0; + + if (range_locked) + clear_bits |= EXTENT_LOCKED; + if (clear_new_delalloc_bytes) + clear_bits |= EXTENT_DELALLOC_NEW; + clear_extent_bit(&BTRFS_I(inode)->io_tree, + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, + clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, + 0, &cached_state, GFP_NOFS); + } + if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(inode, ordered_extent->len); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + ordered_extent->len); if (trans) btrfs_end_transaction(trans); @@ -3024,7 +3057,7 @@ out: clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); /* Drop the cache for the part of the extent we didn't write. */ - btrfs_drop_extent_cache(inode, start, end, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); /* * If the ordered extent had an IOERR or something else went @@ -3072,7 +3105,7 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } -static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, +static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { struct inode *inode = page->mapping->host; @@ -3086,9 +3119,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, ClearPagePrivate2(page); if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, end - start + 1, uptodate)) - return 0; + return; - if (btrfs_is_free_space_inode(inode)) { + if (btrfs_is_free_space_inode(BTRFS_I(inode))) { wq = fs_info->endio_freespace_worker; func = btrfs_freespace_write_helper; } else { @@ -3099,8 +3132,6 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &ordered_extent->work); - - return 0; } static int __readpage_endio_check(struct inode *inode, @@ -3123,9 +3154,8 @@ static int __readpage_endio_check(struct inode *inode, kunmap_atomic(kaddr); return 0; zeroit: - btrfs_warn_rl(BTRFS_I(inode)->root->fs_info, - "csum failed ino %llu off %llu csum %u expected csum %u", - btrfs_ino(inode), start, csum, csum_expected); + btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, + io_bio->mirror_num); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); kunmap_atomic(kaddr); @@ -3263,10 +3293,11 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, * NOTE: caller of this function should reserve 5 units of metadata for * this function. */ -int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) +int btrfs_orphan_add(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = NULL; int reserve = 0; int insert = 0; @@ -3288,7 +3319,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) } if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags)) { + &inode->runtime_flags)) { #if 0 /* * For proper ENOSPC handling, we should do orphan @@ -3305,7 +3336,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) } if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) reserve = 1; spin_unlock(&root->orphan_lock); @@ -3316,10 +3347,10 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (ret) { atomic_dec(&root->orphan_inodes); clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); if (insert) clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); return ret; } } @@ -3331,12 +3362,12 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) atomic_dec(&root->orphan_inodes); if (reserve) { clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); btrfs_orphan_release_metadata(inode); } if (ret != -EEXIST) { clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); btrfs_abort_transaction(trans, ret); return ret; } @@ -3361,20 +3392,20 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) * item for this particular inode. */ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; int delete_item = 0; int release_rsv = 0; int ret = 0; spin_lock(&root->orphan_lock); if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) delete_item = 1; if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) + &inode->runtime_flags)) release_rsv = 1; spin_unlock(&root->orphan_lock); @@ -3548,7 +3579,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = PTR_ERR(trans); goto out; } - ret = btrfs_orphan_add(trans, inode); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); btrfs_end_transaction(trans); if (ret) { iput(inode); @@ -3557,7 +3588,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = btrfs_truncate(inode); if (ret) - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); } else { nr_unlink++; } @@ -3712,7 +3743,7 @@ static int btrfs_read_locked_inode(struct inode *inode) set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); - btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); @@ -3789,7 +3820,7 @@ cache_index: goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); - if (location.objectid != btrfs_ino(inode)) + if (location.objectid != btrfs_ino(BTRFS_I(inode))) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -3811,14 +3842,14 @@ cache_acl: * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(inode), &first_xattr_slot); + btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", - btrfs_ino(inode), + btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); } btrfs_free_path(path); @@ -3960,7 +3991,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * The data relocation inode should also be directly updated * without delay */ - if (!btrfs_is_free_space_inode(inode) + if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); @@ -3993,7 +4024,8 @@ noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, */ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *dir, struct inode *inode, + struct btrfs_inode *dir, + struct btrfs_inode *inode, const char *name, int name_len) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4040,10 +4072,10 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, * that we delay to delete it, and just do this deletion when * we update the inode item. */ - if (BTRFS_I(inode)->dir_index) { + if (inode->dir_index) { ret = btrfs_delayed_delete_inode_ref(inode); if (!ret) { - index = BTRFS_I(inode)->dir_index; + index = inode->dir_index; goto skip_backref; } } @@ -4064,15 +4096,15 @@ skip_backref: goto err; } - ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, - inode, dir_ino); + ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, + dir_ino); if (ret != 0 && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto err; } - ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, - dir, index); + ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, + index); if (ret == -ENOENT) ret = 0; else if (ret) @@ -4082,26 +4114,26 @@ err: if (ret) goto out; - btrfs_i_size_write(dir, dir->i_size - name_len * 2); - inode_inc_iversion(inode); - inode_inc_iversion(dir); - inode->i_ctime = dir->i_mtime = - dir->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, dir); + btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); + inode_inc_iversion(&inode->vfs_inode); + inode_inc_iversion(&dir->vfs_inode); + inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = + dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + ret = btrfs_update_inode(trans, root, &dir->vfs_inode); out: return ret; } int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *dir, struct inode *inode, + struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len) { int ret; ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (!ret) { - drop_nlink(inode); - ret = btrfs_update_inode(trans, root, inode); + drop_nlink(&inode->vfs_inode); + ret = btrfs_update_inode(trans, root, &inode->vfs_inode); } return ret; } @@ -4139,15 +4171,17 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0); + btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + 0); - ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), - dentry->d_name.name, dentry->d_name.len); + ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + BTRFS_I(d_inode(dentry)), dentry->d_name.name, + dentry->d_name.len); if (ret) goto out; if (inode->i_nlink == 0) { - ret = btrfs_orphan_add(trans, inode); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) goto out; } @@ -4170,7 +4204,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_key key; u64 index; int ret; - u64 dir_ino = btrfs_ino(dir); + u64 dir_ino = btrfs_ino(BTRFS_I(dir)); path = btrfs_alloc_path(); if (!path) @@ -4222,13 +4256,13 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index); + ret = btrfs_delete_delayed_dir_index(trans, fs_info, BTRFS_I(dir), index); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_i_size_write(dir, dir->i_size - name_len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); inode_inc_iversion(dir); dir->i_mtime = dir->i_ctime = current_time(dir); ret = btrfs_update_inode_fallback(trans, root, dir); @@ -4249,14 +4283,14 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) return -EPERM; trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); - if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { err = btrfs_unlink_subvol(trans, root, dir, BTRFS_I(inode)->location.objectid, dentry->d_name.name, @@ -4264,17 +4298,18 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) goto out; } - err = btrfs_orphan_add(trans, inode); + err = btrfs_orphan_add(trans, BTRFS_I(inode)); if (err) goto out; last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), - dentry->d_name.name, dentry->d_name.len); + err = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + BTRFS_I(d_inode(dentry)), dentry->d_name.name, + dentry->d_name.len); if (!err) { - btrfs_i_size_write(inode, 0); + btrfs_i_size_write(BTRFS_I(inode), 0); /* * Propagate the last_unlink_trans value of the deleted dir to * its parent directory. This is to prevent an unrecoverable @@ -4398,7 +4433,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int extent_type = -1; int ret; int err = 0; - u64 ino = btrfs_ino(inode); + u64 ino = btrfs_ino(BTRFS_I(inode)); u64 bytes_deleted = 0; bool be_nice = 0; bool should_throttle = 0; @@ -4410,7 +4445,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * for non-free space inodes and ref cows, we want to back off from * time to time */ - if (!btrfs_is_free_space_inode(inode) && + if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && test_bit(BTRFS_ROOT_REF_COWS, &root->state)) be_nice = 1; @@ -4426,7 +4461,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, */ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root == fs_info->tree_root) - btrfs_drop_extent_cache(inode, ALIGN(new_size, + btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size, fs_info->sectorsize), (u64)-1, 0); @@ -4437,7 +4472,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * items. */ if (min_type == 0 && root == BTRFS_I(inode)->root) - btrfs_kill_delayed_inode_items(inode); + btrfs_kill_delayed_inode_items(BTRFS_I(inode)); key.objectid = ino; key.offset = (u64)-1; @@ -4493,28 +4528,25 @@ search_again: if (extent_type != BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_num_bytes(leaf, fi); + + trace_btrfs_truncate_show_fi_regular( + BTRFS_I(inode), leaf, fi, + found_key.offset); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_inline_len(leaf, path->slots[0], fi); + + trace_btrfs_truncate_show_fi_inline( + BTRFS_I(inode), leaf, fi, path->slots[0], + found_key.offset); } item_end--; } if (found_type > min_type) { del_item = 1; } else { - if (item_end < new_size) { - /* - * With NO_HOLES mode, for the following mapping - * - * [0-4k][hole][8k-12k] - * - * if truncating isize down to 6k, it ends up - * isize being 8k. - */ - if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) - last_size = new_size; + if (item_end < new_size) break; - } if (found_key.offset >= new_size) del_item = 1; else @@ -4697,8 +4729,12 @@ out: btrfs_abort_transaction(trans, ret); } error: - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ASSERT(last_size >= new_size); + if (!err && last_size > new_size) + last_size = new_size; btrfs_ordered_update_i_size(inode, last_size, NULL); + } btrfs_free_path(path); @@ -4734,6 +4770,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; @@ -4748,7 +4785,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, (!len || ((len & (blocksize - 1)) == 0))) goto out; - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, round_down(from, blocksize), blocksize); if (ret) goto out; @@ -4756,7 +4793,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, round_down(from, blocksize), blocksize); ret = -ENOMEM; @@ -4828,11 +4865,12 @@ again: out_unlock: if (ret) - btrfs_delalloc_release_space(inode, block_start, + btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize); unlock_page(page); put_page(page); out: + extent_changeset_free(data_reserved); return ret; } @@ -4870,8 +4908,8 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, return ret; } - ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, - 0, 0, len, 0, len, 0, 0, 0); + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)), + offset, 0, 0, len, 0, len, 0, 0, 0); if (ret) btrfs_abort_transaction(trans, ret); else @@ -4918,7 +4956,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) lock_extent_bits(io_tree, hole_start, block_end - 1, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, hole_start, + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start, block_end - hole_start); if (!ordered) break; @@ -4930,7 +4968,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) cur_offset = hole_start; while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, block_end - cur_offset, 0); if (IS_ERR(em)) { err = PTR_ERR(em); @@ -4947,7 +4985,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_size); if (err) break; - btrfs_drop_extent_cache(inode, cur_offset, + btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); if (!hole_em) { @@ -4973,7 +5011,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) write_unlock(&em_tree->lock); if (err != -EEXIST) break; - btrfs_drop_extent_cache(inode, cur_offset, + btrfs_drop_extent_cache(BTRFS_I(inode), + cur_offset, cur_offset + hole_size - 1, 0); } @@ -5070,7 +5109,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * so we need to guarantee from this point on that everything * will be consistent. */ - ret = btrfs_orphan_add(trans, inode); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); btrfs_end_transaction(trans); if (ret) return ret; @@ -5079,14 +5118,21 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) truncate_setsize(inode, newsize); /* Disable nonlocked read DIO to avoid the end less truncate */ - btrfs_inode_block_unlocked_dio(inode); + btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); - btrfs_inode_resume_unlocked_dio(inode); + btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); ret = btrfs_truncate(inode); if (ret && inode->i_nlink) { int err; + /* To get a stable disk_i_size */ + err = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (err) { + btrfs_orphan_del(NULL, BTRFS_I(inode)); + return err; + } + /* * failed to truncate, disk_i_size is only adjusted down * as we remove extents, so it should represent the true @@ -5095,11 +5141,11 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) */ trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); return ret; } i_size_write(inode, BTRFS_I(inode)->disk_i_size); - err = btrfs_orphan_del(trans, inode); + err = btrfs_orphan_del(trans, BTRFS_I(inode)); if (err) btrfs_abort_transaction(trans, err); btrfs_end_transaction(trans); @@ -5219,7 +5265,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state->state & EXTENT_DELALLOC) - btrfs_qgroup_free_data(inode, start, end - start + 1); + btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -5257,18 +5303,18 @@ void btrfs_evict_inode(struct inode *inode) if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || - btrfs_is_free_space_inode(inode))) + btrfs_is_free_space_inode(BTRFS_I(inode)))) goto no_delete; if (is_bad_inode(inode)) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); goto no_delete; } /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ if (!special_file(inode->i_mode)) btrfs_wait_ordered_range(inode, 0, (u64)-1); - btrfs_free_io_failure_record(inode, 0, (u64)-1); + btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, @@ -5282,22 +5328,22 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - ret = btrfs_commit_inode_delayed_inode(inode); + ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); if (ret) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); goto no_delete; } rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); goto no_delete; } rsv->size = min_size; rsv->failfast = 1; global_rsv = &fs_info->global_block_rsv; - btrfs_i_size_write(inode, 0); + btrfs_i_size_write(BTRFS_I(inode), 0); /* * This is a bit simpler than btrfs_truncate since we've already @@ -5332,14 +5378,14 @@ void btrfs_evict_inode(struct inode *inode) btrfs_warn(fs_info, "Could not get space for a delete, will truncate on mount %d", ret); - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); btrfs_free_block_rsv(fs_info, rsv); goto no_delete; } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); btrfs_free_block_rsv(fs_info, rsv); goto no_delete; } @@ -5365,7 +5411,7 @@ void btrfs_evict_inode(struct inode *inode) if (ret) { ret = btrfs_commit_transaction(trans); if (ret) { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); btrfs_free_block_rsv(fs_info, rsv); goto no_delete; } @@ -5394,20 +5440,20 @@ void btrfs_evict_inode(struct inode *inode) */ if (ret == 0) { trans->block_rsv = root->orphan_block_rsv; - btrfs_orphan_del(trans, inode); + btrfs_orphan_del(trans, BTRFS_I(inode)); } else { - btrfs_orphan_del(NULL, inode); + btrfs_orphan_del(NULL, BTRFS_I(inode)); } trans->block_rsv = &fs_info->trans_block_rsv; if (!(root == fs_info->tree_root || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) - btrfs_return_ino(root, btrfs_ino(inode)); + btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode))); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); no_delete: - btrfs_remove_delayed_node(inode); + btrfs_remove_delayed_node(BTRFS_I(inode)); clear_inode(inode); } @@ -5429,8 +5475,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, if (!path) return -ENOMEM; - di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, - namelen, 0); + di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), + name, namelen, 0); if (IS_ERR(di)) ret = PTR_ERR(di); @@ -5485,7 +5531,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); - if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || + if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) goto out; @@ -5520,7 +5566,7 @@ static void inode_tree_add(struct inode *inode) struct rb_node **p; struct rb_node *parent; struct rb_node *new = &BTRFS_I(inode)->rb_node; - u64 ino = btrfs_ino(inode); + u64 ino = btrfs_ino(BTRFS_I(inode)); if (inode_unhashed(inode)) return; @@ -5531,9 +5577,9 @@ static void inode_tree_add(struct inode *inode) parent = *p; entry = rb_entry(parent, struct btrfs_inode, rb_node); - if (ino < btrfs_ino(&entry->vfs_inode)) + if (ino < btrfs_ino(BTRFS_I(&entry->vfs_inode))) p = &parent->rb_left; - else if (ino > btrfs_ino(&entry->vfs_inode)) + else if (ino > btrfs_ino(BTRFS_I(&entry->vfs_inode))) p = &parent->rb_right; else { WARN_ON(!(entry->vfs_inode.i_state & @@ -5593,9 +5639,9 @@ again: prev = node; entry = rb_entry(node, struct btrfs_inode, rb_node); - if (objectid < btrfs_ino(&entry->vfs_inode)) + if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode))) node = node->rb_left; - else if (objectid > btrfs_ino(&entry->vfs_inode)) + else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode))) node = node->rb_right; else break; @@ -5603,7 +5649,7 @@ again: if (!node) { while (prev) { entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(&entry->vfs_inode)) { + if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) { node = prev; break; } @@ -5612,7 +5658,7 @@ again: } while (node) { entry = rb_entry(node, struct btrfs_inode, rb_node); - objectid = btrfs_ino(&entry->vfs_inode) + 1; + objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1; inode = igrab(&entry->vfs_inode); if (inode) { spin_unlock(&root->inode_lock); @@ -5796,7 +5842,7 @@ static int btrfs_dentry_delete(const struct dentry *dentry) if (btrfs_root_refs(&root->root_item) == 0) return 1; - if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return 1; } return 0; @@ -5832,7 +5878,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_item *item; struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; @@ -5865,7 +5910,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) key.type = BTRFS_DIR_INDEX_KEY; key.offset = ctx->pos; - key.objectid = btrfs_ino(inode); + key.objectid = btrfs_ino(BTRFS_I(inode)); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -5883,7 +5928,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) continue; } - item = btrfs_item_nr(slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) @@ -5898,7 +5942,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) ctx->pos = found_key.offset; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - if (verify_dir_item(fs_info, leaf, di)) + if (verify_dir_item(fs_info, leaf, slot, di)) goto next; name_len = btrfs_dir_name_len(leaf, di); @@ -5974,7 +6018,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) return 0; - if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode)) + if (btrfs_fs_closing(root->fs_info) && + btrfs_is_free_space_inode(BTRFS_I(inode))) nolock = true; if (wbc->sync_mode == WB_SYNC_ALL) { @@ -6054,9 +6099,9 @@ static int btrfs_update_time(struct inode *inode, struct timespec *now, * and then set the in-memory index_cnt variable to reflect * free sequence numbers */ -static int btrfs_set_inode_index_count(struct inode *inode) +static int btrfs_set_inode_index_count(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_key key, found_key; struct btrfs_path *path; struct extent_buffer *leaf; @@ -6085,7 +6130,7 @@ static int btrfs_set_inode_index_count(struct inode *inode) * else has to start at 2 */ if (path->slots[0] == 0) { - BTRFS_I(inode)->index_cnt = 2; + inode->index_cnt = 2; goto out; } @@ -6096,11 +6141,11 @@ static int btrfs_set_inode_index_count(struct inode *inode) if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { - BTRFS_I(inode)->index_cnt = 2; + inode->index_cnt = 2; goto out; } - BTRFS_I(inode)->index_cnt = found_key.offset + 1; + inode->index_cnt = found_key.offset + 1; out: btrfs_free_path(path); return ret; @@ -6110,11 +6155,11 @@ out: * helper to find a free sequence number in a given directory. This current * code is very simple, later versions will do smarter things in the btree */ -int btrfs_set_inode_index(struct inode *dir, u64 *index) +int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) { int ret = 0; - if (BTRFS_I(dir)->index_cnt == (u64)-1) { + if (dir->index_cnt == (u64)-1) { ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) { ret = btrfs_set_inode_index_count(dir); @@ -6123,8 +6168,8 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index) } } - *index = BTRFS_I(dir)->index_cnt; - BTRFS_I(dir)->index_cnt++; + *index = dir->index_cnt; + dir->index_cnt++; return ret; } @@ -6185,7 +6230,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (dir && name) { trace_btrfs_inode_request(dir); - ret = btrfs_set_inode_index(dir, index); + ret = btrfs_set_inode_index(BTRFS_I(dir), index); if (ret) { btrfs_free_path(path); iput(inode); @@ -6294,7 +6339,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (ret) btrfs_err(fs_info, "error inheriting props for ino %llu (root %llu): %d", - btrfs_ino(inode), root->root_key.objectid, ret); + btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); return inode; @@ -6320,18 +6365,18 @@ static inline u8 btrfs_inode_type(struct inode *inode) * inode to the parent directory. */ int btrfs_add_link(struct btrfs_trans_handle *trans, - struct inode *parent_inode, struct inode *inode, + struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const char *name, int name_len, int add_backref, u64 index) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); int ret = 0; struct btrfs_key key; - struct btrfs_root *root = BTRFS_I(parent_inode)->root; + struct btrfs_root *root = parent_inode->root; u64 ino = btrfs_ino(inode); u64 parent_ino = btrfs_ino(parent_inode); if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { - memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); + memcpy(&key, &inode->root->root_key, sizeof(key)); } else { key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; @@ -6353,7 +6398,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, root, name, name_len, parent_inode, &key, - btrfs_inode_type(inode), index); + btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { @@ -6361,12 +6406,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, return ret; } - btrfs_i_size_write(parent_inode, parent_inode->i_size + + btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + name_len * 2); - inode_inc_iversion(parent_inode); - parent_inode->i_mtime = parent_inode->i_ctime = - current_time(parent_inode); - ret = btrfs_update_inode(trans, root, parent_inode); + inode_inc_iversion(&parent_inode->vfs_inode); + parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime = + current_time(&parent_inode->vfs_inode); + ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode); if (ret) btrfs_abort_transaction(trans, ret); return ret; @@ -6390,8 +6435,8 @@ fail_dir_item: } static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct inode *dir, struct dentry *dentry, - struct inode *inode, int backref, u64 index) + struct btrfs_inode *dir, struct dentry *dentry, + struct btrfs_inode *inode, int backref, u64 index) { int err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, @@ -6427,8 +6472,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(dir), objectid, - mode, &index); + dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, + mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_unlock; @@ -6447,7 +6492,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock_inode; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), + 0, index); if (err) { goto out_unlock_inode; } else { @@ -6499,8 +6545,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock; inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(dir), objectid, - mode, &index); + dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, + mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_unlock; @@ -6524,7 +6570,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock_inode; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), + 0, index); if (err) goto out_unlock_inode; @@ -6566,7 +6613,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; - err = btrfs_set_inode_index(dir, &index); + err = btrfs_set_inode_index(BTRFS_I(dir), &index); if (err) goto fail; @@ -6590,7 +6637,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); + err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), + 1, index); if (err) { drop_inode = 1; @@ -6604,12 +6652,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, * If new hard link count is 1, it's a file created * with open(2) O_TMPFILE flag. */ - err = btrfs_orphan_del(trans, inode); + err = btrfs_orphan_del(trans, BTRFS_I(inode)); if (err) goto fail; } d_instantiate(dentry, inode); - btrfs_log_new_name(trans, inode, NULL, parent); + btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); } btrfs_balance_delayed_items(fs_info); @@ -6649,8 +6697,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(dir), objectid, - S_IFDIR | mode, &index); + dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, + S_IFDIR | mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fail; @@ -6665,13 +6713,14 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) goto out_fail_inode; - btrfs_i_size_write(inode, 0); + btrfs_i_size_write(BTRFS_I(inode), 0); err = btrfs_update_inode(trans, root, inode); if (err) goto out_fail_inode; - err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, - dentry->d_name.len, 0, index); + err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + dentry->d_name.name, + dentry->d_name.len, 0, index); if (err) goto out_fail_inode; @@ -6788,6 +6837,20 @@ static noinline int uncompress_inline(struct btrfs_path *path, max_size = min_t(unsigned long, PAGE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); + + /* + * decompression code contains a memset to fill in any space between the end + * of the uncompressed data and the end of max_size in case the decompressed + * data ends up shorter than ram_bytes. That doesn't cover the hole between + * the end of an inline extent and the beginning of the next block, so we + * cover that region here. + */ + + if (max_size + pg_offset < PAGE_SIZE) { + char *map = kmap(page); + memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset); + kunmap(page); + } kfree(tmp); return ret; } @@ -6800,12 +6863,12 @@ static noinline int uncompress_inline(struct btrfs_path *path, * * This also copies inline extents directly into the page. */ - -struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) +struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, + struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); int ret; int err = 0; u64 extent_start = 0; @@ -6813,13 +6876,13 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, u64 objectid = btrfs_ino(inode); u32 found_type; struct btrfs_path *path = NULL; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; struct extent_buffer *leaf; struct btrfs_key found_key; struct extent_map *em = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_trans_handle *trans = NULL; const bool new_inline = !page || create; @@ -6899,11 +6962,18 @@ again: found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); + + trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, + extent_start); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_end = ALIGN(extent_start + size, fs_info->sectorsize); + + trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, + path->slots[0], + extent_start); } next: if (start >= extent_end) { @@ -6932,7 +7002,8 @@ next: goto not_found_em; } - btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); + btrfs_extent_item_to_extent_map(inode, path, item, + new_inline, em); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -7084,9 +7155,10 @@ out: return em; } -struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) +struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, + struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) { struct extent_map *em; struct extent_map *hole_em = NULL; @@ -7099,19 +7171,17 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag em = btrfs_get_extent(inode, page, pg_offset, start, len, create); if (IS_ERR(em)) return em; - if (em) { - /* - * if our em maps to - * - a hole or - * - a pre-alloc extent, - * there might actually be delalloc bytes behind it. - */ - if (em->block_start != EXTENT_MAP_HOLE && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return em; - else - hole_em = em; - } + /* + * If our em maps to: + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. + */ + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + return em; + else + hole_em = em; /* check to see if we've wrapped (len == -1 or similar) */ end = start + len; @@ -7123,7 +7193,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag em = NULL; /* ok, we didn't find anything, lets look for delalloc */ - found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, + found = count_range_bits(&inode->io_tree, &range_start, end, len, EXTENT_DELALLOC, 1); found_end = range_start + found; if (found_end < range_start) @@ -7225,9 +7295,11 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, int ret; if (type != BTRFS_ORDERED_NOCOW) { - em = create_pinned_em(inode, start, len, orig_start, - block_start, block_len, orig_block_len, - ram_bytes, type); + em = create_io_em(inode, start, len, orig_start, + block_start, block_len, orig_block_len, + ram_bytes, + BTRFS_COMPRESS_NONE, /* compress_type */ + type); if (IS_ERR(em)) goto out; } @@ -7236,7 +7308,7 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, if (ret) { if (em) { free_extent_map(em); - btrfs_drop_extent_cache(inode, start, + btrfs_drop_extent_cache(BTRFS_I(inode), start, start + len - 1, 0); } em = ERR_PTR(ret); @@ -7264,7 +7336,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, em = btrfs_create_dio_extent(inode, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, - ins.offset, 0); + ins.offset, BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(em)) btrfs_free_reserved_extent(fs_info, ins.objectid, @@ -7282,7 +7354,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *ram_bytes) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_trans_handle *trans; struct btrfs_path *path; int ret; struct extent_buffer *leaf; @@ -7302,8 +7373,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, if (!path) return -ENOMEM; - ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), - offset, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, + btrfs_ino(BTRFS_I(inode)), offset, 0); if (ret < 0) goto out; @@ -7319,7 +7390,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != btrfs_ino(inode) || + if (key.objectid != btrfs_ino(BTRFS_I(inode)) || key.type != BTRFS_EXTENT_DATA_KEY) { /* not our file or wrong item type, must cow */ goto out; @@ -7385,15 +7456,9 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, * look for other files referencing this extent, if we * find any we must cow */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = 0; - goto out; - } - ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), + ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), key.offset - backref_offset, disk_bytenr); - btrfs_end_transaction(trans); if (ret) { ret = 0; goto out; @@ -7423,11 +7488,11 @@ out: bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) { struct radix_tree_root *root = &inode->i_mapping->page_tree; - int found = false; + bool found = false; void **pagep = NULL; struct page *page = NULL; - int start_idx; - int end_idx; + unsigned long start_idx; + unsigned long end_idx; start_idx = start >> PAGE_SHIFT; @@ -7504,7 +7569,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * doing DIO to, so we need to make sure there's no ordered * extents in this range. */ - ordered = btrfs_lookup_ordered_range(inode, lockstart, + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, lockend - lockstart + 1); /* @@ -7570,17 +7635,23 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, return ret; } -static struct extent_map *create_pinned_em(struct inode *inode, u64 start, - u64 len, u64 orig_start, - u64 block_start, u64 block_len, - u64 orig_block_len, u64 ram_bytes, - int type) +/* The callers of this must take lock_extent() */ +static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, + u64 orig_start, u64 block_start, + u64 block_len, u64 orig_block_len, + u64 ram_bytes, int compress_type, + int type) { struct extent_map_tree *em_tree; struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + ASSERT(type == BTRFS_ORDERED_PREALLOC || + type == BTRFS_ORDERED_COMPRESSED || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_REGULAR); + em_tree = &BTRFS_I(inode)->extent_tree; em = alloc_extent_map(); if (!em) @@ -7588,8 +7659,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->start = start; em->orig_start = orig_start; - em->mod_start = start; - em->mod_len = len; em->len = len; em->block_len = block_len; em->block_start = block_start; @@ -7598,15 +7667,23 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->ram_bytes = ram_bytes; em->generation = -1; set_bit(EXTENT_FLAG_PINNED, &em->flags); - if (type == BTRFS_ORDERED_PREALLOC) + if (type == BTRFS_ORDERED_PREALLOC) { set_bit(EXTENT_FLAG_FILLING, &em->flags); + } else if (type == BTRFS_ORDERED_COMPRESSED) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } do { - btrfs_drop_extent_cache(inode, em->start, + btrfs_drop_extent_cache(BTRFS_I(inode), em->start, em->start + em->len - 1, 0); write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); + /* + * The caller has taken lock_extent(), who could race with us + * to add em? + */ } while (ret == -EEXIST); if (ret) { @@ -7614,6 +7691,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, return ERR_PTR(ret); } + /* em got 2 refs now, callers needs to do free_extent_map once. */ return em; } @@ -7621,10 +7699,8 @@ static void adjust_dio_outstanding_extents(struct inode *inode, struct btrfs_dio_data *dio_data, const u64 len) { - unsigned num_extents; + unsigned num_extents = count_max_extents(len); - num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE); /* * If we have an outstanding_extents count still set then we're * within our reservation, otherwise we need to adjust our inode @@ -7687,7 +7763,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto err; } - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; @@ -7804,7 +7880,7 @@ unlock: * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ - if (start + len > i_size_read(inode)) + if (!dio_data->overwrite && start + len > i_size_read(inode)) i_size_write(inode, start + len); adjust_dio_outstanding_extents(inode, dio_data, len); @@ -7910,9 +7986,12 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, bio_end_io_t *repair_endio, void *repair_arg) { struct io_failure_record *failrec; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct bio *bio; int isector; int read_mode = 0; + int segs; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -7924,23 +8003,19 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, failed_mirror); if (!ret) { - free_io_failure(inode, failrec); + free_io_failure(failure_tree, io_tree, failrec); return -EIO; } - if ((failed_bio->bi_vcnt > 1) - || (failed_bio->bi_io_vec->bv_len - > btrfs_inode_sectorsize(inode))) + segs = bio_segments(failed_bio); + if (segs > 1 || + (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode))) read_mode |= REQ_FAILFAST_DEV; isector = start - btrfs_io_bio(failed_bio)->logical; isector >>= inode->i_sb->s_blocksize_bits; bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, pgoff, isector, repair_endio, repair_arg); - if (!bio) { - free_io_failure(inode, failrec); - return -EIO; - } bio_set_op_attrs(bio, REQ_OP_READ, read_mode); btrfs_debug(BTRFS_I(inode)->root->fs_info, @@ -7949,7 +8024,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); if (ret) { - free_io_failure(inode, failrec); + free_io_failure(failure_tree, io_tree, failrec); bio_put(bio); } @@ -7966,20 +8041,25 @@ struct btrfs_retry_complete { static void btrfs_retry_endio_nocsum(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; - struct inode *inode; + struct inode *inode = done->inode; struct bio_vec *bvec; + struct extent_io_tree *io_tree, *failure_tree; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; ASSERT(bio->bi_vcnt == 1); - inode = bio->bi_io_vec->bv_page->mapping->host; + io_tree = &BTRFS_I(inode)->io_tree; + failure_tree = &BTRFS_I(inode)->io_failure_tree; ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); done->uptodate = 1; + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, i) - clean_io_failure(done->inode, done->start, bvec->bv_page, 0); + clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, + io_tree, done->start, bvec->bv_page, + btrfs_ino(BTRFS_I(inode)), 0); end: complete(&done->done); bio_put(bio); @@ -7989,36 +8069,40 @@ static int __btrfs_correct_data_nocsum(struct inode *inode, struct btrfs_io_bio *io_bio) { struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_retry_complete done; u64 start; unsigned int pgoff; u32 sectorsize; int nr_sectors; - int i; int ret; + int err = 0; fs_info = BTRFS_I(inode)->root->fs_info; sectorsize = fs_info->sectorsize; start = io_bio->logical; done.inode = inode; + io_bio->bio.bi_iter = io_bio->iter; - bio_for_each_segment_all(bvec, &io_bio->bio, i) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - pgoff = bvec->bv_offset; + bio_for_each_segment(bvec, &io_bio->bio, iter) { + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); + pgoff = bvec.bv_offset; next_block_or_try_again: done.uptodate = 0; done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, pgoff, start, start + sectorsize - 1, io_bio->mirror_num, btrfs_retry_endio_nocsum, &done); - if (ret) - return ret; + if (ret) { + err = ret; + goto next; + } wait_for_completion(&done.done); @@ -8027,46 +8111,53 @@ next_block_or_try_again: goto next_block_or_try_again; } +next: start += sectorsize; - if (nr_sectors--) { + nr_sectors--; + if (nr_sectors) { pgoff += sectorsize; + ASSERT(pgoff < PAGE_SIZE); goto next_block_or_try_again; } } - return 0; + return err; } static void btrfs_retry_endio(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - struct inode *inode; + struct extent_io_tree *io_tree, *failure_tree; + struct inode *inode = done->inode; struct bio_vec *bvec; - u64 start; int uptodate; int ret; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; uptodate = 1; - start = done->start; - ASSERT(bio->bi_vcnt == 1); - inode = bio->bi_io_vec->bv_page->mapping->host; - ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); + ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); + io_tree = &BTRFS_I(inode)->io_tree; + failure_tree = &BTRFS_I(inode)->io_failure_tree; + + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, i) { - ret = __readpage_endio_check(done->inode, io_bio, i, - bvec->bv_page, bvec->bv_offset, - done->start, bvec->bv_len); + ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, + bvec->bv_offset, done->start, + bvec->bv_len); if (!ret) - clean_io_failure(done->inode, done->start, - bvec->bv_page, bvec->bv_offset); + clean_io_failure(BTRFS_I(inode)->root->fs_info, + failure_tree, io_tree, done->start, + bvec->bv_page, + btrfs_ino(BTRFS_I(inode)), + bvec->bv_offset); else uptodate = 0; } @@ -8077,11 +8168,12 @@ end: bio_put(bio); } -static int __btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t __btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct bio_vec bvec; + struct bvec_iter iter; struct btrfs_retry_complete done; u64 start; u64 offset = 0; @@ -8089,7 +8181,7 @@ static int __btrfs_subio_endio_read(struct inode *inode, int nr_sectors; unsigned int pgoff; int csum_pos; - int i; + bool uptodate = (err == 0); int ret; fs_info = BTRFS_I(inode)->root->fs_info; @@ -8098,29 +8190,31 @@ static int __btrfs_subio_endio_read(struct inode *inode, err = 0; start = io_bio->logical; done.inode = inode; + io_bio->bio.bi_iter = io_bio->iter; - bio_for_each_segment_all(bvec, &io_bio->bio, i) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + bio_for_each_segment(bvec, &io_bio->bio, iter) { + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - pgoff = bvec->bv_offset; + pgoff = bvec.bv_offset; next_block: - csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); - ret = __readpage_endio_check(inode, io_bio, csum_pos, - bvec->bv_page, pgoff, start, - sectorsize); - if (likely(!ret)) - goto next; + if (uptodate) { + csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); + ret = __readpage_endio_check(inode, io_bio, csum_pos, + bvec.bv_page, pgoff, start, sectorsize); + if (likely(!ret)) + goto next; + } try_again: done.uptodate = 0; done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, + ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, pgoff, start, start + sectorsize - 1, io_bio->mirror_num, btrfs_retry_endio, &done); if (ret) { - err = ret; + err = errno_to_blk_status(ret); goto next; } @@ -8136,8 +8230,10 @@ next: ASSERT(nr_sectors); - if (--nr_sectors) { + nr_sectors--; + if (nr_sectors) { pgoff += sectorsize; + ASSERT(pgoff < PAGE_SIZE); goto next_block; } } @@ -8145,8 +8241,8 @@ next: return err; } -static int btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -8166,10 +8262,13 @@ static void btrfs_endio_direct_read(struct bio *bio) struct inode *inode = dip->inode; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - int err = bio->bi_error; + blk_status_t err = bio->bi_status; - if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) { err = btrfs_subio_endio_read(inode, io_bio, err); + if (!err) + bio->bi_status = 0; + } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -8177,25 +8276,34 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); - dio_bio->bi_error = bio->bi_error; - dio_end_io(dio_bio, bio->bi_error); + dio_bio->bi_status = bio->bi_status; + dio_end_io(dio_bio); if (io_bio->end_io) - io_bio->end_io(io_bio, err); + io_bio->end_io(io_bio, blk_status_to_errno(err)); bio_put(bio); } -static void btrfs_endio_direct_write_update_ordered(struct inode *inode, - const u64 offset, - const u64 bytes, - const int uptodate) +static void __endio_write_update_ordered(struct inode *inode, + const u64 offset, const u64 bytes, + const bool uptodate) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered = NULL; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; u64 ordered_offset = offset; u64 ordered_bytes = bytes; int ret; + if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + wq = fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else { + wq = fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } + again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, @@ -8204,9 +8312,8 @@ again: if (!ret) goto out_test; - btrfs_init_work(&ordered->work, btrfs_endio_write_helper, - finish_ordered_fn, NULL, NULL); - btrfs_queue_work(fs_info->endio_write_workers, &ordered->work); + btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(wq, &ordered->work); out_test: /* * our bio might span multiple ordered extents. If we haven't @@ -8224,23 +8331,22 @@ static void btrfs_endio_direct_write(struct bio *bio) struct btrfs_dio_private *dip = bio->bi_private; struct bio *dio_bio = dip->dio_bio; - btrfs_endio_direct_write_update_ordered(dip->inode, - dip->logical_offset, - dip->bytes, - !bio->bi_error); + __endio_write_update_ordered(dip->inode, dip->logical_offset, + dip->bytes, !bio->bi_status); kfree(dip); - dio_bio->bi_error = bio->bi_error; - dio_end_io(dio_bio, bio->bi_error); + dio_bio->bi_status = bio->bi_status; + dio_end_io(dio_bio); bio_put(bio); } -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, +static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 offset) { - int ret; + struct inode *inode = private_data; + blk_status_t ret; ret = btrfs_csum_one_bio(inode, bio, offset, 1); BUG_ON(ret); /* -ENOMEM */ return 0; @@ -8249,12 +8355,13 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio_op(bio), bio->bi_opf, + btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio), + bio->bi_opf, (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); @@ -8278,31 +8385,21 @@ static void btrfs_end_dio_bio(struct bio *bio) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - dip->dio_bio->bi_error = 0; + dip->dio_bio->bi_status = 0; bio_endio(dip->orig_bio); } out: bio_put(bio); } -static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, - u64 first_sector, gfp_t gfp_flags) -{ - struct bio *bio; - bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags); - if (bio) - bio_associate_current(bio); - return bio; -} - -static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode, +static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, struct btrfs_dio_private *dip, struct bio *bio, u64 file_offset) { struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); - int ret; + blk_status_t ret; /* * We load all the csum data we need when we submit @@ -8333,7 +8430,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; bool write = bio_op(bio) == REQ_OP_WRITE; - int ret; + blk_status_t ret; if (async_submit) async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -8350,8 +8447,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, goto map; if (write && async_submit) { - ret = btrfs_wq_submit_bio(fs_info, inode, bio, 0, 0, - file_offset, + ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0, + file_offset, inode, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); goto err; @@ -8381,103 +8478,83 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *bio; struct bio *orig_bio = dip->orig_bio; - struct bio_vec *bvec; u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; - u64 submit_len = 0; u64 map_length; - u32 blocksize = fs_info->sectorsize; int async_submit = 0; - int nr_sectors; + u64 submit_len; + int clone_offset = 0; + int clone_len; int ret; - int i, j; map_length = orig_bio->bi_iter.bi_size; + submit_len = map_length; ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, &map_length, NULL, 0); if (ret) return -EIO; - if (map_length >= orig_bio->bi_iter.bi_size) { + if (map_length >= submit_len) { bio = orig_bio; dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; goto submit; } /* async crcs make it difficult to collect full stripe writes. */ - if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK) + if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK) async_submit = 0; else async_submit = 1; - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); - if (!bio) - return -ENOMEM; - - bio->bi_opf = orig_bio->bi_opf; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; + /* bio split */ + ASSERT(map_length <= INT_MAX); atomic_inc(&dip->pending_bios); + do { + clone_len = min_t(int, submit_len, map_length); - bio_for_each_segment_all(bvec, orig_bio, j) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); - i = 0; -next_block: - if (unlikely(map_length < submit_len + blocksize || - bio_add_page(bio, bvec->bv_page, blocksize, - bvec->bv_offset + (i * blocksize)) < blocksize)) { - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the dip might get freed - * before we're done setting it up - */ - atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, - file_offset, skip_sum, - async_submit); - if (ret) { - bio_put(bio); - atomic_dec(&dip->pending_bios); - goto out_err; - } - - start_sector += submit_len >> 9; - file_offset += submit_len; + /* + * This will never fail as it's passing GPF_NOFS and + * the allocation is backed by btrfs_bioset. + */ + bio = btrfs_bio_clone_partial(orig_bio, clone_offset, + clone_len); + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; + + ASSERT(submit_len >= clone_len); + submit_len -= clone_len; + if (submit_len == 0) + break; - submit_len = 0; + /* + * Increase the count before we submit the bio so we know + * the end IO handler won't happen before we increase the + * count. Otherwise, the dip might get freed before we're + * done setting it up. + */ + atomic_inc(&dip->pending_bios); - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, - start_sector, GFP_NOFS); - if (!bio) - goto out_err; - bio->bi_opf = orig_bio->bi_opf; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, + async_submit); + if (ret) { + bio_put(bio); + atomic_dec(&dip->pending_bios); + goto out_err; + } - map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), - start_sector << 9, - &map_length, NULL, 0); - if (ret) { - bio_put(bio); - goto out_err; - } + clone_offset += clone_len; + start_sector += clone_len >> 9; + file_offset += clone_len; - goto next_block; - } else { - submit_len += blocksize; - if (--nr_sectors) { - i++; - goto next_block; - } - } - } + map_length = submit_len; + ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), + start_sector << 9, &map_length, NULL, 0); + if (ret) + goto out_err; + } while (submit_len > 0); submit: ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, @@ -8504,19 +8581,15 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, loff_t file_offset) { struct btrfs_dio_private *dip = NULL; - struct bio *io_bio = NULL; - struct btrfs_io_bio *btrfs_bio; + struct bio *bio = NULL; + struct btrfs_io_bio *io_bio; int skip_sum; bool write = (bio_op(dio_bio) == REQ_OP_WRITE); int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); - if (!io_bio) { - ret = -ENOMEM; - goto free_ordered; - } + bio = btrfs_bio_clone(dio_bio); dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { @@ -8529,17 +8602,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, dip->logical_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; - io_bio->bi_private = dip; - dip->orig_bio = io_bio; + bio->bi_private = dip; + dip->orig_bio = bio; dip->dio_bio = dio_bio; atomic_set(&dip->pending_bios, 0); - btrfs_bio = btrfs_io_bio(io_bio); - btrfs_bio->logical = file_offset; + io_bio = btrfs_io_bio(bio); + io_bio->logical = file_offset; if (write) { - io_bio->bi_end_io = btrfs_endio_direct_write; + bio->bi_end_io = btrfs_endio_direct_write; } else { - io_bio->bi_end_io = btrfs_endio_direct_read; + bio->bi_end_io = btrfs_endio_direct_read; dip->subio_endio = btrfs_subio_endio_read; } @@ -8562,8 +8635,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, if (!ret) return; - if (btrfs_bio->end_io) - btrfs_bio->end_io(btrfs_bio, ret); + if (io_bio->end_io) + io_bio->end_io(io_bio, ret); free_ordered: /* @@ -8575,35 +8648,34 @@ free_ordered: * same as btrfs_endio_direct_[write|read] because we can't call these * callbacks - they require an allocated dip and a clone of dio_bio. */ - if (io_bio && dip) { - io_bio->bi_error = -EIO; - bio_endio(io_bio); + if (bio && dip) { + bio_io_error(bio); /* - * The end io callbacks free our dip, do the final put on io_bio + * The end io callbacks free our dip, do the final put on bio * and all the cleanup and final put for dio_bio (through * dio_end_io()). */ dip = NULL; - io_bio = NULL; + bio = NULL; } else { if (write) - btrfs_endio_direct_write_update_ordered(inode, + __endio_write_update_ordered(inode, file_offset, dio_bio->bi_iter.bi_size, - 0); + false); else unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - dio_bio->bi_error = -EIO; + dio_bio->bi_status = BLK_STS_IOERR; /* * Releases and cleans up our dio_bio, no need to bio_put() * nor bio_endio()/bio_io_error() against dio_bio. */ - dio_end_io(dio_bio, ret); + dio_end_io(dio_bio); } - if (io_bio) - bio_put(io_bio); + if (bio) + bio_put(bio); kfree(dip); } @@ -8647,6 +8719,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_data dio_data = { 0 }; + struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; int flags = 0; @@ -8679,15 +8752,18 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { + dio_data.overwrite = 1; inode_unlock(inode); relock = true; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; } - ret = btrfs_delalloc_reserve_space(inode, offset, count); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + offset, count); if (ret) goto out; - dio_data.outstanding_extents = div64_u64(count + - BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE); + dio_data.outstanding_extents = count_max_extents(count); /* * We need to know how many extents we reserved so that we can @@ -8716,8 +8792,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) - btrfs_delalloc_release_space(inode, offset, - dio_data.reserve); + btrfs_delalloc_release_space(inode, data_reserved, + offset, dio_data.reserve); /* * On error we might have left some ordered extents * without submitting corresponding bios for them, so @@ -8726,14 +8802,14 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) */ if (dio_data.unsubmitted_oe_range_start < dio_data.unsubmitted_oe_range_end) - btrfs_endio_direct_write_update_ordered(inode, + __endio_write_update_ordered(inode, dio_data.unsubmitted_oe_range_start, dio_data.unsubmitted_oe_range_end - dio_data.unsubmitted_oe_range_start, - 0); + false); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, offset, - count - (size_t)ret); + btrfs_delalloc_release_space(inode, data_reserved, + offset, count - (size_t)ret); } out: if (wakeup) @@ -8741,6 +8817,7 @@ out: if (relock) inode_lock(inode); + extent_changeset_free(data_reserved); return ret; } @@ -8831,7 +8908,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) return 0; - return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); + return __btrfs_releasepage(page, gfp_flags); } static void btrfs_invalidatepage(struct page *page, unsigned int offset, @@ -8866,7 +8943,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, lock_extent_bits(tree, page_start, page_end, &cached_state); again: start = page_start; - ordered = btrfs_lookup_ordered_range(inode, start, + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, page_end - start + 1); if (ordered) { end = min(page_end, ordered->file_offset + ordered->len - 1); @@ -8877,6 +8954,7 @@ again: if (!inode_evicting) clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); @@ -8930,12 +9008,12 @@ again: * free the entire extent. */ if (PageDirty(page)) - btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); @@ -8964,14 +9042,15 @@ again: * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +int btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; char *kaddr; unsigned long zero_start; loff_t size; @@ -8997,10 +9076,10 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret = btrfs_delalloc_reserve_space(inode, page_start, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, reserved_space); if (!ret) { - ret = file_update_time(vma->vm_file); + ret = file_update_time(vmf->vma->vm_file); reserved = 1; } if (ret) { @@ -9032,7 +9111,8 @@ again: * we can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish */ - ordered = btrfs_lookup_ordered_range(inode, page_start, page_end); + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, + PAGE_SIZE); if (ordered) { unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -9050,17 +9130,17 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, page_start, - PAGE_SIZE - reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE - reserved_space); } } /* - * XXX - page_mkwrite gets called every time the page is dirtied, even - * if it was already dirty, so for space accounting reasons we need to - * clear any delalloc bits for the range we are fixing to save. There - * is probably a better way to do this, but for now keep consistent with - * prepare_pages in the normal write path. + * page_mkwrite gets called when the page is firstly dirtied after it's + * faulted in, but write(2) could also dirty a page and set delalloc + * bits, thus in this case for space account reason, we still need to + * clear any delalloc bits within this page range since we have to + * reserve data&meta space before lock_page() (see above comments). */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, EXTENT_DIRTY | EXTENT_DELALLOC | @@ -9102,13 +9182,16 @@ again: out_unlock: if (!ret) { sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } unlock_page(page); out: - btrfs_delalloc_release_space(inode, page_start, reserved_space); + btrfs_delalloc_release_space(inode, data_reserved, page_start, + reserved_space); out_noreserve: sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); return ret; } @@ -9230,7 +9313,7 @@ static int btrfs_truncate(struct inode *inode) if (ret == 0 && inode->i_nlink > 0) { trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_orphan_del(trans, inode); + ret = btrfs_orphan_del(trans, BTRFS_I(inode)); if (ret) err = ret; } @@ -9275,7 +9358,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, inode->i_fop = &btrfs_dir_file_operations; set_nlink(inode, 1); - btrfs_i_size_write(inode, 0); + btrfs_i_size_write(BTRFS_I(inode), 0); unlock_new_inode(inode); err = btrfs_subvol_inherit_props(trans, new_root, parent_root); @@ -9305,6 +9388,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; + ei->new_delalloc_bytes = 0; ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; @@ -9329,8 +9413,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); - extent_io_tree_init(&ei->io_tree, &inode->i_data); - extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); + extent_io_tree_init(&ei->io_tree, inode); + extent_io_tree_init(&ei->io_failure_tree, inode); ei->io_tree.track_uptodate = 1; ei->io_failure_tree.track_uptodate = 1; atomic_set(&ei->sync_writers, 0); @@ -9348,7 +9432,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode) { - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif @@ -9370,6 +9454,7 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(BTRFS_I(inode)->outstanding_extents); WARN_ON(BTRFS_I(inode)->reserved_extents); WARN_ON(BTRFS_I(inode)->delalloc_bytes); + WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); WARN_ON(BTRFS_I(inode)->csum_bytes); WARN_ON(BTRFS_I(inode)->defrag_bytes); @@ -9384,7 +9469,7 @@ void btrfs_destroy_inode(struct inode *inode) if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)) { btrfs_info(fs_info, "inode %llu still on the orphan list", - btrfs_ino(inode)); + btrfs_ino(BTRFS_I(inode))); atomic_dec(&root->orphan_inodes); } @@ -9403,7 +9488,7 @@ void btrfs_destroy_inode(struct inode *inode) } btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); free: call_rcu(&inode->i_rcu, btrfs_i_callback); } @@ -9438,7 +9523,6 @@ void btrfs_destroy_cachep(void) rcu_barrier(); kmem_cache_destroy(btrfs_inode_cachep); kmem_cache_destroy(btrfs_trans_handle_cachep); - kmem_cache_destroy(btrfs_transaction_cachep); kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); } @@ -9458,12 +9542,6 @@ int btrfs_init_cachep(void) if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", - sizeof(struct btrfs_transaction), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); - if (!btrfs_transaction_cachep) - goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, SLAB_MEM_SPREAD, NULL); @@ -9482,18 +9560,36 @@ fail: return -ENOMEM; } -static int btrfs_getattr(struct vfsmount *mnt, - struct dentry *dentry, struct kstat *stat) +static int btrfs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { u64 delalloc_bytes; - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; + u32 bi_flags = BTRFS_I(inode)->flags; + + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; + stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; + if (bi_flags & BTRFS_INODE_APPEND) + stat->attributes |= STATX_ATTR_APPEND; + if (bi_flags & BTRFS_INODE_COMPRESS) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (bi_flags & BTRFS_INODE_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (bi_flags & BTRFS_INODE_NODUMP) + stat->attributes |= STATX_ATTR_NODUMP; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); - delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; + delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + ALIGN(delalloc_bytes, blocksize)) >> 9; @@ -9513,8 +9609,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *old_inode = old_dentry->d_inode; struct timespec ctime = current_time(old_inode); struct dentry *parent; - u64 old_ino = btrfs_ino(old_inode); - u64 new_ino = btrfs_ino(new_inode); + u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); + u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; u64 root_objectid; @@ -9550,10 +9646,10 @@ static int btrfs_rename_exchange(struct inode *old_dir, * We need to find a free sequence number both in the source and * in the destination directory for the exchange. */ - ret = btrfs_set_inode_index(new_dir, &old_idx); + ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); if (ret) goto out_fail; - ret = btrfs_set_inode_index(old_dir, &new_idx); + ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); if (ret) goto out_fail; @@ -9571,7 +9667,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_dentry->d_name.name, new_dentry->d_name.len, old_ino, - btrfs_ino(new_dir), old_idx); + btrfs_ino(BTRFS_I(new_dir)), + old_idx); if (ret) goto out_fail; } @@ -9587,7 +9684,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, old_dentry->d_name.name, old_dentry->d_name.len, new_ino, - btrfs_ino(old_dir), new_idx); + btrfs_ino(BTRFS_I(old_dir)), + new_idx); if (ret) goto out_fail; } @@ -9603,8 +9701,10 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_inode->i_ctime = ctime; if (old_dentry->d_parent != new_dentry->d_parent) { - btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); - btrfs_record_unlink_dir(trans, new_dir, new_inode, 1); + btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), + BTRFS_I(old_inode), 1); + btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), + BTRFS_I(new_inode), 1); } /* src is a subvolume */ @@ -9615,8 +9715,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, old_dentry->d_name.name, old_dentry->d_name.len); } else { /* src is an inode */ - ret = __btrfs_unlink_inode(trans, root, old_dir, - old_dentry->d_inode, + ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), + BTRFS_I(old_dentry->d_inode), old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) @@ -9635,8 +9735,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_dentry->d_name.name, new_dentry->d_name.len); } else { /* dest is an inode */ - ret = __btrfs_unlink_inode(trans, dest, new_dir, - new_dentry->d_inode, + ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), + BTRFS_I(new_dentry->d_inode), new_dentry->d_name.name, new_dentry->d_name.len); if (!ret) @@ -9647,7 +9747,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, goto out_fail; } - ret = btrfs_add_link(trans, new_dir, old_inode, + ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), new_dentry->d_name.name, new_dentry->d_name.len, 0, old_idx); if (ret) { @@ -9655,7 +9755,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, goto out_fail; } - ret = btrfs_add_link(trans, old_dir, new_inode, + ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), old_dentry->d_name.name, old_dentry->d_name.len, 0, new_idx); if (ret) { @@ -9670,13 +9770,15 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (root_log_pinned) { parent = new_dentry->d_parent; - btrfs_log_new_name(trans, old_inode, old_dir, parent); + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + parent); btrfs_end_log_trans(root); root_log_pinned = false; } if (dest_log_pinned) { parent = old_dentry->d_parent; - btrfs_log_new_name(trans, new_inode, new_dir, parent); + btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), + parent); btrfs_end_log_trans(dest); dest_log_pinned = false; } @@ -9693,11 +9795,11 @@ out_fail: * allow the tasks to sync it. */ if (ret && (root_log_pinned || dest_log_pinned)) { - if (btrfs_inode_in_log(old_dir, fs_info->generation) || - btrfs_inode_in_log(new_dir, fs_info->generation) || - btrfs_inode_in_log(old_inode, fs_info->generation) || + if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || + btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || + btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || (new_inode && - btrfs_inode_in_log(new_inode, fs_info->generation))) + btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) btrfs_set_log_full_commit(fs_info, trans); if (root_log_pinned) { @@ -9736,7 +9838,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, - btrfs_ino(dir), + btrfs_ino(BTRFS_I(dir)), objectid, S_IFCHR | WHITEOUT_MODE, &index); @@ -9755,8 +9857,8 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_add_nondir(trans, dir, dentry, - inode, 0, index); + ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, + BTRFS_I(inode), 0, index); if (ret) goto out; @@ -9784,10 +9886,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, u64 index = 0; u64 root_objectid; int ret; - u64 old_ino = btrfs_ino(old_inode); + u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; - if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; /* we only allow rename subvolume link between subvolumes */ @@ -9795,7 +9897,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, return -EXDEV; if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || - (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) + (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID)) return -ENOTEMPTY; if (S_ISDIR(old_inode->i_mode) && new_inode && @@ -9855,7 +9957,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (dest != root) btrfs_record_root_in_trans(trans, dest); - ret = btrfs_set_inode_index(new_dir, &index); + ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); if (ret) goto out_fail; @@ -9870,7 +9972,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.name, new_dentry->d_name.len, old_ino, - btrfs_ino(new_dir), index); + btrfs_ino(BTRFS_I(new_dir)), index); if (ret) goto out_fail; } @@ -9883,7 +9985,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode->i_ctime = current_time(old_dir); if (old_dentry->d_parent != new_dentry->d_parent) - btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), + BTRFS_I(old_inode), 1); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; @@ -9891,8 +9994,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_name.name, old_dentry->d_name.len); } else { - ret = __btrfs_unlink_inode(trans, root, old_dir, - d_inode(old_dentry), + ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), + BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) @@ -9906,7 +10009,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode) { inode_inc_iversion(new_inode); new_inode->i_ctime = current_time(new_inode); - if (unlikely(btrfs_ino(new_inode) == + if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { root_objectid = BTRFS_I(new_inode)->location.objectid; ret = btrfs_unlink_subvol(trans, dest, new_dir, @@ -9915,20 +10018,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.len); BUG_ON(new_inode->i_nlink == 0); } else { - ret = btrfs_unlink_inode(trans, dest, new_dir, - d_inode(new_dentry), + ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), + BTRFS_I(d_inode(new_dentry)), new_dentry->d_name.name, new_dentry->d_name.len); } if (!ret && new_inode->i_nlink == 0) - ret = btrfs_orphan_add(trans, d_inode(new_dentry)); + ret = btrfs_orphan_add(trans, + BTRFS_I(d_inode(new_dentry))); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } } - ret = btrfs_add_link(trans, new_dir, old_inode, + ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), new_dentry->d_name.name, new_dentry->d_name.len, 0, index); if (ret) { @@ -9942,7 +10046,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (log_pinned) { struct dentry *parent = new_dentry->d_parent; - btrfs_log_new_name(trans, old_inode, old_dir, parent); + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + parent); btrfs_end_log_trans(root); log_pinned = false; } @@ -9969,11 +10074,11 @@ out_fail: * allow the tasks to sync it. */ if (ret && log_pinned) { - if (btrfs_inode_in_log(old_dir, fs_info->generation) || - btrfs_inode_in_log(new_dir, fs_info->generation) || - btrfs_inode_in_log(old_inode, fs_info->generation) || + if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || + btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || + btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || (new_inode && - btrfs_inode_in_log(new_inode, fs_info->generation))) + btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) btrfs_set_log_full_commit(fs_info, trans); btrfs_end_log_trans(root); @@ -10237,8 +10342,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(dir), objectid, - S_IFLNK|S_IRWXUGO, &index); + dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), + objectid, S_IFLNK|S_IRWXUGO, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_unlock; @@ -10264,7 +10369,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, err = -ENOMEM; goto out_unlock_inode; } - key.objectid = btrfs_ino(inode); + key.objectid = btrfs_ino(BTRFS_I(inode)); key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(name_len); @@ -10294,7 +10399,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_symlink_aops; inode_set_bytes(inode, name_len); - btrfs_i_size_write(inode, name_len); + btrfs_i_size_write(BTRFS_I(inode), name_len); err = btrfs_update_inode(trans, root, inode); /* * Last step, add directory indexes for our symlink inode. This is the @@ -10302,7 +10407,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, * elsewhere above. */ if (!err) - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, + BTRFS_I(inode), 0, index); if (err) { drop_inode = 1; goto out_unlock_inode; @@ -10388,7 +10494,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, break; } - btrfs_drop_extent_cache(inode, cur_offset, + btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, cur_offset + ins.offset -1, 0); em = alloc_extent_map(); @@ -10415,7 +10521,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, write_unlock(&em_tree->lock); if (ret != -EEXIST) break; - btrfs_drop_extent_cache(inode, cur_offset, + btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, cur_offset + ins.offset - 1, 0); } @@ -10452,7 +10558,7 @@ next: btrfs_end_transaction(trans); } if (cur_offset < end) - btrfs_free_reserved_data_space(inode, cur_offset, + btrfs_free_reserved_data_space(inode, NULL, cur_offset, end - cur_offset + 1); return ret; } @@ -10517,7 +10623,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) goto out; inode = btrfs_new_inode(trans, root, dir, NULL, 0, - btrfs_ino(dir), objectid, mode, &index); + btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; @@ -10537,7 +10643,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) ret = btrfs_update_inode(trans, root, inode); if (ret) goto out_inode; - ret = btrfs_orphan_add(trans, inode); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) goto out_inode; @@ -10567,6 +10673,48 @@ out_inode: } +__attribute__((const)) +static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) +{ + return -EAGAIN; +} + +static struct btrfs_fs_info *iotree_fs_info(void *private_data) +{ + struct inode *inode = private_data; + return btrfs_sb(inode->i_sb); +} + +static void btrfs_check_extent_io_range(void *private_data, const char *caller, + u64 start, u64 end) +{ + struct inode *inode = private_data; + u64 isize; + + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + } +} + +void btrfs_set_range_writeback(void *private_data, u64 start, u64 end) +{ + struct inode *inode = private_data; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + ASSERT(page); /* Pages should be in the extent_io_tree */ + set_page_writeback(page); + put_page(page); + index++; + } +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10605,16 +10753,23 @@ static const struct file_operations btrfs_dir_file_operations = { }; static const struct extent_io_ops btrfs_extent_io_ops = { - .fill_delalloc = run_delalloc_range, + /* mandatory callbacks */ .submit_bio_hook = btrfs_submit_bio_hook, - .merge_bio_hook = btrfs_merge_bio_hook, .readpage_end_io_hook = btrfs_readpage_end_io_hook, + .merge_bio_hook = btrfs_merge_bio_hook, + .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, + .tree_fs_info = iotree_fs_info, + .set_range_writeback = btrfs_set_range_writeback, + + /* optional callbacks */ + .fill_delalloc = run_delalloc_range, .writepage_end_io_hook = btrfs_writepage_end_io_hook, .writepage_start_hook = btrfs_writepage_start_hook, .set_bit_hook = btrfs_set_bit_hook, .clear_bit_hook = btrfs_clear_bit_hook, .merge_extent_hook = btrfs_merge_extent_hook, .split_extent_hook = btrfs_split_extent_hook, + .check_extent_io_range = btrfs_check_extent_io_range, }; /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 33f967d30b2a..fa1b78cf25f6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -37,7 +37,7 @@ #include <linux/bit_spinlock.h> #include <linux/security.h> #include <linux/xattr.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/uuid.h> @@ -395,7 +395,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) q = bdev_get_queue(device->bdev); if (blk_queue_discard(q)) { num_devices++; - minlen = min((u64)q->limits.discard_granularity, + minlen = min_t(u64, q->limits.discard_granularity, minlen); } } @@ -434,7 +434,7 @@ int btrfs_is_empty_uuid(u8 *uuid) static noinline int create_subvol(struct inode *dir, struct dentry *dentry, - char *name, int namelen, + const char *name, int namelen, u64 *async_transid, struct btrfs_qgroup_inherit *inherit) { @@ -487,8 +487,7 @@ static noinline int create_subvol(struct inode *dir, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(fs_info, &block_rsv, - qgroup_reserved); + btrfs_subvolume_release_metadata(fs_info, &block_rsv); goto fail_free; } trans->block_rsv = &block_rsv; @@ -581,27 +580,27 @@ static noinline int create_subvol(struct inode *dir, /* * insert the directory item */ - ret = btrfs_set_inode_index(dir, &index); + ret = btrfs_set_inode_index(BTRFS_I(dir), &index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_insert_dir_item(trans, root, - name, namelen, dir, &key, + name, namelen, BTRFS_I(dir), &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } - btrfs_i_size_write(dir, dir->i_size + namelen * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); BUG_ON(ret); ret = btrfs_add_root_ref(trans, fs_info, objectid, root->root_key.objectid, - btrfs_ino(dir), index, name, namelen); + btrfs_ino(BTRFS_I(dir)), index, name, namelen); BUG_ON(ret); ret = btrfs_uuid_tree_add(trans, fs_info, root_item->uuid, @@ -613,7 +612,7 @@ fail: kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(fs_info, &block_rsv, qgroup_reserved); + btrfs_subvolume_release_metadata(fs_info, &block_rsv); if (async_transid) { *async_transid = trans->transid; @@ -657,7 +656,7 @@ static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) } static int create_snapshot(struct btrfs_root *root, struct inode *dir, - struct dentry *dentry, char *name, int namelen, + struct dentry *dentry, u64 *async_transid, bool readonly, struct btrfs_qgroup_inherit *inherit) { @@ -670,12 +669,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; - pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL); if (!pending_snapshot) return -ENOMEM; pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), - GFP_NOFS); + GFP_KERNEL); pending_snapshot->path = btrfs_alloc_path(); if (!pending_snapshot->root_item || !pending_snapshot->path) { ret = -ENOMEM; @@ -690,7 +689,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; - btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -753,9 +752,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, d_instantiate(dentry, inode); ret = 0; fail: - btrfs_subvolume_release_metadata(fs_info, - &pending_snapshot->block_rsv, - pending_snapshot->qgroup_reserved); + btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); dec_and_free: if (atomic_dec_and_test(&root->will_be_snapshoted)) wake_up_atomic_t(&root->will_be_snapshoted); @@ -835,7 +832,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) * inside this filesystem so it's quite a bit simpler. */ static noinline int btrfs_mksubvol(const struct path *parent, - char *name, int namelen, + const char *name, int namelen, struct btrfs_root *snap_src, u64 *async_transid, bool readonly, struct btrfs_qgroup_inherit *inherit) @@ -874,7 +871,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dir, dentry, name, namelen, + error = create_snapshot(snap_src, dir, dentry, async_transid, readonly, inherit); } else { error = create_subvol(dir, dentry, name, namelen, @@ -941,7 +938,7 @@ static int find_new_extents(struct btrfs_root *root, struct btrfs_file_extent_item *extent; int type; int ret; - u64 ino = btrfs_ino(inode); + u64 ino = btrfs_ino(BTRFS_I(inode)); path = btrfs_alloc_path(); if (!path) @@ -1012,7 +1009,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) /* get the big lock and read metadata off disk */ lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); if (IS_ERR(em)) @@ -1130,6 +1127,7 @@ static int cluster_pages_for_defrag(struct inode *inode, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_io_tree *tree; + struct extent_changeset *data_reserved = NULL; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); file_end = (isize - 1) >> PAGE_SHIFT; @@ -1138,7 +1136,7 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); - ret = btrfs_delalloc_reserve_space(inode, + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); if (ret) @@ -1229,7 +1227,7 @@ again: spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, (page_cnt - i_done) << PAGE_SHIFT); } @@ -1250,15 +1248,17 @@ again: unlock_page(pages[i]); put_page(pages[i]); } + extent_changeset_free(data_reserved); return i_done; out: for (i = 0; i < i_done; i++) { unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); + extent_changeset_free(data_reserved); return ret; } @@ -1507,7 +1507,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } @@ -1622,13 +1622,13 @@ out_free: kfree(vol_args); out: mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); mnt_drop_write_file(file); return ret; } static noinline int btrfs_ioctl_snap_create_transid(struct file *file, - char *name, unsigned long fd, int subvol, + const char *name, unsigned long fd, int subvol, u64 *transid, bool readonly, struct btrfs_qgroup_inherit *inherit) { @@ -1780,7 +1780,7 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file, int ret = 0; u64 flags = 0; - if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) return -EINVAL; down_read(&fs_info->subvol_sem); @@ -1812,7 +1812,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, if (ret) goto out; - if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; goto out_drop_write; } @@ -2446,7 +2446,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out_dput; - if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out_dput; } @@ -2497,7 +2497,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; - btrfs_record_snapshot_destroy(trans, dir); + btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); ret = btrfs_unlink_subvol(trans, root, dir, dest->root_key.objectid, @@ -2555,7 +2555,7 @@ out_end_trans: err = ret; inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(fs_info, &block_rsv, qgroup_reserved); + btrfs_subvolume_release_metadata(fs_info, &block_rsv); out_up_write: up_write(&fs_info->subvol_sem); if (err) { @@ -2613,9 +2613,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } ret = btrfs_defrag_root(root); - if (ret) - goto out; - ret = btrfs_defrag_root(root->fs_info->extent_root); break; case S_IFREG: if (!(file->f_mode & FMODE_WRITE)) { @@ -2667,7 +2664,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; mutex_lock(&fs_info->volume_mutex); @@ -2686,7 +2683,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return ret; } @@ -2714,7 +2711,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) return -EOPNOTSUPP; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -2727,7 +2724,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) ret = btrfs_rm_device(fs_info, vol_args->name, 0); } mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) @@ -2758,7 +2755,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -2778,7 +2775,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out_drop_write: mnt_drop_write_file(file); @@ -3047,11 +3044,21 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, cmp->src_pages = src_pgarr; cmp->dst_pages = dst_pgarr; - ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff); + /* + * If deduping ranges in the same inode, locking rules make it mandatory + * to always lock pages in ascending order to avoid deadlocks with + * concurrent tasks (such as starting writeback/delalloc). + */ + if (src == dst && dst_loff < loff) { + swap(src_pgarr, dst_pgarr); + swap(loff, dst_loff); + } + + ret = gather_extent_pages(src, src_pgarr, cmp->num_pages, loff); if (ret) goto out; - ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff); + ret = gather_extent_pages(dst, dst_pgarr, cmp->num_pages, dst_loff); out: if (ret) @@ -3059,8 +3066,7 @@ out: return 0; } -static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, - u64 dst_loff, u64 len, struct cmp_pages *cmp) +static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) { int ret = 0; int i; @@ -3128,26 +3134,27 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, int ret; u64 len = olen; struct cmp_pages cmp; - int same_inode = 0; + bool same_inode = (src == dst); u64 same_lock_start = 0; u64 same_lock_len = 0; - if (src == dst) - same_inode = 1; - if (len == 0) return 0; - if (same_inode) { + if (same_inode) inode_lock(src); + else + btrfs_double_inode_lock(src, dst); - ret = extent_same_check_offsets(src, loff, &len, olen); - if (ret) - goto out_unlock; - ret = extent_same_check_offsets(src, dst_loff, &len, olen); - if (ret) - goto out_unlock; + ret = extent_same_check_offsets(src, loff, &len, olen); + if (ret) + goto out_unlock; + ret = extent_same_check_offsets(dst, dst_loff, &len, olen); + if (ret) + goto out_unlock; + + if (same_inode) { /* * Single inode case wants the same checks, except we * don't want our length pushed out past i_size as @@ -3175,16 +3182,6 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, same_lock_start = min_t(u64, loff, dst_loff); same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; - } else { - btrfs_double_inode_lock(src, dst); - - ret = extent_same_check_offsets(src, loff, &len, olen); - if (ret) - goto out_unlock; - - ret = extent_same_check_offsets(dst, dst_loff, &len, olen); - if (ret) - goto out_unlock; } /* don't make the dst file partly checksummed */ @@ -3236,7 +3233,7 @@ again: } /* pass original length for comparison so we stay within i_size */ - ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp); + ret = btrfs_cmp_data(olen, &cmp); if (ret == 0) ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); @@ -3304,7 +3301,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, if (endoff > destoff + olen) endoff = destoff + olen; if (endoff > inode->i_size) - btrfs_i_size_write(inode, endoff); + btrfs_i_size_write(BTRFS_I(inode), endoff); ret = btrfs_update_inode(trans, root, inode); if (ret) { @@ -3317,20 +3314,19 @@ out: return ret; } -static void clone_update_extent_map(struct inode *inode, +static void clone_update_extent_map(struct btrfs_inode *inode, const struct btrfs_trans_handle *trans, const struct btrfs_path *path, const u64 hole_offset, const u64 hole_len) { - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; em = alloc_extent_map(); if (!em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); return; } @@ -3344,7 +3340,7 @@ static void clone_update_extent_map(struct inode *inode, if (btrfs_file_extent_type(path->nodes[0], fi) == BTRFS_FILE_EXTENT_INLINE) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); } else { em->start = hole_offset; em->len = hole_len; @@ -3370,8 +3366,7 @@ static void clone_update_extent_map(struct inode *inode, } if (ret) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); } /* @@ -3399,8 +3394,7 @@ static void clone_update_extent_map(struct inode *inode, * data into the destination inode's inline extent if the later is greater then * the former. */ -static int clone_copy_inline_extent(struct inode *src, - struct inode *dst, +static int clone_copy_inline_extent(struct inode *dst, struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_key *new_key, @@ -3420,7 +3414,7 @@ static int clone_copy_inline_extent(struct inode *src, if (new_key->offset > 0) return -EOPNOTSUPP; - key.objectid = btrfs_ino(dst); + key.objectid = btrfs_ino(BTRFS_I(dst)); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -3435,7 +3429,7 @@ static int clone_copy_inline_extent(struct inode *src, goto copy_inline_extent; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == btrfs_ino(dst) && + if (key.objectid == btrfs_ino(BTRFS_I(dst)) && key.type == BTRFS_EXTENT_DATA_KEY) { ASSERT(key.offset > 0); return -EOPNOTSUPP; @@ -3469,7 +3463,7 @@ static int clone_copy_inline_extent(struct inode *src, } else if (ret == 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == btrfs_ino(dst) && + if (key.objectid == btrfs_ino(BTRFS_I(dst)) && key.type == BTRFS_EXTENT_DATA_KEY) return -EOPNOTSUPP; } @@ -3548,12 +3542,9 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN); - if (!buf) { - buf = vmalloc(fs_info->nodesize); - if (!buf) - return ret; - } + buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!buf) + return ret; path = btrfs_alloc_path(); if (!path) { @@ -3563,7 +3554,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, path->reada = READA_FORWARD; /* clone data */ - key.objectid = btrfs_ino(src); + key.objectid = btrfs_ino(BTRFS_I(src)); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = off; @@ -3606,7 +3597,7 @@ process_slot: btrfs_item_key_to_cpu(leaf, &key, slot); if (key.type > BTRFS_EXTENT_DATA_KEY || - key.objectid != btrfs_ino(src)) + key.objectid != btrfs_ino(BTRFS_I(src))) break; if (key.type == BTRFS_EXTENT_DATA_KEY) { @@ -3659,7 +3650,7 @@ process_slot: path->leave_spinning = 0; memcpy(&new_key, &key, sizeof(new_key)); - new_key.objectid = btrfs_ino(inode); + new_key.objectid = btrfs_ino(BTRFS_I(inode)); if (off <= key.offset) new_key.offset = key.offset + destoff - off; else @@ -3749,7 +3740,7 @@ process_slot: fs_info, disko, diskl, 0, root->root_key.objectid, - btrfs_ino(inode), + btrfs_ino(BTRFS_I(inode)), new_key.offset - datao); if (ret) { btrfs_abort_transaction(trans, @@ -3779,7 +3770,7 @@ process_slot: size -= skip + trim; datal -= skip + trim; - ret = clone_copy_inline_extent(src, inode, + ret = clone_copy_inline_extent(inode, trans, path, &new_key, drop_start, @@ -3798,11 +3789,12 @@ process_slot: /* If we have an implicit hole (NO_HOLES feature). */ if (drop_start < new_key.offset) - clone_update_extent_map(inode, trans, + clone_update_extent_map(BTRFS_I(inode), trans, NULL, drop_start, new_key.offset - drop_start); - clone_update_extent_map(inode, trans, path, 0, 0); + clone_update_extent_map(BTRFS_I(inode), trans, + path, 0, 0); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -3852,8 +3844,9 @@ process_slot: btrfs_end_transaction(trans); goto out; } - clone_update_extent_map(inode, trans, NULL, last_dest_end, - destoff + len - last_dest_end); + clone_update_extent_map(BTRFS_I(inode), trans, NULL, + last_dest_end, + destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen, no_time_update); } @@ -4449,13 +4442,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = -EROFS; goto out; } - if (atomic_xchg( - &fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); - atomic_set( - &fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: @@ -4600,7 +4591,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, out: btrfs_free_path(path); - vfree(inodes); + kvfree(inodes); kfree(loi); return ret; @@ -4650,7 +4641,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) return ret; again: - if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); need_unlock = true; @@ -4696,7 +4687,7 @@ again: } locked: - BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); + BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); @@ -4752,11 +4743,10 @@ locked: do_balance: /* - * Ownership of bctl and mutually_exclusive_operation_running + * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP * goes to to btrfs_balance. bctl is freed in __cancel_balance, * or, if restriper was paused all the way until unmount, in - * free_fs_info. mutually_exclusive_operation_running is - * cleared in __cancel_balance. + * free_fs_info. The flag is cleared in __cancel_balance. */ need_unlock = false; @@ -4776,7 +4766,7 @@ out_unlock: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); if (need_unlock) - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out: mnt_drop_write_file(file); return ret; @@ -4910,7 +4900,6 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) goto out; } - /* FIXME: check if the IDs really exist */ if (sa->assign) { ret = btrfs_add_qgroup_relation(trans, fs_info, sa->src, sa->dst); @@ -4969,7 +4958,6 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto out; } - /* FIXME: check if the IDs really exist */ if (sa->create) { ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid); } else { @@ -5023,7 +5011,6 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) qgroupid = root->root_key.objectid; } - /* FIXME: check if the IDs really exist */ ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim); err = btrfs_end_transaction(trans); @@ -5129,7 +5116,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, down_write(&fs_info->subvol_sem); - if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; goto out; } @@ -5653,6 +5640,10 @@ long btrfs_ioctl(struct file *file, unsigned int #ifdef CONFIG_COMPAT long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + /* + * These all access 32-bit values anyway so no further + * handling is necessary. + */ switch (cmd) { case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; @@ -5663,8 +5654,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; - default: - return -ENOIOCTLCMD; } return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 45d26980caf9..d433e75d489a 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -18,13 +18,14 @@ #include <linux/kernel.h> #include <linux/slab.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/init.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/lzo.h> +#include <linux/refcount.h> #include "compression.h" #define LZO_LEN 4 @@ -40,9 +41,9 @@ static void lzo_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); - vfree(workspace->buf); - vfree(workspace->cbuf); - vfree(workspace->mem); + kvfree(workspace->buf); + kvfree(workspace->cbuf); + kvfree(workspace->mem); kfree(workspace); } @@ -50,13 +51,13 @@ static struct list_head *lzo_alloc_workspace(void) { struct workspace *workspace; - workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); if (!workspace) return ERR_PTR(-ENOMEM); - workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); - workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); - workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); + workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); + workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -76,7 +77,7 @@ static inline void write_compress_length(char *buf, size_t len) memcpy(buf, &dlen, LZO_LEN); } -static inline size_t read_compress_length(char *buf) +static inline size_t read_compress_length(const char *buf) { __le32 dlen; @@ -86,13 +87,11 @@ static inline size_t read_compress_length(char *buf) static int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, unsigned long len, + u64 start, struct page **pages, - unsigned long nr_dest_pages, unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) + unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; @@ -102,7 +101,9 @@ static int lzo_compress_pages(struct list_head *ws, struct page *in_page = NULL; struct page *out_page = NULL; unsigned long bytes_left; - + unsigned long len = *total_out; + unsigned long nr_dest_pages = *out_pages; + const unsigned long max_out = nr_dest_pages * PAGE_SIZE; size_t in_len; size_t out_len; char *buf; @@ -141,7 +142,7 @@ static int lzo_compress_pages(struct list_head *ws, ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); if (ret != LZO_E_OK) { - pr_debug("BTRFS: deflate in loop returned %d\n", + pr_debug("BTRFS: lzo in loop returned %d\n", ret); ret = -EIO; goto out; @@ -229,8 +230,10 @@ static int lzo_compress_pages(struct list_head *ws, in_len = min(bytes_left, PAGE_SIZE); } - if (tot_out > tot_in) + if (tot_out >= tot_in) { + ret = -E2BIG; goto out; + } /* store the size of all chunks of compressed data */ cpage_out = kmap(pages[0]); @@ -254,16 +257,13 @@ out: return ret; } -static int lzo_decompress_bio(struct list_head *ws, - struct page **pages_in, - u64 disk_start, - struct bio *orig_bio, - size_t srclen) +static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; char *data_in; unsigned long page_in_index = 0; + size_t srclen = cb->compressed_len; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long buf_offset = 0; @@ -278,6 +278,9 @@ static int lzo_decompress_bio(struct list_head *ws, unsigned long tot_len; char *buf; bool may_late_unmap, need_unmap; + struct page **pages_in = cb->compressed_pages; + u64 disk_start = cb->start; + struct bio *orig_bio = cb->orig_bio; data_in = kmap(pages_in[0]); tot_len = read_compress_length(data_in); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 041c3326d109..a3aca495e33e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -212,7 +212,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); /* one ref for the tree */ - atomic_set(&entry->refs, 1); + refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); @@ -358,7 +358,7 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, out: if (!ret && cached && entry) { *cached = entry; - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; @@ -425,14 +425,14 @@ have_entry: out: if (!ret && cached && entry) { *cached = entry; - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; } /* Needs to either be called under a log transaction or the log_mutex */ -void btrfs_get_logged_extents(struct inode *inode, +void btrfs_get_logged_extents(struct btrfs_inode *inode, struct list_head *logged_list, const loff_t start, const loff_t end) @@ -442,7 +442,7 @@ void btrfs_get_logged_extents(struct inode *inode, struct rb_node *n; struct rb_node *prev; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); n = __tree_search(&tree->tree, end, &prev); if (!n) @@ -456,7 +456,7 @@ void btrfs_get_logged_extents(struct inode *inode, if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) continue; list_add(&ordered->log_list, logged_list); - atomic_inc(&ordered->refs); + refcount_inc(&ordered->refs); } spin_unlock_irq(&tree->lock); } @@ -565,7 +565,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); - if (atomic_dec_and_test(&entry->refs)) { + if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->log_list)); ASSERT(list_empty(&entry->trans_list)); ASSERT(list_empty(&entry->root_extent_list)); @@ -623,7 +623,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; if (trans) - atomic_inc(&trans->use_count); + refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); ASSERT(trans); @@ -663,7 +663,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, +u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -671,7 +671,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, LIST_HEAD(skipped); LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; - int count = 0; + u64 count = 0; const u64 range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); @@ -690,7 +690,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, list_move_tail(&ordered->root_extent_list, &root->ordered_extents); - atomic_inc(&ordered->refs); + refcount_inc(&ordered->refs); spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, @@ -701,7 +701,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, cond_resched(); spin_lock(&root->ordered_extent_lock); - if (nr != -1) + if (nr != U64_MAX) nr--; count++; } @@ -720,13 +720,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, return count; } -int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, - const u64 range_start, const u64 range_len) +u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, + const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; - int done; - int total_done = 0; + u64 total_done = 0; + u64 done; INIT_LIST_HEAD(&splice); @@ -748,9 +748,8 @@ int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, total_done += done; spin_lock(&fs_info->ordered_root_lock); - if (nr != -1) { + if (nr != U64_MAX) { nr -= done; - WARN_ON(nr < 0); } } list_splice_tail(&splice, &fs_info->ordered_roots); @@ -870,7 +869,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, if (!offset_in_entry(entry, file_offset)) entry = NULL; if (entry) - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); out: spin_unlock_irq(&tree->lock); return entry; @@ -879,15 +878,14 @@ out: /* Since the DIO code tries to lock a wide area we need to look for any ordered * extents that exist in the range, rather than just the start of the range. */ -struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, - u64 file_offset, - u64 len) +struct btrfs_ordered_extent *btrfs_lookup_ordered_range( + struct btrfs_inode *inode, u64 file_offset, u64 len) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) { @@ -912,7 +910,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, } out: if (entry) - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); spin_unlock_irq(&tree->lock); return entry; } @@ -923,7 +921,7 @@ bool btrfs_have_ordered_extents_in_range(struct inode *inode, { struct btrfs_ordered_extent *oe; - oe = btrfs_lookup_ordered_range(inode, file_offset, len); + oe = btrfs_lookup_ordered_range(BTRFS_I(inode), file_offset, len); if (oe) { btrfs_put_ordered_extent(oe); return true; @@ -949,7 +947,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) goto out; entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); out: spin_unlock_irq(&tree->lock); return entry; @@ -984,8 +982,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, } disk_i_size = BTRFS_I(inode)->disk_i_size; - /* truncate file */ - if (disk_i_size > i_size) { + /* + * truncate file. + * If ordered is not NULL, then this is called from endio and + * disk_i_size will be updated by either truncate itself or any + * in-flight IOs which are inside the disk_i_size. + * + * Because btrfs_setsize() may set i_size with disk_i_size if truncate + * fails somehow, we need to make sure we have a precise disk_i_size by + * updating it as usual. + * + */ + if (!ordered && disk_i_size > i_size) { BTRFS_I(inode)->disk_i_size = orig_offset; ret = 0; goto out; @@ -1032,25 +1040,22 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, /* We treat this entry as if it doesn't exist */ if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) continue; - if (test->file_offset + test->len <= disk_i_size) + + if (entry_end(test) <= disk_i_size) break; if (test->file_offset >= i_size) break; - if (entry_end(test) > disk_i_size) { - /* - * we don't update disk_i_size now, so record this - * undealt i_size. Or we will not know the real - * i_size. - */ - if (test->outstanding_isize < offset) - test->outstanding_isize = offset; - if (ordered && - ordered->outstanding_isize > - test->outstanding_isize) - test->outstanding_isize = - ordered->outstanding_isize; - goto out; - } + + /* + * We don't update disk_i_size now, so record this undealt + * i_size. Or we will not know the real i_size. + */ + if (test->outstanding_isize < offset) + test->outstanding_isize = offset; + if (ordered && + ordered->outstanding_isize > test->outstanding_isize) + test->outstanding_isize = ordered->outstanding_isize; + goto out; } new_i_size = min_t(u64, offset, i_size); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 5f2b0ca28705..56c4c0ee6381 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -75,6 +75,8 @@ struct btrfs_ordered_sum { * in the logging code. */ #define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to * complete in the current transaction. */ +#define BTRFS_ORDERED_REGULAR 12 /* Regular IO for COW */ + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -111,7 +113,7 @@ struct btrfs_ordered_extent { int compress_type; /* reference count */ - atomic_t refs; + refcount_t refs; /* the inode we belong to */ struct inode *inode; @@ -187,9 +189,10 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); -struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, - u64 file_offset, - u64 len); +struct btrfs_ordered_extent *btrfs_lookup_ordered_range( + struct btrfs_inode *inode, + u64 file_offset, + u64 len); bool btrfs_have_ordered_extents_in_range(struct inode *inode, u64 file_offset, u64 len); @@ -197,11 +200,11 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, +u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); -int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, +u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); -void btrfs_get_logged_extents(struct inode *inode, +void btrfs_get_logged_extents(struct btrfs_inode *inode, struct list_head *logged_list, const loff_t start, const loff_t end); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index cdafbf92ef0c..fcae61e175f3 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -261,8 +261,11 @@ void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l) case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); - pr_info("\t\tblock group used %llu\n", - btrfs_disk_block_group_used(l, bi)); + pr_info( + "\t\tblock group used %llu chunk_objectid %llu flags %llu\n", + btrfs_disk_block_group_used(l, bi), + btrfs_disk_block_group_chunk_objectid(l, bi), + btrfs_disk_block_group_flags(l, bi)); break; case BTRFS_CHUNK_ITEM_KEY: print_chunk(l, btrfs_item_ptr(l, i, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index f2621e330954..4b23ae5d0e5c 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -164,6 +164,7 @@ static int iterate_object_props(struct btrfs_root *root, size_t), void *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *name_buf = NULL; char *value_buf = NULL; @@ -214,6 +215,12 @@ static int iterate_object_props(struct btrfs_root *root, name_ptr = (unsigned long)(di + 1); data_ptr = name_ptr + name_len; + if (verify_dir_item(fs_info, leaf, + path->slots[0], di)) { + ret = -EIO; + goto out; + } + if (name_len <= XATTR_BTRFS_PREFIX_LEN || memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, name_ptr, @@ -279,7 +286,7 @@ static void inode_prop_iterator(void *ctx, if (unlikely(ret)) btrfs_warn(root->fs_info, "error applying prop %s to ino %llu (root %llu): %d", - handler->xattr_name, btrfs_ino(inode), + handler->xattr_name, btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); else set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); @@ -288,7 +295,7 @@ static void inode_prop_iterator(void *ctx, int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 ino = btrfs_ino(inode); + u64 ino = btrfs_ino(BTRFS_I(inode)); int ret; ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 662821f1252c..4ce351efe281 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -47,50 +47,6 @@ * - check all ioctl parameters */ -/* - * one struct for each qgroup, organized in fs_info->qgroup_tree. - */ -struct btrfs_qgroup { - u64 qgroupid; - - /* - * state - */ - u64 rfer; /* referenced */ - u64 rfer_cmpr; /* referenced compressed */ - u64 excl; /* exclusive */ - u64 excl_cmpr; /* exclusive compressed */ - - /* - * limits - */ - u64 lim_flags; /* which limits are set */ - u64 max_rfer; - u64 max_excl; - u64 rsv_rfer; - u64 rsv_excl; - - /* - * reservation tracking - */ - u64 reserved; - - /* - * lists - */ - struct list_head groups; /* groups this group is member of */ - struct list_head members; /* groups that are members of this group */ - struct list_head dirty; /* dirty groups */ - struct rb_node node; /* tree of qgroups */ - - /* - * temp variables for accounting operations - * Refer to qgroup_shared_accounting() for details. - */ - u64 old_refcnt; - u64 new_refcnt; -}; - static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { @@ -319,7 +275,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return 0; - fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS); + fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); if (!fs_info->qgroup_ulist) { ret = -ENOMEM; goto out; @@ -876,7 +832,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, goto out; } - fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS); + fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); if (!fs_info->qgroup_ulist) { ret = -ENOMEM; goto out; @@ -1019,7 +975,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - clean_tree_block(trans, fs_info, quota_root->node); + clean_tree_block(fs_info, quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); @@ -1038,6 +994,18 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } +static void report_reserved_underflow(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup, + u64 num_bytes) +{ +#ifdef CONFIG_BTRFS_DEBUG + WARN_ON(qgroup->reserved < num_bytes); + btrfs_debug(fs_info, + "qgroup %llu reserved space underflow, have: %llu, to free: %llu", + qgroup->qgroupid, qgroup->reserved, num_bytes); +#endif + qgroup->reserved = 0; +} /* * The easy accounting, if we are adding/removing the only ref for an extent * then this qgroup and all of the parent qgroups get their reference and @@ -1065,8 +1033,13 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; qgroup->excl_cmpr += sign * num_bytes; - if (sign > 0) - qgroup->reserved -= num_bytes; + if (sign > 0) { + trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes); + if (qgroup->reserved < num_bytes) + report_reserved_underflow(fs_info, qgroup, num_bytes); + else + qgroup->reserved -= num_bytes; + } qgroup_dirty(fs_info, qgroup); @@ -1086,8 +1059,15 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup->rfer_cmpr += sign * num_bytes; WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; - if (sign > 0) - qgroup->reserved -= num_bytes; + if (sign > 0) { + trace_qgroup_update_reserve(fs_info, qgroup, + -(s64)num_bytes); + if (qgroup->reserved < num_bytes) + report_reserved_underflow(fs_info, qgroup, + num_bytes); + else + qgroup->reserved -= num_bytes; + } qgroup->excl_cmpr += sign * num_bytes; qgroup_dirty(fs_info, qgroup); @@ -1156,7 +1136,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; - tmp = ulist_alloc(GFP_NOFS); + tmp = ulist_alloc(GFP_KERNEL); if (!tmp) return -ENOMEM; @@ -1205,7 +1185,7 @@ out: return ret; } -int __del_qgroup_relation(struct btrfs_trans_handle *trans, +static int __del_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { struct btrfs_root *quota_root; @@ -1216,7 +1196,7 @@ int __del_qgroup_relation(struct btrfs_trans_handle *trans, int ret = 0; int err; - tmp = ulist_alloc(GFP_NOFS); + tmp = ulist_alloc(GFP_KERNEL); if (!tmp) return -ENOMEM; @@ -1426,37 +1406,6 @@ out: return ret; } -int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_qgroup_extent_record *record; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - u64 qgroup_to_skip; - int ret = 0; - - delayed_refs = &trans->transaction->delayed_refs; - qgroup_to_skip = delayed_refs->qgroup_to_skip; - - /* - * No need to do lock, since this function will only be called in - * btrfs_commit_transaction(). - */ - node = rb_first(&delayed_refs->dirty_extent_root); - while (node) { - record = rb_entry(node, struct btrfs_qgroup_extent_record, - node); - ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, - &record->old_roots); - if (ret < 0) - break; - if (qgroup_to_skip) - ulist_del(record->old_roots, qgroup_to_skip, 0); - node = rb_next(node); - } - return ret; -} - int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) @@ -1486,6 +1435,28 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, return 0; } +int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_extent_record *qrecord) +{ + struct ulist *old_root; + u64 bytenr = qrecord->bytenr; + int ret; + + ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root); + if (ret < 0) + return ret; + + /* + * Here we don't need to get the lock of + * trans->transaction->delayed_refs, since inserted qrecord won't + * be deleted, only qrecord->node may be modified (new qrecord insert) + * + * So modifying qrecord->old_roots is safe here + */ + qrecord->old_roots = old_root; + return 0; +} + int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, gfp_t gfp_flag) @@ -1511,9 +1482,11 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, spin_lock(&delayed_refs->lock); ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); spin_unlock(&delayed_refs->lock); - if (ret > 0) + if (ret > 0) { kfree(record); - return 0; + return 0; + } + return btrfs_qgroup_trace_extent_post(fs_info, record); } int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, @@ -1554,6 +1527,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, if (ret) return ret; } + cond_resched(); return 0; } @@ -1571,8 +1545,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, * If we increment the root nodes slot counter past the number of * elements, 1 is returned to signal completion of the search. */ -static int adjust_slots_upwards(struct btrfs_root *root, - struct btrfs_path *path, int root_level) +static int adjust_slots_upwards(struct btrfs_path *path, int root_level) { int level = 0; int nr, slot; @@ -1713,7 +1686,7 @@ walk_down: goto out; /* Nonzero return here means we completed our search */ - ret = adjust_slots_upwards(root, path, root_level); + ret = adjust_slots_upwards(path, root_level); if (ret) break; @@ -1914,6 +1887,35 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, return 0; } +/* + * Check if the @roots potentially is a list of fs tree roots + * + * Return 0 for definitely not a fs/subvol tree roots ulist + * Return 1 for possible fs/subvol tree roots in the list (considering an empty + * one as well) + */ +static int maybe_fs_roots(struct ulist *roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + + /* Empty one, still possible for fs roots */ + if (!roots || roots->nnodes == 0) + return 1; + + ULIST_ITER_INIT(&uiter); + unode = ulist_next(roots, &uiter); + if (!unode) + return 1; + + /* + * If it contains fs tree roots, then it must belong to fs/subvol + * trees. + * If it contains a non-fs tree, it won't be shared with fs/subvol trees. + */ + return is_fstree(unode->val); +} + int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -1927,13 +1929,24 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 nr_old_roots = 0; int ret = 0; - if (new_roots) + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + if (new_roots) { + if (!maybe_fs_roots(new_roots)) + goto out_free; nr_new_roots = new_roots->nnodes; - if (old_roots) + } + if (old_roots) { + if (!maybe_fs_roots(old_roots)) + goto out_free; nr_old_roots = old_roots->nnodes; + } - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + /* Quick exit, either not fs tree roots, or won't affect any qgroup */ + if (nr_old_roots == 0 && nr_new_roots == 0) goto out_free; + BUG_ON(!fs_info->quota_root); trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes, @@ -2012,16 +2025,32 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, if (!ret) { /* - * Use (u64)-1 as time_seq to do special search, which + * Old roots should be searched when inserting qgroup + * extent record + */ + if (WARN_ON(!record->old_roots)) { + /* Search commit root to find old_roots */ + ret = btrfs_find_all_roots(NULL, fs_info, + record->bytenr, 0, + &record->old_roots); + if (ret < 0) + goto cleanup; + } + + /* + * Use SEQ_LAST as time_seq to do special search, which * doesn't lock tree or delayed_refs and search current * root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, (u64)-1, &new_roots); + record->bytenr, SEQ_LAST, &new_roots); if (ret < 0) goto cleanup; - if (qgroup_to_skip) + if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0); + ulist_del(record->old_roots, qgroup_to_skip, + 0); + } ret = btrfs_qgroup_account_extent(trans, fs_info, record->bytenr, record->num_bytes, record->old_roots, new_roots); @@ -2170,9 +2199,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, goto out; } - rcu_read_lock(); level_size = fs_info->nodesize; - rcu_read_unlock(); } /* @@ -2306,13 +2333,27 @@ out: return ret; } -static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes) +static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) +{ + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && + qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer) + return false; + + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && + qg->reserved + (s64)qg->excl + num_bytes > qg->max_excl) + return false; + + return true; +} + +static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; + int retried = 0; struct ulist_node *unode; struct ulist_iterator uiter; @@ -2322,6 +2363,11 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes) if (num_bytes == 0) return 0; + if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && + capable(CAP_SYS_RESOURCE)) + enforce = false; + +retry: spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; if (!quota_root) @@ -2347,16 +2393,28 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes) qg = unode_aux_to_qgroup(unode); - if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && - qg->reserved + (s64)qg->rfer + num_bytes > - qg->max_rfer) { - ret = -EDQUOT; - goto out; - } + if (enforce && !qgroup_check_limits(qg, num_bytes)) { + /* + * Commit the tree and retry, since we may have + * deletions which would free up space. + */ + if (!retried && qg->reserved > 0) { + struct btrfs_trans_handle *trans; - if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && - qg->reserved + (s64)qg->excl + num_bytes > - qg->max_excl) { + spin_unlock(&fs_info->qgroup_lock); + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + retried++; + goto retry; + } ret = -EDQUOT; goto out; } @@ -2379,6 +2437,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes) qg = unode_aux_to_qgroup(unode); + trace_qgroup_update_reserve(fs_info, qg, num_bytes); qg->reserved += num_bytes; } @@ -2424,7 +2483,11 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, qg = unode_aux_to_qgroup(unode); - qg->reserved -= num_bytes; + trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes); + if (qg->reserved < num_bytes) + report_reserved_underflow(fs_info, qg, num_bytes); + else + qg->reserved -= num_bytes; list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(fs_info->qgroup_ulist, @@ -2439,23 +2502,6 @@ out: spin_unlock(&fs_info->qgroup_lock); } -static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes) -{ - return btrfs_qgroup_free_refroot(root->fs_info, root->objectid, - num_bytes); -} -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) -{ - if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) - return; - btrfs_err(trans->fs_info, - "qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x", - trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", - (u32)(trans->delayed_ref_elem.seq >> 32), - (u32)trans->delayed_ref_elem.seq); - BUG(); -} - /* * returns < 0 on error, 0 when more leafs are to be scanned. * returns 1 when done. @@ -2789,71 +2835,146 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * Return <0 for error (including -EQUOT) * * NOTE: this function may sleep for memory allocation. + * if btrfs_qgroup_reserve_data() is called multiple times with + * same @reserved, caller must ensure when error happens it's OK + * to free *ALL* reserved space. */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_changeset changeset; struct ulist_node *unode; struct ulist_iterator uiter; + struct extent_changeset *reserved; + u64 orig_reserved; + u64 to_reserve; int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || !is_fstree(root->objectid) || len == 0) return 0; - changeset.bytes_changed = 0; - changeset.range_changed = ulist_alloc(GFP_NOFS); + /* @reserved parameter is mandatory for qgroup */ + if (WARN_ON(!reserved_ret)) + return -EINVAL; + if (!*reserved_ret) { + *reserved_ret = extent_changeset_alloc(); + if (!*reserved_ret) + return -ENOMEM; + } + reserved = *reserved_ret; + /* Record already reserved space */ + orig_reserved = reserved->bytes_changed; ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, reserved); + + /* Newly reserved space */ + to_reserve = reserved->bytes_changed - orig_reserved; trace_btrfs_qgroup_reserve_data(inode, start, len, - changeset.bytes_changed, - QGROUP_RESERVE); + to_reserve, QGROUP_RESERVE); if (ret < 0) goto cleanup; - ret = qgroup_reserve(root, changeset.bytes_changed); + ret = qgroup_reserve(root, to_reserve, true); if (ret < 0) goto cleanup; - ulist_free(changeset.range_changed); return ret; cleanup: - /* cleanup already reserved ranges */ + /* cleanup *ALL* already reserved ranges */ ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(changeset.range_changed, &uiter))) + while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, GFP_NOFS); - ulist_free(changeset.range_changed); + extent_changeset_release(reserved); return ret; } -static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, - int free) +/* Free ranges specified by @reserved, normally in error path */ +static int qgroup_free_reserved_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { + struct btrfs_root *root = BTRFS_I(inode)->root; + struct ulist_node *unode; + struct ulist_iterator uiter; struct extent_changeset changeset; - int trace_op = QGROUP_RELEASE; + int freed = 0; int ret; - changeset.bytes_changed = 0; - changeset.range_changed = ulist_alloc(GFP_NOFS); - if (!changeset.range_changed) - return -ENOMEM; + extent_changeset_init(&changeset); + len = round_up(start + len, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(&reserved->range_changed, &uiter))) { + u64 range_start = unode->val; + /* unode->aux is the inclusive end */ + u64 range_len = unode->aux - range_start + 1; + u64 free_start; + u64 free_len; + + extent_changeset_release(&changeset); + + /* Only free range in range [start, start + len) */ + if (range_start >= start + len || + range_start + range_len <= start) + continue; + free_start = max(range_start, start); + free_len = min(start + len, range_start + range_len) - + free_start; + /* + * TODO: To also modify reserved->ranges_reserved to reflect + * the modification. + * + * However as long as we free qgroup reserved according to + * EXTENT_QGROUP_RESERVED, we won't double free. + * So not need to rush. + */ + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + free_start, free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + if (ret < 0) + goto out; + freed += changeset.bytes_changed; + } + btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed); + ret = freed; +out: + extent_changeset_release(&changeset); + return ret; +} + +static int __btrfs_qgroup_release_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len, + int free) +{ + struct extent_changeset changeset; + int trace_op = QGROUP_RELEASE; + int ret; + + /* In release case, we shouldn't have @reserved */ + WARN_ON(!free && reserved); + if (free && reserved) + return qgroup_free_reserved_data(inode, reserved, start, len); + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; - if (free) { - qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed); + if (free) trace_op = QGROUP_FREE; - } trace_btrfs_qgroup_release_data(inode, start, len, changeset.bytes_changed, trace_op); + if (free) + btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, + BTRFS_I(inode)->root->objectid, + changeset.bytes_changed); + ret = changeset.bytes_changed; out: - ulist_free(changeset.range_changed); + extent_changeset_release(&changeset); return ret; } @@ -2862,14 +2983,17 @@ out: * * Should be called when a range of pages get invalidated before reaching disk. * Or for error cleanup case. + * if @reserved is given, only reserved range in [@start, @start + @len) will + * be freed. * * For data written to disk, use btrfs_qgroup_release_data(). * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); } /* @@ -2889,10 +3013,11 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) */ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); } -int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes) +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -2902,26 +3027,28 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - ret = qgroup_reserve(root, num_bytes); + trace_qgroup_meta_reserve(root, (s64)num_bytes); + ret = qgroup_reserve(root, num_bytes, enforce); if (ret < 0) return ret; - atomic_add(num_bytes, &root->qgroup_meta_rsv); + atomic64_add(num_bytes, &root->qgroup_meta_rsv); return ret; } void btrfs_qgroup_free_meta_all(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - int reserved; + u64 reserved; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || !is_fstree(root->objectid)) return; - reserved = atomic_xchg(&root->qgroup_meta_rsv, 0); + reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0); if (reserved == 0) return; - qgroup_free(root, reserved); + trace_qgroup_meta_reserve(root, -(s64)reserved); + btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved); } void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) @@ -2933,9 +3060,10 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) return; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes); - atomic_sub(num_bytes, &root->qgroup_meta_rsv); - qgroup_free(root, num_bytes); + WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes); + atomic64_sub(num_bytes, &root->qgroup_meta_rsv); + trace_qgroup_meta_reserve(root, -(s64)num_bytes); + btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes); } /* @@ -2949,23 +3077,22 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) struct ulist_iterator iter; int ret; - changeset.bytes_changed = 0; - changeset.range_changed = ulist_alloc(GFP_NOFS); - if (WARN_ON(!changeset.range_changed)) - return; - + extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { ULIST_ITER_INIT(&iter); - while ((unode = ulist_next(changeset.range_changed, &iter))) { + while ((unode = ulist_next(&changeset.range_changed, &iter))) { btrfs_warn(BTRFS_I(inode)->root->fs_info, "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu", inode->i_ino, unode->val, unode->aux); } - qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed); + btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, + BTRFS_I(inode)->root->objectid, + changeset.bytes_changed); + } - ulist_free(changeset.range_changed); + extent_changeset_release(&changeset); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 416ae8e1d23c..d9984e87cddf 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -62,6 +62,50 @@ struct btrfs_qgroup_extent_record { }; /* + * one struct for each qgroup, organized in fs_info->qgroup_tree. + */ +struct btrfs_qgroup { + u64 qgroupid; + + /* + * state + */ + u64 rfer; /* referenced */ + u64 rfer_cmpr; /* referenced compressed */ + u64 excl; /* exclusive */ + u64 excl_cmpr; /* exclusive compressed */ + + /* + * limits + */ + u64 lim_flags; /* which limits are set */ + u64 max_rfer; + u64 max_excl; + u64 rsv_rfer; + u64 rsv_excl; + + /* + * reservation tracking + */ + u64 reserved; + + /* + * lists + */ + struct list_head groups; /* groups this group is member of */ + struct list_head members; /* groups that are members of this group */ + struct list_head dirty; /* dirty groups */ + struct rb_node node; /* tree of qgroups */ + + /* + * temp variables for accounting operations + * Refer to qgroup_shared_accounting() for details. + */ + u64 old_refcnt; + u64 new_refcnt; +}; + +/* * For qgroup event trace points only */ #define QGROUP_RESERVE (1<<0) @@ -90,13 +134,13 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); + /* * Inform qgroup to trace one dirty extent, its info is recorded in @record. - * So qgroup can account it at commit trans time. + * So qgroup can account it at transaction committing time. * - * No lock version, caller must acquire delayed ref lock and allocate memory. + * No lock version, caller must acquire delayed ref lock and allocated memory, + * then call btrfs_qgroup_trace_extent_post() after exiting lock context. * * Return 0 for success insert * Return >0 for existing record, caller can free @record safely. @@ -108,11 +152,37 @@ int btrfs_qgroup_trace_extent_nolock( struct btrfs_qgroup_extent_record *record); /* + * Post handler after qgroup_trace_extent_nolock(). + * + * NOTE: Current qgroup does the expensive backref walk at transaction + * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming + * new transaction. + * This is designed to allow btrfs_find_all_roots() to get correct new_roots + * result. + * + * However for old_roots there is no need to do backref walk at that time, + * since we search commit roots to walk backref and result will always be + * correct. + * + * Due to the nature of no lock version, we can't do backref there. + * So we must call btrfs_qgroup_trace_extent_post() after exiting + * spinlock context. + * + * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result + * using current root, then we can move all expensive backref walk out of + * transaction committing, but not now as qgroup accounting will be wrong again. + */ +int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_extent_record *qrecord); + +/* * Inform qgroup to trace one dirty extent, specified by @bytenr and * @num_bytes. * So qgroup can account it at commit trans time. * - * Better encapsulated version. + * Better encapsulated version, with memory allocation and backref walk for + * commit roots. + * So this can sleep. * * Return 0 if the operation is done. * Return <0 for error, like memory allocation failure or invalid parameter @@ -159,17 +229,12 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes); -/* - * TODO: Add proper trace point for it, as btrfs_qgroup_free() is - * called by everywhere, can't provide good trace for delayed ref case. - */ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { - btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); + btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); } -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, @@ -177,11 +242,14 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, #endif /* New io_tree based accurate qgroup reserve API */ -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_reserve_data(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); -int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_free_data(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); -int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes); +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + bool enforce); void btrfs_qgroup_free_meta_all(struct btrfs_root *root); void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes); void btrfs_qgroup_check_reserved_leak(struct inode *inode); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d2a9a1ee5361..208638384cd2 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -31,7 +31,7 @@ #include <linux/hash.h> #include <linux/list_sort.h> #include <linux/raid/xor.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <asm/div64.h> #include "ctree.h" #include "extent_map.h" @@ -149,7 +149,7 @@ struct btrfs_raid_bio { int generic_bio_cnt; - atomic_t refs; + refcount_t refs; atomic_t stripes_pending; @@ -218,12 +218,9 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) * of a failing mount. */ table_size = sizeof(*table) + sizeof(*h) * num_entries; - table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!table) { - table = vzalloc(table_size); - if (!table) - return -ENOMEM; - } + table = kvzalloc(table_size, GFP_KERNEL); + if (!table) + return -ENOMEM; spin_lock_init(&table->cache_lock); INIT_LIST_HEAD(&table->stripe_cache); @@ -389,7 +386,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) if (bio_list_empty(&rbio->bio_list)) { if (!list_empty(&rbio->hash_list)) { list_del_init(&rbio->hash_list); - atomic_dec(&rbio->refs); + refcount_dec(&rbio->refs); BUG_ON(!list_empty(&rbio->plug_list)); } } @@ -480,7 +477,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) /* bump our ref if we were not in the list before */ if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) - atomic_inc(&rbio->refs); + refcount_inc(&rbio->refs); if (!list_empty(&rbio->stripe_cache)){ list_move(&rbio->stripe_cache, &table->stripe_cache); @@ -677,11 +674,9 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) struct btrfs_raid_bio *freeit = NULL; struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; - int walk = 0; spin_lock_irqsave(&h->lock, flags); list_for_each_entry(cur, &h->hash_list, hash_list) { - walk++; if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { spin_lock(&cur->bio_list_lock); @@ -691,7 +686,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) test_bit(RBIO_CACHE_BIT, &cur->flags) && !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { list_del_init(&cur->hash_list); - atomic_dec(&cur->refs); + refcount_dec(&cur->refs); steal_rbio(cur, rbio); cache_drop = cur; @@ -740,7 +735,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) } } lockit: - atomic_inc(&rbio->refs); + refcount_inc(&rbio->refs); list_add(&rbio->hash_list, &h->hash_list); out: spin_unlock_irqrestore(&h->lock, flags); @@ -786,7 +781,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } list_del_init(&rbio->hash_list); - atomic_dec(&rbio->refs); + refcount_dec(&rbio->refs); /* * we use the plug list to hold all the rbios @@ -803,7 +798,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) list_del_init(&rbio->plug_list); list_add(&next->hash_list, &h->hash_list); - atomic_inc(&next->refs); + refcount_inc(&next->refs); spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); @@ -845,8 +840,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) { int i; - WARN_ON(atomic_read(&rbio->refs) < 0); - if (!atomic_dec_and_test(&rbio->refs)) + if (!refcount_dec_and_test(&rbio->refs)) return; WARN_ON(!list_empty(&rbio->stripe_cache)); @@ -874,7 +868,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; @@ -887,7 +881,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_error = err; + cur->bi_status = err; bio_endio(cur); cur = next; } @@ -900,7 +894,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; int max_errors; if (err) @@ -917,7 +911,7 @@ static void raid_write_end_io(struct bio *bio) max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 0 : rbio->bbio->max_errors; if (atomic_read(&rbio->error) > max_errors) - err = -EIO; + err = BLK_STS_IOERR; rbio_orig_end_io(rbio, err); } @@ -999,7 +993,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->stripe_npages = stripe_npages; rbio->faila = -1; rbio->failb = -1; - atomic_set(&rbio->refs, 1); + refcount_set(&rbio->refs, 1); atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); @@ -1095,7 +1089,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && stripe->dev->bdev && - !last->bi_error && + !last->bi_status && last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) @@ -1104,10 +1098,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); - if (!bio) - return -ENOMEM; - + bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); bio->bi_iter.bi_size = 0; bio->bi_bdev = stripe->dev->bdev; bio->bi_iter.bi_sector = disk_start >> 9; @@ -1145,20 +1136,27 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; - struct bio_vec *bvec; u64 start; unsigned long stripe_offset; unsigned long page_index; - int i; spin_lock_irq(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) { + struct bio_vec bvec; + struct bvec_iter iter; + int i = 0; + start = (u64)bio->bi_iter.bi_sector << 9; stripe_offset = start - rbio->bbio->raid_map[0]; page_index = stripe_offset >> PAGE_SHIFT; - bio_for_each_segment_all(bvec, bio, i) - rbio->bio_pages[page_index + i] = bvec->bv_page; + if (bio_flagged(bio, BIO_CLONED)) + bio->bi_iter = btrfs_io_bio(bio)->iter; + + bio_for_each_segment(bvec, bio, iter) { + rbio->bio_pages[page_index + i] = bvec.bv_page; + i++; + } } spin_unlock_irq(&rbio->bio_list_lock); } @@ -1432,11 +1430,14 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio, */ static void set_bio_pages_uptodate(struct bio *bio) { - struct bio_vec *bvec; - int i; + struct bio_vec bvec; + struct bvec_iter iter; + + if (bio_flagged(bio, BIO_CLONED)) + bio->bi_iter = btrfs_io_bio(bio)->iter; - bio_for_each_segment_all(bvec, bio, i) - SetPageUptodate(bvec->bv_page); + bio_for_each_segment(bvec, bio, iter) + SetPageUptodate(bvec.bv_page); } /* @@ -1451,7 +1452,7 @@ static void raid_rmw_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -1994,7 +1995,7 @@ static void raid_recover_end_io(struct bio *bio) * we only read stripe pages off the disk, set them * up to date if there were no errors */ - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2120,6 +2121,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_raid_bio *rbio; int ret; + if (generic_io) { + ASSERT(bbio->mirror_num == mirror_num); + btrfs_io_bio(bio)->mirror_num = mirror_num; + } + rbio = alloc_rbio(fs_info, bbio, stripe_len); if (IS_ERR(rbio)) { if (generic_io) @@ -2196,6 +2202,8 @@ static void read_rebuild_work(struct btrfs_work *work) /* * The following code is used to scrub/replace the parity stripe * + * Caller must have already increased bio_counter for getting @bbio. + * * Note: We need make sure all the pages that add into the scrub/replace * raid bio are correct and not be changed during the scrub/replace. That * is those pages just hold metadata or file data with checksum. @@ -2233,6 +2241,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, ASSERT(rbio->stripe_npages == stripe_nsectors); bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + /* + * We have already increased bio_counter when getting bbio, record it + * so we can free it at rbio_orig_end_io(). + */ + rbio->generic_bio_cnt = 1; + return rbio; } @@ -2520,7 +2534,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2675,6 +2689,12 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, return NULL; } + /* + * When we get bbio, we have already increased bio_counter, record it + * so we can free it at rbio_orig_end_io() + */ + rbio->generic_bio_cnt = 1; + return rbio; } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index e88bca87f5d2..ab852b8e3e37 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -66,7 +66,6 @@ struct reada_extctl { struct reada_extent { u64 logical; struct btrfs_key top; - int err; struct list_head extctl; int refcnt; spinlock_t lock; @@ -209,9 +208,9 @@ cleanup: return; } -int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int err) +int btree_readahead_hook(struct extent_buffer *eb, int err) { + struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; struct reada_extent *re; @@ -235,10 +234,10 @@ start_machine: return ret; } -static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev, u64 logical, +static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = dev->fs_info; int ret; struct reada_zone *zone; struct btrfs_block_group_cache *cache = NULL; @@ -270,6 +269,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, if (!zone) return NULL; + ret = radix_tree_preload(GFP_KERNEL); + if (ret) { + kfree(zone); + return NULL; + } + zone->start = start; zone->end = end; INIT_LIST_HEAD(&zone->list); @@ -299,6 +304,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, zone = NULL; } spin_unlock(&fs_info->reada_lock); + radix_tree_preload_end(); return zone; } @@ -313,7 +319,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; struct btrfs_device *prev_dev; - u32 blocksize; u64 length; int real_stripes; int nzones = 0; @@ -334,7 +339,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!re) return NULL; - blocksize = fs_info->nodesize; re->logical = logical; re->top = *top; INIT_LIST_HEAD(&re->extctl); @@ -344,10 +348,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, /* * map block */ - length = blocksize; + length = fs_info->nodesize; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &length, &bbio, 0); - if (ret || !bbio || length < blocksize) + if (ret || !bbio || length < fs_info->nodesize) goto error; if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { @@ -367,7 +371,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!dev->bdev) continue; - zone = reada_find_zone(fs_info, dev, logical, bbio); + zone = reada_find_zone(dev, logical, bbio); if (!zone) continue; @@ -386,6 +390,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, goto error; } + ret = radix_tree_preload(GFP_KERNEL); + if (ret) + goto error; + /* insert extent in reada_tree + all per-device trees, all or nothing */ btrfs_dev_replace_lock(&fs_info->dev_replace, 0); spin_lock(&fs_info->reada_lock); @@ -395,13 +403,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + radix_tree_preload_end(); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + radix_tree_preload_end(); goto error; } + radix_tree_preload_end(); prev_dev = NULL; dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( &fs_info->dev_replace); @@ -639,9 +650,9 @@ static int reada_pick_zone(struct btrfs_device *dev) return 1; } -static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev) +static int reada_start_machine_dev(struct btrfs_device *dev) { + struct btrfs_fs_info *fs_info = dev->fs_info; struct reada_extent *re = NULL; int mirror_num = 0; struct extent_buffer *eb = NULL; @@ -754,8 +765,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) - enqueued += reada_start_machine_dev(fs_info, - device); + enqueued += reada_start_machine_dev(device); } mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 379711048fb0..65661d1aae4e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1548,9 +1548,9 @@ again: prev = node; entry = rb_entry(node, struct btrfs_inode, rb_node); - if (objectid < btrfs_ino(&entry->vfs_inode)) + if (objectid < btrfs_ino(entry)) node = node->rb_left; - else if (objectid > btrfs_ino(&entry->vfs_inode)) + else if (objectid > btrfs_ino(entry)) node = node->rb_right; else break; @@ -1558,7 +1558,7 @@ again: if (!node) { while (prev) { entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(&entry->vfs_inode)) { + if (objectid <= btrfs_ino(entry)) { node = prev; break; } @@ -1573,7 +1573,7 @@ again: return inode; } - objectid = btrfs_ino(&entry->vfs_inode) + 1; + objectid = btrfs_ino(entry) + 1; if (cond_resched_lock(&root->inode_lock)) goto again; @@ -1609,8 +1609,8 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, return -ENOMEM; bytenr -= BTRFS_I(reloc_inode)->index_cnt; - ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode), - bytenr, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, + btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); if (ret < 0) goto out; if (ret > 0) { @@ -1698,11 +1698,11 @@ int replace_file_extents(struct btrfs_trans_handle *trans, if (first) { inode = find_next_inode(root, key.objectid); first = 0; - } else if (inode && btrfs_ino(inode) < key.objectid) { + } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { btrfs_add_delayed_iput(inode); inode = find_next_inode(root, key.objectid); } - if (inode && btrfs_ino(inode) == key.objectid) { + if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); WARN_ON(!IS_ALIGNED(key.offset, @@ -1714,8 +1714,8 @@ int replace_file_extents(struct btrfs_trans_handle *trans, if (!ret) continue; - btrfs_drop_extent_cache(inode, key.offset, end, - 1); + btrfs_drop_extent_cache(BTRFS_I(inode), + key.offset, end, 1); unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, end); } @@ -2088,7 +2088,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, inode = find_next_inode(root, objectid); if (!inode) break; - ino = btrfs_ino(inode); + ino = btrfs_ino(BTRFS_I(inode)); if (ino > max_key->objectid) { iput(inode); @@ -2130,7 +2130,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, /* the lock_extent waits for readpage to complete */ lock_extent(&BTRFS_I(inode)->io_tree, start, end); - btrfs_drop_extent_cache(inode, start, end, 1); + btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); unlock_extent(&BTRFS_I(inode)->io_tree, start, end); } return 0; @@ -3093,11 +3093,12 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset; + struct extent_changeset *data_reserved = NULL; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); - ret = btrfs_check_data_free_space(inode, prealloc_start, + ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, prealloc_end + 1 - prealloc_start); if (ret) goto out; @@ -3113,8 +3114,8 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; if (cur_offset < start) - btrfs_free_reserved_data_space(inode, cur_offset, - start - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); @@ -3125,10 +3126,11 @@ int prealloc_file_extent_cluster(struct inode *inode, nr++; } if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space(inode, cur_offset, - prealloc_end + 1 - cur_offset); + btrfs_free_reserved_data_space(inode, data_reserved, + cur_offset, prealloc_end + 1 - cur_offset); out: inode_unlock(inode); + extent_changeset_free(data_reserved); return ret; } @@ -3161,7 +3163,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, free_extent_map(em); break; } - btrfs_drop_extent_cache(inode, start, end, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); } unlock_extent(&BTRFS_I(inode)->io_tree, start, end); return ret; @@ -3203,7 +3205,8 @@ static int relocate_file_extent_cluster(struct inode *inode, index = (cluster->start - offset) >> PAGE_SHIFT; last_index = (cluster->end - offset) >> PAGE_SHIFT; while (index <= last_index) { - ret = btrfs_delalloc_reserve_metadata(inode, PAGE_SIZE); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), + PAGE_SIZE); if (ret) goto out; @@ -3215,7 +3218,7 @@ static int relocate_file_extent_cluster(struct inode *inode, page = find_or_create_page(inode->i_mapping, index, mask); if (!page) { - btrfs_delalloc_release_metadata(inode, + btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE); ret = -ENOMEM; goto out; @@ -3234,7 +3237,7 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!PageUptodate(page)) { unlock_page(page); put_page(page); - btrfs_delalloc_release_metadata(inode, + btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE); ret = -EIO; goto out; @@ -3543,7 +3546,7 @@ truncate: goto out; } - ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode); + ret = btrfs_truncate_free_space_cache(trans, block_group, inode); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -4245,7 +4248,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); BTRFS_I(inode)->index_cnt = group->key.objectid; - err = btrfs_orphan_add(trans, inode); + err = btrfs_orphan_add(trans, BTRFS_I(inode)); out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -4268,8 +4271,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->reloc_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, - fs_info->btree_inode->i_mapping); + extent_io_tree_init(&rc->processed_blocks, NULL); return rc; } @@ -4334,7 +4336,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) rc->block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!rc->block_group); - ret = btrfs_inc_block_group_ro(extent_root, rc->block_group); + ret = btrfs_inc_block_group_ro(fs_info, rc->block_group); if (ret) { err = ret; goto out; @@ -4347,8 +4349,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group, - path); + inode = lookup_free_space_inode(fs_info, rc->block_group, path); btrfs_free_path(path); if (!IS_ERR(inode)) @@ -4372,7 +4373,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); - btrfs_wait_ordered_roots(fs_info, -1, + btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group->key.objectid, rc->block_group->key.offset); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 4c6735491ee0..460db0cb2d07 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -74,7 +74,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, * * If we find something return 0, otherwise > 0, < 0 on error. */ -int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, +int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, struct btrfs_path *path, struct btrfs_root_item *root_item, struct btrfs_key *root_key) { @@ -207,7 +207,7 @@ out: } int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_key *key, struct btrfs_root_item *item) + const struct btrfs_key *key, struct btrfs_root_item *item) { /* * Make sure generation v1 and v2 match. See update_root for details. @@ -337,7 +337,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) /* drop the root item for 'key' from 'root' */ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_path *path; int ret; @@ -390,6 +390,13 @@ again: WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); ptr = (unsigned long)(ref + 1); + ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr, + name_len); + if (!ret) { + err = -EIO; + goto out; + } + WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); *sequence = btrfs_root_ref_sequence(leaf, ref); @@ -501,8 +508,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_root_item *item = &root->root_item; - struct timespec ct = current_fs_time(root->fs_info->sb); + struct timespec ct; + ktime_get_real_ts(&ct); spin_lock(&root->root_item_lock); btrfs_set_root_ctransid(item, trans->transid); btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9a94670536a6..6f1e4c984b94 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -18,6 +18,7 @@ #include <linux/blkdev.h> #include <linux/ratelimit.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "volumes.h" #include "disk-io.h" @@ -64,7 +65,7 @@ struct scrub_ctx; #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_recover { - atomic_t refs; + refcount_t refs; struct btrfs_bio *bbio; u64 map_length; }; @@ -95,7 +96,7 @@ struct scrub_bio { struct scrub_ctx *sctx; struct btrfs_device *dev; struct bio *bio; - int err; + blk_status_t status; u64 logical; u64 physical; #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO @@ -112,7 +113,7 @@ struct scrub_block { struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; - atomic_t refs; /* free mem on transition to zero */ + refcount_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; struct scrub_parity *sparity; struct { @@ -140,9 +141,9 @@ struct scrub_parity { int nsectors; - int stripe_len; + u64 stripe_len; - atomic_t refs; + refcount_t refs; struct list_head spages; @@ -161,14 +162,6 @@ struct scrub_parity { unsigned long bitmap[0]; }; -struct scrub_wr_ctx { - struct scrub_bio *wr_curr_bio; - struct btrfs_device *tgtdev; - int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ - atomic_t flush_all_writes; - struct mutex wr_lock; -}; - struct scrub_ctx { struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; struct btrfs_fs_info *fs_info; @@ -183,11 +176,14 @@ struct scrub_ctx { atomic_t cancel_req; int readonly; int pages_per_rd_bio; - u32 sectorsize; - u32 nodesize; int is_dev_replace; - struct scrub_wr_ctx wr_ctx; + + struct scrub_bio *wr_curr_bio; + struct mutex wr_lock; + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ + atomic_t flush_all_writes; + struct btrfs_device *wr_tgtdev; /* * statistics @@ -202,7 +198,7 @@ struct scrub_ctx { * doesn't free the scrub context before or while the workers are * doing the wakeup() call. */ - atomic_t refs; + refcount_t refs; }; struct scrub_fixup_nodatasum { @@ -240,6 +236,13 @@ struct scrub_warning { struct btrfs_device *dev; }; +struct full_stripe_lock { + struct rb_node node; + u64 logical; + u64 refs; + struct mutex mutex; +}; + static void scrub_pending_bio_inc(struct scrub_ctx *sctx); static void scrub_pending_bio_dec(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); @@ -282,12 +285,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, u64 *extent_physical, struct btrfs_device **extent_dev, int *extent_mirror_num); -static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, - struct scrub_wr_ctx *wr_ctx, - struct btrfs_fs_info *fs_info, - struct btrfs_device *dev, - int is_dev_replace); -static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static void scrub_wr_submit(struct scrub_ctx *sctx); @@ -307,7 +304,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) { - atomic_inc(&sctx->refs); + refcount_inc(&sctx->refs); atomic_inc(&sctx->bios_in_flight); } @@ -351,6 +348,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) } /* + * Insert new full stripe lock into full stripe locks tree + * + * Return pointer to existing or newly inserted full_stripe_lock structure if + * everything works well. + * Return ERR_PTR(-ENOMEM) if we failed to allocate memory + * + * NOTE: caller must hold full_stripe_locks_root->lock before calling this + * function + */ +static struct full_stripe_lock *insert_full_stripe_lock( + struct btrfs_full_stripe_locks_tree *locks_root, + u64 fstripe_logical) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct full_stripe_lock *entry; + struct full_stripe_lock *ret; + + WARN_ON(!mutex_is_locked(&locks_root->lock)); + + p = &locks_root->root.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct full_stripe_lock, node); + if (fstripe_logical < entry->logical) { + p = &(*p)->rb_left; + } else if (fstripe_logical > entry->logical) { + p = &(*p)->rb_right; + } else { + entry->refs++; + return entry; + } + } + + /* Insert new lock */ + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + ret->logical = fstripe_logical; + ret->refs = 1; + mutex_init(&ret->mutex); + + rb_link_node(&ret->node, parent, p); + rb_insert_color(&ret->node, &locks_root->root); + return ret; +} + +/* + * Search for a full stripe lock of a block group + * + * Return pointer to existing full stripe lock if found + * Return NULL if not found + */ +static struct full_stripe_lock *search_full_stripe_lock( + struct btrfs_full_stripe_locks_tree *locks_root, + u64 fstripe_logical) +{ + struct rb_node *node; + struct full_stripe_lock *entry; + + WARN_ON(!mutex_is_locked(&locks_root->lock)); + + node = locks_root->root.rb_node; + while (node) { + entry = rb_entry(node, struct full_stripe_lock, node); + if (fstripe_logical < entry->logical) + node = node->rb_left; + else if (fstripe_logical > entry->logical) + node = node->rb_right; + else + return entry; + } + return NULL; +} + +/* + * Helper to get full stripe logical from a normal bytenr. + * + * Caller must ensure @cache is a RAID56 block group. + */ +static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, + u64 bytenr) +{ + u64 ret; + + /* + * Due to chunk item size limit, full stripe length should not be + * larger than U32_MAX. Just a sanity check here. + */ + WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX); + + /* + * round_down() can only handle power of 2, while RAID56 full + * stripe length can be 64KiB * n, so we need to manually round down. + */ + ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) * + cache->full_stripe_len + cache->key.objectid; + return ret; +} + +/* + * Lock a full stripe to avoid concurrency of recovery and read + * + * It's only used for profiles with parities (RAID5/6), for other profiles it + * does nothing. + * + * Return 0 if we locked full stripe covering @bytenr, with a mutex held. + * So caller must call unlock_full_stripe() at the same context. + * + * Return <0 if encounters error. + */ +static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, + bool *locked_ret) +{ + struct btrfs_block_group_cache *bg_cache; + struct btrfs_full_stripe_locks_tree *locks_root; + struct full_stripe_lock *existing; + u64 fstripe_start; + int ret = 0; + + *locked_ret = false; + bg_cache = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg_cache) { + ASSERT(0); + return -ENOENT; + } + + /* Profiles not based on parity don't need full stripe lock */ + if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) + goto out; + locks_root = &bg_cache->full_stripe_locks_root; + + fstripe_start = get_full_stripe_logical(bg_cache, bytenr); + + /* Now insert the full stripe lock */ + mutex_lock(&locks_root->lock); + existing = insert_full_stripe_lock(locks_root, fstripe_start); + mutex_unlock(&locks_root->lock); + if (IS_ERR(existing)) { + ret = PTR_ERR(existing); + goto out; + } + mutex_lock(&existing->mutex); + *locked_ret = true; +out: + btrfs_put_block_group(bg_cache); + return ret; +} + +/* + * Unlock a full stripe. + * + * NOTE: Caller must ensure it's the same context calling corresponding + * lock_full_stripe(). + * + * Return 0 if we unlock full stripe without problem. + * Return <0 for error + */ +static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, + bool locked) +{ + struct btrfs_block_group_cache *bg_cache; + struct btrfs_full_stripe_locks_tree *locks_root; + struct full_stripe_lock *fstripe_lock; + u64 fstripe_start; + bool freeit = false; + int ret = 0; + + /* If we didn't acquire full stripe lock, no need to continue */ + if (!locked) + return 0; + + bg_cache = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg_cache) { + ASSERT(0); + return -ENOENT; + } + if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) + goto out; + + locks_root = &bg_cache->full_stripe_locks_root; + fstripe_start = get_full_stripe_logical(bg_cache, bytenr); + + mutex_lock(&locks_root->lock); + fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start); + /* Unpaired unlock_full_stripe() detected */ + if (!fstripe_lock) { + WARN_ON(1); + ret = -ENOENT; + mutex_unlock(&locks_root->lock); + goto out; + } + + if (fstripe_lock->refs == 0) { + WARN_ON(1); + btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow", + fstripe_lock->logical); + } else { + fstripe_lock->refs--; + } + + if (fstripe_lock->refs == 0) { + rb_erase(&fstripe_lock->node, &locks_root->root); + freeit = true; + } + mutex_unlock(&locks_root->lock); + + mutex_unlock(&fstripe_lock->mutex); + if (freeit) + kfree(fstripe_lock); +out: + btrfs_put_block_group(bg_cache); + return ret; +} + +/* * used for workers that require transaction commits (i.e., for the * NOCOW case) */ @@ -358,7 +571,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->fs_info; - atomic_inc(&sctx->refs); + refcount_inc(&sctx->refs); /* * increment scrubs_running to prevent cancel requests from * completing as long as a worker is running. we must also @@ -422,8 +635,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; - scrub_free_wr_ctx(&sctx->wr_ctx); - /* this can happen when scrub is cancelled */ if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; @@ -443,13 +654,14 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) kfree(sbio); } + kfree(sctx->wr_curr_bio); scrub_free_csums(sctx); kfree(sctx); } static void scrub_put_ctx(struct scrub_ctx *sctx) { - if (atomic_dec_and_test(&sctx->refs)) + if (refcount_dec_and_test(&sctx->refs)) scrub_free_ctx(sctx); } @@ -459,12 +671,11 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->fs_info; - int ret; sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) goto nomem; - atomic_set(&sctx->refs, 1); + refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; @@ -489,8 +700,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx->bios[i]->next_free = -1; } sctx->first_free = 0; - sctx->nodesize = fs_info->nodesize; - sctx->sectorsize = fs_info->sectorsize; atomic_set(&sctx->bios_in_flight, 0); atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); @@ -501,12 +710,16 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) spin_lock_init(&sctx->stat_lock); init_waitqueue_head(&sctx->list_wait); - ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, - fs_info->dev_replace.tgtdev, is_dev_replace); - if (ret) { - scrub_free_ctx(sctx); - return ERR_PTR(ret); + WARN_ON(sctx->wr_curr_bio != NULL); + mutex_init(&sctx->wr_lock); + sctx->wr_curr_bio = NULL; + if (is_dev_replace) { + WARN_ON(!fs_info->dev_replace.tgtdev); + sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; + sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; + atomic_set(&sctx->flush_all_writes, 0); } + return sctx; nomem: @@ -521,6 +734,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, u32 nlink; int ret; int i; + unsigned nofs_flag; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; struct scrub_warning *swarn = warn_ctx; @@ -559,7 +773,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, nlink = btrfs_inode_nlink(eb, inode_item); btrfs_release_path(swarn->path); + /* + * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub + * uses GFP_NOFS in this context, so we keep it consistent but it does + * not seem to be strictly necessary. + */ + nofs_flag = memalloc_nofs_save(); ipath = init_ipath(4096, local_root, swarn->path); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { ret = PTR_ERR(ipath); ipath = NULL; @@ -733,7 +954,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) ret = -EIO; goto out; } - ret = repair_io_failure(inode, offset, PAGE_SIZE, + ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE, fixup->logical, page, offset - page_offset(page), fixup->mirror_num); @@ -859,12 +1080,14 @@ out: static inline void scrub_get_recover(struct scrub_recover *recover) { - atomic_inc(&recover->refs); + refcount_inc(&recover->refs); } -static inline void scrub_put_recover(struct scrub_recover *recover) +static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, + struct scrub_recover *recover) { - if (atomic_dec_and_test(&recover->refs)) { + if (refcount_dec_and_test(&recover->refs)) { + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(recover->bbio); kfree(recover); } @@ -894,6 +1117,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int mirror_index; int page_num; int success; + bool full_stripe_locked; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -919,6 +1143,24 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; + /* + * For RAID5/6, race can happen for a different device scrub thread. + * For data corruption, Parity and Data threads will both try + * to recovery the data. + * Race can lead to doubly added csum error, or even unrecoverable + * error. + */ + ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); + if (ret < 0) { + spin_lock(&sctx->stat_lock); + if (ret == -ENOMEM) + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + return ret; + } + if (sctx->is_dev_replace && !is_metadata && !have_csum) { sblocks_for_recheck = NULL; goto nodatasum_case; @@ -1243,7 +1485,7 @@ out: sblock->pagev[page_index]->sblock = NULL; recover = sblock->pagev[page_index]->recover; if (recover) { - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); sblock->pagev[page_index]->recover = NULL; } @@ -1253,6 +1495,9 @@ out: kfree(sblocks_for_recheck); } + ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); + if (ret < 0) + return ret; return 0; } @@ -1332,20 +1577,23 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &mapped_length, &bbio, 0, 1); + logical, &mapped_length, &bbio); if (ret || !bbio || mapped_length < sublen) { btrfs_put_bbio(bbio); + btrfs_bio_counter_dec(fs_info); return -EIO; } recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); if (!recover) { btrfs_put_bbio(bbio); + btrfs_bio_counter_dec(fs_info); return -ENOMEM; } - atomic_set(&recover->refs, 1); + refcount_set(&recover->refs, 1); recover->bbio = bbio; recover->map_length = mapped_length; @@ -1367,7 +1615,7 @@ leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); return -ENOMEM; } scrub_page_get(page); @@ -1409,7 +1657,7 @@ leave_nomem: scrub_get_recover(recover); page->recover = recover; } - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); length -= sublen; logical += sublen; page_index++; @@ -1420,14 +1668,14 @@ leave_nomem: struct scrub_bio_ret { struct completion event; - int error; + blk_status_t status; }; static void scrub_bio_wait_endio(struct bio *bio) { struct scrub_bio_ret *ret = bio->bi_private; - ret->error = bio->bi_error; + ret->status = bio->bi_status; complete(&ret->event); } @@ -1445,7 +1693,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, int ret; init_completion(&done.event); - done.error = 0; + done.status = 0; bio->bi_iter.bi_sector = page->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; @@ -1457,7 +1705,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, return ret; wait_for_completion(&done.event); - if (done.error) + if (done.status) return -EIO; return 0; @@ -1489,24 +1737,23 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } WARN_ON(!page->page); - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) { - page->io_error = 1; - sblock->no_io_error_seen = 0; - continue; - } + bio = btrfs_io_bio_alloc(1); bio->bi_bdev = page->dev->bdev; bio_add_page(bio, page->page, PAGE_SIZE, 0); if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { - if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) + if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) { + page->io_error = 1; sblock->no_io_error_seen = 0; + } } else { bio->bi_iter.bi_sector = page->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - if (btrfsic_submit_bio_wait(bio)) + if (btrfsic_submit_bio_wait(bio)) { + page->io_error = 1; sblock->no_io_error_seen = 0; + } } bio_put(bio); @@ -1578,9 +1825,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; + bio = btrfs_io_bio_alloc(1); bio->bi_bdev = page_bad->dev->bdev; bio->bi_iter.bi_sector = page_bad->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -1636,7 +1881,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, if (spage->io_error) { void *mapped_buffer = kmap_atomic(spage->page); - memset(mapped_buffer, 0, PAGE_SIZE); + clear_page(mapped_buffer); flush_dcache_page(spage->page); kunmap_atomic(mapped_buffer); } @@ -1646,37 +1891,31 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { - struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; struct scrub_bio *sbio; int ret; - mutex_lock(&wr_ctx->wr_lock); + mutex_lock(&sctx->wr_lock); again: - if (!wr_ctx->wr_curr_bio) { - wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), + if (!sctx->wr_curr_bio) { + sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio), GFP_KERNEL); - if (!wr_ctx->wr_curr_bio) { - mutex_unlock(&wr_ctx->wr_lock); + if (!sctx->wr_curr_bio) { + mutex_unlock(&sctx->wr_lock); return -ENOMEM; } - wr_ctx->wr_curr_bio->sctx = sctx; - wr_ctx->wr_curr_bio->page_count = 0; + sctx->wr_curr_bio->sctx = sctx; + sctx->wr_curr_bio->page_count = 0; } - sbio = wr_ctx->wr_curr_bio; + sbio = sctx->wr_curr_bio; if (sbio->page_count == 0) { struct bio *bio; sbio->physical = spage->physical_for_dev_replace; sbio->logical = spage->logical; - sbio->dev = wr_ctx->tgtdev; + sbio->dev = sctx->wr_tgtdev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(GFP_KERNEL, - wr_ctx->pages_per_wr_bio); - if (!bio) { - mutex_unlock(&wr_ctx->wr_lock); - return -ENOMEM; - } + bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio); sbio->bio = bio; } @@ -1685,7 +1924,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -1699,7 +1938,7 @@ again: if (sbio->page_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; - mutex_unlock(&wr_ctx->wr_lock); + mutex_unlock(&sctx->wr_lock); return -EIO; } scrub_wr_submit(sctx); @@ -1709,23 +1948,22 @@ again: sbio->pagev[sbio->page_count] = spage; scrub_page_get(spage); sbio->page_count++; - if (sbio->page_count == wr_ctx->pages_per_wr_bio) + if (sbio->page_count == sctx->pages_per_wr_bio) scrub_wr_submit(sctx); - mutex_unlock(&wr_ctx->wr_lock); + mutex_unlock(&sctx->wr_lock); return 0; } static void scrub_wr_submit(struct scrub_ctx *sctx) { - struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; struct scrub_bio *sbio; - if (!wr_ctx->wr_curr_bio) + if (!sctx->wr_curr_bio) return; - sbio = wr_ctx->wr_curr_bio; - wr_ctx->wr_curr_bio = NULL; + sbio = sctx->wr_curr_bio; + sctx->wr_curr_bio = NULL; WARN_ON(!sbio->bio->bi_bdev); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer @@ -1740,7 +1978,7 @@ static void scrub_wr_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, @@ -1755,7 +1993,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) int i; WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); - if (sbio->err) { + if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; @@ -1829,7 +2067,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) page = sblock->pagev[0]->page; buffer = kmap_atomic(page); - len = sctx->sectorsize; + len = sctx->fs_info->sectorsize; index = 0; for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); @@ -1894,7 +2132,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) sblock->header_error = 1; - len = sctx->nodesize - BTRFS_CSUM_SIZE; + len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; index = 0; @@ -2000,12 +2238,12 @@ static int scrub_checksum_super(struct scrub_block *sblock) static void scrub_block_get(struct scrub_block *sblock) { - atomic_inc(&sblock->refs); + refcount_inc(&sblock->refs); } static void scrub_block_put(struct scrub_block *sblock) { - if (atomic_dec_and_test(&sblock->refs)) { + if (refcount_dec_and_test(&sblock->refs)) { int i; if (sblock->sparity) @@ -2077,10 +2315,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(GFP_KERNEL, - sctx->pages_per_rd_bio); - if (!bio) - return -ENOMEM; + bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio); sbio->bio = bio; } @@ -2089,7 +2324,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -2125,7 +2360,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio) struct scrub_block *sblock = bio->bi_private; struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) sblock->no_io_error_seen = 0; bio_put(bio); @@ -2168,10 +2403,10 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) scrub_block_put(sblock); if (sctx->is_dev_replace && - atomic_read(&sctx->wr_ctx.flush_all_writes)) { - mutex_lock(&sctx->wr_ctx.wr_lock); + atomic_read(&sctx->flush_all_writes)) { + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); } scrub_pending_bio_dec(sctx); @@ -2189,8 +2424,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) int ret; int i; + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bbio, 0, 1); + &length, &bbio); if (ret || !bbio || !bbio->raid_map) goto bbio_out; @@ -2205,10 +2441,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) goto bbio_out; } - bio = btrfs_io_bio_alloc(GFP_NOFS, 0); - if (!bio) - goto bbio_out; - + bio = btrfs_io_bio_alloc(0); bio->bi_iter.bi_sector = logical >> 9; bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; @@ -2233,6 +2466,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) rbio_out: bio_put(bio); bbio_out: + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(bbio); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2257,7 +2491,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->refs, 1); + refcount_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; @@ -2334,7 +2568,7 @@ static void scrub_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_queue_work(fs_info->scrub_workers, &sbio->work); @@ -2347,7 +2581,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) int i; BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); - if (sbio->err) { + if (sbio->status) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -2374,10 +2608,10 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) spin_unlock(&sctx->list_lock); if (sctx->is_dev_replace && - atomic_read(&sctx->wr_ctx.flush_all_writes)) { - mutex_lock(&sctx->wr_ctx.wr_lock); + atomic_read(&sctx->flush_all_writes)) { + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); } scrub_pending_bio_dec(sctx); @@ -2387,7 +2621,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, unsigned long *bitmap, u64 start, u64 len) { - u32 offset; + u64 offset; int nsectors; int sectorsize = sparity->sctx->fs_info->sectorsize; @@ -2397,8 +2631,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, } start -= sparity->logic_start; - start = div_u64_rem(start, sparity->stripe_len, &offset); - offset /= sectorsize; + start = div64_u64_rem(start, sparity->stripe_len, &offset); + offset = div_u64(offset, sectorsize); nsectors = (int)len / sectorsize; if (offset + nsectors <= sparity->nsectors) { @@ -2472,8 +2706,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) if (!sum) return 0; - index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; - num_sectors = sum->len / sctx->sectorsize; + index = ((u32)(logical - sum->bytenr)) / sctx->fs_info->sectorsize; + num_sectors = sum->len / sctx->fs_info->sectorsize; memcpy(csum, sum->sums + index, sctx->csum_size); if (index == num_sectors - 1) { list_del(&sum->list); @@ -2492,19 +2726,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, u32 blocksize; if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; spin_lock(&sctx->stat_lock); sctx->stat.data_extents_scrubbed++; sctx->stat.data_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - blocksize = sctx->nodesize; + blocksize = sctx->fs_info->nodesize; spin_lock(&sctx->stat_lock); sctx->stat.tree_extents_scrubbed++; sctx->stat.tree_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; WARN_ON(1); } @@ -2557,7 +2791,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->refs, 1); + refcount_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; sblock->sparity = sparity; @@ -2638,11 +2872,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, } if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - blocksize = sctx->nodesize; + blocksize = sctx->fs_info->nodesize; } else { - blocksize = sctx->sectorsize; + blocksize = sctx->fs_info->sectorsize; WARN_ON(1); } @@ -2696,7 +2930,7 @@ static int get_raid56_logic_offset(u64 physical, int num, for (i = 0; i < nr_data_stripes(map); i++) { *offset = last_offset + i * map->stripe_len; - stripe_nr = div_u64(*offset, map->stripe_len); + stripe_nr = div64_u64(*offset, map->stripe_len); stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); /* Work out the disk rotation on this stripe-set */ @@ -2750,7 +2984,7 @@ static void scrub_parity_bio_endio(struct bio *bio) struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); @@ -2767,7 +3001,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) struct btrfs_fs_info *fs_info = sctx->fs_info; struct bio *bio; struct btrfs_raid_bio *rbio; - struct scrub_page *spage; struct btrfs_bio *bbio = NULL; u64 length; int ret; @@ -2777,15 +3010,14 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) goto out; length = sparity->logic_end - sparity->logic_start; + + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, - &length, &bbio, 0, 1); + &length, &bbio); if (ret || !bbio || !bbio->raid_map) goto bbio_out; - bio = btrfs_io_bio_alloc(GFP_NOFS, 0); - if (!bio) - goto bbio_out; - + bio = btrfs_io_bio_alloc(0); bio->bi_iter.bi_sector = sparity->logic_start >> 9; bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; @@ -2797,9 +3029,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) if (!rbio) goto rbio_out; - list_for_each_entry(spage, &sparity->spages, list) - raid56_add_scrub_pages(rbio, spage->page, spage->logical); - scrub_pending_bio_inc(sctx); raid56_parity_submit_scrub_rbio(rbio); return; @@ -2807,6 +3036,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) rbio_out: bio_put(bio); bbio_out: + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(bbio); bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); @@ -2824,12 +3054,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors) static void scrub_parity_get(struct scrub_parity *sparity) { - atomic_inc(&sparity->refs); + refcount_inc(&sparity->refs); } static void scrub_parity_put(struct scrub_parity *sparity) { - if (!atomic_dec_and_test(&sparity->refs)) + if (!refcount_dec_and_test(&sparity->refs)) return; scrub_parity_check_and_repair(sparity); @@ -2881,7 +3111,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->scrub_dev = sdev; sparity->logic_start = logic_start; sparity->logic_end = logic_end; - atomic_set(&sparity->refs, 1); + refcount_set(&sparity->refs, 1); INIT_LIST_HEAD(&sparity->spages); sparity->dbitmap = sparity->bitmap; sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; @@ -3052,9 +3282,9 @@ out: logic_end - logic_start); scrub_parity_put(sparity); scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); btrfs_release_path(path); return ret < 0 ? ret : 0; @@ -3100,7 +3330,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, physical = map->stripes[num].physical; offset = 0; - nstripes = div_u64(length, map->stripe_len); + nstripes = div64_u64(length, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; @@ -3210,14 +3440,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ if (atomic_read(&fs_info->scrub_pause_req)) { /* push queued extents */ - atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + atomic_set(&sctx->flush_all_writes, 1); scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_set(&sctx->flush_all_writes, 0); scrub_blocked_if_needed(fs_info); } @@ -3424,9 +3654,9 @@ skip: out: /* push queued extents */ scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); blk_finish_plug(&plug); btrfs_free_path(path); @@ -3584,7 +3814,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * -> btrfs_scrub_pause() */ scrub_pause_on(fs_info); - ret = btrfs_inc_block_group_ro(root, cache); + ret = btrfs_inc_block_group_ro(fs_info, cache); if (!ret && is_dev_replace) { /* * If we are doing a device replace wait for any tasks @@ -3606,7 +3836,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ btrfs_wait_block_group_reservations(cache); btrfs_wait_nocow_writers(cache); - ret = btrfs_wait_ordered_roots(fs_info, -1, + ret = btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->key.objectid, cache->key.offset); if (ret > 0) { @@ -3663,11 +3893,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * write requests are really completed when bios_in_flight * changes to 0. */ - atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + atomic_set(&sctx->flush_all_writes, 1); scrub_submit(sctx); - mutex_lock(&sctx->wr_ctx.wr_lock); + mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_ctx.wr_lock); + mutex_unlock(&sctx->wr_lock); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); @@ -3681,7 +3911,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); - atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_set(&sctx->flush_all_writes, 0); scrub_pause_off(fs_info); @@ -4084,34 +4314,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, btrfs_put_bbio(bbio); } -static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, - struct scrub_wr_ctx *wr_ctx, - struct btrfs_fs_info *fs_info, - struct btrfs_device *dev, - int is_dev_replace) -{ - WARN_ON(wr_ctx->wr_curr_bio != NULL); - - mutex_init(&wr_ctx->wr_lock); - wr_ctx->wr_curr_bio = NULL; - if (!is_dev_replace) - return 0; - - WARN_ON(!dev->bdev); - wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; - wr_ctx->tgtdev = dev; - atomic_set(&wr_ctx->flush_all_writes, 0); - return 0; -} - -static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) -{ - mutex_lock(&wr_ctx->wr_lock); - kfree(wr_ctx->wr_curr_bio); - wr_ctx->wr_curr_bio = NULL; - mutex_unlock(&wr_ctx->wr_lock); -} - static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, int mirror_num, u64 physical_for_dev_replace) { @@ -4240,7 +4442,7 @@ out: scrub_pending_trans_workers_dec(sctx); } -static int check_extent_to_block(struct inode *inode, u64 start, u64 len, +static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len, u64 logical) { struct extent_state *cached_state = NULL; @@ -4250,7 +4452,7 @@ static int check_extent_to_block(struct inode *inode, u64 start, u64 len, u64 lockstart = start, lockend = start + len - 1; int ret = 0; - io_tree = &BTRFS_I(inode)->io_tree; + io_tree = &inode->io_tree; lock_extent_bits(io_tree, lockstart, lockend, &cached_state); ordered = btrfs_lookup_ordered_range(inode, lockstart, len); @@ -4329,7 +4531,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, io_tree = &BTRFS_I(inode)->io_tree; nocow_ctx_logical = nocow_ctx->logical; - ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical); + ret = check_extent_to_block(BTRFS_I(inode), offset, len, + nocow_ctx_logical); if (ret) { ret = ret > 0 ? 0 : ret; goto out; @@ -4376,7 +4579,7 @@ again: } } - ret = check_extent_to_block(inode, offset, len, + ret = check_extent_to_block(BTRFS_I(inode), offset, len, nocow_ctx_logical); if (ret) { ret = ret > 0 ? 0 : ret; @@ -4413,7 +4616,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, struct btrfs_device *dev; int ret; - dev = sctx->wr_ctx.tgtdev; + dev = sctx->wr_tgtdev; if (!dev) return -EIO; if (!dev->bdev) { @@ -4421,13 +4624,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, "scrub write_page_nocow(bdev == NULL) is unexpected"); return -EIO; } - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - return -ENOMEM; - } + bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d145ce804620..b082210df9c8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1069,6 +1069,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } } + ret = btrfs_is_name_len_valid(eb, path->slots[0], + (unsigned long)(di + 1), name_len + data_len); + if (!ret) { + ret = -EIO; + goto out; + } if (name_len + data_len > buf_len) { buf_len = name_len + data_len; if (is_vmalloc_addr(buf)) { @@ -1083,7 +1089,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, buf = tmp; } if (!buf) { - buf = vmalloc(buf_len); + buf = kvmalloc(buf_len, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto out; @@ -1681,6 +1687,9 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) { int ret; + if (ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + ret = get_cur_inode_state(sctx, ino, gen); if (ret < 0) goto out; @@ -1847,7 +1856,7 @@ out: */ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, const char *name, int name_len, - u64 *who_ino, u64 *who_gen) + u64 *who_ino, u64 *who_gen, u64 *who_mode) { int ret = 0; u64 gen; @@ -1866,7 +1875,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. */ - if (sctx->parent_root) { + if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) @@ -1896,7 +1905,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (other_inode > sctx->send_progress || is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, NULL, - who_gen, NULL, NULL, NULL, NULL); + who_gen, who_mode, NULL, NULL, NULL); if (ret < 0) goto out; @@ -1934,6 +1943,19 @@ static int did_overwrite_ref(struct send_ctx *sctx, if (ret <= 0) goto out; + if (dir != BTRFS_FIRST_FREE_OBJECTID) { + ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, + NULL, NULL, NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + ret = 0; + goto out; + } + if (gen != dir_gen) + goto out; + } + /* check if the ref was overwritten by another ref */ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, &ow_inode, &other_type); @@ -2753,15 +2775,20 @@ out: struct recorded_ref { struct list_head list; - char *dir_path; char *name; struct fs_path *full_path; u64 dir; u64 dir_gen; - int dir_path_len; int name_len; }; +static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +{ + ref->full_path = path; + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; +} + /* * We need to process new refs before deleted refs, but compare_tree gives us * everything mixed. So we first record all refs and later process them. @@ -2778,17 +2805,7 @@ static int __record_ref(struct list_head *head, u64 dir, ref->dir = dir; ref->dir_gen = dir_gen; - ref->full_path = path; - - ref->name = (char *)kbasename(ref->full_path->start); - ref->name_len = ref->full_path->end - ref->name; - ref->dir_path = ref->full_path->start; - if (ref->name == ref->full_path->start) - ref->dir_path_len = 0; - else - ref->dir_path_len = ref->full_path->end - - ref->full_path->start - 1 - ref->name_len; - + set_ref_path(ref, path); list_add_tail(&ref->list, head); return 0; } @@ -3530,9 +3547,17 @@ static int is_ancestor(struct btrfs_root *root, struct fs_path *fs_path) { u64 ino = ino2; + bool free_path = false; + int ret = 0; + + if (!fs_path) { + fs_path = fs_path_alloc(); + if (!fs_path) + return -ENOMEM; + free_path = true; + } while (ino > BTRFS_FIRST_FREE_OBJECTID) { - int ret; u64 parent; u64 parent_gen; @@ -3541,13 +3566,18 @@ static int is_ancestor(struct btrfs_root *root, if (ret < 0) { if (ret == -ENOENT && ino == ino2) ret = 0; - return ret; + goto out; + } + if (parent == ino1) { + ret = parent_gen == ino1_gen ? 1 : 0; + goto out; } - if (parent == ino1) - return parent_gen == ino1_gen ? 1 : 0; ino = parent; } - return 0; + out: + if (free_path) + fs_path_free(fs_path); + return ret; } static int wait_for_parent_move(struct send_ctx *sctx, @@ -3556,6 +3586,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, { int ret = 0; u64 ino = parent_ref->dir; + u64 ino_gen = parent_ref->dir_gen; u64 parent_ino_before, parent_ino_after; struct fs_path *path_before = NULL; struct fs_path *path_after = NULL; @@ -3576,6 +3607,8 @@ static int wait_for_parent_move(struct send_ctx *sctx, * at get_cur_path()). */ while (ino > BTRFS_FIRST_FREE_OBJECTID) { + u64 parent_ino_after_gen; + if (is_waiting_for_move(sctx, ino)) { /* * If the current inode is an ancestor of ino in the @@ -3598,7 +3631,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, fs_path_reset(path_after); ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, - NULL, path_after); + &parent_ino_after_gen, path_after); if (ret < 0) goto out; ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, @@ -3615,10 +3648,20 @@ static int wait_for_parent_move(struct send_ctx *sctx, if (ino > sctx->cur_ino && (parent_ino_before != parent_ino_after || len1 != len2 || memcmp(path_before->start, path_after->start, len1))) { - ret = 1; - break; + u64 parent_ino_gen; + + ret = get_inode_info(sctx->parent_root, ino, NULL, + &parent_ino_gen, NULL, NULL, NULL, + NULL); + if (ret < 0) + goto out; + if (ino_gen == parent_ino_gen) { + ret = 1; + break; + } } ino = parent_ino_after; + ino_gen = parent_ino_after_gen; } out: @@ -3640,6 +3683,36 @@ out: return ret; } +static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) +{ + int ret; + struct fs_path *new_path; + + /* + * Our reference's name member points to its full_path member string, so + * we use here a new path. + */ + new_path = fs_path_alloc(); + if (!new_path) + return -ENOMEM; + + ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path); + if (ret < 0) { + fs_path_free(new_path); + return ret; + } + ret = fs_path_add(new_path, ref->name, ref->name_len); + if (ret < 0) { + fs_path_free(new_path); + return ret; + } + + fs_path_free(ref->full_path); + set_ref_path(ref, new_path); + + return 0; +} + /* * This does all the move/link/unlink/rmdir magic. */ @@ -3653,10 +3726,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) struct fs_path *valid_path = NULL; u64 ow_inode = 0; u64 ow_gen; + u64 ow_mode; int did_overwrite = 0; int is_orphan = 0; u64 last_dir_ino_rm = 0; bool can_rename = true; + bool orphanized_dir = false; + bool orphanized_ancestor = false; btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino); @@ -3754,7 +3830,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) */ ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, cur->name, cur->name_len, - &ow_inode, &ow_gen); + &ow_inode, &ow_gen, &ow_mode); if (ret < 0) goto out; if (ret) { @@ -3771,6 +3847,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) cur->full_path); if (ret < 0) goto out; + if (S_ISDIR(ow_mode)) + orphanized_dir = true; /* * If ow_inode has its rename operation delayed @@ -3808,9 +3886,16 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * might contain the pre-orphanization name of * ow_inode, which is no longer valid. */ - fs_path_reset(valid_path); - ret = get_cur_path(sctx, sctx->cur_ino, - sctx->cur_inode_gen, valid_path); + ret = is_ancestor(sctx->parent_root, + ow_inode, ow_gen, + sctx->cur_ino, NULL); + if (ret > 0) { + orphanized_ancestor = true; + fs_path_reset(valid_path); + ret = get_cur_path(sctx, sctx->cur_ino, + sctx->cur_inode_gen, + valid_path); + } if (ret < 0) goto out; } else { @@ -3869,6 +3954,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; } else { + /* + * We might have previously orphanized an inode + * which is an ancestor of our current inode, + * so our reference's full path, which was + * computed before any such orphanizations, must + * be updated. + */ + if (orphanized_dir) { + ret = update_ref_path(sctx, cur); + if (ret < 0) + goto out; + } ret = send_link(sctx, cur->full_path, valid_path); if (ret < 0) @@ -3931,6 +4028,18 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; if (!ret) { + /* + * If we orphanized any ancestor before, we need + * to recompute the full path for deleted names, + * since any such path was computed before we + * processed any references and orphanized any + * ancestor inode. + */ + if (orphanized_ancestor) { + ret = update_ref_path(sctx, cur); + if (ret < 0) + goto out; + } ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; @@ -5155,15 +5264,18 @@ static int is_extent_unchanged(struct send_ctx *sctx, while (key.offset < ekey->offset + left_len) { ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); - if (right_type != BTRFS_FILE_EXTENT_REG) { + if (right_type != BTRFS_FILE_EXTENT_REG && + right_type != BTRFS_FILE_EXTENT_INLINE) { ret = 0; goto out; } - right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); - right_len = btrfs_file_extent_num_bytes(eb, ei); - right_offset = btrfs_file_extent_offset(eb, ei); - right_gen = btrfs_file_extent_generation(eb, ei); + if (right_type == BTRFS_FILE_EXTENT_INLINE) { + right_len = btrfs_file_extent_inline_len(eb, slot, ei); + right_len = PAGE_ALIGN(right_len); + } else { + right_len = btrfs_file_extent_num_bytes(eb, ei); + } /* * Are we at extent 8? If yes, we know the extent is changed. @@ -5175,6 +5287,23 @@ static int is_extent_unchanged(struct send_ctx *sctx, goto out; } + /* + * We just wanted to see if when we have an inline extent, what + * follows it is a regular extent (wanted to check the above + * condition for inline extents too). This should normally not + * happen but it's possible for example when we have an inline + * compressed extent representing data with a size matching + * the page size (currently the same as sector size). + */ + if (right_type == BTRFS_FILE_EXTENT_INLINE) { + ret = 0; + goto out; + } + + right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + right_offset = btrfs_file_extent_offset(eb, ei); + right_gen = btrfs_file_extent_generation(eb, ei); + left_offset_fixed = left_offset; if (key.offset < ekey->offset) { /* Fix the right offset for 2a and 7. */ @@ -5277,6 +5406,81 @@ out: return ret; } +static int range_is_hole_in_parent(struct send_ctx *sctx, + const u64 start, + const u64 end) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root *root = sctx->parent_root; + u64 search_start = start; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = search_start; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + while (search_start < end) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *fi; + u64 extent_end; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid < sctx->cur_ino || + key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + if (key.objectid > sctx->cur_ino || + key.type > BTRFS_EXTENT_DATA_KEY || + key.offset >= end) + break; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) { + u64 size = btrfs_file_extent_inline_len(leaf, slot, fi); + + extent_end = ALIGN(key.offset + size, + root->fs_info->sectorsize); + } else { + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + } + if (extent_end <= start) + goto next; + if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) { + search_start = extent_end; + goto next; + } + ret = 0; + goto out; +next: + path->slots[0]++; + } + ret = 1; +out: + btrfs_free_path(path); + return ret; +} + static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) { @@ -5321,8 +5525,17 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, return ret; } - if (sctx->cur_inode_last_extent < key->offset) - ret = send_hole(sctx, key->offset); + if (sctx->cur_inode_last_extent < key->offset) { + ret = range_is_hole_in_parent(sctx, + sctx->cur_inode_last_extent, + key->offset); + if (ret < 0) + return ret; + else if (ret == 0) + ret = send_hole(sctx, key->offset); + else + ret = 0; + } sctx->cur_inode_last_extent = extent_end; return ret; } @@ -6192,8 +6405,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + /* + * Check that we don't overflow at later allocations, we request + * clone_sources_count + 1 items, and compare to unsigned long inside + * access_ok. + */ if (arg->clone_sources_count > - ULLONG_MAX / sizeof(*arg->clone_sources)) { + ULONG_MAX / sizeof(struct clone_root) - 1) { ret = -EINVAL; goto out; } @@ -6242,22 +6460,16 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN); + sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); if (!sctx->send_buf) { - sctx->send_buf = vmalloc(sctx->send_max_size); - if (!sctx->send_buf) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } - sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN); + sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL); if (!sctx->read_buf) { - sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); - if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } sctx->pending_dir_moves = RB_ROOT; @@ -6266,25 +6478,19 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); + sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL); if (!sctx->clone_roots) { - sctx->clone_roots = vzalloc(alloc_size); - if (!sctx->clone_roots) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); if (arg->clone_sources_count) { - clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); + clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); if (!clone_sources_tmp) { - clone_sources_tmp = vmalloc(alloc_size); - if (!clone_sources_tmp) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b5ae7d3d1896..12540b6104b5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -265,7 +265,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, function, line, errstr); return; } - ACCESS_ONCE(trans->transaction->aborted) = errno; + WRITE_ONCE(trans->transaction->aborted, errno); /* Wake up anybody who may be waiting on this transaction */ wake_up(&fs_info->transaction_wait); wake_up(&fs_info->transaction_blocked_wait); @@ -549,16 +549,19 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_ssd: btrfs_set_and_info(info, SSD, "use ssd allocation scheme"); + btrfs_clear_opt(info->mount_opt, NOSSD); break; case Opt_ssd_spread: btrfs_set_and_info(info, SSD_SPREAD, "use spread ssd allocation scheme"); btrfs_set_opt(info->mount_opt, SSD); + btrfs_clear_opt(info->mount_opt, NOSSD); break; case Opt_nossd: btrfs_set_and_info(info, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); + btrfs_clear_opt(info->mount_opt, SSD_SPREAD); break; case Opt_barrier: btrfs_clear_and_info(info, NOBARRIER, @@ -598,18 +601,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } break; case Opt_alloc_start: - num = match_strdup(&args[0]); - if (num) { - mutex_lock(&info->chunk_mutex); - info->alloc_start = memparse(num, NULL); - mutex_unlock(&info->chunk_mutex); - kfree(num); - btrfs_info(info, "allocations start at %llu", - info->alloc_start); - } else { - ret = -ENOMEM; - goto out; - } + btrfs_info(info, + "option alloc_start is obsolete, ignored"); break; case Opt_acl: #ifdef CONFIG_BTRFS_FS_POSIX_ACL @@ -1114,7 +1107,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec static int btrfs_fill_super(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - void *data, int silent) + void *data) { struct inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1133,6 +1126,13 @@ static int btrfs_fill_super(struct super_block *sb, #endif sb->s_flags |= MS_I_VERSION; sb->s_iflags |= SB_I_CGROUPWB; + + err = super_setup_bdi(sb); + if (err) { + btrfs_err(fs_info, "super_setup_bdi failed"); + return err; + } + err = open_ctree(sb, fs_devices, (char *)data); if (err) { btrfs_err(fs_info, "open_ctree failed"); @@ -1154,7 +1154,6 @@ static int btrfs_fill_super(struct super_block *sb, goto fail_close; } - save_mount_options(sb, data); cleancache_init_fs(sb); sb->s_flags |= MS_ACTIVE; return 0; @@ -1177,7 +1176,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1222,8 +1221,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",nobarrier"); if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); - if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", info->alloc_start); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); @@ -1611,8 +1608,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); btrfs_sb(s)->bdev_holder = fs_type; - error = btrfs_fill_super(s, fs_devices, data, - flags & MS_SILENT ? 1 : 0); + error = btrfs_fill_super(s, fs_devices, data); } if (error) { deactivate_locked_super(s); @@ -1707,7 +1703,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned long old_opts = fs_info->mount_opt; unsigned long old_compress_type = fs_info->compress_type; u64 old_max_inline = fs_info->max_inline; - u64 old_alloc_start = fs_info->alloc_start; int old_thread_pool_size = fs_info->thread_pool_size; unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; @@ -1786,8 +1781,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } if (fs_info->fs_devices->missing_devices > - fs_info->num_tolerated_disk_barrier_failures && - !(*flags & MS_RDONLY)) { + fs_info->num_tolerated_disk_barrier_failures) { btrfs_warn(fs_info, "too many missing devices, writeable remount is not allowed"); ret = -EACCES; @@ -1847,9 +1841,6 @@ restore: fs_info->mount_opt = old_opts; fs_info->compress_type = old_compress_type; fs_info->max_inline = old_max_inline; - mutex_lock(&fs_info->chunk_mutex); - fs_info->alloc_start = old_alloc_start; - mutex_unlock(&fs_info->chunk_mutex); btrfs_resize_thread_pool(fs_info, old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; @@ -1890,18 +1881,15 @@ static inline void btrfs_descending_sort_devices( static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 *free_bytes) { - struct btrfs_root *root = fs_info->tree_root; struct btrfs_device_info *devices_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 skip_space; u64 type; u64 avail_space; - u64 used_space; u64 min_stripe_size; int min_stripes = 1, num_stripes = 1; int i = 0, nr_devices; - int ret; /* * We aren't under the device list lock, so this is racy-ish, but good @@ -1919,12 +1907,12 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, } devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), - GFP_NOFS); + GFP_KERNEL); if (!devices_info) return -ENOMEM; /* calc min stripe number for data space allocation */ - type = btrfs_get_alloc_profile(root, 1); + type = btrfs_data_alloc_profile(fs_info); if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; num_stripes = nr_devices; @@ -1941,8 +1929,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, else min_stripe_size = BTRFS_STRIPE_LEN; - if (fs_info->alloc_start) - mutex_lock(&fs_devices->device_list_mutex); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (!device->in_fs_metadata || !device->bdev || @@ -1965,34 +1951,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, */ skip_space = SZ_1M; - /* user can set the offset in fs_info->alloc_start. */ - if (fs_info->alloc_start && - fs_info->alloc_start + BTRFS_STRIPE_LEN <= - device->total_bytes) { - rcu_read_unlock(); - skip_space = max(fs_info->alloc_start, skip_space); - - /* - * btrfs can not use the free space in - * [0, skip_space - 1], we must subtract it from the - * total. In order to implement it, we account the used - * space in this range first. - */ - ret = btrfs_account_dev_extents_size(device, 0, - skip_space - 1, - &used_space); - if (ret) { - kfree(devices_info); - mutex_unlock(&fs_devices->device_list_mutex); - return ret; - } - - rcu_read_lock(); - - /* calc the free space in [0, skip_space - 1] */ - skip_space -= used_space; - } - /* * we can use the free space in [0, skip_space - 1], subtract * it from the total. @@ -2011,8 +1969,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, i++; } rcu_read_unlock(); - if (fs_info->alloc_start) - mutex_unlock(&fs_devices->device_list_mutex); nr_devices = i; @@ -2049,10 +2005,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, * multiplier to scale the sizes. * * Unused device space usage is based on simulating the chunk allocator - * algorithm that respects the device sizes, order of allocations and the - * 'alloc_start' value, this is a close approximation of the actual use but - * there are other factors that may change the result (like a new metadata - * chunk). + * algorithm that respects the device sizes and order of allocations. This is + * a close approximation of the actual use but there are other factors that may + * change the result (like a new metadata chunk). * * If metadata is exhausted, f_bavail will be 0. */ @@ -2235,7 +2190,7 @@ static int btrfs_freeze(struct super_block *sb) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - fs_info->fs_frozen = 1; + set_bit(BTRFS_FS_FROZEN, &fs_info->flags); /* * We don't need a barrier here, we'll wait for any transaction that * could be in progress on other threads (and do delayed iputs that @@ -2254,7 +2209,9 @@ static int btrfs_freeze(struct super_block *sb) static int btrfs_unfreeze(struct super_block *sb) { - btrfs_sb(sb)->fs_frozen = 0; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + clear_bit(BTRFS_FS_FROZEN, &fs_info->flags); return 0; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 1f157fba8940..c2d5f3580b4c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -447,11 +447,52 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); +static ssize_t quota_override_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + int quota_override; + + quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + return snprintf(buf, PAGE_SIZE, "%d\n", quota_override); +} + +static ssize_t quota_override_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long knob; + int err; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + err = kstrtoul(buf, 10, &knob); + if (err) + return err; + if (knob > 1) + return -EINVAL; + + if (knob) + set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + else + clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); + + return len; +} + +BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), BTRFS_ATTR_PTR(nodesize), BTRFS_ATTR_PTR(sectorsize), BTRFS_ATTR_PTR(clone_alignment), + BTRFS_ATTR_PTR(quota_override), NULL, }; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ea272432c930..b18ab8f327a5 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -237,7 +237,6 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) { memset(trans, 0, sizeof(*trans)); trans->transid = 1; - INIT_LIST_HEAD(&trans->qgroup_ref_list); trans->type = __TRANS_DUMMY; } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 133753232a94..d06b1c931d05 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -87,7 +87,7 @@ static int test_find_delalloc(u32 sectorsize) return -ENOMEM; } - extent_io_tree_init(&tmp, &inode->i_data); + extent_io_tree_init(&tmp, inode); /* * First go through and create and mark all of our pages dirty, we pin diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 4d0f038e14f1..8c91d03cc82d 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -278,7 +278,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(inode, NULL, 0, 0, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0); if (IS_ERR(em)) { em = NULL; test_msg("Got an error when we shouldn't have\n"); @@ -293,7 +293,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } free_extent_map(em); - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); /* * All of the magic numbers are based on the mapping setup in @@ -302,7 +302,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) */ setup_file_extents(root, sectorsize); - em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -323,7 +323,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -350,7 +350,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -372,7 +372,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -399,7 +399,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -428,7 +428,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -450,7 +450,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -484,7 +484,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -513,7 +513,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -543,7 +543,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -576,7 +576,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -611,7 +611,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -645,7 +645,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -680,7 +680,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -707,7 +707,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -742,7 +742,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(inode, NULL, 0, offset + 6, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, + sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -769,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, 4096 * 1024, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -802,7 +803,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -885,7 +886,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) insert_inode_item_key(root); insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(inode, NULL, 0, 0, 2 * sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -907,7 +908,8 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) } free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, sectorsize, 2 * sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, + 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0e0508f488b2..f615d59b0489 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -60,8 +60,8 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { void btrfs_put_transaction(struct btrfs_transaction *transaction) { - WARN_ON(atomic_read(&transaction->use_count) == 0); - if (atomic_dec_and_test(&transaction->use_count)) { + WARN_ON(refcount_read(&transaction->use_count) == 0); + if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); if (transaction->delayed_refs.pending_csums) @@ -93,7 +93,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) btrfs_put_block_group_trimming(cache); btrfs_put_block_group(cache); } - kmem_cache_free(btrfs_transaction_cachep, transaction); + kfree(transaction); } } @@ -207,7 +207,7 @@ loop: spin_unlock(&fs_info->trans_lock); return -EBUSY; } - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); @@ -228,7 +228,7 @@ loop: */ BUG_ON(type == TRANS_JOIN_NOLOCK); - cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); + cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS); if (!cur_trans) return -ENOMEM; @@ -238,11 +238,11 @@ loop: * someone started a transaction after we unlocked. Make sure * to redo the checks above */ - kmem_cache_free(btrfs_transaction_cachep, cur_trans); + kfree(cur_trans); goto loop; } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); - kmem_cache_free(btrfs_transaction_cachep, cur_trans); + kfree(cur_trans); return -EROFS; } @@ -257,7 +257,7 @@ loop: * One for this trans handle, one so it will live on until we * commit the transaction. */ - atomic_set(&cur_trans->use_count, 2); + refcount_set(&cur_trans->use_count, 2); atomic_set(&cur_trans->pending_ordered, 0); cur_trans->flags = 0; cur_trans->start_time = get_seconds(); @@ -294,7 +294,7 @@ loop: spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, - fs_info->btree_inode->i_mapping); + fs_info->btree_inode); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -432,7 +432,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->trans_lock); cur_trans = fs_info->running_transaction; if (cur_trans && is_transaction_blocked(cur_trans)) { - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); spin_unlock(&fs_info->trans_lock); wait_event(fs_info->transaction_wait, @@ -474,7 +474,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root) static struct btrfs_trans_handle * start_transaction(struct btrfs_root *root, unsigned int num_items, - unsigned int type, enum btrfs_reserve_flush_enum flush) + unsigned int type, enum btrfs_reserve_flush_enum flush, + bool enforce_qgroups) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -505,9 +506,10 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * Do the reservation before we join the transaction so we can do all * the appropriate flushing if need be. */ - if (num_items > 0 && root != fs_info->chunk_root) { + if (num_items && root != fs_info->chunk_root) { qgroup_reserved = num_items * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved); + ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved, + enforce_qgroups); if (ret) return ERR_PTR(ret); @@ -570,7 +572,6 @@ again: h->type = type; h->can_flush_pending_bgs = true; - INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); smp_mb(); @@ -613,8 +614,9 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, - BTRFS_RESERVE_FLUSH_ALL); + BTRFS_RESERVE_FLUSH_ALL, true); } + struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( struct btrfs_root *root, unsigned int num_items, @@ -625,7 +627,14 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( u64 num_bytes; int ret; - trans = btrfs_start_transaction(root, num_items); + /* + * We have two callers: unlink and block group removal. The + * former should succeed even if we will temporarily exceed + * quota and the latter operates on the extent root so + * qgroup enforcement is ignored anyway. + */ + trans = start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_ALL, false); if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; @@ -654,25 +663,25 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush( unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, - BTRFS_RESERVE_FLUSH_LIMIT); + BTRFS_RESERVE_FLUSH_LIMIT, true); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN, - BTRFS_RESERVE_NO_FLUSH); + return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH, + true); } struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN_NOLOCK, - BTRFS_RESERVE_NO_FLUSH); + BTRFS_RESERVE_NO_FLUSH, true); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_USERSPACE, - BTRFS_RESERVE_NO_FLUSH); + BTRFS_RESERVE_NO_FLUSH, true); } /* @@ -691,7 +700,7 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_ATTACH, - BTRFS_RESERVE_NO_FLUSH); + BTRFS_RESERVE_NO_FLUSH, true); } /* @@ -707,7 +716,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) struct btrfs_trans_handle *trans; trans = start_transaction(root, 0, TRANS_ATTACH, - BTRFS_RESERVE_NO_FLUSH); + BTRFS_RESERVE_NO_FLUSH, true); if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) btrfs_wait_for_commit(root->fs_info, 0); @@ -734,7 +743,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) list_for_each_entry(t, &fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); ret = 0; break; } @@ -763,7 +772,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) if (t->state == TRANS_STATE_COMPLETED) break; cur_trans = t; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); break; } } @@ -866,14 +875,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (lock && !atomic_read(&info->open_ioctl_trans) && should_end_transaction(trans) && - ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { + READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { spin_lock(&info->trans_lock); if (cur_trans->state == TRANS_STATE_RUNNING) cur_trans->state = TRANS_STATE_BLOCKED; spin_unlock(&info->trans_lock); } - if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { + if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { if (throttle) return btrfs_commit_transaction(trans); else @@ -907,7 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, wake_up_process(info->transaction_kthread); err = -EIO; } - assert_qgroups_uptodate(trans); kmem_cache_free(btrfs_trans_handle_cachep, trans); if (must_run_delayed_refs) { @@ -1354,12 +1362,8 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * enabled. If this check races with the ioctl, rescan will * kick in anyway. */ - mutex_lock(&fs_info->qgroup_ioctl_lock); - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { - mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return 0; - } - mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * We are going to commit transaction, see btrfs_commit_transaction() @@ -1370,9 +1374,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, ret = commit_fs_roots(trans, fs_info); if (ret) goto out; - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret < 0) - goto out; ret = btrfs_qgroup_account_extents(trans, fs_info); if (ret < 0) goto out; @@ -1499,12 +1500,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* * insert the directory item */ - ret = btrfs_set_inode_index(parent_inode, &index); + ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); BUG_ON(ret); /* -ENOMEM */ /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, - btrfs_ino(parent_inode), + btrfs_ino(BTRFS_I(parent_inode)), dentry->d_name.name, dentry->d_name.len, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { @@ -1598,7 +1599,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_add_root_ref(trans, fs_info, objectid, parent_root->root_key.objectid, - btrfs_ino(parent_inode), index, + btrfs_ino(BTRFS_I(parent_inode)), index, dentry->d_name.name, dentry->d_name.len); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1638,7 +1639,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, parent_root, dentry->d_name.name, dentry->d_name.len, - parent_inode, &key, + BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); @@ -1647,7 +1648,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - btrfs_i_size_write(parent_inode, parent_inode->i_size + + btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + dentry->d_name.len * 2); parent_inode->i_mtime = parent_inode->i_ctime = current_time(parent_inode); @@ -1833,7 +1834,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, /* take transaction reference */ cur_trans = trans->transaction; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); btrfs_end_transaction(trans); @@ -1922,7 +1923,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static inline void @@ -1940,7 +1941,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) int ret; /* Stop the commit early if ->aborted is set */ - if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + if (unlikely(READ_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; btrfs_end_transaction(trans); return ret; @@ -2009,7 +2010,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&fs_info->trans_lock); - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans); @@ -2029,7 +2030,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (prev_trans->state != TRANS_STATE_COMPLETED) { - atomic_inc(&prev_trans->use_count); + refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); wait_for_commit(prev_trans); @@ -2080,7 +2081,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) atomic_read(&cur_trans->num_writers) == 1); /* ->aborted might be set after the previous check, so check it */ - if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + if (unlikely(READ_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; goto scrub_continue; } @@ -2124,13 +2125,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } - /* Reocrd old roots for later qgroup accounting */ - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } - /* * make sure none of the code above managed to slip in a * delayed item @@ -2173,6 +2167,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_free_log_root_tree(trans, fs_info); /* + * commit_fs_roots() can call btrfs_save_ino_cache(), which generates + * new delayed refs. Must handle them or qgroup can be wrong. + */ + ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + if (ret) { + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&fs_info->reloc_mutex); + goto scrub_continue; + } + + /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. */ @@ -2194,14 +2199,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * The tasks which save the space cache and inode cache may also * update ->aborted, check it. */ - if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + if (unlikely(READ_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; mutex_unlock(&fs_info->tree_log_mutex); mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; } - btrfs_prepare_extent_commit(trans, fs_info); + btrfs_prepare_extent_commit(fs_info); cur_trans = fs_info->running_transaction; @@ -2217,7 +2222,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) switch_commit_roots(cur_trans, fs_info); - assert_qgroups_uptodate(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); update_super_roots(fs_info); @@ -2251,7 +2255,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } - ret = write_ctree_super(trans, fs_info, 0); + ret = write_all_supers(fs_info, 0); if (ret) { mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; @@ -2300,7 +2304,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * it'll result in deadlock about SB_FREEZE_FS. */ if (current != fs_info->transaction_kthread && - current != fs_info->cleaner_kthread && !fs_info->fs_frozen) + current != fs_info->cleaner_kthread && + !test_bit(BTRFS_FS_FROZEN, &fs_info->flags)) btrfs_run_delayed_iputs(fs_info); return ret; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 5dfb5590fff6..c55e44560103 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -18,6 +18,8 @@ #ifndef __BTRFS_TRANSACTION__ #define __BTRFS_TRANSACTION__ + +#include <linux/refcount.h> #include "btrfs_inode.h" #include "delayed-ref.h" #include "ctree.h" @@ -49,7 +51,7 @@ struct btrfs_transaction { * transaction can end */ atomic_t num_writers; - atomic_t use_count; + refcount_t use_count; atomic_t pending_ordered; unsigned long flags; @@ -125,8 +127,6 @@ struct btrfs_trans_handle { unsigned int type; struct btrfs_root *root; struct btrfs_fs_info *fs_info; - struct seq_list delayed_ref_elem; - struct list_head qgroup_ref_list; struct list_head new_bgs; }; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index eeffff84f280..f20ef211a73d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -97,7 +97,7 @@ #define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, + struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, const loff_t start, const loff_t end, @@ -631,8 +631,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), - start, 0); + ret = btrfs_lookup_file_extent(trans, root, path, + btrfs_ino(BTRFS_I(inode)), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || @@ -673,6 +673,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, unsigned long dest_offset; struct btrfs_key ins; + if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && + btrfs_fs_incompat(fs_info, NO_HOLES)) + goto update_inode; + ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); if (ret) @@ -825,6 +829,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } inode_add_bytes(inode, nbytes); +update_inode: ret = btrfs_update_inode(trans, root, inode); out: if (inode) @@ -843,7 +848,7 @@ out: static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct inode *dir, + struct btrfs_inode *dir, struct btrfs_dir_item *di) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -875,7 +880,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, + name_len); if (ret) goto out; else @@ -991,8 +997,8 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_root *log_root, - struct inode *dir, struct inode *inode, - struct extent_buffer *eb, + struct btrfs_inode *dir, + struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid, u64 ref_index, char *name, int namelen, int *search_done) @@ -1047,12 +1053,11 @@ again: parent_objectid, victim_name, victim_name_len)) { - inc_nlink(inode); + inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, root, dir, - inode, victim_name, - victim_name_len); + ret = btrfs_unlink_inode(trans, root, dir, inode, + victim_name, victim_name_len); kfree(victim_name); if (ret) return ret; @@ -1115,16 +1120,16 @@ again: victim_name_len)) { ret = -ENOENT; victim_parent = read_one_inode(root, - parent_objectid); + parent_objectid); if (victim_parent) { - inc_nlink(inode); + inc_nlink(&inode->vfs_inode); btrfs_release_path(path); ret = btrfs_unlink_inode(trans, root, - victim_parent, - inode, - victim_name, - victim_name_len); + BTRFS_I(victim_parent), + inode, + victim_name, + victim_name_len); if (!ret) ret = btrfs_run_delayed_items( trans, @@ -1170,15 +1175,19 @@ next: return 0; } -static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index, - u64 *parent_objectid) +static int extref_get_fields(struct extent_buffer *eb, int slot, + unsigned long ref_ptr, u32 *namelen, char **name, + u64 *index, u64 *parent_objectid) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ref_ptr; *namelen = btrfs_inode_extref_name_len(eb, extref); + if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name, + *namelen)) + return -EIO; + *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1193,14 +1202,19 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, return 0; } -static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index) +static int ref_get_fields(struct extent_buffer *eb, int slot, + unsigned long ref_ptr, u32 *namelen, char **name, + u64 *index) { struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ref_ptr; *namelen = btrfs_inode_ref_name_len(eb, ref); + if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1), + *namelen)) + return -EIO; + *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1275,8 +1289,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, while (ref_ptr < ref_end) { if (log_ref_ver) { - ret = extref_get_fields(eb, ref_ptr, &namelen, &name, - &ref_index, &parent_objectid); + ret = extref_get_fields(eb, slot, ref_ptr, &namelen, + &name, &ref_index, &parent_objectid); /* * parent object can change from one array * item to another. @@ -1288,15 +1302,16 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; } } else { - ret = ref_get_fields(eb, ref_ptr, &namelen, &name, - &ref_index); + ret = ref_get_fields(eb, slot, ref_ptr, &namelen, + &name, &ref_index); } if (ret) goto out; /* if we already have a perfect match, we're done */ - if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), - ref_index, name, namelen)) { + if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), + btrfs_ino(BTRFS_I(inode)), ref_index, + name, namelen)) { /* * look for a conflicting back reference in the * metadata. if we find one we have to unlink that name @@ -1307,7 +1322,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (!search_done) { ret = __add_inode_ref(trans, root, path, log, - dir, inode, eb, + BTRFS_I(dir), + BTRFS_I(inode), inode_objectid, parent_objectid, ref_index, name, namelen, @@ -1320,8 +1336,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* insert our name */ - ret = btrfs_add_link(trans, dir, inode, name, namelen, - 0, ref_index); + ret = btrfs_add_link(trans, BTRFS_I(dir), + BTRFS_I(inode), + name, namelen, 0, ref_index); if (ret) goto out; @@ -1360,7 +1377,7 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans, } static int count_inode_extrefs(struct btrfs_root *root, - struct inode *inode, struct btrfs_path *path) + struct btrfs_inode *inode, struct btrfs_path *path) { int ret = 0; int name_len; @@ -1404,7 +1421,7 @@ static int count_inode_extrefs(struct btrfs_root *root, } static int count_inode_refs(struct btrfs_root *root, - struct inode *inode, struct btrfs_path *path) + struct btrfs_inode *inode, struct btrfs_path *path) { int ret; struct btrfs_key key; @@ -1477,19 +1494,19 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, struct btrfs_path *path; int ret; u64 nlink = 0; - u64 ino = btrfs_ino(inode); + u64 ino = btrfs_ino(BTRFS_I(inode)); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = count_inode_refs(root, inode, path); + ret = count_inode_refs(root, BTRFS_I(inode), path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(root, inode, path); + ret = count_inode_extrefs(root, BTRFS_I(inode), path); if (ret < 0) goto out; @@ -1639,7 +1656,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, return -EIO; } - ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, + name_len, 1, index); /* FIXME, put inode into FIXUP list */ @@ -1769,7 +1787,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, if (!exists) goto out; - ret = drop_one_dir_item(trans, root, path, dir, dst_di); + ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); if (ret) goto out; @@ -1778,7 +1796,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); if (!ret && update_size) { - btrfs_i_size_write(dir, dir->i_size + name_len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); ret = btrfs_update_inode(trans, root, dir); } kfree(name); @@ -1832,7 +1850,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, di)) + if (verify_dir_item(fs_info, eb, slot, di)) return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); @@ -2008,7 +2026,7 @@ again: ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, di)) { + if (verify_dir_item(fs_info, eb, slot, di)) { ret = -EIO; goto out; } @@ -2052,8 +2070,8 @@ again: } inc_nlink(inode); - ret = btrfs_unlink_inode(trans, root, dir, inode, - name, name_len); + ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + BTRFS_I(inode), name, name_len); if (!ret) ret = btrfs_run_delayed_items(trans, fs_info); kfree(name); @@ -2093,6 +2111,7 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, struct btrfs_path *path, const u64 ino) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key search_key; struct btrfs_path *log_path; int i; @@ -2134,6 +2153,12 @@ process_leaf: u32 this_len = sizeof(*di) + name_len + data_len; char *name; + ret = verify_dir_item(fs_info, path->nodes[0], + path->slots[0], di); + if (ret) { + ret = -EIO; + goto out; + } name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; @@ -2469,7 +2494,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, fs_info, next); + clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -2549,7 +2574,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, fs_info, next); + clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -2627,7 +2652,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, fs_info, next); + clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -2958,7 +2983,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * the running transaction open, so a full commit can't hop * in and cause problems either. */ - ret = write_ctree_super(trans, fs_info, 1); + ret = write_all_supers(fs_info, 1); if (ret) { btrfs_set_log_full_commit(fs_info, trans); btrfs_abort_transaction(trans, ret); @@ -3084,7 +3109,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, - struct inode *dir, u64 index) + struct btrfs_inode *dir, u64 index) { struct btrfs_root *log; struct btrfs_dir_item *di; @@ -3094,14 +3119,14 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, int bytes_del = 0; u64 dir_ino = btrfs_ino(dir); - if (BTRFS_I(dir)->logged_trans < trans->transid) + if (dir->logged_trans < trans->transid) return 0; ret = join_running_log_trans(root); if (ret) return 0; - mutex_lock(&BTRFS_I(dir)->log_mutex); + mutex_lock(&dir->log_mutex); log = root->log_root; path = btrfs_alloc_path(); @@ -3176,7 +3201,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, fail: btrfs_free_path(path); out_unlock: - mutex_unlock(&BTRFS_I(dir)->log_mutex); + mutex_unlock(&dir->log_mutex); if (ret == -ENOSPC) { btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; @@ -3192,25 +3217,25 @@ out_unlock: int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, - struct inode *inode, u64 dirid) + struct btrfs_inode *inode, u64 dirid) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log; u64 index; int ret; - if (BTRFS_I(inode)->logged_trans < trans->transid) + if (inode->logged_trans < trans->transid) return 0; ret = join_running_log_trans(root); if (ret) return 0; log = root->log_root; - mutex_lock(&BTRFS_I(inode)->log_mutex); + mutex_lock(&inode->log_mutex); ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), dirid, &index); - mutex_unlock(&BTRFS_I(inode)->log_mutex); + mutex_unlock(&inode->log_mutex); if (ret == -ENOSPC) { btrfs_set_log_full_commit(fs_info, trans); ret = 0; @@ -3260,7 +3285,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, * to replay anything deleted before the fsync */ static noinline int log_dir_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, + struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, int key_type, struct btrfs_log_ctx *ctx, @@ -3450,7 +3475,7 @@ done: * key logged by this transaction. */ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, + struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, struct btrfs_log_ctx *ctx) @@ -3464,9 +3489,8 @@ again: min_key = 0; max_key = 0; while (1) { - ret = log_dir_items(trans, root, inode, path, - dst_path, key_type, ctx, min_key, - &max_key); + ret = log_dir_items(trans, root, inode, path, dst_path, key_type, + ctx, min_key, &max_key); if (ret) return ret; if (max_key == (u64)-1) @@ -3595,34 +3619,34 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, static int log_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, - struct inode *inode) + struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; int ret; ret = btrfs_insert_empty_item(trans, log, path, - &BTRFS_I(inode)->location, - sizeof(*inode_item)); + &inode->location, sizeof(*inode_item)); if (ret && ret != -EEXIST) return ret; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); + fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, + 0, 0); btrfs_release_path(path); return 0; } static noinline int copy_items(struct btrfs_trans_handle *trans, - struct inode *inode, + struct btrfs_inode *inode, struct btrfs_path *dst_path, struct btrfs_path *src_path, u64 *last_extent, int start_slot, int nr, int inode_only, u64 logged_isize) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); unsigned long src_offset; unsigned long dst_offset; - struct btrfs_root *log = BTRFS_I(inode)->root->log_root; + struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; struct extent_buffer *src = src_path->nodes[0]; @@ -3633,7 +3657,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, char *ins_data; int i; struct list_head ordered_sums; - int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; bool has_extents = false; bool need_find_last_extent = true; bool done = false; @@ -3675,7 +3699,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, dst_path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, dst_path->nodes[0], inode_item, - inode, inode_only == LOG_INODE_EXISTS, + &inode->vfs_inode, + inode_only == LOG_INODE_EXISTS, logged_isize); } else { copy_extent_buffer(dst_path->nodes[0], src, dst_offset, @@ -3783,7 +3808,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, if (need_find_last_extent) { u64 len; - ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path); + ret = btrfs_prev_leaf(inode->root, src_path); if (ret < 0) return ret; if (ret) @@ -3825,8 +3850,8 @@ fill_holes: if (need_find_last_extent) { /* btrfs_prev_leaf could return 1 without releasing the path */ btrfs_release_path(src_path); - ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key, - src_path, 0, 0); + ret = btrfs_search_slot(NULL, inode->root, &first_key, + src_path, 0, 0); if (ret < 0) return ret; ASSERT(ret == 0); @@ -3846,7 +3871,7 @@ fill_holes: u64 extent_end; if (i >= btrfs_header_nritems(src_path->nodes[0])) { - ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path); + ret = btrfs_next_leaf(inode->root, src_path); if (ret < 0) return ret; ASSERT(ret == 0); @@ -3881,8 +3906,7 @@ fill_holes: offset = *last_extent; len = key.offset - *last_extent; ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), - offset, 0, 0, len, 0, len, 0, - 0, 0); + offset, 0, 0, len, 0, len, 0, 0, 0); if (ret) break; *last_extent = extent_end; @@ -4055,7 +4079,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, } static int log_one_extent(struct btrfs_trans_handle *trans, - struct inode *inode, struct btrfs_root *root, + struct btrfs_inode *inode, struct btrfs_root *root, const struct extent_map *em, struct btrfs_path *path, const struct list_head *logged_list, @@ -4072,8 +4096,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int extent_inserted = 0; bool ordered_io_err = false; - ret = wait_ordered_extents(trans, inode, root, em, logged_list, - &ordered_io_err); + ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, + logged_list, &ordered_io_err); if (ret) return ret; @@ -4084,7 +4108,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, btrfs_init_map_token(&token); - ret = __btrfs_drop_extents(trans, log, inode, path, em->start, + ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, em->start + em->len, NULL, 0, 1, sizeof(*fi), &extent_inserted); if (ret) @@ -4150,7 +4174,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *inode, + struct btrfs_inode *inode, struct btrfs_path *path, struct list_head *logged_list, struct btrfs_log_ctx *ctx, @@ -4159,14 +4183,14 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, { struct extent_map *em, *n; struct list_head extents; - struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *tree = &inode->extent_tree; u64 test_gen; int ret = 0; int num = 0; INIT_LIST_HEAD(&extents); - down_write(&BTRFS_I(inode)->dio_sem); + down_write(&inode->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; @@ -4188,7 +4212,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; /* Need a ref to keep it from getting evicted from cache */ - atomic_inc(&em->refs); + refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); list_add_tail(&em->list, &extents); num++; @@ -4206,7 +4230,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, * without writing to the log tree and the fsync must report the * file data write error and not commit the current transaction. */ - ret = filemap_check_errors(inode->i_mapping); + ret = filemap_check_errors(inode->vfs_inode.i_mapping); if (ret) ctx->io_err = ret; process: @@ -4235,13 +4259,13 @@ process: } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - up_write(&BTRFS_I(inode)->dio_sem); + up_write(&inode->dio_sem); btrfs_release_path(path); return ret; } -static int logged_inode_size(struct btrfs_root *log, struct inode *inode, +static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, struct btrfs_path *path, u64 *size_ret) { struct btrfs_key key; @@ -4279,7 +4303,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, */ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *inode, + struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path) { @@ -4374,7 +4398,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, */ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *inode, + struct btrfs_inode *inode, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4385,7 +4409,7 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_root *log = root->log_root; const u64 ino = btrfs_ino(inode); - const u64 i_size = i_size_read(inode); + const u64 i_size = i_size_read(&inode->vfs_inode); if (!btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; @@ -4495,7 +4519,7 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, static int btrfs_check_ref_name_override(struct extent_buffer *eb, const int slot, const struct btrfs_key *key, - struct inode *inode, + struct btrfs_inode *inode, u64 *other_ino) { int ret; @@ -4538,6 +4562,12 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, this_len = sizeof(*extref) + this_name_len; } + ret = btrfs_is_name_len_valid(eb, slot, name_ptr, + this_name_len); + if (!ret) { + ret = -EIO; + goto out; + } if (this_name_len > name_len) { char *new_name; @@ -4551,9 +4581,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, } read_extent_buffer(eb, name, name_ptr, this_name_len); - di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root, - search_path, parent, - name, this_name_len, 0); + di = btrfs_lookup_dir_item(NULL, inode->root, search_path, + parent, name, this_name_len, 0); if (di && !IS_ERR(di)) { struct btrfs_key di_key; @@ -4596,7 +4625,7 @@ out: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, + struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, const loff_t start, const loff_t end, @@ -4618,7 +4647,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, int ins_nr; bool fast_search = false; u64 ino = btrfs_ino(inode); - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; u64 logged_isize = 0; bool need_log_inode_item = true; @@ -4639,9 +4668,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, /* today the code can only do partial logging of directories */ - if (S_ISDIR(inode->i_mode) || + if (S_ISDIR(inode->vfs_inode.i_mode) || (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags) && + &inode->runtime_flags) && inode_only >= LOG_INODE_EXISTS)) max_key.type = BTRFS_XATTR_ITEM_KEY; else @@ -4654,8 +4683,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * order for the log replay code to mark inodes for link count * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). */ - if (S_ISDIR(inode->i_mode) || - BTRFS_I(inode)->generation > fs_info->last_trans_committed) + if (S_ISDIR(inode->vfs_inode.i_mode) || + inode->generation > fs_info->last_trans_committed) ret = btrfs_commit_inode_delayed_items(trans, inode); else ret = btrfs_commit_inode_delayed_inode(inode); @@ -4668,17 +4697,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (inode_only == LOG_OTHER_INODE) { inode_only = LOG_INODE_EXISTS; - mutex_lock_nested(&BTRFS_I(inode)->log_mutex, - SINGLE_DEPTH_NESTING); + mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); } else { - mutex_lock(&BTRFS_I(inode)->log_mutex); + mutex_lock(&inode->log_mutex); } /* * a brute force approach to making sure we get the most uptodate * copies of everything. */ - if (S_ISDIR(inode->i_mode)) { + if (S_ISDIR(inode->vfs_inode.i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; if (inode_only == LOG_INODE_EXISTS) @@ -4699,31 +4727,30 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * (zeroes), as if an expanding truncate happened, * instead of getting a file of 4Kb only. */ - err = logged_inode_size(log, inode, path, - &logged_isize); + err = logged_inode_size(log, inode, path, &logged_isize); if (err) goto out_unlock; } if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { + &inode->runtime_flags)) { if (inode_only == LOG_INODE_EXISTS) { max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); while(1) { ret = btrfs_truncate_inode_items(trans, - log, inode, 0, 0); + log, &inode->vfs_inode, 0, 0); if (ret != -EAGAIN) break; } } } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags) || + &inode->runtime_flags) || inode_only == LOG_INODE_EXISTS) { if (inode_only == LOG_INODE_ALL) fast_search = true; @@ -4764,18 +4791,17 @@ again: if ((min_key.type == BTRFS_INODE_REF_KEY || min_key.type == BTRFS_INODE_EXTREF_KEY) && - BTRFS_I(inode)->generation == trans->transid) { + inode->generation == trans->transid) { u64 other_ino = 0; ret = btrfs_check_ref_name_override(path->nodes[0], - path->slots[0], - &min_key, inode, - &other_ino); + path->slots[0], &min_key, inode, + &other_ino); if (ret < 0) { err = ret; goto out_unlock; } else if (ret > 0 && ctx && - other_ino != btrfs_ino(ctx->inode)) { + other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { struct btrfs_key inode_key; struct inode *other_inode; @@ -4823,9 +4849,10 @@ again: * update the log with the new name before we * unpin it. */ - err = btrfs_log_inode(trans, root, other_inode, - LOG_OTHER_INODE, - 0, LLONG_MAX, ctx); + err = btrfs_log_inode(trans, root, + BTRFS_I(other_inode), + LOG_OTHER_INODE, 0, LLONG_MAX, + ctx); iput(other_inode); if (err) goto out_unlock; @@ -4979,25 +5006,25 @@ log_extents: write_unlock(&em_tree->lock); } - if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { + if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { ret = log_directory_changes(trans, root, inode, path, dst_path, - ctx); + ctx); if (ret) { err = ret; goto out_unlock; } } - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->logged_trans = trans->transid; - BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); out_unlock: if (unlikely(err)) btrfs_put_logged_extents(&logged_list); else btrfs_submit_logged_extents(&logged_list, log); - mutex_unlock(&BTRFS_I(inode)->log_mutex); + mutex_unlock(&inode->log_mutex); btrfs_free_path(path); btrfs_free_path(dst_path); @@ -5021,13 +5048,13 @@ out_unlock: * we logged the inode or it might have also done the unlink). */ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; bool ret = false; - mutex_lock(&BTRFS_I(inode)->log_mutex); - if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) { + mutex_lock(&inode->log_mutex); + if (inode->last_unlink_trans > fs_info->last_trans_committed) { /* * Make sure any commits to the log are forced to be full * commits. @@ -5035,7 +5062,7 @@ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(fs_info, trans); ret = true; } - mutex_unlock(&BTRFS_I(inode)->log_mutex); + mutex_unlock(&inode->log_mutex); return ret; } @@ -5047,14 +5074,14 @@ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, * a full commit is required. */ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, - struct inode *inode, + struct btrfs_inode *inode, struct dentry *parent, struct super_block *sb, u64 last_committed) { int ret = 0; struct dentry *old_parent = NULL; - struct inode *orig_inode = inode; + struct btrfs_inode *orig_inode = inode; /* * for regular files, if its inode is already on disk, we don't @@ -5062,15 +5089,15 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * we can use the last_unlink_trans field to record renames * and other fun in this file. */ - if (S_ISREG(inode->i_mode) && - BTRFS_I(inode)->generation <= last_committed && - BTRFS_I(inode)->last_unlink_trans <= last_committed) - goto out; + if (S_ISREG(inode->vfs_inode.i_mode) && + inode->generation <= last_committed && + inode->last_unlink_trans <= last_committed) + goto out; - if (!S_ISDIR(inode->i_mode)) { + if (!S_ISDIR(inode->vfs_inode.i_mode)) { if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) goto out; - inode = d_inode(parent); + inode = BTRFS_I(d_inode(parent)); } while (1) { @@ -5081,7 +5108,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * think this inode has already been logged. */ if (inode != orig_inode) - BTRFS_I(inode)->logged_trans = trans->transid; + inode->logged_trans = trans->transid; smp_mb(); if (btrfs_must_commit_transaction(trans, inode)) { @@ -5093,7 +5120,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, break; if (IS_ROOT(parent)) { - inode = d_inode(parent); + inode = BTRFS_I(d_inode(parent)); if (btrfs_must_commit_transaction(trans, inode)) ret = 1; break; @@ -5102,7 +5129,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, parent = dget_parent(parent); dput(old_parent); old_parent = parent; - inode = d_inode(parent); + inode = BTRFS_I(d_inode(parent)); } dput(old_parent); @@ -5159,7 +5186,7 @@ struct btrfs_dir_list { */ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *start_inode, + struct btrfs_inode *start_inode, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -5237,7 +5264,7 @@ process_leaf: goto next_dir_inode; } - if (btrfs_inode_in_log(di_inode, trans->transid)) { + if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { iput(di_inode); break; } @@ -5245,10 +5272,10 @@ process_leaf: ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, root, di_inode, + ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), log_mode, 0, LLONG_MAX, ctx); if (!ret && - btrfs_must_commit_transaction(trans, di_inode)) + btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) ret = 1; iput(di_inode); if (ret) @@ -5289,14 +5316,14 @@ next_dir_inode: } static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, - struct inode *inode, + struct btrfs_inode *inode, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); int ret; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; const u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); @@ -5365,14 +5392,14 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (ctx) ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, root, dir_inode, + ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), LOG_INODE_ALL, 0, LLONG_MAX, ctx); if (!ret && - btrfs_must_commit_transaction(trans, dir_inode)) + btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) ret = 1; if (!ret && ctx && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, - dir_inode, ctx); + BTRFS_I(dir_inode), ctx); iput(dir_inode); if (ret) goto out; @@ -5392,7 +5419,8 @@ out: * the last committed transaction */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, + struct btrfs_root *root, + struct btrfs_inode *inode, struct dentry *parent, const loff_t start, const loff_t end, @@ -5406,9 +5434,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, int ret = 0; u64 last_committed = fs_info->last_trans_committed; bool log_dentries = false; - struct inode *orig_inode = inode; + struct btrfs_inode *orig_inode = inode; - sb = inode->i_sb; + sb = inode->vfs_inode.i_sb; if (btrfs_test_opt(fs_info, NOTREELOG)) { ret = 1; @@ -5425,14 +5453,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - if (root != BTRFS_I(inode)->root || - btrfs_root_refs(&root->root_item) == 0) { + if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) { ret = 1; goto end_no_trans; } - ret = check_parent_dirs_for_sync(trans, inode, parent, - sb, last_committed); + ret = check_parent_dirs_for_sync(trans, inode, parent, sb, + last_committed); if (ret) goto end_no_trans; @@ -5455,14 +5482,14 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * we can use the last_unlink_trans field to record renames * and other fun in this file. */ - if (S_ISREG(inode->i_mode) && - BTRFS_I(inode)->generation <= last_committed && - BTRFS_I(inode)->last_unlink_trans <= last_committed) { + if (S_ISREG(inode->vfs_inode.i_mode) && + inode->generation <= last_committed && + inode->last_unlink_trans <= last_committed) { ret = 0; goto end_trans; } - if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) + if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) log_dentries = true; /* @@ -5506,7 +5533,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * but the file inode does not have a matching BTRFS_INODE_REF_KEY item * and has a link count of 2. */ - if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + if (inode->last_unlink_trans > last_committed) { ret = btrfs_log_all_parents(trans, orig_inode, ctx); if (ret) goto end_trans; @@ -5516,14 +5543,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; - inode = d_inode(parent); - if (root != BTRFS_I(inode)->root) + inode = BTRFS_I(d_inode(parent)); + if (root != inode->root) break; - if (BTRFS_I(inode)->generation > last_committed) { + if (inode->generation > last_committed) { ret = btrfs_log_inode(trans, root, inode, - LOG_INODE_EXISTS, - 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); if (ret) goto end_trans; } @@ -5567,8 +5593,8 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *parent = dget_parent(dentry); int ret; - ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, - start, end, 0, ctx); + ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), + parent, start, end, 0, ctx); dput(parent); return ret; @@ -5730,7 +5756,7 @@ error: * inodes, etc) are done. */ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, - struct inode *dir, struct inode *inode, + struct btrfs_inode *dir, struct btrfs_inode *inode, int for_rename) { /* @@ -5743,23 +5769,23 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * into the file. When the file is logged we check it and * don't log the parents if the file is fully on disk. */ - mutex_lock(&BTRFS_I(inode)->log_mutex); - BTRFS_I(inode)->last_unlink_trans = trans->transid; - mutex_unlock(&BTRFS_I(inode)->log_mutex); + mutex_lock(&inode->log_mutex); + inode->last_unlink_trans = trans->transid; + mutex_unlock(&inode->log_mutex); /* * if this directory was already logged any new * names for this file/dir will get recorded */ smp_mb(); - if (BTRFS_I(dir)->logged_trans == trans->transid) + if (dir->logged_trans == trans->transid) return; /* * if the inode we're about to unlink was logged, * the log will be properly updated for any new names */ - if (BTRFS_I(inode)->logged_trans == trans->transid) + if (inode->logged_trans == trans->transid) return; /* @@ -5776,9 +5802,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, return; record: - mutex_lock(&BTRFS_I(dir)->log_mutex); - BTRFS_I(dir)->last_unlink_trans = trans->transid; - mutex_unlock(&BTRFS_I(dir)->log_mutex); + mutex_lock(&dir->log_mutex); + dir->last_unlink_trans = trans->transid; + mutex_unlock(&dir->log_mutex); } /* @@ -5794,11 +5820,11 @@ record: * parent root and tree of tree roots trees, etc) are done. */ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, - struct inode *dir) + struct btrfs_inode *dir) { - mutex_lock(&BTRFS_I(dir)->log_mutex); - BTRFS_I(dir)->last_unlink_trans = trans->transid; - mutex_unlock(&BTRFS_I(dir)->log_mutex); + mutex_lock(&dir->log_mutex); + dir->last_unlink_trans = trans->transid; + mutex_unlock(&dir->log_mutex); } /* @@ -5809,27 +5835,25 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * full transaction commit is required. */ int btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *old_dir, + struct btrfs_inode *inode, struct btrfs_inode *old_dir, struct dentry *parent) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root * root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_root *root = inode->root; /* * this will force the logging code to walk the dentry chain * up for the file */ - if (S_ISREG(inode->i_mode)) - BTRFS_I(inode)->last_unlink_trans = trans->transid; + if (S_ISREG(inode->vfs_inode.i_mode)) + inode->last_unlink_trans = trans->transid; /* * if this inode hasn't been logged and directory we're renaming it * from hasn't been logged, we don't need to log it */ - if (BTRFS_I(inode)->logged_trans <= - fs_info->last_trans_committed && - (!old_dir || BTRFS_I(old_dir)->logged_trans <= - fs_info->last_trans_committed)) + if (inode->logged_trans <= fs_info->last_trans_committed && + (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) return 0; return btrfs_log_inode_parent(trans, root, inode, parent, 0, diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index ab858e31ccbc..483027f9a7f4 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -48,13 +48,13 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans) { - ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid; + WRITE_ONCE(fs_info->last_trans_log_full_commit, trans->transid); } static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans) { - return ACCESS_ONCE(fs_info->last_trans_log_full_commit) == + return READ_ONCE(fs_info->last_trans_log_full_commit) == trans->transid; } @@ -72,19 +72,19 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, - struct inode *dir, u64 index); + struct btrfs_inode *dir, u64 index); int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, - struct inode *inode, u64 dirid); + struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); int btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, - struct inode *dir, struct inode *inode, + struct btrfs_inode *dir, struct btrfs_inode *inode, int for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, - struct inode *dir); + struct btrfs_inode *dir); int btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *old_dir, + struct btrfs_inode *inode, struct btrfs_inode *old_dir, struct dentry *parent); #endif diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index b1434bb57e36..d8edf164f81c 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -52,13 +52,13 @@ void ulist_init(struct ulist *ulist) } /** - * ulist_fini - free up additionally allocated memory for the ulist + * ulist_release - free up additionally allocated memory for the ulist * @ulist: the ulist from which to free the additional memory * * This is useful in cases where the base 'struct ulist' has been statically * allocated. */ -static void ulist_fini(struct ulist *ulist) +void ulist_release(struct ulist *ulist) { struct ulist_node *node; struct ulist_node *next; @@ -79,7 +79,7 @@ static void ulist_fini(struct ulist *ulist) */ void ulist_reinit(struct ulist *ulist) { - ulist_fini(ulist); + ulist_release(ulist); ulist_init(ulist); } @@ -105,13 +105,13 @@ struct ulist *ulist_alloc(gfp_t gfp_mask) * ulist_free - free dynamically allocated ulist * @ulist: ulist to free * - * It is not necessary to call ulist_fini before. + * It is not necessary to call ulist_release before. */ void ulist_free(struct ulist *ulist) { if (!ulist) return; - ulist_fini(ulist); + ulist_release(ulist); kfree(ulist); } diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index a01a2c45825f..53c913632733 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -19,9 +19,6 @@ * */ struct ulist_iterator { -#ifdef CONFIG_BTRFS_DEBUG - int i; -#endif struct list_head *cur_list; /* hint to start search */ }; @@ -32,10 +29,6 @@ struct ulist_node { u64 val; /* value to store */ u64 aux; /* auxiliary value saved along with the val */ -#ifdef CONFIG_BTRFS_DEBUG - int seqnum; /* sequence number this node is added */ -#endif - struct list_head list; /* used to link node */ struct rb_node rb_node; /* used to speed up search */ }; @@ -51,6 +44,7 @@ struct ulist { }; void ulist_init(struct ulist *ulist); +void ulist_release(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); struct ulist *ulist_alloc(gfp_t gfp_mask); void ulist_free(struct ulist *ulist); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3c3c69c0eee4..5eb7217738ed 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -134,12 +134,16 @@ const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { }; static int init_first_rw_device(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_device *device); + struct btrfs_fs_info *fs_info); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, + enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, + int mirror_num, int need_raid_map); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -238,6 +242,17 @@ static struct btrfs_device *__alloc_device(void) if (!dev) return ERR_PTR(-ENOMEM); + /* + * Preallocate a bio that's always going to be used for flushing device + * barriers and matches the device lifespan + */ + dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); + if (!dev->flush_bio) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + bio_get(dev->flush_bio); + INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->resized_list); @@ -366,7 +381,7 @@ static noinline void run_scheduled_bios(struct btrfs_device *device) */ blk_start_plug(&plug); - bdi = blk_get_backing_dev_info(device->bdev); + bdi = device->bdev->bd_bdi; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; @@ -834,6 +849,7 @@ static void __free_device(struct work_struct *work) device = container_of(work, struct btrfs_device, rcu_work); rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); } @@ -1009,14 +1025,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, q = bdev_get_queue(bdev); if (blk_queue_discard(q)) device->can_discard = 1; + if (!blk_queue_nonrot(q)) + fs_devices->rotating = 1; device->bdev = bdev; device->in_fs_metadata = 0; device->mode = flags; - if (!blk_queue_nonrot(bdev_get_queue(bdev))) - fs_devices->rotating = 1; - fs_devices->open_devices++; if (device->writeable && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1350,15 +1365,13 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction, int ret; int slot; struct extent_buffer *l; - u64 min_search_start; /* * We don't want to overwrite the superblock on the drive nor any area * used by the boot loader (grub for example), so we make sure to start * at an offset of at least 1MB. */ - min_search_start = max(fs_info->alloc_start, 1024ull * 1024); - search_start = max(search_start, min_search_start); + search_start = max_t(u64, search_start, SZ_1M); path = btrfs_alloc_path(); if (!path) @@ -1726,7 +1739,7 @@ out: * Function to update ctime/mtime for a given device path. * Mainly used for ctime/mtime based probe like libblkid. */ -static void update_dev_time(char *path_name) +static void update_dev_time(const char *path_name) { struct file *filp; @@ -1852,7 +1865,8 @@ void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = next_device->bdev; } -int btrfs_rm_device(struct btrfs_fs_info *fs_info, char *device_path, u64 devid) +int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, + u64 devid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2092,7 +2106,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, - char *device_path, + const char *device_path, struct btrfs_device **device) { int ret = 0; @@ -2119,7 +2133,7 @@ static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, } int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, - char *device_path, + const char *device_path, struct btrfs_device **device) { *device = NULL; @@ -2152,7 +2166,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, * Lookup a device given by device id, or the path if the id is 0. */ int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, - char *devpath, struct btrfs_device **device) + const char *devpath, + struct btrfs_device **device) { int ret; @@ -2308,7 +2323,7 @@ error: return ret; } -int btrfs_init_new_device(struct btrfs_fs_info *fs_info, char *device_path) +int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) { struct btrfs_root *root = fs_info->dev_root; struct request_queue *q; @@ -2382,7 +2397,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, char *device_path) device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; - device->total_bytes = i_size_read(bdev->bd_inode); + device->total_bytes = round_down(i_size_read(bdev->bd_inode), + fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; device->fs_info = fs_info; @@ -2412,16 +2428,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, char *device_path) fs_info->fs_devices->total_devices++; fs_info->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += device->total_bytes; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!blk_queue_nonrot(bdev_get_queue(bdev))) + if (!blk_queue_nonrot(q)) fs_info->fs_devices->rotating = 1; tmp = btrfs_super_total_bytes(fs_info->super_copy); btrfs_set_super_total_bytes(fs_info->super_copy, - tmp + device->total_bytes); + round_down(tmp + device->total_bytes, fs_info->sectorsize)); tmp = btrfs_super_num_devices(fs_info->super_copy); btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1); @@ -2440,7 +2454,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, char *device_path) if (seeding_dev) { mutex_lock(&fs_info->chunk_mutex); - ret = init_first_rw_device(trans, fs_info, device); + ret = init_first_rw_device(trans, fs_info); mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2516,7 +2530,7 @@ error: } int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, - char *device_path, + const char *device_path, struct btrfs_device *srcdev, struct btrfs_device **device_out) { @@ -2569,7 +2583,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, goto error; } - name = rcu_string_strdup(device_path, GFP_NOFS); + name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { kfree(device); ret = -ENOMEM; @@ -2684,6 +2698,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, if (!device->writeable) return -EACCES; + new_size = round_down(new_size, fs_info->sectorsize); + mutex_lock(&fs_info->chunk_mutex); old_total = btrfs_super_total_bytes(super_copy); diff = new_size - device->total_bytes; @@ -2696,7 +2712,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, fs_devices = fs_info->fs_devices; - btrfs_set_super_total_bytes(super_copy, old_total + diff); + btrfs_set_super_total_bytes(super_copy, + round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; btrfs_device_set_total_bytes(device, new_size); @@ -2794,10 +2811,38 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, return ret; } +static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, length); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_crit(fs_info, "unable to find logical %llu length %llu", + logical, length); + return ERR_PTR(-EINVAL); + } + + if (em->start > logical || em->start + em->len < logical) { + btrfs_crit(fs_info, + "found a bad mapping, wanted %llu-%llu, found %llu-%llu", + logical, length, em->start, em->start + em->len); + free_extent_map(em); + return ERR_PTR(-EINVAL); + } + + /* callers are responsible for dropping em's ref. */ + return em; +} + int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct extent_map_tree *em_tree; struct extent_map *em; struct map_lookup *map; u64 dev_extent_len = 0; @@ -2805,23 +2850,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em_tree = &fs_info->mapping_tree.map_tree; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - - if (!em || em->start > chunk_offset || - em->start + em->len < chunk_offset) { + em = get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(em)) { /* * This is a logic error, but we don't want to just rely on the * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ ASSERT(0); - if (em) - free_extent_map(em); - return -EINVAL; + return PTR_ERR(em); } map = em->map_lookup; mutex_lock(&fs_info->chunk_mutex); @@ -2849,9 +2886,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_bytes_used(device, device->bytes_used - dev_extent_len); - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += dev_extent_len; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); } @@ -3735,7 +3770,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) if (ret) btrfs_handle_fs_error(fs_info, ret, NULL); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } /* Non-zero return value signifies invalidity */ @@ -3754,6 +3789,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_fs_info *fs_info = bctl->fs_info; + u64 meta_target, data_target; u64 allowed; int mixed = 0; int ret; @@ -3850,11 +3886,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } while (read_seqretry(&fs_info->profiles_lock, seq)); - if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < - btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < + btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { btrfs_warn(fs_info, "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", - bctl->meta.target, bctl->data.target); + meta_target, data_target); } if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { @@ -3909,7 +3950,7 @@ out: __cancel_balance(fs_info); else { kfree(bctl); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } return ret; } @@ -3999,7 +4040,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); - WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); @@ -4362,7 +4403,10 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = btrfs_device_get_total_bytes(device); - u64 diff = old_size - new_size; + u64 diff; + + new_size = round_down(new_size, fs_info->sectorsize); + diff = old_size - new_size; if (device->is_tgtdev_for_dev_replace) return -EINVAL; @@ -4378,9 +4422,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_device_set_total_bytes(device, new_size); if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space -= diff; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_sub(diff, &fs_info->free_chunk_space); } mutex_unlock(&fs_info->chunk_mutex); @@ -4491,7 +4533,8 @@ again: &fs_info->fs_devices->resized_devices); WARN_ON(diff > old_total); - btrfs_set_super_total_bytes(super_copy, old_total - diff); + btrfs_set_super_total_bytes(super_copy, + round_down(old_total - diff, fs_info->sectorsize)); mutex_unlock(&fs_info->chunk_mutex); /* Now btrfs_update_device() will change the on-disk size. */ @@ -4504,9 +4547,7 @@ done: btrfs_device_set_total_bytes(device, old_size); if (device->writeable) device->fs_devices->total_rw_bytes += diff; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += diff; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(diff, &fs_info->free_chunk_space); mutex_unlock(&fs_info->chunk_mutex); } return ret; @@ -4584,8 +4625,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) / sizeof(struct btrfs_stripe) + 1) static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 start, - u64 type) + u64 start, u64 type) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; @@ -4785,7 +4825,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = div_u64(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - stripe_size = div_u64(stripe_size, raid_stripe_len); + stripe_size = div64_u64(stripe_size, raid_stripe_len); stripe_size *= raid_stripe_len; map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -4833,7 +4873,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ret = add_extent_mapping(em_tree, em, 0); if (!ret) { list_add_tail(&em->list, &trans->transaction->pending_chunks); - atomic_inc(&em->refs); + refcount_inc(&em->refs); } write_unlock(&em_tree->lock); if (ret) { @@ -4852,9 +4892,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); } - spin_lock(&info->free_chunk_lock); - info->free_chunk_space -= (stripe_size * map->num_stripes); - spin_unlock(&info->free_chunk_lock); + atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); free_extent_map(em); check_raid56_incompat_flag(info, type); @@ -4888,7 +4926,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_device *device; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - struct extent_map_tree *em_tree; struct extent_map *em; struct map_lookup *map; size_t item_size; @@ -4897,24 +4934,9 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int i = 0; int ret = 0; - em_tree = &fs_info->mapping_tree.map_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_crit(fs_info, "unable to find logical %Lu len %Lu", - chunk_offset, chunk_size); - return -EINVAL; - } - - if (em->start != chunk_offset || em->len != chunk_size) { - btrfs_crit(fs_info, - "found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu", - chunk_offset, chunk_size, em->start, em->len); - free_extent_map(em); - return -EINVAL; - } + em = get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(em)) + return PTR_ERR(em); map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); @@ -5009,29 +5031,26 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); chunk_offset = find_next_chunk(fs_info); - return __btrfs_alloc_chunk(trans, fs_info, chunk_offset, type); + return __btrfs_alloc_chunk(trans, chunk_offset, type); } static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_device *device) + struct btrfs_fs_info *fs_info) { - struct btrfs_root *extent_root = fs_info->extent_root; u64 chunk_offset; u64 sys_chunk_offset; u64 alloc_profile; int ret; chunk_offset = find_next_chunk(fs_info); - alloc_profile = btrfs_get_alloc_profile(extent_root, 0); - ret = __btrfs_alloc_chunk(trans, fs_info, chunk_offset, alloc_profile); + alloc_profile = btrfs_metadata_alloc_profile(fs_info); + ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); if (ret) return ret; sys_chunk_offset = find_next_chunk(fs_info); - alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); - ret = __btrfs_alloc_chunk(trans, fs_info, sys_chunk_offset, - alloc_profile); + alloc_profile = btrfs_system_alloc_profile(fs_info); + ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); return ret; } @@ -5057,15 +5076,12 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct extent_map *em; struct map_lookup *map; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int readonly = 0; int miss_ndevs = 0; int i; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - read_unlock(&map_tree->map_tree.lock); - if (!em) + em = get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(em)) return 1; map = em->map_lookup; @@ -5119,34 +5135,19 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; int ret; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - - /* - * We could return errors for these cases, but that could get ugly and - * we'd probably do the same thing which is just not do anything else - * and exit, so return 1 so the callers don't try to use other copies. - */ - if (!em) { - btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, - logical+len); - return 1; - } - - if (em->start > logical || em->start + em->len < logical) { - btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got %Lu-%Lu", - logical, logical+len, em->start, - em->start + em->len); - free_extent_map(em); + em = get_chunk_map(fs_info, logical, len); + if (IS_ERR(em)) + /* + * We could return errors for these cases, but that could get + * ugly and we'd probably do the same thing which is just not do + * anything else and exit, so return 1 so the callers don't try + * to use other copies. + */ return 1; - } map = em->map_lookup; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) @@ -5162,7 +5163,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) free_extent_map(em); btrfs_dev_replace_lock(&fs_info->dev_replace, 0); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && + fs_info->dev_replace.tgtdev) ret++; btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); @@ -5175,15 +5177,11 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; unsigned long len = fs_info->sectorsize; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - BUG_ON(!em); + em = get_chunk_map(fs_info, logical, len); + WARN_ON(IS_ERR(em)); - BUG_ON(em->start > logical || em->start + em->len < logical); map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); @@ -5191,20 +5189,16 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len, int mirror_num) { struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; int ret = 0; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - BUG_ON(!em); + em = get_chunk_map(fs_info, logical, len); + WARN_ON(IS_ERR(em)); - BUG_ON(em->start > logical || em->start + em->len < logical); map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; @@ -5297,25 +5291,353 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) GFP_NOFS|__GFP_NOFAIL); atomic_set(&bbio->error, 0); - atomic_set(&bbio->refs, 1); + refcount_set(&bbio->refs, 1); return bbio; } void btrfs_get_bbio(struct btrfs_bio *bbio) { - WARN_ON(!atomic_read(&bbio->refs)); - atomic_inc(&bbio->refs); + WARN_ON(!refcount_read(&bbio->refs)); + refcount_inc(&bbio->refs); } void btrfs_put_bbio(struct btrfs_bio *bbio) { if (!bbio) return; - if (atomic_dec_and_test(&bbio->refs)) + if (refcount_dec_and_test(&bbio->refs)) kfree(bbio); } +/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ +/* + * Please note that, discard won't be sent to target device of device + * replace. + */ +static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, + struct btrfs_bio **bbio_ret) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_bio *bbio; + u64 offset; + u64 stripe_nr; + u64 stripe_nr_end; + u64 stripe_end_offset; + u64 stripe_cnt; + u64 stripe_len; + u64 stripe_offset; + u64 num_stripes; + u32 stripe_index; + u32 factor = 0; + u32 sub_stripes = 0; + u64 stripes_per_dev = 0; + u32 remaining_stripes = 0; + u32 last_stripe = 0; + int ret = 0; + int i; + + /* discard always return a bbio */ + ASSERT(bbio_ret); + + em = get_chunk_map(fs_info, logical, length); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* we don't discard raid56 yet */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EOPNOTSUPP; + goto out; + } + + offset = logical - em->start; + length = min_t(u64, em->len - offset, length); + + stripe_len = map->stripe_len; + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ + stripe_nr = div64_u64(offset, stripe_len); + + /* stripe_offset is the offset of this block in its stripe */ + stripe_offset = offset - stripe_nr * stripe_len; + + stripe_nr_end = round_up(offset + length, map->stripe_len); + stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); + stripe_cnt = stripe_nr_end - stripe_nr; + stripe_end_offset = stripe_nr_end * map->stripe_len - + (offset + length); + /* + * after this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array + */ + num_stripes = 1; + stripe_index = 0; + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + num_stripes = min_t(u64, map->num_stripes, + sub_stripes * stripe_cnt); + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + stripe_index *= sub_stripes; + stripes_per_dev = div_u64_rem(stripe_cnt, factor, + &remaining_stripes); + div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); + last_stripe *= sub_stripes; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP)) { + num_stripes = map->num_stripes; + } else { + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); + } + + bbio = alloc_btrfs_bio(num_stripes, 0); + if (!bbio) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + bbio->stripes[i].length = stripes_per_dev * + map->stripe_len; + + if (i / sub_stripes < remaining_stripes) + bbio->stripes[i].length += + map->stripe_len; + + /* + * Special for the first stripe and + * the last stripe: + * + * |-------|...|-------| + * |----------| + * off end_off + */ + if (i < sub_stripes) + bbio->stripes[i].length -= + stripe_offset; + + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + sub_stripes - 1)) + bbio->stripes[i].length -= + stripe_end_offset; + + if (i == sub_stripes - 1) + stripe_offset = 0; + } else { + bbio->stripes[i].length = length; + } + + stripe_index++; + if (stripe_index == map->num_stripes) { + stripe_index = 0; + stripe_nr++; + } + } + + *bbio_ret = bbio; + bbio->map_type = map->type; + bbio->num_stripes = num_stripes; +out: + free_extent_map(em); + return ret; +} + +/* + * In dev-replace case, for repair case (that's the only case where the mirror + * is selected explicitly when calling btrfs_map_block), blocks left of the + * left cursor can also be read from the target drive. + * + * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the + * array of stripes. + * For READ, it also needs to be supported using the same mirror number. + * + * If the requested block is not left of the left cursor, EIO is returned. This + * can happen because btrfs_num_copies() returns one more in the dev-replace + * case. + */ +static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, + u64 srcdev_devid, int *mirror_num, + u64 *physical) +{ + struct btrfs_bio *bbio = NULL; + int num_stripes; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + int i; + int ret = 0; + + ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + logical, &length, &bbio, 0, 0); + if (ret) { + ASSERT(bbio == NULL); + return ret; + } + + num_stripes = bbio->num_stripes; + if (*mirror_num > num_stripes) { + /* + * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, + * that means that the requested area is not left of the left + * cursor + */ + btrfs_put_bbio(bbio); + return -EIO; + } + + /* + * process the rest of the function using the mirror_num of the source + * drive. Therefore look it up first. At the end, patch the device + * pointer to the one of the target drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid != srcdev_devid) + continue; + + /* + * In case of DUP, in order to keep it simple, only add the + * mirror with the lowest physical address + */ + if (found && + physical_of_found <= bbio->stripes[i].physical) + continue; + + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + + btrfs_put_bbio(bbio); + + ASSERT(found); + if (!found) + return -EIO; + + *mirror_num = index_srcdev + 1; + *physical = physical_of_found; + return ret; +} + +static void handle_ops_on_dev_replace(enum btrfs_map_op op, + struct btrfs_bio **bbio_ret, + struct btrfs_dev_replace *dev_replace, + int *num_stripes_ret, int *max_errors_ret) +{ + struct btrfs_bio *bbio = *bbio_ret; + u64 srcdev_devid = dev_replace->srcdev->devid; + int tgtdev_indexes = 0; + int num_stripes = *num_stripes_ret; + int max_errors = *max_errors_ret; + int i; + + if (op == BTRFS_MAP_WRITE) { + int index_where_to_add; + + /* + * duplicate the write operations while the dev replace + * procedure is running. Since the copying of the old disk to + * the new disk takes place at run time while the filesystem is + * mounted writable, the regular write operations to the old + * disk have to be duplicated to go to the new disk as well. + * + * Note that device->missing is handled by the caller, and that + * the write to the old disk is already set up in the stripes + * array. + */ + index_where_to_add = num_stripes; + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* write to new disk, too */ + struct btrfs_bio_stripe *new = + bbio->stripes + index_where_to_add; + struct btrfs_bio_stripe *old = + bbio->stripes + i; + + new->physical = old->physical; + new->length = old->length; + new->dev = dev_replace->tgtdev; + bbio->tgtdev_map[i] = index_where_to_add; + index_where_to_add++; + max_errors++; + tgtdev_indexes++; + } + } + num_stripes = index_where_to_add; + } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + /* + * During the dev-replace procedure, the target drive can also + * be used to read data in case it is needed to repair a corrupt + * block elsewhere. This is possible if the requested area is + * left of the left cursor. In this area, the target drive is a + * full copy of the source drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it simple, + * only add the mirror with the lowest physical + * address + */ + if (found && + physical_of_found <= + bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + } + if (found) { + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; + + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; + + tgtdev_indexes++; + num_stripes++; + } + } + + *num_stripes_ret = num_stripes; + *max_errors_ret = max_errors; + bbio->num_tgtdevs = tgtdev_indexes; + *bbio_ret = bbio; +} + +static bool need_full_stripe(enum btrfs_map_op op) +{ + return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, @@ -5324,14 +5646,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct extent_map_tree *em_tree = &map_tree->map_tree; u64 offset; u64 stripe_offset; - u64 stripe_end_offset; u64 stripe_nr; - u64 stripe_nr_orig; - u64 stripe_nr_end; u64 stripe_len; u32 stripe_index; int i; @@ -5347,23 +5664,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, *length); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_crit(fs_info, "unable to find logical %llu len %llu", - logical, *length); - return -EINVAL; - } + if (op == BTRFS_MAP_DISCARD) + return __btrfs_map_block_for_discard(fs_info, logical, + *length, bbio_ret); - if (em->start > logical || em->start + em->len < logical) { - btrfs_crit(fs_info, - "found a bad mapping, wanted %Lu, found %Lu-%Lu", - logical, em->start, em->start + em->len); - free_extent_map(em); - return -EINVAL; - } + em = get_chunk_map(fs_info, logical, *length); + if (IS_ERR(em)) + return PTR_ERR(em); map = em->map_lookup; offset = logical - em->start; @@ -5402,14 +5709,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, raid56_full_stripe_start *= full_stripe_len; } - if (op == BTRFS_MAP_DISCARD) { - /* we don't discard raid56 yet */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = -EOPNOTSUPP; - goto out; - } - *length = min_t(u64, em->len - offset, *length); - } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { u64 max_len; /* For writes to RAID[56], allow a full stripeset across all disks. For other RAID types and for RAID[56] reads, just allow a single @@ -5440,105 +5740,28 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { - /* - * in dev-replace case, for repair case (that's the only - * case where the mirror is selected explicitly when - * calling btrfs_map_block), blocks left of the left cursor - * can also be read from the target drive. - * For REQ_GET_READ_MIRRORS, the target drive is added as - * the last one to the array of stripes. For READ, it also - * needs to be supported using the same mirror number. - * If the requested block is not left of the left cursor, - * EIO is returned. This can happen because btrfs_num_copies() - * returns one more in the dev-replace case. - */ - u64 tmp_length = *length; - struct btrfs_bio *tmp_bbio = NULL; - int tmp_num_stripes; - u64 srcdev_devid = dev_replace->srcdev->devid; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - - ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &tmp_length, &tmp_bbio, 0, 0); - if (ret) { - WARN_ON(tmp_bbio != NULL); - goto out; - } - - tmp_num_stripes = tmp_bbio->num_stripes; - if (mirror_num > tmp_num_stripes) { - /* - * BTRFS_MAP_GET_READ_MIRRORS does not contain this - * mirror, that means that the requested area - * is not left of the left cursor - */ - ret = -EIO; - btrfs_put_bbio(tmp_bbio); - goto out; - } - - /* - * process the rest of the function using the mirror_num - * of the source drive. Therefore look it up first. - * At the end, patch the device pointer to the one of the - * target drive. - */ - for (i = 0; i < tmp_num_stripes; i++) { - if (tmp_bbio->stripes[i].dev->devid != srcdev_devid) - continue; - - /* - * In case of DUP, in order to keep it simple, only add - * the mirror with the lowest physical address - */ - if (found && - physical_of_found <= tmp_bbio->stripes[i].physical) - continue; - - index_srcdev = i; - found = 1; - physical_of_found = tmp_bbio->stripes[i].physical; - } - - btrfs_put_bbio(tmp_bbio); - - if (!found) { - WARN_ON(1); - ret = -EIO; + !need_full_stripe(op) && dev_replace->tgtdev != NULL) { + ret = get_extra_mirror_from_replace(fs_info, logical, *length, + dev_replace->srcdev->devid, + &mirror_num, + &physical_to_patch_in_first_stripe); + if (ret) goto out; - } - - mirror_num = index_srcdev + 1; - patch_the_first_stripe_for_dev_replace = 1; - physical_to_patch_in_first_stripe = physical_of_found; + else + patch_the_first_stripe_for_dev_replace = 1; } else if (mirror_num > map->num_stripes) { mirror_num = 0; } num_stripes = 1; stripe_index = 0; - stripe_nr_orig = stripe_nr; - stripe_nr_end = ALIGN(offset + *length, map->stripe_len); - stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); - stripe_end_offset = stripe_nr_end * map->stripe_len - - (offset + *length); - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - if (op == BTRFS_MAP_DISCARD) - num_stripes = min_t(u64, map->num_stripes, - stripe_nr_end - stripe_nr_orig); stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS) + if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || - op == BTRFS_MAP_GET_READ_MIRRORS) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5551,8 +5774,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || - op == BTRFS_MAP_GET_READ_MIRRORS) { + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5568,10 +5790,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->sub_stripes; - else if (op == BTRFS_MAP_DISCARD) - num_stripes = min_t(u64, map->sub_stripes * - (stripe_nr_end - stripe_nr_orig), - map->num_stripes); else if (mirror_num) stripe_index += mirror_num - 1; else { @@ -5589,7 +5807,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ - stripe_nr = div_u64(raid56_full_stripe_start, + stripe_nr = div64_u64(raid56_full_stripe_start, stripe_len * nr_data_stripes(map)); /* RAID[56] write or recovery. Return all stripes */ @@ -5614,8 +5832,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1) + if ((op != BTRFS_MAP_WRITE && + op != BTRFS_MAP_GET_READ_MIRRORS) && + mirror_num <= 1) mirror_num = 1; } } else { @@ -5637,8 +5856,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } num_alloc_stripes = num_stripes; - if (dev_replace_is_ongoing) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { + if (op == BTRFS_MAP_WRITE) num_alloc_stripes <<= 1; if (op == BTRFS_MAP_GET_READ_MIRRORS) num_alloc_stripes++; @@ -5650,14 +5869,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, ret = -ENOMEM; goto out; } - if (dev_replace_is_ongoing) + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); /* build raid_map */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && - need_raid_map && - ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) || - mirror_num > 1)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && + (need_full_stripe(op) || mirror_num > 1)) { u64 tmp; unsigned rot; @@ -5681,173 +5898,27 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, RAID6_Q_STRIPE; } - if (op == BTRFS_MAP_DISCARD) { - u32 factor = 0; - u32 sub_stripes = 0; - u64 stripes_per_dev = 0; - u32 remaining_stripes = 0; - u32 last_stripe = 0; - if (map->type & - (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - if (map->type & BTRFS_BLOCK_GROUP_RAID0) - sub_stripes = 1; - else - sub_stripes = map->sub_stripes; - - factor = map->num_stripes / sub_stripes; - stripes_per_dev = div_u64_rem(stripe_nr_end - - stripe_nr_orig, - factor, - &remaining_stripes); - div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); - last_stripe *= sub_stripes; - } - - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; - bbio->stripes[i].dev = map->stripes[stripe_index].dev; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID10)) { - bbio->stripes[i].length = stripes_per_dev * - map->stripe_len; - - if (i / sub_stripes < remaining_stripes) - bbio->stripes[i].length += - map->stripe_len; - - /* - * Special for the first stripe and - * the last stripe: - * - * |-------|...|-------| - * |----------| - * off end_off - */ - if (i < sub_stripes) - bbio->stripes[i].length -= - stripe_offset; - - if (stripe_index >= last_stripe && - stripe_index <= (last_stripe + - sub_stripes - 1)) - bbio->stripes[i].length -= - stripe_end_offset; - - if (i == sub_stripes - 1) - stripe_offset = 0; - } else - bbio->stripes[i].length = *length; - - stripe_index++; - if (stripe_index == map->num_stripes) { - /* This could only happen for RAID0/10 */ - stripe_index = 0; - stripe_nr++; - } - } - } else { - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + - stripe_nr * map->stripe_len; - bbio->stripes[i].dev = - map->stripes[stripe_index].dev; - stripe_index++; - } + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = + map->stripes[stripe_index].dev; + stripe_index++; } - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) max_errors = btrfs_chunk_max_errors(map); if (bbio->raid_map) sort_parity_stripes(bbio, num_stripes); - tgtdev_indexes = 0; - if (dev_replace_is_ongoing && - (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) && - dev_replace->tgtdev != NULL) { - int index_where_to_add; - u64 srcdev_devid = dev_replace->srcdev->devid; - - /* - * duplicate the write operations while the dev replace - * procedure is running. Since the copying of the old disk - * to the new disk takes place at run time while the - * filesystem is mounted writable, the regular write - * operations to the old disk have to be duplicated to go - * to the new disk as well. - * Note that device->missing is handled by the caller, and - * that the write to the old disk is already set up in the - * stripes array. - */ - index_where_to_add = num_stripes; - for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { - /* write to new disk, too */ - struct btrfs_bio_stripe *new = - bbio->stripes + index_where_to_add; - struct btrfs_bio_stripe *old = - bbio->stripes + i; - - new->physical = old->physical; - new->length = old->length; - new->dev = dev_replace->tgtdev; - bbio->tgtdev_map[i] = index_where_to_add; - index_where_to_add++; - max_errors++; - tgtdev_indexes++; - } - } - num_stripes = index_where_to_add; - } else if (dev_replace_is_ongoing && - op == BTRFS_MAP_GET_READ_MIRRORS && - dev_replace->tgtdev != NULL) { - u64 srcdev_devid = dev_replace->srcdev->devid; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - - /* - * During the dev-replace procedure, the target drive can - * also be used to read data in case it is needed to repair - * a corrupt block elsewhere. This is possible if the - * requested area is left of the left cursor. In this area, - * the target drive is a full copy of the source drive. - */ - for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it - * simple, only add the mirror with the - * lowest physical address - */ - if (found && - physical_of_found <= - bbio->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = bbio->stripes[i].physical; - } - } - if (found) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; - - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; - tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; - - tgtdev_indexes++; - num_stripes++; - } + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && + need_full_stripe(op)) { + handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, + &max_errors); } *bbio_ret = bbio; @@ -5855,7 +5926,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; - bbio->num_tgtdevs = tgtdev_indexes; /* * this is the case that REQ_READ && dev_replace_is_ongoing && @@ -5888,19 +5958,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* For Scrub/replace */ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num, - int need_raid_map) + struct btrfs_bio **bbio_ret) { - return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, - mirror_num, need_raid_map); + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); } int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct extent_map_tree *em_tree = &map_tree->map_tree; struct extent_map *em; struct map_lookup *map; u64 *buf; @@ -5910,24 +5976,11 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 rmap_len; int i, j, nr = 0; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_start, 1); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_err(fs_info, "couldn't find em for chunk %Lu", - chunk_start); + em = get_chunk_map(fs_info, chunk_start, 1); + if (IS_ERR(em)) return -EIO; - } - if (em->start != chunk_start) { - btrfs_err(fs_info, "bad chunk start, em=%Lu, wanted=%Lu", - em->start, chunk_start); - free_extent_map(em); - return -EIO; - } map = em->map_lookup; - length = em->len; rmap_len = map->stripe_len; @@ -5951,7 +6004,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, continue; stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div_u64(stripe_nr, map->stripe_len); + stripe_nr = div64_u64(stripe_nr, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; @@ -5996,9 +6049,10 @@ static void btrfs_end_bio(struct bio *bio) struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; - if (bio->bi_error) { + if (bio->bi_status) { atomic_inc(&bbio->error); - if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) { + if (bio->bi_status == BLK_STS_IOERR || + bio->bi_status == BLK_STS_TARGET) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; @@ -6036,13 +6090,13 @@ static void btrfs_end_bio(struct bio *bio) * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } else { /* * this bio is actually up to date, we didn't * go over the max number of errors */ - bio->bi_error = 0; + bio->bi_status = 0; } btrfs_end_bbio(bbio, bio); @@ -6153,7 +6207,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; btrfs_end_bbio(bbio, bio); } } @@ -6215,15 +6269,14 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || - (bio_op(bio) == REQ_OP_WRITE && !dev->writeable)) { + (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) { bbio_error(bbio, first_bio, logical); continue; } - if (dev_nr < total_devs - 1) { - bio = btrfs_bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); /* -ENOMEM */ - } else + if (dev_nr < total_devs - 1) + bio = btrfs_bio_clone(first_bio); + else bio = first_bio; submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, @@ -6638,10 +6691,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, device->in_fs_metadata = 1; if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&fs_info->free_chunk_lock); - fs_info->free_chunk_space += device->total_bytes - - device->bytes_used; - spin_unlock(&fs_info->free_chunk_lock); + atomic64_add(device->total_bytes - device->bytes_used, + &fs_info->free_chunk_space); } ret = 0; return ret; @@ -6958,7 +7009,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, key.offset = device->devid; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn_in_rcu(fs_info, @@ -7106,7 +7158,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, return 0; } -void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path) +void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) { struct buffer_head *bh; struct btrfs_super_block *disk_super; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 24ba6bc3ec34..6f45fd60d15a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -74,6 +74,8 @@ struct btrfs_device { int missing; int can_discard; int is_tgtdev_for_dev_replace; + int last_flush_error; + int flush_bio_sent; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED seqcount_t data_seqcount; @@ -123,7 +125,6 @@ struct btrfs_device { struct list_head resized_list; /* for sending down flush barriers */ - int nobarriers; struct bio *flush_bio; struct completion flush_wait; @@ -280,6 +281,11 @@ struct btrfs_io_bio { u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; u8 *csum_allocated; btrfs_io_bio_end_io_t *end_io; + struct bvec_iter iter; + /* + * This member must come last, bio_alloc_bioset will allocate enough + * bytes for entire btrfs_io_bio but relies on bio being last. + */ struct bio bio; }; @@ -298,7 +304,7 @@ struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); struct btrfs_bio { - atomic_t refs; + refcount_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; u64 map_type; /* get from map_lookup->type */ @@ -400,8 +406,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num, - int need_raid_map); + struct btrfs_bio **bbio_ret); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); @@ -422,16 +427,16 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, struct btrfs_device *device, struct btrfs_device *this_dev); int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, - char *device_path, + const char *device_path, struct btrfs_device **device); int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, - char *devpath, + const char *devpath, struct btrfs_device **device); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); int btrfs_rm_device(struct btrfs_fs_info *fs_info, - char *device_path, u64 devid); + const char *device_path, u64 devid); void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, @@ -439,9 +444,9 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); -int btrfs_init_new_device(struct btrfs_fs_info *fs_info, char *path); +int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, - char *device_path, + const char *device_path, struct btrfs_device *srcdev, struct btrfs_device **device_out); int btrfs_balance(struct btrfs_balance_control *bctl, @@ -474,8 +479,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); -void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path); -int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len, int mirror_num); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct btrfs_mapping_tree *map_tree, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 9621c7f2503e..2c7e53f9ff1b 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -47,8 +47,8 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name, return -ENOMEM; /* lookup the xattr by name */ - di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, - strlen(name), 0); + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), + name, strlen(name), 0); if (!di) { ret = -ENODATA; goto out; @@ -108,8 +108,8 @@ static int do_setxattr(struct btrfs_trans_handle *trans, path->skip_release_on_error = 1; if (!value) { - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), - name, name_len, -1); + di = btrfs_lookup_xattr(trans, root, path, + btrfs_ino(BTRFS_I(inode)), name, name_len, -1); if (!di && (flags & XATTR_REPLACE)) ret = -ENODATA; else if (IS_ERR(di)) @@ -128,8 +128,8 @@ static int do_setxattr(struct btrfs_trans_handle *trans, */ if (flags & XATTR_REPLACE) { ASSERT(inode_is_locked(inode)); - di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), - name, name_len, 0); + di = btrfs_lookup_xattr(NULL, root, path, + btrfs_ino(BTRFS_I(inode)), name, name_len, 0); if (!di) ret = -ENODATA; else if (IS_ERR(di)) @@ -140,7 +140,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, di = NULL; } - ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, value, size); if (ret == -EOVERFLOW) { /* @@ -278,7 +278,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) * NOTE: we set key.offset = 0; because we want to start with the * first xattr that we find and walk forward */ - key.objectid = btrfs_ino(inode); + key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; @@ -336,7 +336,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) u32 this_len = sizeof(*di) + name_len + data_len; unsigned long name_ptr = (unsigned long)(di + 1); - if (verify_dir_item(fs_info, leaf, di)) { + if (verify_dir_item(fs_info, leaf, slot, di)) { ret = -EIO; goto err; } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index da497f184ff4..c248f9286366 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -24,12 +24,13 @@ #include <linux/slab.h> #include <linux/zlib.h> #include <linux/zutil.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include <linux/init.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/bio.h> +#include <linux/refcount.h> #include "compression.h" struct workspace { @@ -42,7 +43,7 @@ static void zlib_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); - vfree(workspace->strm.workspace); + kvfree(workspace->strm.workspace); kfree(workspace->buf); kfree(workspace); } @@ -52,14 +53,14 @@ static struct list_head *zlib_alloc_workspace(void) struct workspace *workspace; int workspacesize; - workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); if (!workspace) return ERR_PTR(-ENOMEM); workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); - workspace->strm.workspace = vmalloc(workspacesize); - workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS); + workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -73,13 +74,11 @@ fail: static int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, unsigned long len, + u64 start, struct page **pages, - unsigned long nr_dest_pages, unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) + unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; @@ -89,6 +88,9 @@ static int zlib_compress_pages(struct list_head *ws, struct page *in_page = NULL; struct page *out_page = NULL; unsigned long bytes_left; + unsigned long len = *total_out; + unsigned long nr_dest_pages = *out_pages; + const unsigned long max_out = nr_dest_pages * PAGE_SIZE; *out_pages = 0; *total_out = 0; @@ -210,10 +212,7 @@ out: return ret; } -static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in, - u64 disk_start, - struct bio *orig_bio, - size_t srclen) +static int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; @@ -221,8 +220,12 @@ static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in, char *data_in; size_t total_out = 0; unsigned long page_in_index = 0; + size_t srclen = cb->compressed_len; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; + struct page **pages_in = cb->compressed_pages; + u64 disk_start = cb->start; + struct bio *orig_bio = cb->orig_bio; data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; |