diff options
Diffstat (limited to 'fs/btrfs/disk-io.c')
| -rw-r--r-- | fs/btrfs/disk-io.c | 1009 |
1 files changed, 649 insertions, 360 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8e3438672a82..41b718cfea40 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,7 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" @@ -42,6 +41,7 @@ #include "block-group.h" #include "discard.h" #include "space-info.h" +#include "zoned.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -109,15 +109,13 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) * just before they are sent down the IO stack. */ struct async_submit_bio { - void *private_data; + struct inode *inode; struct bio *bio; extent_submit_bio_start_t *submit_bio_start; int mirror_num; - /* - * bio_offset is optional, can be used if the pages in the bio - * can't tell us where in the file the bio should go - */ - u64 bio_offset; + + /* Optional parameter for submit_bio_start used by direct io */ + u64 dio_file_offset; struct btrfs_work work; blk_status_t status; }; @@ -150,40 +148,41 @@ struct async_submit_bio { # error # endif +#define DEFINE_LEVEL(stem, level) \ + .names[level] = "btrfs-" stem "-0" #level, + +#define DEFINE_NAME(stem) \ + DEFINE_LEVEL(stem, 0) \ + DEFINE_LEVEL(stem, 1) \ + DEFINE_LEVEL(stem, 2) \ + DEFINE_LEVEL(stem, 3) \ + DEFINE_LEVEL(stem, 4) \ + DEFINE_LEVEL(stem, 5) \ + DEFINE_LEVEL(stem, 6) \ + DEFINE_LEVEL(stem, 7) + static struct btrfs_lockdep_keyset { u64 id; /* root objectid */ - const char *name_stem; /* lock name stem */ - char names[BTRFS_MAX_LEVEL + 1][20]; - struct lock_class_key keys[BTRFS_MAX_LEVEL + 1]; + /* Longest entry: btrfs-free-space-00 */ + char names[BTRFS_MAX_LEVEL][20]; + struct lock_class_key keys[BTRFS_MAX_LEVEL]; } btrfs_lockdep_keysets[] = { - { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" }, - { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" }, - { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" }, - { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, - { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, - { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, - { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" }, - { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, - { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, - { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, - { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" }, - { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" }, - { .id = 0, .name_stem = "tree" }, + { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, + { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, + { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, + { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, + { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, + { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, + { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, + { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, + { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, + { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, + { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, + { .id = 0, DEFINE_NAME("tree") }, }; -void __init btrfs_init_lockdep(void) -{ - int i, j; - - /* initialize lockdep class names */ - for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) { - struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i]; - - for (j = 0; j < ARRAY_SIZE(ks->names); j++) - snprintf(ks->names[j], sizeof(ks->names[j]), - "btrfs-%s-%02d", ks->name_stem, j); - } -} +#undef DEFINE_LEVEL +#undef DEFINE_NAME void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) @@ -210,15 +209,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; const int num_pages = fs_info->nodesize >> PAGE_SHIFT; + const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; int i; shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - kaddr = page_address(buf->pages[0]); + kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start); crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, - PAGE_SIZE - BTRFS_CSUM_SIZE); + first_page_part - BTRFS_CSUM_SIZE); for (i = 1; i < num_pages; i++) { kaddr = page_address(buf->pages[i]); @@ -248,10 +248,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; - if (need_lock) { + if (need_lock) btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - } lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); @@ -280,7 +278,7 @@ out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (need_lock) - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); return ret; } @@ -319,7 +317,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); - if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb))) + if (memcmp(disk_sb->csum, result, fs_info->csum_size)) return 1; return 0; @@ -443,16 +441,16 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb, } /* - * checksum a dirty tree block before IO. This has extra checks to make sure - * we only fill in the checksum field in the first page of a multi-page block + * Checksum a dirty tree block before IO. This has extra checks to make sure + * we only fill in the checksum field in the first page of a multi-page block. + * For subpage extent buffers we need bvec to also read the offset in the page. */ - -static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) +static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec) { + struct page *page = bvec->bv_page; u64 start = page_offset(page); u64 found_start; u8 result[BTRFS_CSUM_SIZE]; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); struct extent_buffer *eb; int ret; @@ -461,6 +459,12 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) return 0; found_start = btrfs_header_bytenr(eb); + + if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { + WARN_ON(found_start != 0); + return 0; + } + /* * Please do not consolidate these warnings into a single if. * It is useful to know what went wrong. @@ -489,7 +493,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); return ret; } - write_extent_buffer(eb, result, 0, csum_size); + write_extent_buffer(eb, result, 0, fs_info->csum_size); return 0; } @@ -523,65 +527,37 @@ static int check_tree_block_fsid(struct extent_buffer *eb) return 1; } -int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int mirror) +/* Do basic extent buffer checks at read time */ +static int validate_extent_buffer(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; - int found_level; - struct extent_buffer *eb; - struct btrfs_fs_info *fs_info; - u16 csum_size; - int ret = 0; + const u32 csum_size = fs_info->csum_size; + u8 found_level; u8 result[BTRFS_CSUM_SIZE]; - int reads_done; - - if (!page->private) - goto out; - - eb = (struct extent_buffer *)page->private; - fs_info = eb->fs_info; - csum_size = btrfs_super_csum_size(fs_info->super_copy); - - /* the pending IO might have been the only thing that kept this buffer - * in memory. Make sure we have a ref for all this other checks - */ - atomic_inc(&eb->refs); - - reads_done = atomic_dec_and_test(&eb->io_pages); - if (!reads_done) - goto err; - - eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { - ret = -EIO; - goto err; - } + int ret = 0; found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu", eb->start, found_start); ret = -EIO; - goto err; + goto out; } if (check_tree_block_fsid(eb)) { btrfs_err_rl(fs_info, "bad fsid on block %llu", eb->start); ret = -EIO; - goto err; + goto out; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "bad tree block level %d on %llu", (int)btrfs_header_level(eb), eb->start); ret = -EIO; - goto err; + goto out; } - btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), - eb, found_level); - csum_tree_block(eb, result); if (memcmp_extent_buffer(eb, result, 0, csum_size)) { @@ -595,7 +571,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb)); ret = -EUCLEAN; - goto err; + goto out; } /* @@ -617,6 +593,94 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, btrfs_err(fs_info, "block=%llu read time tree block corruption detected", eb->start); +out: + return ret; +} + +static int validate_subpage_buffer(struct page *page, u64 start, u64 end, + int mirror) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct extent_buffer *eb; + bool reads_done; + int ret = 0; + + /* + * We don't allow bio merge for subpage metadata read, so we should + * only get one eb for each endio hook. + */ + ASSERT(end == start + fs_info->nodesize - 1); + ASSERT(PagePrivate(page)); + + eb = find_extent_buffer(fs_info, start); + /* + * When we are reading one tree block, eb must have been inserted into + * the radix tree. If not, something is wrong. + */ + ASSERT(eb); + + reads_done = atomic_dec_and_test(&eb->io_pages); + /* Subpage read must finish in page read */ + ASSERT(reads_done); + + eb->read_mirror = mirror; + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { + ret = -EIO; + goto err; + } + ret = validate_extent_buffer(eb); + if (ret < 0) + goto err; + + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) + btree_readahead_hook(eb, ret); + + set_extent_buffer_uptodate(eb); + + free_extent_buffer(eb); + return ret; +err: + /* + * end_bio_extent_readpage decrements io_pages in case of error, + * make sure it has something to decrement. + */ + atomic_inc(&eb->io_pages); + clear_extent_buffer_uptodate(eb); + free_extent_buffer(eb); + return ret; +} + +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, + struct page *page, u64 start, u64 end, + int mirror) +{ + struct extent_buffer *eb; + int ret = 0; + int reads_done; + + ASSERT(page->private); + + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return validate_subpage_buffer(page, start, end, mirror); + + eb = (struct extent_buffer *)page->private; + + /* + * The pending IO might have been the only thing that kept this buffer + * in memory. Make sure we have a ref for all this other checks + */ + atomic_inc(&eb->refs); + + reads_done = atomic_dec_and_test(&eb->io_pages); + if (!reads_done) + goto err; + + eb->read_mirror = mirror; + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { + ret = -EIO; + goto err; + } + ret = validate_extent_buffer(eb); err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) @@ -632,7 +696,7 @@ err: clear_extent_buffer_uptodate(eb); } free_extent_buffer(eb); -out: + return ret; } @@ -645,7 +709,7 @@ static void end_workqueue_bio(struct bio *bio) fs_info = end_io_wq->info; end_io_wq->status = bio->bi_status; - if (bio_op(bio) == REQ_OP_WRITE) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) wq = fs_info->endio_meta_write_workers; else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) @@ -694,8 +758,8 @@ static void run_one_async_start(struct btrfs_work *work) blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->private_data, async->bio, - async->bio_offset); + ret = async->submit_bio_start(async->inode, async->bio, + async->dio_file_offset); if (ret) async->status = ret; } @@ -715,7 +779,7 @@ static void run_one_async_done(struct btrfs_work *work) blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - inode = async->private_data; + inode = async->inode; /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { @@ -745,18 +809,19 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, +blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, - u64 bio_offset, void *private_data, + u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) return BLK_STS_RESOURCE; - async->private_data = private_data; + async->inode = inode; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; @@ -764,7 +829,7 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, run_one_async_free); - async->bio_offset = bio_offset; + async->dio_file_offset = dio_file_offset; async->status = 0; @@ -785,7 +850,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); + ret = csum_dirty_buffer(root->fs_info, bvec); if (ret) break; } @@ -793,8 +858,8 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) return errno_to_blk_status(ret); } -static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, - u64 bio_offset) +static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, + u64 dio_file_offset) { /* * when we're called for a write, we're already in the async @@ -806,6 +871,8 @@ static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, static int check_async_write(struct btrfs_fs_info *fs_info, struct btrfs_inode *bi) { + if (btrfs_is_zoned(fs_info)) + return 0; if (atomic_read(&bi->sync_writers)) return 0; if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) @@ -820,7 +887,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; - if (bio_op(bio) != REQ_OP_WRITE) { + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -840,8 +907,8 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ - ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, - 0, inode, btree_submit_bio_start); + ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, + 0, btree_submit_bio_start); } if (ret) @@ -947,47 +1014,33 @@ static const struct address_space_operations btree_aops = { .set_page_dirty = btree_set_page_dirty, }; -void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct extent_buffer *buf = NULL; - int ret; - - buf = btrfs_find_create_tree_block(fs_info, bytenr); - if (IS_ERR(buf)) - return; - - ret = read_extent_buffer_pages(buf, WAIT_NONE, 0); - if (ret < 0) - free_extent_buffer_stale(buf); - else - free_extent_buffer(buf); -} - struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, - u64 bytenr) + u64 bytenr, u64 owner_root, + int level) { if (btrfs_is_testing(fs_info)) return alloc_test_extent_buffer(fs_info, bytenr); - return alloc_extent_buffer(fs_info, bytenr); + return alloc_extent_buffer(fs_info, bytenr, owner_root, level); } /* * Read tree block at logical address @bytenr and do variant basic but critical * verification. * + * @owner_root: the objectid of the root owner for this block. * @parent_transid: expected transid of this tree block, skip check if 0 * @level: expected level, mandatory check * @first_key: expected key in slot 0, skip check if NULL */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 parent_transid, int level, - struct btrfs_key *first_key) + u64 owner_root, u64 parent_transid, + int level, struct btrfs_key *first_key) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(fs_info, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); if (IS_ERR(buf)) return buf; @@ -1012,8 +1065,6 @@ void btrfs_clean_tree_block(struct extent_buffer *buf) percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -buf->len, fs_info->dirty_metadata_batch); - /* ugh, clear_extent_buffer_dirty needs to lock the page */ - btrfs_set_lock_blocking_write(buf); clear_extent_buffer_dirty(buf); } } @@ -1030,7 +1081,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->orphan_cleanup_state = 0; root->last_trans = 0; - root->highest_objectid = 0; + root->free_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; @@ -1155,7 +1206,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; - goto fail; + goto fail_unlock; } root->node = leaf; @@ -1164,8 +1215,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->commit_root = btrfs_root_node(root); set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - root->root_item.flags = 0; - root->root_item.byte_limit = 0; + btrfs_set_root_flags(&root->root_item, 0); + btrfs_set_root_limit(&root->root_item, 0); btrfs_set_root_bytenr(&root->root_item, leaf->start); btrfs_set_root_generation(&root->root_item, trans->transid); btrfs_set_root_level(&root->root_item, 0); @@ -1177,7 +1228,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, generate_random_guid(root->root_item.uuid); else export_guid(root->root_item.uuid, &guid_null); - root->root_item.drop_level = 0; + btrfs_set_root_drop_level(&root->root_item, 0); + + btrfs_tree_unlock(leaf); key.objectid = objectid; key.type = BTRFS_ROOT_ITEM_KEY; @@ -1186,13 +1239,12 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (ret) goto fail; - btrfs_tree_unlock(leaf); - return root; -fail: +fail_unlock: if (leaf) btrfs_tree_unlock(leaf); +fail: btrfs_put_root(root); return ERR_PTR(ret); @@ -1202,7 +1254,6 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct extent_buffer *leaf; root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); if (!root) @@ -1212,6 +1263,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + return root; +} + +int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct extent_buffer *leaf; + /* * DON'T set SHAREABLE bit for log trees. * @@ -1224,16 +1283,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); - if (IS_ERR(leaf)) { - btrfs_put_root(root); - return ERR_CAST(leaf); - } + if (IS_ERR(leaf)) + return PTR_ERR(leaf); root->node = leaf; btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); - return root; + + return 0; } int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, @@ -1244,6 +1302,16 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); + + if (!btrfs_is_zoned(fs_info)) { + int ret = btrfs_alloc_log_tree_node(trans, log_root); + + if (ret) { + btrfs_put_root(log_root); + return ret; + } + } + WARN_ON(fs_info->log_root_tree); fs_info->log_root_tree = log_root; return 0; @@ -1255,11 +1323,18 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log_root; struct btrfs_inode_item *inode_item; + int ret; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); + ret = btrfs_alloc_log_tree_node(trans, log_root); + if (ret) { + btrfs_put_root(log_root); + return ret; + } + log_root->last_trans = trans->transid; log_root->root_key.offset = root->root_key.objectid; @@ -1281,57 +1356,61 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return 0; } -struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key) +static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, + struct btrfs_path *path, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; - struct btrfs_path *path; u64 generation; int ret; int level; - path = btrfs_alloc_path(); - if (!path) - return ERR_PTR(-ENOMEM); - root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS); - if (!root) { - ret = -ENOMEM; - goto alloc_fail; - } + if (!root) + return ERR_PTR(-ENOMEM); ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); if (ret) { if (ret > 0) ret = -ENOENT; - goto find_fail; + goto fail; } generation = btrfs_root_generation(&root->root_item); level = btrfs_root_level(&root->root_item); root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), - generation, level, NULL); + key->objectid, generation, level, NULL); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; - goto find_fail; + goto fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - goto find_fail; + goto fail; } root->commit_root = btrfs_root_node(root); -out: - btrfs_free_path(path); return root; - -find_fail: +fail: btrfs_put_root(root); -alloc_fail: - root = ERR_PTR(ret); - goto out; + return ERR_PTR(ret); +} + +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) +{ + struct btrfs_root *root; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + root = read_tree_root_path(tree_root, path, key); + btrfs_free_path(path); + + return root; } /* @@ -1344,14 +1423,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) int ret; unsigned int nofs_flag; - root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), - GFP_NOFS); - if (!root->free_ino_pinned || !root->free_ino_ctl) { - ret = -ENOMEM; - goto fail; - } - /* * We might be called under a transaction (e.g. indirect backref * resolution) which could deadlock if it triggers memory reclaim @@ -1368,10 +1439,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) btrfs_check_and_init_root_item(&root->root_item); } - btrfs_init_free_ino_ctl(root); - spin_lock_init(&root->ino_cache_lock); - init_waitqueue_head(&root->ino_cache_wait); - /* * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M @@ -1388,14 +1455,13 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) } mutex_lock(&root->objectid_mutex); - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); + ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); goto fail; } - ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); mutex_unlock(&root->objectid_mutex); @@ -1419,6 +1485,31 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, return root; } +static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, + u64 objectid) +{ + if (objectid == BTRFS_ROOT_TREE_OBJECTID) + return btrfs_grab_root(fs_info->tree_root); + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + return btrfs_grab_root(fs_info->extent_root); + if (objectid == BTRFS_CHUNK_TREE_OBJECTID) + return btrfs_grab_root(fs_info->chunk_root); + if (objectid == BTRFS_DEV_TREE_OBJECTID) + return btrfs_grab_root(fs_info->dev_root); + if (objectid == BTRFS_CSUM_TREE_OBJECTID) + return btrfs_grab_root(fs_info->csum_root); + if (objectid == BTRFS_QUOTA_TREE_OBJECTID) + return btrfs_grab_root(fs_info->quota_root) ? + fs_info->quota_root : ERR_PTR(-ENOENT); + if (objectid == BTRFS_UUID_TREE_OBJECTID) + return btrfs_grab_root(fs_info->uuid_root) ? + fs_info->uuid_root : ERR_PTR(-ENOENT); + if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) + return btrfs_grab_root(fs_info->free_space_root) ? + fs_info->free_space_root : ERR_PTR(-ENOENT); + return NULL; +} + int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { @@ -1453,7 +1544,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) root = list_first_entry(&fs_info->allocated_roots, struct btrfs_root, leak_list); btrfs_err(fs_info, "leaked root %s refcount %d", - btrfs_root_name(root->root_key.objectid, buf), + btrfs_root_name(&root->root_key, buf), refcount_read(&root->refs)); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); @@ -1466,7 +1557,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->dio_bytes); + percpu_counter_destroy(&fs_info->ordered_bytes); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); @@ -1518,25 +1609,9 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, struct btrfs_key key; int ret; - if (objectid == BTRFS_ROOT_TREE_OBJECTID) - return btrfs_grab_root(fs_info->tree_root); - if (objectid == BTRFS_EXTENT_TREE_OBJECTID) - return btrfs_grab_root(fs_info->extent_root); - if (objectid == BTRFS_CHUNK_TREE_OBJECTID) - return btrfs_grab_root(fs_info->chunk_root); - if (objectid == BTRFS_DEV_TREE_OBJECTID) - return btrfs_grab_root(fs_info->dev_root); - if (objectid == BTRFS_CSUM_TREE_OBJECTID) - return btrfs_grab_root(fs_info->csum_root); - if (objectid == BTRFS_QUOTA_TREE_OBJECTID) - return btrfs_grab_root(fs_info->quota_root) ? - fs_info->quota_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_UUID_TREE_OBJECTID) - return btrfs_grab_root(fs_info->uuid_root) ? - fs_info->uuid_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return btrfs_grab_root(fs_info->free_space_root) ? - fs_info->free_space_root : ERR_PTR(-ENOENT); + root = btrfs_get_global_root(fs_info, objectid); + if (root) + return root; again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { @@ -1622,6 +1697,52 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, } /* + * btrfs_get_fs_root_commit_root - return a root for the given objectid + * @fs_info: the fs_info + * @objectid: the objectid we need to lookup + * + * This is exclusively used for backref walking, and exists specifically because + * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref + * creation time, which means we may have to read the tree_root in order to look + * up a fs root that is not in memory. If the root is not in memory we will + * read the tree root commit root and look up the fs root from there. This is a + * temporary root, it will not be inserted into the radix tree as it doesn't + * have the most uptodate information, it'll simply be discarded once the + * backref code is finished using the root. + */ +struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_root *root; + struct btrfs_key key; + + ASSERT(path->search_commit_root && path->skip_locking); + + /* + * This can return -ENOENT if we ask for a root that doesn't exist, but + * since this is called via the backref walking code we won't be looking + * up a root that doesn't exist, unless there's corruption. So if root + * != NULL just return it. + */ + root = btrfs_get_global_root(fs_info, objectid); + if (root) + return root; + + root = btrfs_lookup_fs_root(fs_info, objectid); + if (root) + return root; + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = read_tree_root_path(fs_info->tree_root, path, &key); + btrfs_release_path(path); + + return root; +} + +/* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens */ @@ -1695,7 +1816,7 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(fs_info); sleep: - clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); + clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); if (kthread_should_park()) kthread_parkme(); if (kthread_should_stop()) @@ -1715,13 +1836,13 @@ static int transaction_kthread(void *arg) struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; u64 transid; - time64_t now; + time64_t delta; unsigned long delay; bool cannot_commit; do { cannot_commit = false; - delay = HZ * fs_info->commit_interval; + delay = msecs_to_jiffies(fs_info->commit_interval * 1000); mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); @@ -1731,12 +1852,13 @@ static int transaction_kthread(void *arg) goto sleep; } - now = ktime_get_seconds(); + delta = ktime_get_seconds() - cur->start_time; if (cur->state < TRANS_STATE_COMMIT_START && - (now < cur->start_time || - now - cur->start_time < fs_info->commit_interval)) { + delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); - delay = HZ * 5; + delay -= msecs_to_jiffies((delta - 1) * 1000); + delay = min(delay, + msecs_to_jiffies(fs_info->commit_interval * 1000)); goto sleep; } transid = cur->transid; @@ -1985,8 +2107,6 @@ void btrfs_put_root(struct btrfs_root *root) free_anon_bdev(root->anon_dev); btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); #ifdef CONFIG_BTRFS_DEBUG spin_lock(&root->fs_info->fs_roots_radix_lock); list_del_init(&root->leak_list); @@ -2201,8 +2321,9 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return -ENOMEM; log_tree_root->node = read_tree_block(fs_info, bytenr, - fs_info->generation + 1, - level, NULL); + BTRFS_TREE_LOG_OBJECTID, + fs_info->generation + 1, level, + NULL); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); @@ -2247,30 +2368,42 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->extent_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->extent_root = root; location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->dev_root = root; + btrfs_init_devices_late(fs_info); } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->dev_root = root; - btrfs_init_devices_late(fs_info); - location.objectid = BTRFS_CSUM_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + /* If IGNOREDATACSUMS is set don't bother reading the csum root. */ + if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->csum_root = root; + } } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->csum_root = root; /* * This tree can share blocks with some other fs tree during relocation @@ -2279,11 +2412,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) root = btrfs_get_fs_root(tree_root->fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID, true); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->data_reloc_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->data_reloc_root = root; location.objectid = BTRFS_QUOTA_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); @@ -2296,9 +2432,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) location.objectid = BTRFS_UUID_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - if (ret != -ENOENT) - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + if (ret != -ENOENT) + goto out; + } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->uuid_root = root; @@ -2308,11 +2446,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->free_space_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->free_space_root = root; } return 0; @@ -2373,13 +2514,21 @@ static int validate_super(struct btrfs_fs_info *fs_info, btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } - /* Only PAGE SIZE is supported yet */ - if (sectorsize != PAGE_SIZE) { + + /* + * For 4K page size, we only support 4K sector size. + * For 64K page size, we support read-write for 64K sector size, and + * read-only for 4K sector size. + */ + if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || + (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && + sectorsize != SZ_64K))) { btrfs_err(fs_info, - "sectorsize %llu not supported yet, only support %lu", + "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); ret = -EINVAL; } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid nodesize %llu", nodesize); @@ -2568,6 +2717,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) generation = btrfs_super_generation(sb); level = btrfs_super_root_level(sb); tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), + BTRFS_ROOT_TREE_OBJECTID, generation, level, NULL); if (IS_ERR(tree_root->node)) { handle_error = true; @@ -2591,14 +2741,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user */ - ret = btrfs_find_highest_objectid(tree_root, - &tree_root->highest_objectid); + ret = btrfs_init_root_free_objectid(tree_root); if (ret < 0) { handle_error = true; continue; } - ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); ret = btrfs_read_roots(fs_info); if (ret < 0) { @@ -2640,11 +2789,13 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); + spin_lock_init(&fs_info->treelog_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); + mutex_init(&fs_info->zoned_meta_io_lock); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -2732,6 +2883,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; fs_info->sectorsize = 4096; + fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; spin_lock_init(&fs_info->swapfile_pins_lock); @@ -2748,7 +2900,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); + ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL); if (ret) return ret; @@ -2774,6 +2926,9 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block return -ENOMEM; btrfs_init_delayed_root(fs_info->delayed_root); + if (sb_rdonly(sb)) + set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); + return btrfs_alloc_stripe_hash_table(fs_info); } @@ -2814,6 +2969,110 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) return 0; } +/* + * Some options only have meaning at mount time and shouldn't persist across + * remounts, or be displayed. Clear these at the end of mount and remount + * code paths. + */ +void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) +{ + btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); + btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); +} + +/* + * Mounting logic specific to read-write file systems. Shared by open_ctree + * and btrfs_remount when remounting from read-only to read-write. + */ +int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) +{ + int ret; + const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); + bool clear_free_space_tree = false; + + if (btrfs_test_opt(fs_info, CLEAR_CACHE) && + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + clear_free_space_tree = true; + } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { + btrfs_warn(fs_info, "free space tree is invalid"); + clear_free_space_tree = true; + } + + if (clear_free_space_tree) { + btrfs_info(fs_info, "clearing free space tree"); + ret = btrfs_clear_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to clear free space tree: %d", ret); + goto out; + } + } + + ret = btrfs_cleanup_fs_roots(fs_info); + if (ret) + goto out; + + down_read(&fs_info->cleanup_work_sem); + if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || + (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { + up_read(&fs_info->cleanup_work_sem); + goto out; + } + up_read(&fs_info->cleanup_work_sem); + + mutex_lock(&fs_info->cleaner_mutex); + ret = btrfs_recover_relocation(fs_info->tree_root); + mutex_unlock(&fs_info->cleaner_mutex); + if (ret < 0) { + btrfs_warn(fs_info, "failed to recover relocation: %d", ret); + goto out; + } + + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, "creating free space tree"); + ret = btrfs_create_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to create free space tree: %d", ret); + goto out; + } + } + + if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) { + ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); + if (ret) + goto out; + } + + ret = btrfs_resume_balance_async(fs_info); + if (ret) + goto out; + + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + btrfs_warn(fs_info, "failed to resume dev_replace"); + goto out; + } + + btrfs_qgroup_rescan_resume(fs_info); + + if (!fs_info->uuid_root) { + btrfs_info(fs_info, "creating UUID tree"); + ret = btrfs_create_uuid_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to create the UUID tree %d", ret); + goto out; + } + } + + ret = btrfs_find_orphan_roots(fs_info); +out: + return ret; +} + int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) { @@ -2829,7 +3088,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device struct btrfs_root *chunk_root; int ret; int err = -EINVAL; - int clear_free_space_tree = 0; int level; ret = init_mount_fs_info(fs_info, sb); @@ -2882,6 +3140,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } + fs_info->csum_size = btrfs_super_csum_size(disk_super); + ret = btrfs_init_csum_hash(fs_info, csum_type); if (ret) { err = ret; @@ -2996,6 +3256,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* Cache block sizes */ fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; + fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; /* @@ -3026,6 +3288,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } + /* For 4K sector size support, it's only read-only */ + if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) { + if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) { + btrfs_err(fs_info, + "subpage sectorsize %u only supported read-only for page size %lu", + sectorsize, PAGE_SIZE); + err = -EINVAL; + goto fail_alloc; + } + } + ret = btrfs_init_workqueues(fs_info, fs_devices); if (ret) { err = ret; @@ -3052,6 +3325,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device chunk_root->node = read_tree_block(fs_info, btrfs_super_chunk_root(disk_super), + BTRFS_CHUNK_TREE_OBJECTID, generation, level, NULL); if (IS_ERR(chunk_root->node) || !extent_buffer_uptodate(chunk_root->node)) { @@ -3075,11 +3349,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } /* - * Keep the devid that is marked to be the target device for the - * device replace procedure + * At this point we know all the devices that make this filesystem, + * including the seed devices but we don't know yet if the replace + * target is required. So free devices that are not part of this + * filesystem but skip the replace traget device which is checked + * below in btrfs_init_dev_replace(). */ - btrfs_free_extra_devids(fs_devices, 0); - + btrfs_free_extra_devids(fs_devices); if (!fs_devices->latest_bdev) { btrfs_err(fs_info, "failed to read devices"); goto fail_tree_roots; @@ -3090,6 +3366,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_tree_roots; /* + * Get zone type information of zoned block devices. This will also + * handle emulation of a zoned filesystem if a regular device has the + * zoned incompat feature flag set. + */ + ret = btrfs_get_dev_zone_info_all_devices(fs_info); + if (ret) { + btrfs_err(fs_info, + "zoned: failed to read device zone info: %d", + ret); + goto fail_block_groups; + } + + /* * If we have a uuid root and we're not being told to rescan we need to * check the generation here so we can set the * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the @@ -3126,7 +3415,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_block_groups; } - btrfs_free_extra_devids(fs_devices, 1); + ret = btrfs_check_zoned_mode(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to initialize zoned mode: %d", + ret); + goto fail_block_groups; + } ret = btrfs_sysfs_add_fsid(fs_devices); if (ret) { @@ -3212,26 +3506,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } } - ret = btrfs_find_orphan_roots(fs_info); - if (ret) - goto fail_qgroup; - - if (!sb_rdonly(sb)) { - ret = btrfs_cleanup_fs_roots(fs_info); - if (ret) - goto fail_qgroup; - - mutex_lock(&fs_info->cleaner_mutex); - ret = btrfs_recover_relocation(tree_root); - mutex_unlock(&fs_info->cleaner_mutex); - if (ret < 0) { - btrfs_warn(fs_info, "failed to recover relocation: %d", - ret); - err = -EINVAL; - goto fail_qgroup; - } - } - fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); @@ -3241,78 +3515,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } if (sb_rdonly(sb)) - return 0; - - if (btrfs_test_opt(fs_info, CLEAR_CACHE) && - btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - clear_free_space_tree = 1; - } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && - !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { - btrfs_warn(fs_info, "free space tree is invalid"); - clear_free_space_tree = 1; - } - - if (clear_free_space_tree) { - btrfs_info(fs_info, "clearing free space tree"); - ret = btrfs_clear_free_space_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to clear free space tree: %d", ret); - close_ctree(fs_info); - return ret; - } - } - - if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && - !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - btrfs_info(fs_info, "creating free space tree"); - ret = btrfs_create_free_space_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to create free space tree: %d", ret); - close_ctree(fs_info); - return ret; - } - } + goto clear_oneshot; - down_read(&fs_info->cleanup_work_sem); - if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || - (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { - up_read(&fs_info->cleanup_work_sem); - close_ctree(fs_info); - return ret; - } - up_read(&fs_info->cleanup_work_sem); - - ret = btrfs_resume_balance_async(fs_info); - if (ret) { - btrfs_warn(fs_info, "failed to resume balance: %d", ret); - close_ctree(fs_info); - return ret; - } - - ret = btrfs_resume_dev_replace_async(fs_info); + ret = btrfs_start_pre_rw_mount(fs_info); if (ret) { - btrfs_warn(fs_info, "failed to resume device replace: %d", ret); close_ctree(fs_info); return ret; } - - btrfs_qgroup_rescan_resume(fs_info); btrfs_discard_resume(fs_info); - if (!fs_info->uuid_root) { - btrfs_info(fs_info, "creating UUID tree"); - ret = btrfs_create_uuid_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to create the UUID tree: %d", ret); - close_ctree(fs_info); - return ret; - } - } else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || - fs_info->generation != - btrfs_super_uuid_tree_generation(disk_super)) { + if (fs_info->uuid_root && + (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || + fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) { btrfs_info(fs_info, "checking UUID tree"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { @@ -3322,14 +3536,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device return ret; } } - set_bit(BTRFS_FS_OPEN, &fs_info->flags); - /* - * backuproot only affect mount behavior, and if open_ctree succeeded, - * no need to keep the flag - */ - btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); + set_bit(BTRFS_FS_OPEN, &fs_info->flags); +clear_oneshot: + btrfs_clear_oneshot_options(fs_info); return 0; fail_qgroup: @@ -3410,10 +3621,17 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, { struct btrfs_super_block *super; struct page *page; - u64 bytenr; + u64 bytenr, bytenr_orig; struct address_space *mapping = bdev->bd_inode->i_mapping; + int ret; + + bytenr_orig = btrfs_sb_offset(copy_num); + ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); + if (ret == -ENOENT) + return ERR_PTR(-EINVAL); + else if (ret) + return ERR_PTR(ret); - bytenr = btrfs_sb_offset(copy_num); if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) return ERR_PTR(-EINVAL); @@ -3427,7 +3645,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, return ERR_PTR(-ENODATA); } - if (btrfs_super_bytenr(super) != bytenr) { + if (btrfs_super_bytenr(super) != bytenr_orig) { btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } @@ -3482,7 +3700,8 @@ static int write_dev_supers(struct btrfs_device *device, SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; int errors = 0; - u64 bytenr; + int ret; + u64 bytenr, bytenr_orig; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; @@ -3494,12 +3713,22 @@ static int write_dev_supers(struct btrfs_device *device, struct bio *bio; struct btrfs_super_block *disk_super; - bytenr = btrfs_sb_offset(i); + bytenr_orig = btrfs_sb_offset(i); + ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); + if (ret == -ENOENT) { + continue; + } else if (ret < 0) { + btrfs_err(device->fs_info, + "couldn't get super block location for mirror %d", + i); + errors++; + continue; + } if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; - btrfs_set_super_bytenr(sb, bytenr); + btrfs_set_super_bytenr(sb, bytenr_orig); crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, @@ -3544,6 +3773,7 @@ static int write_dev_supers(struct btrfs_device *device, bio->bi_opf |= REQ_FUA; btrfsic_submit_bio(bio); + btrfs_advance_sb_log(device, i); } return errors < i ? 0 : -1; } @@ -3560,6 +3790,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) int i; int errors = 0; bool primary_failed = false; + int ret; u64 bytenr; if (max_mirrors == 0) @@ -3568,7 +3799,15 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) for (i = 0; i < max_mirrors; i++) { struct page *page; - bytenr = btrfs_sb_offset(i); + ret = btrfs_sb_log_location(device, i, READ, &bytenr); + if (ret == -ENOENT) { + break; + } else if (ret < 0) { + errors++; + if (i == 0) + primary_failed = true; + continue; + } if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; @@ -3882,14 +4121,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, } } - if (root->free_ino_pinned) - __btrfs_remove_free_space_cache(root->free_ino_pinned); - if (root->free_ino_ctl) - __btrfs_remove_free_space_cache(root->free_ino_ctl); - if (root->ino_cache_inode) { - iput(root->ino_cache_inode); - root->ino_cache_inode = NULL; - } if (drop_ref) btrfs_put_root(root); } @@ -4001,6 +4232,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); + cancel_work_sync(&fs_info->preempt_reclaim_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4053,9 +4285,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_sum(&fs_info->delalloc_bytes)); } - if (percpu_counter_sum(&fs_info->dio_bytes)) + if (percpu_counter_sum(&fs_info->ordered_bytes)) btrfs_info(fs_info, "at unmount dio bytes count %lld", - percpu_counter_sum(&fs_info->dio_bytes)); + percpu_counter_sum(&fs_info->ordered_bytes)); btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); @@ -4069,6 +4301,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); + /* We shouldn't have any transaction open at this point */ + ASSERT(list_empty(&fs_info->trans_list)); + clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); btrfs_free_fs_roots(fs_info); @@ -4112,8 +4347,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { - struct btrfs_fs_info *fs_info; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info = buf->fs_info; u64 transid = btrfs_header_generation(buf); int was_dirty; @@ -4126,8 +4360,6 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) return; #endif - root = BTRFS_I(buf->pages[0]->mapping->host)->root; - fs_info = root->fs_info; btrfs_assert_tree_locked(buf); if (transid != fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", @@ -4576,6 +4808,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); + btrfs_free_redirty_list(cur_trans); + cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } @@ -4632,3 +4866,58 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } + +int btrfs_init_root_free_objectid(struct btrfs_root *root) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *l; + struct btrfs_key search_key; + struct btrfs_key found_key; + int slot; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + search_key.objectid = BTRFS_LAST_FREE_OBJECTID; + search_key.type = -1; + search_key.offset = (u64)-1; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); /* Corruption */ + if (path->slots[0] > 0) { + slot = path->slots[0] - 1; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + root->free_objectid = max_t(u64, found_key.objectid + 1, + BTRFS_FIRST_FREE_OBJECTID); + } else { + root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) +{ + int ret; + mutex_lock(&root->objectid_mutex); + + if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + btrfs_warn(root->fs_info, + "the objectid of root %llu reaches its highest value", + root->root_key.objectid); + ret = -ENOSPC; + goto out; + } + + *objectid = root->free_objectid++; + ret = 0; +out: + mutex_unlock(&root->objectid_mutex); + return ret; +} |