aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c602
1 files changed, 399 insertions, 203 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1af28b066b42..0888d484df80 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -23,7 +23,7 @@
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "volumes.h"
+#include "bio.h"
#include "print-tree.h"
#include "locking.h"
#include "tree-log.h"
@@ -43,6 +43,15 @@
#include "space-info.h"
#include "zoned.h"
#include "subpage.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "defrag.h"
+#include "uuid-tree.h"
+#include "relocation.h"
+#include "scrub.h"
+#include "super.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -75,12 +84,12 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
- struct inode *inode;
+ struct btrfs_inode *inode;
struct bio *bio;
- extent_submit_bio_start_t *submit_bio_start;
+ enum btrfs_wq_submit_cmd submit_cmd;
int mirror_num;
- /* Optional parameter for submit_bio_start used by direct io */
+ /* Optional parameter for used by direct io */
u64 dio_file_offset;
struct btrfs_work work;
blk_status_t status;
@@ -131,8 +140,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
- lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
+ lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state);
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
@@ -145,8 +153,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 1;
clear_extent_buffer_uptodate(eb);
out:
- unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state);
return ret;
}
@@ -167,11 +175,9 @@ static bool btrfs_supported_super_csum(u16 csum_type)
* Return 0 if the superblock checksum type matches the checksum value of that
* algorithm. Pass the raw disk superblock data.
*/
-static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
- char *raw_disk_sb)
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *disk_sb)
{
- struct btrfs_super_block *disk_sb =
- (struct btrfs_super_block *)raw_disk_sb;
char result[BTRFS_CSUM_SIZE];
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
@@ -182,7 +188,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
* filled with zeros and is included in the checksum.
*/
- crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
+ crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
if (memcmp(disk_sb->csum, result, fs_info->csum_size))
@@ -249,40 +255,54 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
return ret;
}
+static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
+ int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ u64 start = eb->start;
+ int i, num_pages = num_extent_pages(eb);
+ int ret = 0;
+
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+
+ ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE,
+ start, p, start - page_offset(p), mirror_num);
+ if (ret)
+ break;
+ start += PAGE_SIZE;
+ }
+
+ return ret;
+}
+
/*
* helper to read a given tree block, doing retries as required when
* the checksums don't match and we have alternate mirrors to try.
*
- * @parent_transid: expected transid, skip check if 0
- * @level: expected level, mandatory check
- * @first_key: expected key of first slot, skip check if NULL
+ * @check: expected tree parentness check, see the comments of the
+ * structure for details.
*/
int btrfs_read_extent_buffer(struct extent_buffer *eb,
- u64 parent_transid, int level,
- struct btrfs_key *first_key)
+ struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct extent_io_tree *io_tree;
int failed = 0;
int ret;
int num_copies = 0;
int mirror_num = 0;
int failed_mirror = 0;
- io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+ ASSERT(check);
+
while (1) {
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
- ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
- if (!ret) {
- if (verify_parent_transid(io_tree, eb,
- parent_transid, 0))
- ret = -EIO;
- else if (btrfs_verify_level_key(eb, level,
- first_key, parent_transid))
- ret = -EUCLEAN;
- else
- break;
- }
+ ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check);
+ if (!ret)
+ break;
num_copies = btrfs_num_copies(fs_info,
eb->start, eb->len);
@@ -458,7 +478,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
}
/* Do basic extent buffer checks at read time */
-static int validate_extent_buffer(struct extent_buffer *eb)
+static int validate_extent_buffer(struct extent_buffer *eb,
+ struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
@@ -468,6 +489,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
const u8 *header_csum;
int ret = 0;
+ ASSERT(check);
+
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
btrfs_err_rl(fs_info,
@@ -506,6 +529,45 @@ static int validate_extent_buffer(struct extent_buffer *eb)
goto out;
}
+ if (found_level != check->level) {
+ ret = -EIO;
+ goto out;
+ }
+ if (unlikely(check->transid &&
+ btrfs_header_generation(eb) != check->transid)) {
+ btrfs_err_rl(eb->fs_info,
+"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
+ eb->start, eb->read_mirror, check->transid,
+ btrfs_header_generation(eb));
+ ret = -EIO;
+ goto out;
+ }
+ if (check->has_first_key) {
+ struct btrfs_key *expect_key = &check->first_key;
+ struct btrfs_key found_key;
+
+ if (found_level)
+ btrfs_node_key_to_cpu(eb, &found_key, 0);
+ else
+ btrfs_item_key_to_cpu(eb, &found_key, 0);
+ if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
+ btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
+ eb->start, check->transid,
+ expect_key->objectid,
+ expect_key->type, expect_key->offset,
+ found_key.objectid, found_key.type,
+ found_key.offset);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+ if (check->owner_root) {
+ ret = btrfs_check_eb_owner(eb, check->owner_root);
+ if (ret < 0)
+ goto out;
+ }
+
/*
* If this is a leaf block and it is corrupt, set the corrupt bit so
* that we don't try and read the other copies of this block, just
@@ -530,13 +592,15 @@ out:
}
static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
- int mirror)
+ int mirror, struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
struct extent_buffer *eb;
bool reads_done;
int ret = 0;
+ ASSERT(check);
+
/*
* We don't allow bio merge for subpage metadata read, so we should
* only get one eb for each endio hook.
@@ -560,7 +624,7 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
ret = -EIO;
goto err;
}
- ret = validate_extent_buffer(eb);
+ ret = validate_extent_buffer(eb, check);
if (ret < 0)
goto err;
@@ -590,7 +654,8 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ASSERT(page->private);
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
- return validate_subpage_buffer(page, start, end, mirror);
+ return validate_subpage_buffer(page, start, end, mirror,
+ &bbio->parent_check);
eb = (struct extent_buffer *)page->private;
@@ -609,7 +674,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ret = -EIO;
goto err;
}
- ret = validate_extent_buffer(eb);
+ ret = validate_extent_buffer(eb, &bbio->parent_check);
err:
if (ret) {
/*
@@ -631,8 +696,18 @@ static void run_one_async_start(struct btrfs_work *work)
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
- ret = async->submit_bio_start(async->inode, async->bio,
- async->dio_file_offset);
+ switch (async->submit_cmd) {
+ case WQ_SUBMIT_METADATA:
+ ret = btree_submit_bio_start(async->bio);
+ break;
+ case WQ_SUBMIT_DATA:
+ ret = btrfs_submit_bio_start(async->inode, async->bio);
+ break;
+ case WQ_SUBMIT_DATA_DIO:
+ ret = btrfs_submit_bio_start_direct_io(async->inode,
+ async->bio, async->dio_file_offset);
+ break;
+ }
if (ret)
async->status = ret;
}
@@ -647,16 +722,14 @@ static void run_one_async_start(struct btrfs_work *work)
*/
static void run_one_async_done(struct btrfs_work *work)
{
- struct async_submit_bio *async;
- struct inode *inode;
-
- async = container_of(work, struct async_submit_bio, work);
- inode = async->inode;
+ struct async_submit_bio *async =
+ container_of(work, struct async_submit_bio, work);
+ struct btrfs_inode *inode = async->inode;
+ struct btrfs_bio *bbio = btrfs_bio(async->bio);
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
- async->bio->bi_status = async->status;
- bio_endio(async->bio);
+ btrfs_bio_end_io(bbio, async->status);
return;
}
@@ -666,7 +739,7 @@ static void run_one_async_done(struct btrfs_work *work)
* This changes nothing when cgroups aren't in use.
*/
async->bio->bi_opf |= REQ_CGROUP_PUNT;
- btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
+ btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -684,11 +757,10 @@ static void run_one_async_free(struct btrfs_work *work)
* - true if the work has been succesfuly submitted
* - false in case of error
*/
-bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
- u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start)
+bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd)
{
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
@@ -698,7 +770,7 @@ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
async->inode = inode;
async->bio = bio;
async->mirror_num = mirror_num;
- async->submit_bio_start = submit_bio_start;
+ async->submit_cmd = cmd;
btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
run_one_async_free);
@@ -732,8 +804,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
return errno_to_blk_status(ret);
}
-static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
- u64 dio_file_offset)
+blk_status_t btree_submit_bio_start(struct bio *bio)
{
/*
* when we're called for a write, we're already in the async
@@ -754,12 +825,14 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
return true;
}
-void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
+void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t ret;
bio->bi_opf |= REQ_META;
+ bbio->is_metadata = 1;
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
btrfs_submit_bio(fs_info, bio, mirror_num);
@@ -770,14 +843,13 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
* Kthread helpers are used to submit writes so that checksumming can
* happen in parallel across all CPUs.
*/
- if (should_async_write(fs_info, BTRFS_I(inode)) &&
- btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
+ if (should_async_write(fs_info, inode) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA))
return;
ret = btree_csum_one_bio(bio);
if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(bbio, ret);
return;
}
@@ -924,28 +996,28 @@ struct extent_buffer *btrfs_find_create_tree_block(
* Read tree block at logical address @bytenr and do variant basic but critical
* verification.
*
- * @owner_root: the objectid of the root owner for this block.
- * @parent_transid: expected transid of this tree block, skip check if 0
- * @level: expected level, mandatory check
- * @first_key: expected key in slot 0, skip check if NULL
+ * @check: expected tree parentness check, see comments of the
+ * structure for details.
*/
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 owner_root, u64 parent_transid,
- int level, struct btrfs_key *first_key)
+ struct btrfs_tree_parent_check *check)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
+ ASSERT(check);
+
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
+ check->level);
if (IS_ERR(buf))
return buf;
- ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
+ ret = btrfs_read_extent_buffer(buf, check);
if (ret) {
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
- if (btrfs_check_eb_owner(buf, owner_root)) {
+ if (btrfs_check_eb_owner(buf, check->owner_root)) {
free_extent_buffer_stale(buf);
return ERR_PTR(-EUCLEAN);
}
@@ -1032,9 +1104,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->anon_dev = 0;
if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
- IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
+ IO_TREE_ROOT_DIRTY_LOG_PAGES);
extent_io_tree_init(fs_info, &root->log_csum_range,
- IO_TREE_LOG_CSUM_RANGE, NULL);
+ IO_TREE_LOG_CSUM_RANGE);
}
spin_lock_init(&root->root_item_lock);
@@ -1172,6 +1244,13 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
return btrfs_global_root(fs_info, &key);
}
+struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
+ return fs_info->block_group_root;
+ return btrfs_extent_root(fs_info, 0);
+}
+
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid)
{
@@ -1202,7 +1281,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
- goto fail_unlock;
+ goto fail;
}
root->node = leaf;
@@ -1237,9 +1316,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
return root;
-fail_unlock:
- if (leaf)
- btrfs_tree_unlock(leaf);
fail:
btrfs_put_root(root);
@@ -1357,6 +1433,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
struct btrfs_key *key)
{
struct btrfs_root *root;
+ struct btrfs_tree_parent_check check = { 0 };
struct btrfs_fs_info *fs_info = tree_root->fs_info;
u64 generation;
int ret;
@@ -1376,9 +1453,11 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
generation = btrfs_root_generation(&root->root_item);
level = btrfs_root_level(&root->root_item);
- root->node = read_tree_block(fs_info,
- btrfs_root_bytenr(&root->root_item),
- key->objectid, generation, level, NULL);
+ check.level = level;
+ check.transid = generation;
+ check.owner_root = key->objectid;
+ root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item),
+ &check);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
@@ -1524,6 +1603,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
if (objectid == BTRFS_UUID_TREE_OBJECTID)
return btrfs_grab_root(fs_info->uuid_root) ?
fs_info->uuid_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->block_group_root) ?
+ fs_info->block_group_root : ERR_PTR(-ENOENT);
if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
struct btrfs_root *root = btrfs_global_root(fs_info, &key);
@@ -1980,14 +2062,7 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
- btrfs_set_backup_block_group_root(root_backup,
- info->block_group_root->node->start);
- btrfs_set_backup_block_group_root_gen(root_backup,
- btrfs_header_generation(info->block_group_root->node));
- btrfs_set_backup_block_group_root_level(root_backup,
- btrfs_header_level(info->block_group_root->node));
- } else {
+ if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
@@ -2093,8 +2168,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->workers);
if (fs_info->endio_workers)
destroy_workqueue(fs_info->endio_workers);
- if (fs_info->endio_raid56_workers)
- destroy_workqueue(fs_info->endio_raid56_workers);
if (fs_info->rmw_workers)
destroy_workqueue(fs_info->rmw_workers);
if (fs_info->compressed_write_workers)
@@ -2225,6 +2298,8 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
{
struct inode *inode = fs_info->btree_inode;
+ unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
+ fs_info->tree_root);
inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
set_nlink(inode, 1);
@@ -2238,8 +2313,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_BTREE_INODE_IO, inode);
- BTRFS_I(inode)->io_tree.track_uptodate = false;
+ IO_TREE_BTREE_INODE_IO);
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
@@ -2247,7 +2321,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
BTRFS_I(inode)->location.type = 0;
BTRFS_I(inode)->location.offset = 0;
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
- btrfs_insert_inode_hash(inode);
+ __insert_inode_hash(inode, hash);
}
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
@@ -2266,6 +2340,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_seq = 1;
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
mutex_init(&fs_info->qgroup_rescan_lock);
}
@@ -2298,8 +2373,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
alloc_workqueue("btrfs-endio", flags, max_active);
fs_info->endio_meta_workers =
alloc_workqueue("btrfs-endio-meta", flags, max_active);
- fs_info->endio_raid56_workers =
- alloc_workqueue("btrfs-endio-raid56", flags, max_active);
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
@@ -2321,7 +2394,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->compressed_write_workers &&
- fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+ fs_info->endio_write_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->fixup_workers &&
fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
@@ -2357,6 +2430,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
int ret;
+ struct btrfs_tree_parent_check check = { 0 };
struct btrfs_root *log_tree_root;
struct btrfs_super_block *disk_super = fs_info->super_copy;
u64 bytenr = btrfs_super_log_root(disk_super);
@@ -2372,10 +2446,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
if (!log_tree_root)
return -ENOMEM;
- log_tree_root->node = read_tree_block(fs_info, bytenr,
- BTRFS_TREE_LOG_OBJECTID,
- fs_info->generation + 1, level,
- NULL);
+ check.level = level;
+ check.transid = fs_info->generation + 1;
+ check.owner_root = BTRFS_TREE_LOG_OBJECTID;
+ log_tree_root->node = read_tree_block(fs_info, bytenr, &check);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
@@ -2529,10 +2603,24 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (ret)
return ret;
- location.objectid = BTRFS_DEV_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = 0;
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
+ location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->block_group_root = root;
+ }
+ }
+
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
@@ -2544,7 +2632,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
fs_info->dev_root = root;
}
/* Initialize fs_info for all devices in any case */
- btrfs_init_devices_late(fs_info);
+ ret = btrfs_init_devices_late(fs_info);
+ if (ret)
+ goto out;
/*
* This tree can share blocks with some other fs tree during relocation
@@ -2600,8 +2690,8 @@ out:
* 1, 2 2nd and 3rd backup copy
* -1 skip bytenr check
*/
-static int validate_super(struct btrfs_fs_info *fs_info,
- struct btrfs_super_block *sb, int mirror_num)
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num)
{
u64 nodesize = btrfs_super_nodesize(sb);
u64 sectorsize = btrfs_super_sectorsize(sb);
@@ -2703,6 +2793,18 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ /*
+ * Artificial requirement for block-group-tree to force newer features
+ * (free-space-tree, no-holes) so the test matrix is smaller.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
+ !btrfs_fs_incompat(fs_info, NO_HOLES))) {
+ btrfs_err(fs_info,
+ "block-group-tree feature requires fres-space-tree and no-holes");
+ ret = -EINVAL;
+ }
+
if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
@@ -2785,7 +2887,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
*/
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
{
- return validate_super(fs_info, fs_info->super_copy, 0);
+ return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
}
/*
@@ -2799,7 +2901,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
{
int ret;
- ret = validate_super(fs_info, sb, -1);
+ ret = btrfs_validate_super(fs_info, sb, -1);
if (ret < 0)
goto out;
if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
@@ -2825,10 +2927,14 @@ out:
static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
{
+ struct btrfs_tree_parent_check check = {
+ .level = level,
+ .transid = gen,
+ .owner_root = root->root_key.objectid
+ };
int ret = 0;
- root->node = read_tree_block(root->fs_info, bytenr,
- root->root_key.objectid, gen, level, NULL);
+ root->node = read_tree_block(root->fs_info, bytenr, &check);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
@@ -2860,17 +2966,7 @@ static int load_important_roots(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info, "couldn't read tree root");
return ret;
}
-
- if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
- return 0;
-
- bytenr = btrfs_super_block_group_root(sb);
- gen = btrfs_super_block_group_root_generation(sb);
- level = btrfs_super_block_group_root_level(sb);
- ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
- if (ret)
- btrfs_warn(fs_info, "couldn't read block group root");
- return ret;
+ return 0;
}
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
@@ -2882,16 +2978,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
int ret = 0;
int i;
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- struct btrfs_root *root;
-
- root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
- GFP_KERNEL);
- if (!root)
- return -ENOMEM;
- fs_info->block_group_root = root;
- }
-
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
if (handle_error) {
if (!IS_ERR(tree_root->node))
@@ -2990,6 +3076,19 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->zoned_data_reloc_io_lock);
seqlock_init(&fs_info->profiles_lock);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
+ btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start,
+ BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
+ BTRFS_LOCKDEP_TRANS_COMPLETED);
+
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
@@ -3043,7 +3142,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
extent_io_tree_init(fs_info, &fs_info->excluded_extents,
- IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
+ IO_TREE_FS_EXCLUDED_EXTENTS);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
@@ -3279,6 +3378,112 @@ out:
return ret;
}
+/*
+ * Do various sanity and dependency checks of different features.
+ *
+ * This is the place for less strict checks (like for subpage or artificial
+ * feature dependencies).
+ *
+ * For strict checks or possible corruption detection, see
+ * btrfs_validate_super().
+ *
+ * This should be called after btrfs_parse_options(), as some mount options
+ * (space cache related) can modify on-disk format like free space tree and
+ * screw up certain feature dependencies.
+ */
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ u64 incompat = btrfs_super_incompat_flags(disk_super);
+ const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
+ const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
+
+ if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ btrfs_err(fs_info,
+ "cannot mount because of unknown incompat features (0x%llx)",
+ incompat);
+ return -EINVAL;
+ }
+
+ /* Runtime limitation for mixed block groups. */
+ if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (fs_info->sectorsize != fs_info->nodesize)) {
+ btrfs_err(fs_info,
+"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
+ fs_info->nodesize, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* Mixed backref is an always-enabled feature. */
+ incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+
+ /* Set compression related flags just in case. */
+ if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+ else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
+
+ /*
+ * An ancient flag, which should really be marked deprecated.
+ * Such runtime limitation doesn't really need a incompat flag.
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
+ incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+
+ if (compat_ro_unsupp && !sb_rdonly(sb)) {
+ btrfs_err(fs_info,
+ "cannot mount read-write because of unknown compat_ro features (0x%llx)",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata writes, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * Artificial limitations for block group tree, to force
+ * block-group-tree to rely on no-holes and free-space-tree.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
+ !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
+ btrfs_err(fs_info,
+"block-group-tree feature requires no-holes and free-space-tree features");
+ return -EINVAL;
+ }
+
+ /*
+ * Subpage runtime limitation on v1 cache.
+ *
+ * V1 space cache still has some hard codeed PAGE_SIZE usage, while
+ * we're already defaulting to v2 cache, no need to bother v1 as it's
+ * going to be deprecated anyway.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ btrfs_warn(fs_info,
+ "v1 space cache is not supported for page size %lu with sectorsize %u",
+ PAGE_SIZE, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* This can be called by remount, we need to protect the super block. */
+ spin_lock(&fs_info->super_lock);
+ btrfs_set_super_incompat_flags(disk_super, incompat);
+ spin_unlock(&fs_info->super_lock);
+
+ return 0;
+}
+
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
char *options)
{
@@ -3359,7 +3564,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* We want to check superblock checksum, the type is stored inside.
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
*/
- if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
+ if (btrfs_check_super_csum(fs_info, disk_super)) {
btrfs_err(fs_info, "superblock checksum mismatch");
err = -EINVAL;
btrfs_release_disk_super(disk_super);
@@ -3428,72 +3633,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
- features = btrfs_super_incompat_flags(disk_super) &
- ~BTRFS_FEATURE_INCOMPAT_SUPP;
- if (features) {
- btrfs_err(fs_info,
- "cannot mount because of unsupported optional features (0x%llx)",
- features);
- err = -EINVAL;
- goto fail_alloc;
- }
-
- features = btrfs_super_incompat_flags(disk_super);
- features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
- else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
-
- /*
- * Flag our filesystem as having big metadata blocks if they are bigger
- * than the page size.
- */
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
-
- /*
- * mixed block groups end up with duplicate but slightly offset
- * extent buffers for the same range. It leads to corruptions
- */
- if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- (sectorsize != nodesize)) {
- btrfs_err(fs_info,
-"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
- nodesize, sectorsize);
- goto fail_alloc;
- }
-
- /*
- * Needn't use the lock because there is no other task which will
- * update the flag.
- */
- btrfs_set_super_incompat_flags(disk_super, features);
-
- features = btrfs_super_compat_ro_flags(disk_super) &
- ~BTRFS_FEATURE_COMPAT_RO_SUPP;
- if (!sb_rdonly(sb) && features) {
- btrfs_err(fs_info,
- "cannot mount read-write because of unsupported optional features (0x%llx)",
- features);
- err = -EINVAL;
- goto fail_alloc;
- }
- /*
- * We have unsupported RO compat features, although RO mounted, we
- * should not cause any metadata write, including log replay.
- * Or we could screw up whatever the new feature requires.
- */
- if (unlikely(features && btrfs_super_log_root(disk_super) &&
- !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
- btrfs_err(fs_info,
-"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
- features);
- err = -EINVAL;
+ ret = btrfs_check_features(fs_info, sb);
+ if (ret < 0) {
+ err = ret;
goto fail_alloc;
}
-
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
@@ -3683,10 +3828,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
/*
- * Mount does not set all options immediately, we can do it now and do
- * not have to wait for transaction commit
+ * For devices supporting discard turn on discard=async automatically,
+ * unless it's already set or disabled. This could be turned off by
+ * nodiscard for the same mount.
*/
- btrfs_apply_pending_changes(fs_info);
+ if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
+ btrfs_test_opt(fs_info, NODISCARD)) &&
+ fs_info->fs_devices->discardable) {
+ btrfs_set_and_info(fs_info, DISCARD_ASYNC,
+ "auto enabling async discard");
+ btrfs_clear_opt(fs_info->mount_opt, NODISCARD);
+ }
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
@@ -3815,7 +3968,7 @@ static void btrfs_end_super_write(struct bio *bio)
if (bio->bi_status) {
btrfs_warn_rl_in_rcu(device->fs_info,
"lost page write due to IO error on %s (%d)",
- rcu_str_deref(device->name),
+ btrfs_dev_name(device),
blk_status_to_errno(bio->bi_status));
ClearPageUptodate(page);
SetPageError(page);
@@ -3833,7 +3986,7 @@ static void btrfs_end_super_write(struct bio *bio)
}
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num)
+ int copy_num, bool drop_cache)
{
struct btrfs_super_block *super;
struct page *page;
@@ -3851,6 +4004,19 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
+ if (drop_cache) {
+ /* This should only be called with the primary sb. */
+ ASSERT(copy_num == 0);
+
+ /*
+ * Drop the page of the primary superblock, so later read will
+ * always read from the device.
+ */
+ invalidate_inode_pages2_range(mapping,
+ bytenr >> PAGE_SHIFT,
+ (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
+ }
+
page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
if (IS_ERR(page))
return ERR_CAST(page);
@@ -3882,7 +4048,7 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
- super = btrfs_read_dev_one_super(bdev, i);
+ super = btrfs_read_dev_one_super(bdev, i, false);
if (IS_ERR(super))
continue;
@@ -4475,6 +4641,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
/*
+ * If we had UNFINISHED_DROPS we could still be processing them, so
+ * clear that bit and wake up relocation so it can stop.
+ * We must do this before stopping the block group reclaim task, because
+ * at btrfs_relocate_block_group() we wait for this bit, and after the
+ * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
+ * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
+ * return 1.
+ */
+ btrfs_wake_unfinished_drop(fs_info);
+
+ /*
* We may have the reclaim task running and relocating a data block group,
* in which case it may create delayed iputs. So stop it before we park
* the cleaner kthread otherwise we can get new delayed iputs after
@@ -4492,12 +4669,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*/
kthread_park(fs_info->cleaner_kthread);
- /*
- * If we had UNFINISHED_DROPS we could still be processing them, so
- * clear that bit and wake up relocation so it can stop.
- */
- btrfs_wake_unfinished_drop(fs_info);
-
/* wait for the qgroup rescan worker to stop */
btrfs_qgroup_wait_for_completion(fs_info, false);
@@ -4520,6 +4691,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
/* clear out the rbtree of defraggable inodes */
btrfs_cleanup_defrag_inodes(fs_info);
+ /*
+ * After we parked the cleaner kthread, ordered extents may have
+ * completed and created new delayed iputs. If one of the async reclaim
+ * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
+ * can hang forever trying to stop it, because if a delayed iput is
+ * added after it ran btrfs_run_delayed_iputs() and before it called
+ * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
+ * no one else to run iputs.
+ *
+ * So wait for all ongoing ordered extents to complete and then run
+ * delayed iputs. This works because once we reach this point no one
+ * can either create new ordered extents nor create delayed iputs
+ * through some other means.
+ *
+ * Also note that btrfs_wait_ordered_roots() is not safe here, because
+ * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
+ * but the delayed iput for the respective inode is made only when doing
+ * the final btrfs_put_ordered_extent() (which must happen at
+ * btrfs_finish_ordered_io() when we are unmounting).
+ */
+ btrfs_flush_workqueue(fs_info->endio_write_workers);
+ /* Ordered extents for free space inodes. */
+ btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ btrfs_run_delayed_iputs(fs_info);
+
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);