aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c1009
1 files changed, 649 insertions, 360 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8e3438672a82..41b718cfea40 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,7 +29,6 @@
#include "tree-log.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "inode-map.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
@@ -42,6 +41,7 @@
#include "block-group.h"
#include "discard.h"
#include "space-info.h"
+#include "zoned.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -109,15 +109,13 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
- void *private_data;
+ struct inode *inode;
struct bio *bio;
extent_submit_bio_start_t *submit_bio_start;
int mirror_num;
- /*
- * bio_offset is optional, can be used if the pages in the bio
- * can't tell us where in the file the bio should go
- */
- u64 bio_offset;
+
+ /* Optional parameter for submit_bio_start used by direct io */
+ u64 dio_file_offset;
struct btrfs_work work;
blk_status_t status;
};
@@ -150,40 +148,41 @@ struct async_submit_bio {
# error
# endif
+#define DEFINE_LEVEL(stem, level) \
+ .names[level] = "btrfs-" stem "-0" #level,
+
+#define DEFINE_NAME(stem) \
+ DEFINE_LEVEL(stem, 0) \
+ DEFINE_LEVEL(stem, 1) \
+ DEFINE_LEVEL(stem, 2) \
+ DEFINE_LEVEL(stem, 3) \
+ DEFINE_LEVEL(stem, 4) \
+ DEFINE_LEVEL(stem, 5) \
+ DEFINE_LEVEL(stem, 6) \
+ DEFINE_LEVEL(stem, 7)
+
static struct btrfs_lockdep_keyset {
u64 id; /* root objectid */
- const char *name_stem; /* lock name stem */
- char names[BTRFS_MAX_LEVEL + 1][20];
- struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
+ /* Longest entry: btrfs-free-space-00 */
+ char names[BTRFS_MAX_LEVEL][20];
+ struct lock_class_key keys[BTRFS_MAX_LEVEL];
} btrfs_lockdep_keysets[] = {
- { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
- { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
- { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
- { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
- { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
- { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
- { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" },
- { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
- { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
- { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
- { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" },
- { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" },
- { .id = 0, .name_stem = "tree" },
+ { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") },
+ { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") },
+ { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") },
+ { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") },
+ { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") },
+ { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") },
+ { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") },
+ { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") },
+ { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") },
+ { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
+ { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
+ { .id = 0, DEFINE_NAME("tree") },
};
-void __init btrfs_init_lockdep(void)
-{
- int i, j;
-
- /* initialize lockdep class names */
- for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
- struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
-
- for (j = 0; j < ARRAY_SIZE(ks->names); j++)
- snprintf(ks->names[j], sizeof(ks->names[j]),
- "btrfs-%s-%02d", ks->name_stem, j);
- }
-}
+#undef DEFINE_LEVEL
+#undef DEFINE_NAME
void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
int level)
@@ -210,15 +209,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
const int num_pages = fs_info->nodesize >> PAGE_SHIFT;
+ const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- kaddr = page_address(buf->pages[0]);
+ kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
- PAGE_SIZE - BTRFS_CSUM_SIZE);
+ first_page_part - BTRFS_CSUM_SIZE);
for (i = 1; i < num_pages; i++) {
kaddr = page_address(buf->pages[i]);
@@ -248,10 +248,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
- if (need_lock) {
+ if (need_lock)
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- }
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
@@ -280,7 +278,7 @@ out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
if (need_lock)
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
return ret;
}
@@ -319,7 +317,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
- if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
+ if (memcmp(disk_sb->csum, result, fs_info->csum_size))
return 1;
return 0;
@@ -443,16 +441,16 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
}
/*
- * checksum a dirty tree block before IO. This has extra checks to make sure
- * we only fill in the checksum field in the first page of a multi-page block
+ * Checksum a dirty tree block before IO. This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block.
+ * For subpage extent buffers we need bvec to also read the offset in the page.
*/
-
-static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
+static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
{
+ struct page *page = bvec->bv_page;
u64 start = page_offset(page);
u64 found_start;
u8 result[BTRFS_CSUM_SIZE];
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
struct extent_buffer *eb;
int ret;
@@ -461,6 +459,12 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
return 0;
found_start = btrfs_header_bytenr(eb);
+
+ if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
+ WARN_ON(found_start != 0);
+ return 0;
+ }
+
/*
* Please do not consolidate these warnings into a single if.
* It is useful to know what went wrong.
@@ -489,7 +493,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
return ret;
}
- write_extent_buffer(eb, result, 0, csum_size);
+ write_extent_buffer(eb, result, 0, fs_info->csum_size);
return 0;
}
@@ -523,65 +527,37 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
return 1;
}
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int mirror)
+/* Do basic extent buffer checks at read time */
+static int validate_extent_buffer(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
- int found_level;
- struct extent_buffer *eb;
- struct btrfs_fs_info *fs_info;
- u16 csum_size;
- int ret = 0;
+ const u32 csum_size = fs_info->csum_size;
+ u8 found_level;
u8 result[BTRFS_CSUM_SIZE];
- int reads_done;
-
- if (!page->private)
- goto out;
-
- eb = (struct extent_buffer *)page->private;
- fs_info = eb->fs_info;
- csum_size = btrfs_super_csum_size(fs_info->super_copy);
-
- /* the pending IO might have been the only thing that kept this buffer
- * in memory. Make sure we have a ref for all this other checks
- */
- atomic_inc(&eb->refs);
-
- reads_done = atomic_dec_and_test(&eb->io_pages);
- if (!reads_done)
- goto err;
-
- eb->read_mirror = mirror;
- if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
- ret = -EIO;
- goto err;
- }
+ int ret = 0;
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
eb->start, found_start);
ret = -EIO;
- goto err;
+ goto out;
}
if (check_tree_block_fsid(eb)) {
btrfs_err_rl(fs_info, "bad fsid on block %llu",
eb->start);
ret = -EIO;
- goto err;
+ goto out;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
btrfs_err(fs_info, "bad tree block level %d on %llu",
(int)btrfs_header_level(eb), eb->start);
ret = -EIO;
- goto err;
+ goto out;
}
- btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
- eb, found_level);
-
csum_tree_block(eb, result);
if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
@@ -595,7 +571,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb));
ret = -EUCLEAN;
- goto err;
+ goto out;
}
/*
@@ -617,6 +593,94 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
btrfs_err(fs_info,
"block=%llu read time tree block corruption detected",
eb->start);
+out:
+ return ret;
+}
+
+static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
+ int mirror)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct extent_buffer *eb;
+ bool reads_done;
+ int ret = 0;
+
+ /*
+ * We don't allow bio merge for subpage metadata read, so we should
+ * only get one eb for each endio hook.
+ */
+ ASSERT(end == start + fs_info->nodesize - 1);
+ ASSERT(PagePrivate(page));
+
+ eb = find_extent_buffer(fs_info, start);
+ /*
+ * When we are reading one tree block, eb must have been inserted into
+ * the radix tree. If not, something is wrong.
+ */
+ ASSERT(eb);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ /* Subpage read must finish in page read */
+ ASSERT(reads_done);
+
+ eb->read_mirror = mirror;
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+ ret = -EIO;
+ goto err;
+ }
+ ret = validate_extent_buffer(eb);
+ if (ret < 0)
+ goto err;
+
+ if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
+ btree_readahead_hook(eb, ret);
+
+ set_extent_buffer_uptodate(eb);
+
+ free_extent_buffer(eb);
+ return ret;
+err:
+ /*
+ * end_bio_extent_readpage decrements io_pages in case of error,
+ * make sure it has something to decrement.
+ */
+ atomic_inc(&eb->io_pages);
+ clear_extent_buffer_uptodate(eb);
+ free_extent_buffer(eb);
+ return ret;
+}
+
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+ struct page *page, u64 start, u64 end,
+ int mirror)
+{
+ struct extent_buffer *eb;
+ int ret = 0;
+ int reads_done;
+
+ ASSERT(page->private);
+
+ if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ return validate_subpage_buffer(page, start, end, mirror);
+
+ eb = (struct extent_buffer *)page->private;
+
+ /*
+ * The pending IO might have been the only thing that kept this buffer
+ * in memory. Make sure we have a ref for all this other checks
+ */
+ atomic_inc(&eb->refs);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ if (!reads_done)
+ goto err;
+
+ eb->read_mirror = mirror;
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+ ret = -EIO;
+ goto err;
+ }
+ ret = validate_extent_buffer(eb);
err:
if (reads_done &&
test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@ -632,7 +696,7 @@ err:
clear_extent_buffer_uptodate(eb);
}
free_extent_buffer(eb);
-out:
+
return ret;
}
@@ -645,7 +709,7 @@ static void end_workqueue_bio(struct bio *bio)
fs_info = end_io_wq->info;
end_io_wq->status = bio->bi_status;
- if (bio_op(bio) == REQ_OP_WRITE) {
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
wq = fs_info->endio_meta_write_workers;
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
@@ -694,8 +758,8 @@ static void run_one_async_start(struct btrfs_work *work)
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
- ret = async->submit_bio_start(async->private_data, async->bio,
- async->bio_offset);
+ ret = async->submit_bio_start(async->inode, async->bio,
+ async->dio_file_offset);
if (ret)
async->status = ret;
}
@@ -715,7 +779,7 @@ static void run_one_async_done(struct btrfs_work *work)
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
- inode = async->private_data;
+ inode = async->inode;
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
@@ -745,18 +809,19 @@ static void run_one_async_free(struct btrfs_work *work)
kfree(async);
}
-blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
- u64 bio_offset, void *private_data,
+ u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start)
{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
if (!async)
return BLK_STS_RESOURCE;
- async->private_data = private_data;
+ async->inode = inode;
async->bio = bio;
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
@@ -764,7 +829,7 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
run_one_async_free);
- async->bio_offset = bio_offset;
+ async->dio_file_offset = dio_file_offset;
async->status = 0;
@@ -785,7 +850,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
- ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
+ ret = csum_dirty_buffer(root->fs_info, bvec);
if (ret)
break;
}
@@ -793,8 +858,8 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
return errno_to_blk_status(ret);
}
-static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
- u64 bio_offset)
+static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
+ u64 dio_file_offset)
{
/*
* when we're called for a write, we're already in the async
@@ -806,6 +871,8 @@ static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
static int check_async_write(struct btrfs_fs_info *fs_info,
struct btrfs_inode *bi)
{
+ if (btrfs_is_zoned(fs_info))
+ return 0;
if (atomic_read(&bi->sync_writers))
return 0;
if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
@@ -820,7 +887,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
int async = check_async_write(fs_info, BTRFS_I(inode));
blk_status_t ret;
- if (bio_op(bio) != REQ_OP_WRITE) {
+ if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
/*
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
@@ -840,8 +907,8 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
- ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
- 0, inode, btree_submit_bio_start);
+ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
+ 0, btree_submit_bio_start);
}
if (ret)
@@ -947,47 +1014,33 @@ static const struct address_space_operations btree_aops = {
.set_page_dirty = btree_set_page_dirty,
};
-void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
- struct extent_buffer *buf = NULL;
- int ret;
-
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
- if (IS_ERR(buf))
- return;
-
- ret = read_extent_buffer_pages(buf, WAIT_NONE, 0);
- if (ret < 0)
- free_extent_buffer_stale(buf);
- else
- free_extent_buffer(buf);
-}
-
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
- u64 bytenr)
+ u64 bytenr, u64 owner_root,
+ int level)
{
if (btrfs_is_testing(fs_info))
return alloc_test_extent_buffer(fs_info, bytenr);
- return alloc_extent_buffer(fs_info, bytenr);
+ return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
}
/*
* Read tree block at logical address @bytenr and do variant basic but critical
* verification.
*
+ * @owner_root: the objectid of the root owner for this block.
* @parent_transid: expected transid of this tree block, skip check if 0
* @level: expected level, mandatory check
* @first_key: expected key in slot 0, skip check if NULL
*/
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 parent_transid, int level,
- struct btrfs_key *first_key)
+ u64 owner_root, u64 parent_transid,
+ int level, struct btrfs_key *first_key)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
if (IS_ERR(buf))
return buf;
@@ -1012,8 +1065,6 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
-buf->len,
fs_info->dirty_metadata_batch);
- /* ugh, clear_extent_buffer_dirty needs to lock the page */
- btrfs_set_lock_blocking_write(buf);
clear_extent_buffer_dirty(buf);
}
}
@@ -1030,7 +1081,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->orphan_cleanup_state = 0;
root->last_trans = 0;
- root->highest_objectid = 0;
+ root->free_objectid = 0;
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
@@ -1155,7 +1206,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
- goto fail;
+ goto fail_unlock;
}
root->node = leaf;
@@ -1164,8 +1215,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
root->commit_root = btrfs_root_node(root);
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- root->root_item.flags = 0;
- root->root_item.byte_limit = 0;
+ btrfs_set_root_flags(&root->root_item, 0);
+ btrfs_set_root_limit(&root->root_item, 0);
btrfs_set_root_bytenr(&root->root_item, leaf->start);
btrfs_set_root_generation(&root->root_item, trans->transid);
btrfs_set_root_level(&root->root_item, 0);
@@ -1177,7 +1228,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
generate_random_guid(root->root_item.uuid);
else
export_guid(root->root_item.uuid, &guid_null);
- root->root_item.drop_level = 0;
+ btrfs_set_root_drop_level(&root->root_item, 0);
+
+ btrfs_tree_unlock(leaf);
key.objectid = objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1186,13 +1239,12 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (ret)
goto fail;
- btrfs_tree_unlock(leaf);
-
return root;
-fail:
+fail_unlock:
if (leaf)
btrfs_tree_unlock(leaf);
+fail:
btrfs_put_root(root);
return ERR_PTR(ret);
@@ -1202,7 +1254,6 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
- struct extent_buffer *leaf;
root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
if (!root)
@@ -1212,6 +1263,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+ return root;
+}
+
+int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct extent_buffer *leaf;
+
/*
* DON'T set SHAREABLE bit for log trees.
*
@@ -1224,16 +1283,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
- if (IS_ERR(leaf)) {
- btrfs_put_root(root);
- return ERR_CAST(leaf);
- }
+ if (IS_ERR(leaf))
+ return PTR_ERR(leaf);
root->node = leaf;
btrfs_mark_buffer_dirty(root->node);
btrfs_tree_unlock(root->node);
- return root;
+
+ return 0;
}
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -1244,6 +1302,16 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
log_root = alloc_log_tree(trans, fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
+
+ if (!btrfs_is_zoned(fs_info)) {
+ int ret = btrfs_alloc_log_tree_node(trans, log_root);
+
+ if (ret) {
+ btrfs_put_root(log_root);
+ return ret;
+ }
+ }
+
WARN_ON(fs_info->log_root_tree);
fs_info->log_root_tree = log_root;
return 0;
@@ -1255,11 +1323,18 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log_root;
struct btrfs_inode_item *inode_item;
+ int ret;
log_root = alloc_log_tree(trans, fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
+ ret = btrfs_alloc_log_tree_node(trans, log_root);
+ if (ret) {
+ btrfs_put_root(log_root);
+ return ret;
+ }
+
log_root->last_trans = trans->transid;
log_root->root_key.offset = root->root_key.objectid;
@@ -1281,57 +1356,61 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return 0;
}
-struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
- struct btrfs_key *key)
+static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
+ struct btrfs_path *path,
+ struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_fs_info *fs_info = tree_root->fs_info;
- struct btrfs_path *path;
u64 generation;
int ret;
int level;
- path = btrfs_alloc_path();
- if (!path)
- return ERR_PTR(-ENOMEM);
-
root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
- if (!root) {
- ret = -ENOMEM;
- goto alloc_fail;
- }
+ if (!root)
+ return ERR_PTR(-ENOMEM);
ret = btrfs_find_root(tree_root, key, path,
&root->root_item, &root->root_key);
if (ret) {
if (ret > 0)
ret = -ENOENT;
- goto find_fail;
+ goto fail;
}
generation = btrfs_root_generation(&root->root_item);
level = btrfs_root_level(&root->root_item);
root->node = read_tree_block(fs_info,
btrfs_root_bytenr(&root->root_item),
- generation, level, NULL);
+ key->objectid, generation, level, NULL);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
- goto find_fail;
+ goto fail;
} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
- goto find_fail;
+ goto fail;
}
root->commit_root = btrfs_root_node(root);
-out:
- btrfs_free_path(path);
return root;
-
-find_fail:
+fail:
btrfs_put_root(root);
-alloc_fail:
- root = ERR_PTR(ret);
- goto out;
+ return ERR_PTR(ret);
+}
+
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key)
+{
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+ root = read_tree_root_path(tree_root, path, key);
+ btrfs_free_path(path);
+
+ return root;
}
/*
@@ -1344,14 +1423,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
int ret;
unsigned int nofs_flag;
- root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
- root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
- GFP_NOFS);
- if (!root->free_ino_pinned || !root->free_ino_ctl) {
- ret = -ENOMEM;
- goto fail;
- }
-
/*
* We might be called under a transaction (e.g. indirect backref
* resolution) which could deadlock if it triggers memory reclaim
@@ -1368,10 +1439,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
btrfs_check_and_init_root_item(&root->root_item);
}
- btrfs_init_free_ino_ctl(root);
- spin_lock_init(&root->ino_cache_lock);
- init_waitqueue_head(&root->ino_cache_wait);
-
/*
* Don't assign anonymous block device to roots that are not exposed to
* userspace, the id pool is limited to 1M
@@ -1388,14 +1455,13 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
}
mutex_lock(&root->objectid_mutex);
- ret = btrfs_find_highest_objectid(root,
- &root->highest_objectid);
+ ret = btrfs_init_root_free_objectid(root);
if (ret) {
mutex_unlock(&root->objectid_mutex);
goto fail;
}
- ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+ ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
mutex_unlock(&root->objectid_mutex);
@@ -1419,6 +1485,31 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
return root;
}
+static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ if (objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->tree_root);
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->extent_root);
+ if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->chunk_root);
+ if (objectid == BTRFS_DEV_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->dev_root);
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->csum_root);
+ if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->quota_root) ?
+ fs_info->quota_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_UUID_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->uuid_root) ?
+ fs_info->uuid_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->free_space_root) ?
+ fs_info->free_space_root : ERR_PTR(-ENOENT);
+ return NULL;
+}
+
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root)
{
@@ -1453,7 +1544,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
root = list_first_entry(&fs_info->allocated_roots,
struct btrfs_root, leak_list);
btrfs_err(fs_info, "leaked root %s refcount %d",
- btrfs_root_name(root->root_key.objectid, buf),
+ btrfs_root_name(&root->root_key, buf),
refcount_read(&root->refs));
while (refcount_read(&root->refs) > 1)
btrfs_put_root(root);
@@ -1466,7 +1557,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
- percpu_counter_destroy(&fs_info->dio_bytes);
+ percpu_counter_destroy(&fs_info->ordered_bytes);
percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
btrfs_free_csum_hash(fs_info);
btrfs_free_stripe_hash_table(fs_info);
@@ -1518,25 +1609,9 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
struct btrfs_key key;
int ret;
- if (objectid == BTRFS_ROOT_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->tree_root);
- if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->extent_root);
- if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->chunk_root);
- if (objectid == BTRFS_DEV_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->dev_root);
- if (objectid == BTRFS_CSUM_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->csum_root);
- if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->quota_root) ?
- fs_info->quota_root : ERR_PTR(-ENOENT);
- if (objectid == BTRFS_UUID_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->uuid_root) ?
- fs_info->uuid_root : ERR_PTR(-ENOENT);
- if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->free_space_root) ?
- fs_info->free_space_root : ERR_PTR(-ENOENT);
+ root = btrfs_get_global_root(fs_info, objectid);
+ if (root)
+ return root;
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
@@ -1622,6 +1697,52 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
}
/*
+ * btrfs_get_fs_root_commit_root - return a root for the given objectid
+ * @fs_info: the fs_info
+ * @objectid: the objectid we need to lookup
+ *
+ * This is exclusively used for backref walking, and exists specifically because
+ * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref
+ * creation time, which means we may have to read the tree_root in order to look
+ * up a fs root that is not in memory. If the root is not in memory we will
+ * read the tree root commit root and look up the fs root from there. This is a
+ * temporary root, it will not be inserted into the radix tree as it doesn't
+ * have the most uptodate information, it'll simply be discarded once the
+ * backref code is finished using the root.
+ */
+struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ u64 objectid)
+{
+ struct btrfs_root *root;
+ struct btrfs_key key;
+
+ ASSERT(path->search_commit_root && path->skip_locking);
+
+ /*
+ * This can return -ENOENT if we ask for a root that doesn't exist, but
+ * since this is called via the backref walking code we won't be looking
+ * up a root that doesn't exist, unless there's corruption. So if root
+ * != NULL just return it.
+ */
+ root = btrfs_get_global_root(fs_info, objectid);
+ if (root)
+ return root;
+
+ root = btrfs_lookup_fs_root(fs_info, objectid);
+ if (root)
+ return root;
+
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = read_tree_root_path(fs_info->tree_root, path, &key);
+ btrfs_release_path(path);
+
+ return root;
+}
+
+/*
* called by the kthread helper functions to finally call the bio end_io
* functions. This is where read checksum verification actually happens
*/
@@ -1695,7 +1816,7 @@ static int cleaner_kthread(void *arg)
*/
btrfs_delete_unused_bgs(fs_info);
sleep:
- clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
+ clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
if (kthread_should_park())
kthread_parkme();
if (kthread_should_stop())
@@ -1715,13 +1836,13 @@ static int transaction_kthread(void *arg)
struct btrfs_trans_handle *trans;
struct btrfs_transaction *cur;
u64 transid;
- time64_t now;
+ time64_t delta;
unsigned long delay;
bool cannot_commit;
do {
cannot_commit = false;
- delay = HZ * fs_info->commit_interval;
+ delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
mutex_lock(&fs_info->transaction_kthread_mutex);
spin_lock(&fs_info->trans_lock);
@@ -1731,12 +1852,13 @@ static int transaction_kthread(void *arg)
goto sleep;
}
- now = ktime_get_seconds();
+ delta = ktime_get_seconds() - cur->start_time;
if (cur->state < TRANS_STATE_COMMIT_START &&
- (now < cur->start_time ||
- now - cur->start_time < fs_info->commit_interval)) {
+ delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
- delay = HZ * 5;
+ delay -= msecs_to_jiffies((delta - 1) * 1000);
+ delay = min(delay,
+ msecs_to_jiffies(fs_info->commit_interval * 1000));
goto sleep;
}
transid = cur->transid;
@@ -1985,8 +2107,6 @@ void btrfs_put_root(struct btrfs_root *root)
free_anon_bdev(root->anon_dev);
btrfs_drew_lock_destroy(&root->snapshot_lock);
free_root_extent_buffers(root);
- kfree(root->free_ino_ctl);
- kfree(root->free_ino_pinned);
#ifdef CONFIG_BTRFS_DEBUG
spin_lock(&root->fs_info->fs_roots_radix_lock);
list_del_init(&root->leak_list);
@@ -2201,8 +2321,9 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
return -ENOMEM;
log_tree_root->node = read_tree_block(fs_info, bytenr,
- fs_info->generation + 1,
- level, NULL);
+ BTRFS_TREE_LOG_OBJECTID,
+ fs_info->generation + 1, level,
+ NULL);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
@@ -2247,30 +2368,42 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->extent_root = root;
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->extent_root = root;
location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->dev_root = root;
+ btrfs_init_devices_late(fs_info);
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->dev_root = root;
- btrfs_init_devices_late(fs_info);
- location.objectid = BTRFS_CSUM_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ /* If IGNOREDATACSUMS is set don't bother reading the csum root. */
+ if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
+ location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->csum_root = root;
+ }
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->csum_root = root;
/*
* This tree can share blocks with some other fs tree during relocation
@@ -2279,11 +2412,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
root = btrfs_get_fs_root(tree_root->fs_info,
BTRFS_DATA_RELOC_TREE_OBJECTID, true);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->data_reloc_root = root;
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->data_reloc_root = root;
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
@@ -2296,9 +2432,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
location.objectid = BTRFS_UUID_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- if (ret != -ENOENT)
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ if (ret != -ENOENT)
+ goto out;
+ }
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->uuid_root = root;
@@ -2308,11 +2446,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->free_space_root = root;
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->free_space_root = root;
}
return 0;
@@ -2373,13 +2514,21 @@ static int validate_super(struct btrfs_fs_info *fs_info,
btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
ret = -EINVAL;
}
- /* Only PAGE SIZE is supported yet */
- if (sectorsize != PAGE_SIZE) {
+
+ /*
+ * For 4K page size, we only support 4K sector size.
+ * For 64K page size, we support read-write for 64K sector size, and
+ * read-only for 4K sector size.
+ */
+ if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
+ (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
+ sectorsize != SZ_64K))) {
btrfs_err(fs_info,
- "sectorsize %llu not supported yet, only support %lu",
+ "sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
ret = -EINVAL;
}
+
if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
@@ -2568,6 +2717,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
generation = btrfs_super_generation(sb);
level = btrfs_super_root_level(sb);
tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
+ BTRFS_ROOT_TREE_OBJECTID,
generation, level, NULL);
if (IS_ERR(tree_root->node)) {
handle_error = true;
@@ -2591,14 +2741,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
* No need to hold btrfs_root::objectid_mutex since the fs
* hasn't been fully initialised and we are the only user
*/
- ret = btrfs_find_highest_objectid(tree_root,
- &tree_root->highest_objectid);
+ ret = btrfs_init_root_free_objectid(tree_root);
if (ret < 0) {
handle_error = true;
continue;
}
- ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+ ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
ret = btrfs_read_roots(fs_info);
if (ret < 0) {
@@ -2640,11 +2789,13 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
+ spin_lock_init(&fs_info->treelog_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
+ mutex_init(&fs_info->zoned_meta_io_lock);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2732,6 +2883,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
/* Usable values until the real ones are cached from the superblock */
fs_info->nodesize = 4096;
fs_info->sectorsize = 4096;
+ fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
spin_lock_init(&fs_info->swapfile_pins_lock);
@@ -2748,7 +2900,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
- ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
@@ -2774,6 +2926,9 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
return -ENOMEM;
btrfs_init_delayed_root(fs_info->delayed_root);
+ if (sb_rdonly(sb))
+ set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
+
return btrfs_alloc_stripe_hash_table(fs_info);
}
@@ -2814,6 +2969,110 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
return 0;
}
+/*
+ * Some options only have meaning at mount time and shouldn't persist across
+ * remounts, or be displayed. Clear these at the end of mount and remount
+ * code paths.
+ */
+void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
+{
+ btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
+}
+
+/*
+ * Mounting logic specific to read-write file systems. Shared by open_ctree
+ * and btrfs_remount when remounting from read-only to read-write.
+ */
+int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
+ bool clear_free_space_tree = false;
+
+ if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
+ btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ clear_free_space_tree = true;
+ } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
+ btrfs_warn(fs_info, "free space tree is invalid");
+ clear_free_space_tree = true;
+ }
+
+ if (clear_free_space_tree) {
+ btrfs_info(fs_info, "clearing free space tree");
+ ret = btrfs_clear_free_space_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to clear free space tree: %d", ret);
+ goto out;
+ }
+ }
+
+ ret = btrfs_cleanup_fs_roots(fs_info);
+ if (ret)
+ goto out;
+
+ down_read(&fs_info->cleanup_work_sem);
+ if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
+ (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
+ up_read(&fs_info->cleanup_work_sem);
+ goto out;
+ }
+ up_read(&fs_info->cleanup_work_sem);
+
+ mutex_lock(&fs_info->cleaner_mutex);
+ ret = btrfs_recover_relocation(fs_info->tree_root);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
+ goto out;
+ }
+
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_info(fs_info, "creating free space tree");
+ ret = btrfs_create_free_space_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to create free space tree: %d", ret);
+ goto out;
+ }
+ }
+
+ if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
+ ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_resume_balance_async(fs_info);
+ if (ret)
+ goto out;
+
+ ret = btrfs_resume_dev_replace_async(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info, "failed to resume dev_replace");
+ goto out;
+ }
+
+ btrfs_qgroup_rescan_resume(fs_info);
+
+ if (!fs_info->uuid_root) {
+ btrfs_info(fs_info, "creating UUID tree");
+ ret = btrfs_create_uuid_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to create the UUID tree %d", ret);
+ goto out;
+ }
+ }
+
+ ret = btrfs_find_orphan_roots(fs_info);
+out:
+ return ret;
+}
+
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
char *options)
{
@@ -2829,7 +3088,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
struct btrfs_root *chunk_root;
int ret;
int err = -EINVAL;
- int clear_free_space_tree = 0;
int level;
ret = init_mount_fs_info(fs_info, sb);
@@ -2882,6 +3140,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
+ fs_info->csum_size = btrfs_super_csum_size(disk_super);
+
ret = btrfs_init_csum_hash(fs_info, csum_type);
if (ret) {
err = ret;
@@ -2996,6 +3256,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
/* Cache block sizes */
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
+ fs_info->sectorsize_bits = ilog2(sectorsize);
+ fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
/*
@@ -3026,6 +3288,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
+ /* For 4K sector size support, it's only read-only */
+ if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) {
+ if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) {
+ btrfs_err(fs_info,
+ "subpage sectorsize %u only supported read-only for page size %lu",
+ sectorsize, PAGE_SIZE);
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+ }
+
ret = btrfs_init_workqueues(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -3052,6 +3325,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
chunk_root->node = read_tree_block(fs_info,
btrfs_super_chunk_root(disk_super),
+ BTRFS_CHUNK_TREE_OBJECTID,
generation, level, NULL);
if (IS_ERR(chunk_root->node) ||
!extent_buffer_uptodate(chunk_root->node)) {
@@ -3075,11 +3349,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
/*
- * Keep the devid that is marked to be the target device for the
- * device replace procedure
+ * At this point we know all the devices that make this filesystem,
+ * including the seed devices but we don't know yet if the replace
+ * target is required. So free devices that are not part of this
+ * filesystem but skip the replace traget device which is checked
+ * below in btrfs_init_dev_replace().
*/
- btrfs_free_extra_devids(fs_devices, 0);
-
+ btrfs_free_extra_devids(fs_devices);
if (!fs_devices->latest_bdev) {
btrfs_err(fs_info, "failed to read devices");
goto fail_tree_roots;
@@ -3090,6 +3366,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_tree_roots;
/*
+ * Get zone type information of zoned block devices. This will also
+ * handle emulation of a zoned filesystem if a regular device has the
+ * zoned incompat feature flag set.
+ */
+ ret = btrfs_get_dev_zone_info_all_devices(fs_info);
+ if (ret) {
+ btrfs_err(fs_info,
+ "zoned: failed to read device zone info: %d",
+ ret);
+ goto fail_block_groups;
+ }
+
+ /*
* If we have a uuid root and we're not being told to rescan we need to
* check the generation here so we can set the
* BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the
@@ -3126,7 +3415,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_block_groups;
}
- btrfs_free_extra_devids(fs_devices, 1);
+ ret = btrfs_check_zoned_mode(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to initialize zoned mode: %d",
+ ret);
+ goto fail_block_groups;
+ }
ret = btrfs_sysfs_add_fsid(fs_devices);
if (ret) {
@@ -3212,26 +3506,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
}
- ret = btrfs_find_orphan_roots(fs_info);
- if (ret)
- goto fail_qgroup;
-
- if (!sb_rdonly(sb)) {
- ret = btrfs_cleanup_fs_roots(fs_info);
- if (ret)
- goto fail_qgroup;
-
- mutex_lock(&fs_info->cleaner_mutex);
- ret = btrfs_recover_relocation(tree_root);
- mutex_unlock(&fs_info->cleaner_mutex);
- if (ret < 0) {
- btrfs_warn(fs_info, "failed to recover relocation: %d",
- ret);
- err = -EINVAL;
- goto fail_qgroup;
- }
- }
-
fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
@@ -3241,78 +3515,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
if (sb_rdonly(sb))
- return 0;
-
- if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
- btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- clear_free_space_tree = 1;
- } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
- !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
- btrfs_warn(fs_info, "free space tree is invalid");
- clear_free_space_tree = 1;
- }
-
- if (clear_free_space_tree) {
- btrfs_info(fs_info, "clearing free space tree");
- ret = btrfs_clear_free_space_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to clear free space tree: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
- }
-
- if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
- !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- btrfs_info(fs_info, "creating free space tree");
- ret = btrfs_create_free_space_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to create free space tree: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
- }
+ goto clear_oneshot;
- down_read(&fs_info->cleanup_work_sem);
- if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
- (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
- up_read(&fs_info->cleanup_work_sem);
- close_ctree(fs_info);
- return ret;
- }
- up_read(&fs_info->cleanup_work_sem);
-
- ret = btrfs_resume_balance_async(fs_info);
- if (ret) {
- btrfs_warn(fs_info, "failed to resume balance: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
-
- ret = btrfs_resume_dev_replace_async(fs_info);
+ ret = btrfs_start_pre_rw_mount(fs_info);
if (ret) {
- btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
close_ctree(fs_info);
return ret;
}
-
- btrfs_qgroup_rescan_resume(fs_info);
btrfs_discard_resume(fs_info);
- if (!fs_info->uuid_root) {
- btrfs_info(fs_info, "creating UUID tree");
- ret = btrfs_create_uuid_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to create the UUID tree: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
- } else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
- fs_info->generation !=
- btrfs_super_uuid_tree_generation(disk_super)) {
+ if (fs_info->uuid_root &&
+ (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
+ fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
btrfs_info(fs_info, "checking UUID tree");
ret = btrfs_check_uuid_tree(fs_info);
if (ret) {
@@ -3322,14 +3536,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
return ret;
}
}
- set_bit(BTRFS_FS_OPEN, &fs_info->flags);
- /*
- * backuproot only affect mount behavior, and if open_ctree succeeded,
- * no need to keep the flag
- */
- btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+clear_oneshot:
+ btrfs_clear_oneshot_options(fs_info);
return 0;
fail_qgroup:
@@ -3410,10 +3621,17 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
{
struct btrfs_super_block *super;
struct page *page;
- u64 bytenr;
+ u64 bytenr, bytenr_orig;
struct address_space *mapping = bdev->bd_inode->i_mapping;
+ int ret;
+
+ bytenr_orig = btrfs_sb_offset(copy_num);
+ ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
+ if (ret == -ENOENT)
+ return ERR_PTR(-EINVAL);
+ else if (ret)
+ return ERR_PTR(ret);
- bytenr = btrfs_sb_offset(copy_num);
if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
return ERR_PTR(-EINVAL);
@@ -3427,7 +3645,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
return ERR_PTR(-ENODATA);
}
- if (btrfs_super_bytenr(super) != bytenr) {
+ if (btrfs_super_bytenr(super) != bytenr_orig) {
btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
@@ -3482,7 +3700,8 @@ static int write_dev_supers(struct btrfs_device *device,
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
int i;
int errors = 0;
- u64 bytenr;
+ int ret;
+ u64 bytenr, bytenr_orig;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
@@ -3494,12 +3713,22 @@ static int write_dev_supers(struct btrfs_device *device,
struct bio *bio;
struct btrfs_super_block *disk_super;
- bytenr = btrfs_sb_offset(i);
+ bytenr_orig = btrfs_sb_offset(i);
+ ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
+ if (ret == -ENOENT) {
+ continue;
+ } else if (ret < 0) {
+ btrfs_err(device->fs_info,
+ "couldn't get super block location for mirror %d",
+ i);
+ errors++;
+ continue;
+ }
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
- btrfs_set_super_bytenr(sb, bytenr);
+ btrfs_set_super_bytenr(sb, bytenr_orig);
crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
@@ -3544,6 +3773,7 @@ static int write_dev_supers(struct btrfs_device *device,
bio->bi_opf |= REQ_FUA;
btrfsic_submit_bio(bio);
+ btrfs_advance_sb_log(device, i);
}
return errors < i ? 0 : -1;
}
@@ -3560,6 +3790,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
int i;
int errors = 0;
bool primary_failed = false;
+ int ret;
u64 bytenr;
if (max_mirrors == 0)
@@ -3568,7 +3799,15 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
for (i = 0; i < max_mirrors; i++) {
struct page *page;
- bytenr = btrfs_sb_offset(i);
+ ret = btrfs_sb_log_location(device, i, READ, &bytenr);
+ if (ret == -ENOENT) {
+ break;
+ } else if (ret < 0) {
+ errors++;
+ if (i == 0)
+ primary_failed = true;
+ continue;
+ }
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
@@ -3882,14 +4121,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
}
}
- if (root->free_ino_pinned)
- __btrfs_remove_free_space_cache(root->free_ino_pinned);
- if (root->free_ino_ctl)
- __btrfs_remove_free_space_cache(root->free_ino_ctl);
- if (root->ino_cache_inode) {
- iput(root->ino_cache_inode);
- root->ino_cache_inode = NULL;
- }
if (drop_ref)
btrfs_put_root(root);
}
@@ -4001,6 +4232,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
+ cancel_work_sync(&fs_info->preempt_reclaim_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4053,9 +4285,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_sum(&fs_info->delalloc_bytes));
}
- if (percpu_counter_sum(&fs_info->dio_bytes))
+ if (percpu_counter_sum(&fs_info->ordered_bytes))
btrfs_info(fs_info, "at unmount dio bytes count %lld",
- percpu_counter_sum(&fs_info->dio_bytes));
+ percpu_counter_sum(&fs_info->ordered_bytes));
btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -4069,6 +4301,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
+ /* We shouldn't have any transaction open at this point */
+ ASSERT(list_empty(&fs_info->trans_list));
+
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
free_root_pointers(fs_info, true);
btrfs_free_fs_roots(fs_info);
@@ -4112,8 +4347,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
- struct btrfs_fs_info *fs_info;
- struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info = buf->fs_info;
u64 transid = btrfs_header_generation(buf);
int was_dirty;
@@ -4126,8 +4360,6 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
return;
#endif
- root = BTRFS_I(buf->pages[0]->mapping->host)->root;
- fs_info = root->fs_info;
btrfs_assert_tree_locked(buf);
if (transid != fs_info->generation)
WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
@@ -4576,6 +4808,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
EXTENT_DIRTY);
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
+ btrfs_free_redirty_list(cur_trans);
+
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
}
@@ -4632,3 +4866,58 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
return 0;
}
+
+int btrfs_init_root_free_objectid(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *l;
+ struct btrfs_key search_key;
+ struct btrfs_key found_key;
+ int slot;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+ search_key.type = -1;
+ search_key.offset = (u64)-1;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ BUG_ON(ret == 0); /* Corruption */
+ if (path->slots[0] > 0) {
+ slot = path->slots[0] - 1;
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+ root->free_objectid = max_t(u64, found_key.objectid + 1,
+ BTRFS_FIRST_FREE_OBJECTID);
+ } else {
+ root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
+{
+ int ret;
+ mutex_lock(&root->objectid_mutex);
+
+ if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+ btrfs_warn(root->fs_info,
+ "the objectid of root %llu reaches its highest value",
+ root->root_key.objectid);
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ *objectid = root->free_objectid++;
+ ret = 0;
+out:
+ mutex_unlock(&root->objectid_mutex);
+ return ret;
+}