aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c1427
1 files changed, 1028 insertions, 399 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 60f5f68d892d..4dfb3ead1175 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -24,6 +24,9 @@
#include "rcu-string.h"
#include "backref.h"
#include "disk-io.h"
+#include "subpage.h"
+#include "zoned.h"
+#include "block-group.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -142,7 +145,7 @@ struct extent_page_data {
unsigned int sync_io:1;
};
-static int add_extent_changeset(struct extent_state *state, unsigned bits,
+static int add_extent_changeset(struct extent_state *state, u32 bits,
struct extent_changeset *changeset,
int set)
{
@@ -389,16 +392,16 @@ do_insert:
}
/**
- * __etree_search - searche @tree for an entry that contains @offset. Such
- * entry would have entry->start <= offset && entry->end >= offset.
+ * Search @tree for an entry that contains @offset. Such entry would have
+ * entry->start <= offset && entry->end >= offset.
*
- * @tree - the tree to search
- * @offset - offset that should fall within an entry in @tree
- * @next_ret - pointer to the first entry whose range ends after @offset
- * @prev - pointer to the first entry whose range begins before @offset
- * @p_ret - pointer where new node should be anchored (used when inserting an
- * entry in the tree)
- * @parent_ret - points to entry which would have been the parent of the entry,
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @next_ret: pointer to the first entry whose range ends after @offset
+ * @prev_ret: pointer to the first entry whose range begins before @offset
+ * @p_ret: pointer where new node should be anchored (used when inserting an
+ * entry in the tree)
+ * @parent_ret: points to entry which would have been the parent of the entry,
* containing @offset
*
* This function returns a pointer to the entry that contains @offset byte
@@ -530,7 +533,7 @@ static void merge_state(struct extent_io_tree *tree,
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits,
+ struct extent_state *state, u32 *bits,
struct extent_changeset *changeset);
/*
@@ -547,7 +550,7 @@ static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
struct rb_node ***p,
struct rb_node **parent,
- unsigned *bits, struct extent_changeset *changeset)
+ u32 *bits, struct extent_changeset *changeset)
{
struct rb_node *node;
@@ -628,11 +631,11 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits, int wake,
+ u32 *bits, int wake,
struct extent_changeset *changeset)
{
struct extent_state *next;
- unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
+ u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
int ret;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
@@ -676,9 +679,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
{
- struct inode *inode = tree->private_data;
-
- btrfs_panic(btrfs_sb(inode->i_sb), err,
+ btrfs_panic(tree->fs_info, err,
"locking error: extent tree was modified by another thread while locked");
}
@@ -695,9 +696,9 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
+ u32 bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
@@ -868,7 +869,7 @@ static void wait_on_state(struct extent_io_tree *tree,
* The tree lock is taken by this function
*/
static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits)
+ u32 bits)
{
struct extent_state *state;
struct rb_node *node;
@@ -915,9 +916,9 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits, struct extent_changeset *changeset)
+ u32 *bits, struct extent_changeset *changeset)
{
- unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
+ u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
int ret;
if (tree->private_data && is_data_inode(tree->private_data))
@@ -961,12 +962,10 @@ static void cache_state(struct extent_state *state,
*
* [start, end] is inclusive This takes the tree lock.
*/
-
-static int __must_check
-__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned exclusive_bits,
- u64 *failed_start, struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ u32 exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask,
+ struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -980,6 +979,10 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
+ if (exclusive_bits)
+ ASSERT(failed_start);
+ else
+ ASSERT(failed_start == NULL);
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
@@ -1179,15 +1182,6 @@ out:
}
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, u64 * failed_start,
- struct extent_state **cached_state, gfp_t mask)
-{
- return __set_extent_bit(tree, start, end, bits, 0, failed_start,
- cached_state, mask, NULL);
-}
-
-
/**
* convert_extent_bit - convert all bits in a given range from one bit to
* another
@@ -1207,7 +1201,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
* All allocations are done with GFP_NOFS.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned clear_bits,
+ u32 bits, u32 clear_bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@@ -1408,7 +1402,7 @@ out:
/* wrappers around set/clear extent bit */
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset)
+ u32 bits, struct extent_changeset *changeset)
{
/*
* We don't support EXTENT_LOCKED yet, as current changeset will
@@ -1418,19 +1412,19 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
BUG_ON(bits & EXTENT_LOCKED);
- return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
- changeset);
+ return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
+ changeset);
}
int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits)
+ u32 bits)
{
- return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
- GFP_NOWAIT, NULL);
+ return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
+ GFP_NOWAIT, NULL);
}
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
+ u32 bits, int wake, int delete,
struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, bits, wake, delete,
@@ -1438,7 +1432,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset)
+ u32 bits, struct extent_changeset *changeset)
{
/*
* Don't support EXTENT_LOCKED case, same reason as
@@ -1461,9 +1455,9 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u64 failed_start;
while (1) {
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
- EXTENT_LOCKED, &failed_start,
- cached_state, GFP_NOFS, NULL);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
+ EXTENT_LOCKED, &failed_start,
+ cached_state, GFP_NOFS, NULL);
if (err == -EEXIST) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
@@ -1479,8 +1473,8 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
int err;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, GFP_NOFS, NULL);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+ &failed_start, NULL, GFP_NOFS, NULL);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
@@ -1526,8 +1520,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
* nothing was found after 'start'
*/
static struct extent_state *
-find_first_extent_bit_state(struct extent_io_tree *tree,
- u64 start, unsigned bits)
+find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
{
struct rb_node *node;
struct extent_state *state;
@@ -1554,14 +1547,15 @@ out:
}
/*
- * find the first offset in the io tree with 'bits' set. zero is
- * returned if we find something, and *start_ret and *end_ret are
- * set to reflect the state struct that was found.
+ * Find the first offset in the io tree with one or more @bits set.
*
- * If nothing was found, 1 is returned. If found something, return 0.
+ * Note: If there are multiple bits set in @bits, any of them will match.
+ *
+ * Return 0 if we find something, and update @start_ret and @end_ret.
+ * Return 1 if we found nothing.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits,
+ u64 *start_ret, u64 *end_ret, u32 bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@@ -1597,12 +1591,13 @@ out:
}
/**
- * find_contiguous_extent_bit: find a contiguous area of bits
- * @tree - io tree to check
- * @start - offset to start the search from
- * @start_ret - the first offset we found with the bits set
- * @end_ret - the final contiguous range of the bits that were set
- * @bits - bits to look for
+ * Find a contiguous area of bits
+ *
+ * @tree: io tree to check
+ * @start: offset to start the search from
+ * @start_ret: the first offset we found with the bits set
+ * @end_ret: the final contiguous range of the bits that were set
+ * @bits: bits to look for
*
* set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
* to set bits appropriately, and then merge them again. During this time it
@@ -1612,7 +1607,7 @@ out:
* returned will be the full contiguous area with the bits set.
*/
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits)
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
int ret = 1;
@@ -1634,14 +1629,14 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
}
/**
- * find_first_clear_extent_bit - find the first range that has @bits not set.
- * This range could start before @start.
+ * Find the first range that has @bits not set. This range could start before
+ * @start.
*
- * @tree - the tree to search
- * @start - the offset at/after which the found extent should start
- * @start_ret - records the beginning of the range
- * @end_ret - records the end of the range (inclusive)
- * @bits - the set of bits which must be unset
+ * @tree: the tree to search
+ * @start: offset at/after which the found extent should start
+ * @start_ret: records the beginning of the range
+ * @end_ret: records the end of the range (inclusive)
+ * @bits: the set of bits which must be unset
*
* Since unallocated range is also considered one which doesn't have the bits
* set it's possible that @end_ret contains -1, this happens in case the range
@@ -1649,7 +1644,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
* trim @end_ret to the appropriate size.
*/
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits)
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
struct rb_node *node, *prev = NULL, *next;
@@ -1946,7 +1941,7 @@ static int __process_pages_contig(struct address_space *mapping,
unsigned long page_ops, pgoff_t *index_ret)
{
unsigned long nr_pages = end_index - start_index + 1;
- unsigned long pages_locked = 0;
+ unsigned long pages_processed = 0;
pgoff_t index = start_index;
struct page *pages[16];
unsigned ret;
@@ -1981,13 +1976,13 @@ static int __process_pages_contig(struct address_space *mapping,
if (locked_page && pages[i] == locked_page) {
put_page(pages[i]);
- pages_locked++;
+ pages_processed++;
continue;
}
- if (page_ops & PAGE_CLEAR_DIRTY)
+ if (page_ops & PAGE_START_WRITEBACK) {
clear_page_dirty_for_io(pages[i]);
- if (page_ops & PAGE_SET_WRITEBACK)
set_page_writeback(pages[i]);
+ }
if (page_ops & PAGE_SET_ERROR)
SetPageError(pages[i]);
if (page_ops & PAGE_END_WRITEBACK)
@@ -2006,7 +2001,7 @@ static int __process_pages_contig(struct address_space *mapping,
}
}
put_page(pages[i]);
- pages_locked++;
+ pages_processed++;
}
nr_pages -= ret;
index += ret;
@@ -2014,14 +2009,13 @@ static int __process_pages_contig(struct address_space *mapping,
}
out:
if (err && index_ret)
- *index_ret = start_index + pages_locked - 1;
+ *index_ret = start_index + pages_processed - 1;
return err;
}
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned clear_bits,
- unsigned long page_ops)
+ u32 clear_bits, unsigned long page_ops)
{
clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
@@ -2037,7 +2031,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
- unsigned bits, int contig)
+ u32 bits, int contig)
{
struct rb_node *node;
struct extent_state *state;
@@ -2157,7 +2151,7 @@ out:
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int filled, struct extent_state *cached)
+ u32 bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
@@ -2266,6 +2260,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
+ if (btrfs_is_zoned(fs_info))
+ return btrfs_repair_one_zone(fs_info, logical);
+
bio = btrfs_io_bio_alloc(1);
bio->bi_iter.bi_size = 0;
map_length = length;
@@ -2642,7 +2639,7 @@ static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio)
}
blk_status_t btrfs_submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u64 phy_offset,
+ struct bio *failed_bio, u32 bio_offset,
struct page *page, unsigned int pgoff,
u64 start, u64 end, int failed_mirror,
submit_bio_hook_t *submit_bio_hook)
@@ -2652,7 +2649,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
- const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits;
+ const int icsum = bio_offset >> fs_info->sectorsize_bits;
bool need_validation;
struct bio *repair_bio;
struct btrfs_io_bio *repair_io_bio;
@@ -2685,7 +2682,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
repair_bio->bi_private = failed_bio->bi_private;
if (failed_io_bio->csum) {
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
repair_io_bio->csum = repair_io_bio->csum_inline;
memcpy(repair_io_bio->csum,
@@ -2742,6 +2739,7 @@ static void end_bio_extent_writepage(struct bio *bio)
u64 start;
u64 end;
struct bvec_iter_all iter_all;
+ bool first_bvec = true;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
@@ -2768,6 +2766,11 @@ static void end_bio_extent_writepage(struct bio *bio)
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
+ if (first_bvec) {
+ btrfs_record_physical_zoned(inode, start, bio);
+ first_bvec = false;
+ }
+
end_extent_writepage(page, error, start, end);
end_page_writeback(page);
}
@@ -2775,16 +2778,111 @@ static void end_bio_extent_writepage(struct bio *bio)
bio_put(bio);
}
-static void
-endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
- int uptodate)
+/*
+ * Record previously processed extent range
+ *
+ * For endio_readpage_release_extent() to handle a full extent range, reducing
+ * the extent io operations.
+ */
+struct processed_extent {
+ struct btrfs_inode *inode;
+ /* Start of the range in @inode */
+ u64 start;
+ /* End of the range in @inode */
+ u64 end;
+ bool uptodate;
+};
+
+/*
+ * Try to release processed extent range
+ *
+ * May not release the extent range right now if the current range is
+ * contiguous to processed extent.
+ *
+ * Will release processed extent when any of @inode, @uptodate, the range is
+ * no longer contiguous to the processed range.
+ *
+ * Passing @inode == NULL will force processed extent to be released.
+ */
+static void endio_readpage_release_extent(struct processed_extent *processed,
+ struct btrfs_inode *inode, u64 start, u64 end,
+ bool uptodate)
{
struct extent_state *cached = NULL;
- u64 end = start + len - 1;
+ struct extent_io_tree *tree;
- if (uptodate && tree->track_uptodate)
- set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(tree, start, end, &cached);
+ /* The first extent, initialize @processed */
+ if (!processed->inode)
+ goto update;
+
+ /*
+ * Contiguous to processed extent, just uptodate the end.
+ *
+ * Several things to notice:
+ *
+ * - bio can be merged as long as on-disk bytenr is contiguous
+ * This means we can have page belonging to other inodes, thus need to
+ * check if the inode still matches.
+ * - bvec can contain range beyond current page for multi-page bvec
+ * Thus we need to do processed->end + 1 >= start check
+ */
+ if (processed->inode == inode && processed->uptodate == uptodate &&
+ processed->end + 1 >= start && end >= processed->end) {
+ processed->end = end;
+ return;
+ }
+
+ tree = &processed->inode->io_tree;
+ /*
+ * Now we don't have range contiguous to the processed range, release
+ * the processed range now.
+ */
+ if (processed->uptodate && tree->track_uptodate)
+ set_extent_uptodate(tree, processed->start, processed->end,
+ &cached, GFP_ATOMIC);
+ unlock_extent_cached_atomic(tree, processed->start, processed->end,
+ &cached);
+
+update:
+ /* Update processed to current range */
+ processed->inode = inode;
+ processed->start = start;
+ processed->end = end;
+ processed->uptodate = uptodate;
+}
+
+static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
+{
+ ASSERT(PageLocked(page));
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page));
+ btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
+}
+
+static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+
+ ASSERT(page_offset(page) <= start &&
+ start + len <= page_offset(page) + PAGE_SIZE);
+
+ if (uptodate) {
+ btrfs_page_set_uptodate(fs_info, page, start, len);
+ } else {
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_page_set_error(fs_info, page, start, len);
+ }
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ unlock_page(page);
+ else if (is_data_inode(page->mapping->host))
+ /*
+ * For subpage data, unlock the page if we're the last reader.
+ * For subpage metadata, page lock is not utilized for read.
+ */
+ btrfs_subpage_end_reader(fs_info, page, start, len);
}
/*
@@ -2804,12 +2902,12 @@ static void end_bio_extent_readpage(struct bio *bio)
int uptodate = !bio->bi_status;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct extent_io_tree *tree, *failure_tree;
- u64 offset = 0;
- u64 start;
- u64 end;
- u64 len;
- u64 extent_start = 0;
- u64 extent_len = 0;
+ struct processed_extent processed = { 0 };
+ /*
+ * The offset to the beginning of a bio, since one bio can never be
+ * larger than UINT_MAX, u32 here is enough.
+ */
+ u32 bio_offset = 0;
int mirror;
int ret;
struct bvec_iter_all iter_all;
@@ -2819,42 +2917,48 @@ static void end_bio_extent_readpage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 start;
+ u64 end;
+ u32 len;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- (u64)bio->bi_iter.bi_sector, bio->bi_status,
+ bio->bi_iter.bi_sector, bio->bi_status,
io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
- /* We always issue full-page reads, but if some block
- * in a page fails to read, blk_update_request() will
- * advance bv_offset and adjust bv_len to compensate.
- * Print a warning for nonzero offsets, and an error
- * if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
- if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
- btrfs_err(fs_info,
- "partial page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else
- btrfs_info(fs_info,
- "incomplete page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- }
-
- start = page_offset(page);
- end = start + bvec->bv_offset + bvec->bv_len - 1;
+ /*
+ * We always issue full-sector reads, but if some block in a
+ * page fails to read, blk_update_request() will advance
+ * bv_offset and adjust bv_len to compensate. Print a warning
+ * for unaligned offsets, and an error if they don't add up to
+ * a full sector.
+ */
+ if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ btrfs_err(fs_info,
+ "partial page read in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
+ sectorsize))
+ btrfs_info(fs_info,
+ "incomplete page read with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+
+ start = page_offset(page) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
len = bvec->bv_len;
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
if (is_data_inode(inode))
- ret = btrfs_verify_data_csum(io_bio, offset, page,
- start, end, mirror);
+ ret = btrfs_verify_data_csum(io_bio,
+ bio_offset, page, start, end,
+ mirror);
else
ret = btrfs_validate_metadata_buffer(io_bio,
- offset, page, start, end, mirror);
+ page, start, end, mirror);
if (ret)
uptodate = 0;
else
@@ -2879,12 +2983,14 @@ static void end_bio_extent_readpage(struct bio *bio)
* If it can't handle the error it will return -EIO and
* we remain responsible for that page.
*/
- if (!btrfs_submit_read_repair(inode, bio, offset, page,
+ if (!btrfs_submit_read_repair(inode, bio, bio_offset,
+ page,
start - page_offset(page),
start, end, mirror,
btrfs_submit_data_bio)) {
uptodate = !bio->bi_status;
- offset += len;
+ ASSERT(bio_offset + len > bio_offset);
+ bio_offset += len;
continue;
}
} else {
@@ -2908,40 +3014,17 @@ readpage_ok:
off = offset_in_page(i_size);
if (page->index == end_index && off)
zero_user_segment(page, off, PAGE_SIZE);
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
}
- unlock_page(page);
- offset += len;
-
- if (unlikely(!uptodate)) {
- if (extent_len) {
- endio_readpage_release_extent(tree,
- extent_start,
- extent_len, 1);
- extent_start = 0;
- extent_len = 0;
- }
- endio_readpage_release_extent(tree, start,
- end - start + 1, 0);
- } else if (!extent_len) {
- extent_start = start;
- extent_len = end + 1 - start;
- } else if (extent_start + extent_len == start) {
- extent_len += end + 1 - start;
- } else {
- endio_readpage_release_extent(tree, extent_start,
- extent_len, uptodate);
- extent_start = start;
- extent_len = end + 1 - start;
- }
- }
+ ASSERT(bio_offset + len > bio_offset);
+ bio_offset += len;
- if (extent_len)
- endio_readpage_release_extent(tree, extent_start, extent_len,
- uptodate);
+ /* Update page status and unlock */
+ end_page_read(page, uptodate, start, len);
+ endio_readpage_release_extent(&processed, BTRFS_I(inode),
+ start, end, uptodate);
+ }
+ /* Release the last extent */
+ endio_readpage_release_extent(&processed, NULL, 0, 0, false);
btrfs_io_bio_free_csum(io_bio);
bio_put(bio);
}
@@ -3011,14 +3094,67 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
return bio;
}
+/**
+ * Attempt to add a page to bio
+ *
+ * @bio: destination bio
+ * @page: page to add to the bio
+ * @disk_bytenr: offset of the new bio or to check whether we are adding
+ * a contiguous page to the previous one
+ * @pg_offset: starting offset in the page
+ * @size: portion of page that we want to write
+ * @prev_bio_flags: flags of previous bio to see if we can merge the current one
+ * @bio_flags: flags of the current bio to see if we can merge them
+ * @return: true if page was added, false otherwise
+ *
+ * Attempt to add a page to bio considering stripe alignment etc.
+ *
+ * Return true if successfully page added. Otherwise, return false.
+ */
+static bool btrfs_bio_add_page(struct bio *bio, struct page *page,
+ u64 disk_bytenr, unsigned int size,
+ unsigned int pg_offset,
+ unsigned long prev_bio_flags,
+ unsigned long bio_flags)
+{
+ const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
+ bool contig;
+ int ret;
+
+ if (prev_bio_flags != bio_flags)
+ return false;
+
+ if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
+ contig = bio->bi_iter.bi_sector == sector;
+ else
+ contig = bio_end_sector(bio) == sector;
+ if (!contig)
+ return false;
+
+ if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags))
+ return false;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct page *first_page = bio_first_bvec_all(bio)->bv_page;
+
+ if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size))
+ return false;
+ ret = bio_add_zone_append_page(bio, page, size, pg_offset);
+ } else {
+ ret = bio_add_page(bio, page, size, pg_offset);
+ }
+
+ return ret == size;
+}
+
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @wbc: optional writeback control for io accounting
* @page: page to add to the bio
+ * @disk_bytenr: logical bytenr where the write will be
+ * @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
- * @size: portion of page that we want to write
- * @offset: starting offset in the page
* @bio_ret: must be valid pointer, newly allocated bio will be stored there
* @end_io_func: end_io callback for new bio
* @mirror_num: desired mirror to read/write
@@ -3027,7 +3163,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
*/
static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
- struct page *page, u64 offset,
+ struct page *page, u64 disk_bytenr,
size_t size, unsigned long pg_offset,
struct bio **bio_ret,
bio_end_io_t end_io_func,
@@ -3038,28 +3174,18 @@ static int submit_extent_page(unsigned int opf,
{
int ret = 0;
struct bio *bio;
- size_t page_size = min_t(size_t, size, PAGE_SIZE);
- sector_t sector = offset >> 9;
- struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
+ size_t io_size = min_t(size_t, size, PAGE_SIZE);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct extent_io_tree *tree = &inode->io_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
ASSERT(bio_ret);
if (*bio_ret) {
- bool contig;
- bool can_merge = true;
-
bio = *bio_ret;
- if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
- contig = bio->bi_iter.bi_sector == sector;
- else
- contig = bio_end_sector(bio) == sector;
-
- if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
- can_merge = false;
-
- if (prev_bio_flags != bio_flags || !contig || !can_merge ||
- force_bio_submit ||
- bio_add_page(bio, page, page_size, pg_offset) < page_size) {
+ if (force_bio_submit ||
+ !btrfs_bio_add_page(bio, page, disk_bytenr, io_size,
+ pg_offset, prev_bio_flags, bio_flags)) {
ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
if (ret < 0) {
*bio_ret = NULL;
@@ -3068,13 +3194,13 @@ static int submit_extent_page(unsigned int opf,
bio = NULL;
} else {
if (wbc)
- wbc_account_cgroup_owner(wbc, page, page_size);
+ wbc_account_cgroup_owner(wbc, page, io_size);
return 0;
}
}
- bio = btrfs_bio_alloc(offset);
- bio_add_page(bio, page, page_size, pg_offset);
+ bio = btrfs_bio_alloc(disk_bytenr);
+ bio_add_page(bio, page, io_size, pg_offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
bio->bi_write_hint = page->mapping->host->i_write_hint;
@@ -3082,10 +3208,25 @@ static int submit_extent_page(unsigned int opf,
if (wbc) {
struct block_device *bdev;
- bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
+ bdev = fs_info->fs_devices->latest_bdev;
bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
- wbc_account_cgroup_owner(wbc, page, page_size);
+ wbc_account_cgroup_owner(wbc, page, io_size);
+ }
+ if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct extent_map *em;
+ struct map_lookup *map;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+ /* We only support single profile for now */
+ ASSERT(map->num_stripes == 1);
+ btrfs_io_bio(bio)->device = map->stripes[0].dev;
+
+ free_extent_map(em);
}
*bio_ret = bio;
@@ -3093,19 +3234,78 @@ static int submit_extent_page(unsigned int opf,
return ret;
}
-static void attach_extent_buffer_page(struct extent_buffer *eb,
- struct page *page)
+static int attach_extent_buffer_page(struct extent_buffer *eb,
+ struct page *page,
+ struct btrfs_subpage *prealloc)
{
- if (!PagePrivate(page))
- attach_page_private(page, eb);
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ int ret = 0;
+
+ /*
+ * If the page is mapped to btree inode, we should hold the private
+ * lock to prevent race.
+ * For cloned or dummy extent buffers, their pages are not mapped and
+ * will not race with any other ebs.
+ */
+ if (page->mapping)
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ if (fs_info->sectorsize == PAGE_SIZE) {
+ if (!PagePrivate(page))
+ attach_page_private(page, eb);
+ else
+ WARN_ON(page->private != (unsigned long)eb);
+ return 0;
+ }
+
+ /* Already mapped, just free prealloc */
+ if (PagePrivate(page)) {
+ btrfs_free_subpage(prealloc);
+ return 0;
+ }
+
+ if (prealloc)
+ /* Has preallocated memory for subpage */
+ attach_page_private(page, prealloc);
else
- WARN_ON(page->private != (unsigned long)eb);
+ /* Do new allocation to attach subpage */
+ ret = btrfs_attach_subpage(fs_info, page,
+ BTRFS_SUBPAGE_METADATA);
+ return ret;
+}
+
+int set_page_extent_mapped(struct page *page)
+{
+ struct btrfs_fs_info *fs_info;
+
+ ASSERT(page->mapping);
+
+ if (PagePrivate(page))
+ return 0;
+
+ fs_info = btrfs_sb(page->mapping->host->i_sb);
+
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
+
+ attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ return 0;
}
-void set_page_extent_mapped(struct page *page)
+void clear_page_extent_mapped(struct page *page)
{
+ struct btrfs_fs_info *fs_info;
+
+ ASSERT(page->mapping);
+
if (!PagePrivate(page))
- attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ return;
+
+ fs_info = btrfs_sb(page->mapping->host->i_sb);
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return btrfs_detach_subpage(fs_info, page);
+
+ detach_page_private(page);
}
static struct extent_map *
@@ -3146,6 +3346,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 start = page_offset(page);
const u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
@@ -3158,17 +3359,23 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
int nr = 0;
size_t pg_offset = 0;
size_t iosize;
- size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
unsigned long this_bio_flag = 0;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- set_page_extent_mapped(page);
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_extent(tree, start, end);
+ btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
+ unlock_page(page);
+ goto out;
+ }
if (!PageUptodate(page)) {
if (cleancache_get_page(page) == 0) {
BUG_ON(blocksize != PAGE_SIZE);
unlock_extent(tree, start, end);
+ unlock_page(page);
goto out;
}
}
@@ -3185,9 +3392,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
kunmap_atomic(userpage);
}
}
+ begin_page_read(fs_info, page);
while (cur <= end) {
bool force_bio_submit = false;
- u64 offset;
+ u64 disk_bytenr;
if (cur >= last_byte) {
char *userpage;
@@ -3202,13 +3410,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
cur + iosize - 1, &cached);
+ end_page_read(page, true, cur, iosize);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, em_cached);
if (IS_ERR_OR_NULL(em)) {
- SetPageError(page);
unlock_extent(tree, cur, end);
+ end_page_read(page, false, cur, end + 1 - cur);
break;
}
extent_offset = cur - em->start;
@@ -3224,13 +3433,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
iosize = ALIGN(iosize, blocksize);
- if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
- disk_io_size = em->block_len;
- offset = em->block_start;
- } else {
- offset = em->block_start + extent_offset;
- disk_io_size = iosize;
- }
+ if (this_bio_flag & EXTENT_BIO_COMPRESSED)
+ disk_bytenr = em->block_start;
+ else
+ disk_bytenr = em->block_start + extent_offset;
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
@@ -3294,6 +3500,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
cur + iosize - 1, &cached);
+ end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3303,6 +3510,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
unlock_extent(tree, cur, cur + iosize - 1);
+ end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3311,15 +3519,15 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
* to date. Error out
*/
if (block_start == EXTENT_MAP_INLINE) {
- SetPageError(page);
unlock_extent(tree, cur, cur + iosize - 1);
+ end_page_read(page, false, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
- page, offset, disk_io_size,
+ page, disk_bytenr, iosize,
pg_offset, bio,
end_bio_extent_readpage, 0,
*bio_flags,
@@ -3329,19 +3537,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
nr++;
*bio_flags = this_bio_flag;
} else {
- SetPageError(page);
unlock_extent(tree, cur, cur + iosize - 1);
+ end_page_read(page, false, cur, iosize);
goto out;
}
cur = cur + iosize;
pg_offset += iosize;
}
out:
- if (!nr) {
- if (!PageError(page))
- SetPageUptodate(page);
- unlock_page(page);
- }
return ret;
}
@@ -3461,23 +3664,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
unsigned long nr_written,
int *nr_ret)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
u64 start = page_offset(page);
- u64 page_end = start + PAGE_SIZE - 1;
- u64 end;
+ u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
u64 extent_offset;
u64 block_start;
- u64 iosize;
struct extent_map *em;
- size_t pg_offset = 0;
- size_t blocksize;
int ret = 0;
int nr = 0;
+ u32 opf = REQ_OP_WRITE;
const unsigned int write_flags = wbc_to_write_flags(wbc);
bool compressed;
- ret = btrfs_writepage_cow_fixup(page, start, page_end);
+ ret = btrfs_writepage_cow_fixup(page, start, end);
if (ret) {
/* Fixup worker will requeue */
redirty_page_for_writepage(wbc, page);
@@ -3492,16 +3693,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
*/
update_nr_written(wbc, nr_written + 1);
- end = page_end;
- blocksize = inode->vfs_inode.i_sb->s_blocksize;
-
while (cur <= end) {
+ u64 disk_bytenr;
u64 em_end;
- u64 offset;
+ u32 iosize;
if (cur >= i_size) {
- btrfs_writepage_endio_finish_ordered(page, cur,
- page_end, 1);
+ btrfs_writepage_endio_finish_ordered(page, cur, end, 1);
break;
}
em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
@@ -3513,13 +3711,20 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
extent_offset = cur - em->start;
em_end = extent_map_end(em);
- BUG_ON(em_end <= cur);
- BUG_ON(end < cur);
- iosize = min(em_end - cur, end - cur + 1);
- iosize = ALIGN(iosize, blocksize);
- offset = em->block_start + extent_offset;
+ ASSERT(cur <= em_end);
+ ASSERT(cur < end);
+ ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
+ ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
block_start = em->block_start;
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ disk_bytenr = em->block_start + extent_offset;
+
+ /* Note that em_end from extent_map_end() is exclusive */
+ iosize = min(em_end, end + 1) - cur;
+
+ if (btrfs_use_zone_append(inode, em))
+ opf = REQ_OP_ZONE_APPEND;
+
free_extent_map(em);
em = NULL;
@@ -3535,7 +3740,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
btrfs_writepage_endio_finish_ordered(page, cur,
cur + iosize - 1, 1);
cur += iosize;
- pg_offset += iosize;
continue;
}
@@ -3546,9 +3750,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
page->index, cur, end);
}
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- page, offset, iosize, pg_offset,
- &epd->bio,
+ ret = submit_extent_page(opf | write_flags, wbc, page,
+ disk_bytenr, iosize,
+ cur - page_offset(page), &epd->bio,
end_bio_extent_writepage,
0, 0, 0, false);
if (ret) {
@@ -3557,8 +3761,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
end_page_writeback(page);
}
- cur = cur + iosize;
- pg_offset += iosize;
+ cur += iosize;
nr++;
}
*nr_ret = nr;
@@ -3611,7 +3814,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
flush_dcache_page(page);
}
- set_page_extent_mapped(page);
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ SetPageError(page);
+ goto done;
+ }
if (!epd->extent_locked) {
ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
@@ -3656,11 +3863,14 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
}
/*
- * Lock eb pages and flush the bio if we can't the locks
+ * Lock extent buffer status and pages for writeback.
+ *
+ * May try to flush write bio if we can't get the lock.
*
- * Return 0 if nothing went wrong
- * Return >0 is same as 0, except bio is not submitted
- * Return <0 if something went wrong, no page is locked
+ * Return 0 if the extent buffer doesn't need to be submitted.
+ * (E.g. the extent buffer is not dirty)
+ * Return >0 is the extent buffer is submitted to bio.
+ * Return <0 if something went wrong, no page is locked.
*/
static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
struct extent_page_data *epd)
@@ -3868,7 +4078,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
- u64 offset = eb->start;
+ u64 disk_bytenr = eb->start;
u32 nritems;
int i, num_pages;
unsigned long start, end;
@@ -3901,7 +4111,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- p, offset, PAGE_SIZE, 0,
+ p, disk_bytenr, PAGE_SIZE, 0,
&epd->bio,
end_bio_extent_buffer_writepage,
0, 0, 0, false);
@@ -3914,7 +4124,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
ret = -EIO;
break;
}
- offset += PAGE_SIZE;
+ disk_bytenr += PAGE_SIZE;
update_nr_written(wbc, 1);
unlock_page(p);
}
@@ -3930,10 +4140,100 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
return ret;
}
+/*
+ * Submit all page(s) of one extent buffer.
+ *
+ * @page: the page of one extent buffer
+ * @eb_context: to determine if we need to submit this page, if current page
+ * belongs to this eb, we don't need to submit
+ *
+ * The caller should pass each page in their bytenr order, and here we use
+ * @eb_context to determine if we have submitted pages of one extent buffer.
+ *
+ * If we have, we just skip until we hit a new page that doesn't belong to
+ * current @eb_context.
+ *
+ * If not, we submit all the page(s) of the extent buffer.
+ *
+ * Return >0 if we have submitted the extent buffer successfully.
+ * Return 0 if we don't need to submit the page, as it's already submitted by
+ * previous call.
+ * Return <0 for fatal error.
+ */
+static int submit_eb_page(struct page *page, struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ struct extent_buffer **eb_context)
+{
+ struct address_space *mapping = page->mapping;
+ struct btrfs_block_group *cache = NULL;
+ struct extent_buffer *eb;
+ int ret;
+
+ if (!PagePrivate(page))
+ return 0;
+
+ spin_lock(&mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+
+ eb = (struct extent_buffer *)page->private;
+
+ /*
+ * Shouldn't happen and normally this would be a BUG_ON but no point
+ * crashing the machine for something we can survive anyway.
+ */
+ if (WARN_ON(!eb)) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+
+ if (eb == *eb_context) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+ ret = atomic_inc_not_zero(&eb->refs);
+ spin_unlock(&mapping->private_lock);
+ if (!ret)
+ return 0;
+
+ if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
+ /*
+ * If for_sync, this hole will be filled with
+ * trasnsaction commit.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+ ret = -EAGAIN;
+ else
+ ret = 0;
+ free_extent_buffer(eb);
+ return ret;
+ }
+
+ *eb_context = eb;
+
+ ret = lock_extent_buffer_for_io(eb, epd);
+ if (ret <= 0) {
+ btrfs_revert_meta_write_pointer(cache, eb);
+ if (cache)
+ btrfs_put_block_group(cache);
+ free_extent_buffer(eb);
+ return ret;
+ }
+ if (cache)
+ btrfs_put_block_group(cache);
+ ret = write_one_eb(eb, wbc, epd);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ return ret;
+ return 1;
+}
+
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_buffer *eb, *prev_eb = NULL;
+ struct extent_buffer *eb_context = NULL;
struct extent_page_data epd = {
.bio = NULL,
.extent_locked = 0,
@@ -3968,6 +4268,7 @@ int btree_write_cache_pages(struct address_space *mapping,
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
+ btrfs_zoned_meta_io_lock(fs_info);
retry:
if (wbc->sync_mode == WB_SYNC_ALL)
tag_pages_for_writeback(mapping, index, end);
@@ -3979,55 +4280,13 @@ retry:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- if (!PagePrivate(page))
+ ret = submit_eb_page(page, wbc, &epd, &eb_context);
+ if (ret == 0)
continue;
-
- spin_lock(&mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- eb = (struct extent_buffer *)page->private;
-
- /*
- * Shouldn't happen and normally this would be a BUG_ON
- * but no sense in crashing the users box for something
- * we can survive anyway.
- */
- if (WARN_ON(!eb)) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- if (eb == prev_eb) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->private_lock);
- if (!ret)
- continue;
-
- prev_eb = eb;
- ret = lock_extent_buffer_for_io(eb, &epd);
- if (!ret) {
- free_extent_buffer(eb);
- continue;
- } else if (ret < 0) {
- done = 1;
- free_extent_buffer(eb);
- break;
- }
-
- ret = write_one_eb(eb, wbc, &epd);
- if (ret) {
+ if (ret < 0) {
done = 1;
- free_extent_buffer(eb);
break;
}
- free_extent_buffer(eb);
/*
* the filesystem may choose to bump up nr_to_write.
@@ -4048,10 +4307,9 @@ retry:
index = 0;
goto retry;
}
- ASSERT(ret <= 0);
if (ret < 0) {
end_write_bio(&epd, ret);
- return ret;
+ goto out;
}
/*
* If something went wrong, don't allow any metadata write bio to be
@@ -4086,14 +4344,17 @@ retry:
ret = -EROFS;
end_write_bio(&epd, ret);
}
+out:
+ btrfs_zoned_meta_io_unlock(fs_info);
return ret;
}
/**
- * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * Walk the list of dirty pages of the given address space and write all of them.
+ *
* @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @data: data passed to __extent_writepage function
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @epd: holds context for the write, namely the bio
*
* If a page is already under I/O, write_cache_pages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
@@ -4382,14 +4643,22 @@ int extent_invalidatepage(struct extent_io_tree *tree,
u64 end = start + PAGE_SIZE - 1;
size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+ /* This function is only called for the btree inode */
+ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
+
start += ALIGN(offset, blocksize);
if (start > end)
return 0;
lock_extent_bits(tree, start, end, &cached_state);
wait_on_page_writeback(page);
- clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 1, 1, &cached_state);
+
+ /*
+ * Currently for btree io tree, only EXTENT_LOCKED is utilized,
+ * so here we only need to unlock the extent range to free any
+ * existing extent state.
+ */
+ unlock_extent_cached(tree, start, end, &cached_state);
return 0;
}
@@ -4409,12 +4678,14 @@ static int try_release_extent_state(struct extent_io_tree *tree,
ret = 0;
} else {
/*
- * at this point we can safely clear everything except the
- * locked bit and the nodatasum bit
+ * At this point we can safely clear everything except the
+ * locked bit, the nodatasum bit and the delalloc new bit.
+ * The delalloc new bit will be cleared by ordered extent
+ * completion.
*/
ret = __clear_extent_bit(tree, start, end,
- ~(EXTENT_LOCKED | EXTENT_NODATASUM),
- 0, 0, NULL, mask, NULL);
+ ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
+ 0, 0, NULL, mask, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
@@ -4691,7 +4962,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
roots = ulist_alloc(GFP_KERNEL);
tmp_ulist = ulist_alloc(GFP_KERNEL);
@@ -4883,25 +5153,39 @@ int extent_buffer_under_io(const struct extent_buffer *eb)
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
-/*
- * Release all pages attached to the extent buffer.
- */
-static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
{
- int i;
- int num_pages;
- int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
+ struct btrfs_subpage *subpage;
- BUG_ON(extent_buffer_under_io(eb));
+ lockdep_assert_held(&page->mapping->private_lock);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *page = eb->pages[i];
+ if (PagePrivate(page)) {
+ subpage = (struct btrfs_subpage *)page->private;
+ if (atomic_read(&subpage->eb_refs))
+ return true;
+ }
+ return false;
+}
- if (!page)
- continue;
+static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
+
+ /*
+ * For mapped eb, we're going to change the page private, which should
+ * be done under the private_lock.
+ */
+ if (mapped)
+ spin_lock(&page->mapping->private_lock);
+
+ if (!PagePrivate(page)) {
if (mapped)
- spin_lock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->private_lock);
+ return;
+ }
+
+ if (fs_info->sectorsize == PAGE_SIZE) {
/*
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
@@ -4920,9 +5204,49 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
*/
detach_page_private(page);
}
-
if (mapped)
spin_unlock(&page->mapping->private_lock);
+ return;
+ }
+
+ /*
+ * For subpage, we can have dummy eb with page private. In this case,
+ * we can directly detach the private as such page is only attached to
+ * one dummy eb, no sharing.
+ */
+ if (!mapped) {
+ btrfs_detach_subpage(fs_info, page);
+ return;
+ }
+
+ btrfs_page_dec_eb_refs(fs_info, page);
+
+ /*
+ * We can only detach the page private if there are no other ebs in the
+ * page range.
+ */
+ if (!page_range_has_eb(fs_info, page))
+ btrfs_detach_subpage(fs_info, page);
+
+ spin_unlock(&page->mapping->private_lock);
+}
+
+/* Release all pages attached to the extent buffer */
+static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+{
+ int i;
+ int num_pages;
+
+ ASSERT(!extent_buffer_under_io(eb));
+
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++) {
+ struct page *page = eb->pages[i];
+
+ if (!page)
+ continue;
+
+ detach_extent_buffer_page(eb, page);
/* One for when we allocated the page */
put_page(page);
@@ -4950,33 +5274,17 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
eb->len = len;
eb->fs_info = fs_info;
eb->bflags = 0;
- rwlock_init(&eb->lock);
- atomic_set(&eb->blocking_readers, 0);
- eb->blocking_writers = 0;
- eb->lock_recursed = false;
- init_waitqueue_head(&eb->write_lock_wq);
- init_waitqueue_head(&eb->read_lock_wq);
+ init_rwsem(&eb->lock);
btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
&fs_info->allocated_ebs);
+ INIT_LIST_HEAD(&eb->release_list);
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
atomic_set(&eb->io_pages, 0);
- /*
- * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
- */
- BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
- > MAX_INLINE_EXTENT_BUFFER_SIZE);
- BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
-
-#ifdef CONFIG_BTRFS_DEBUG
- eb->spinning_writers = 0;
- atomic_set(&eb->spinning_readers, 0);
- atomic_set(&eb->read_locks, 0);
- eb->write_locks = 0;
-#endif
+ ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
return eb;
}
@@ -4992,21 +5300,32 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
if (new == NULL)
return NULL;
+ /*
+ * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
+ * btrfs_release_extent_buffer() have different behavior for
+ * UNMAPPED subpage extent buffer.
+ */
+ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+
for (i = 0; i < num_pages; i++) {
+ int ret;
+
p = alloc_page(GFP_NOFS);
if (!p) {
btrfs_release_extent_buffer(new);
return NULL;
}
- attach_extent_buffer_page(new, p);
+ ret = attach_extent_buffer_page(new, p, NULL);
+ if (ret < 0) {
+ put_page(p);
+ btrfs_release_extent_buffer(new);
+ return NULL;
+ }
WARN_ON(PageDirty(p));
- SetPageUptodate(p);
new->pages[i] = p;
copy_page(page_address(p), page_address(src->pages[i]));
}
-
- set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
- set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+ set_extent_buffer_uptodate(new);
return new;
}
@@ -5024,9 +5343,14 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
+ int ret;
+
eb->pages[i] = alloc_page(GFP_NOFS);
if (!eb->pages[i])
goto err;
+ ret = attach_extent_buffer_page(eb, eb->pages[i], NULL);
+ if (ret < 0)
+ goto err;
}
set_extent_buffer_uptodate(eb);
btrfs_set_header_nritems(eb, 0);
@@ -5034,8 +5358,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
err:
- for (; i > 0; i--)
+ for (; i > 0; i--) {
+ detach_extent_buffer_page(eb, eb->pages[i - 1]);
__free_page(eb->pages[i - 1]);
+ }
__free_extent_buffer(eb);
return NULL;
}
@@ -5105,7 +5431,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> PAGE_SHIFT);
+ start >> fs_info->sectorsize_bits);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
/*
@@ -5157,7 +5483,7 @@ again:
}
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_SHIFT, eb);
+ start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -5177,8 +5503,40 @@ free_eb:
}
#endif
+static struct extent_buffer *grab_extent_buffer(
+ struct btrfs_fs_info *fs_info, struct page *page)
+{
+ struct extent_buffer *exists;
+
+ /*
+ * For subpage case, we completely rely on radix tree to ensure we
+ * don't try to insert two ebs for the same bytenr. So here we always
+ * return NULL and just continue.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return NULL;
+
+ /* Page not yet attached to an extent buffer */
+ if (!PagePrivate(page))
+ return NULL;
+
+ /*
+ * We could have already allocated an eb for this page and attached one
+ * so lets see if we can get a ref on the existing eb, and if we can we
+ * know it's good and we can just return that one, else we know we can
+ * just overwrite page->private.
+ */
+ exists = (struct extent_buffer *)page->private;
+ if (atomic_inc_not_zero(&exists->refs))
+ return exists;
+
+ WARN_ON(PageDirty(page));
+ detach_page_private(page);
+ return NULL;
+}
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+ u64 start, u64 owner_root, int level)
{
unsigned long len = fs_info->nodesize;
int num_pages;
@@ -5196,6 +5554,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
return ERR_PTR(-EINVAL);
}
+ if (fs_info->sectorsize < PAGE_SIZE &&
+ offset_in_page(start) + len > PAGE_SIZE) {
+ btrfs_err(fs_info,
+ "tree block crosses page boundary, start %llu nodesize %lu",
+ start, len);
+ return ERR_PTR(-EINVAL);
+ }
+
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
@@ -5203,44 +5569,62 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return ERR_PTR(-ENOMEM);
+ btrfs_set_buffer_lockdep_class(owner_root, eb, level);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++, index++) {
+ struct btrfs_subpage *prealloc = NULL;
+
p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
if (!p) {
exists = ERR_PTR(-ENOMEM);
goto free_eb;
}
- spin_lock(&mapping->private_lock);
- if (PagePrivate(p)) {
- /*
- * We could have already allocated an eb for this page
- * and attached one so lets see if we can get a ref on
- * the existing eb, and if we can we know it's good and
- * we can just return that one, else we know we can just
- * overwrite page->private.
- */
- exists = (struct extent_buffer *)p->private;
- if (atomic_inc_not_zero(&exists->refs)) {
- spin_unlock(&mapping->private_lock);
- unlock_page(p);
- put_page(p);
- mark_extent_buffer_accessed(exists, p);
- goto free_eb;
- }
- exists = NULL;
+ /*
+ * Preallocate page->private for subpage case, so that we won't
+ * allocate memory with private_lock hold. The memory will be
+ * freed by attach_extent_buffer_page() or freed manually if
+ * we exit earlier.
+ *
+ * Although we have ensured one subpage eb can only have one
+ * page, but it may change in the future for 16K page size
+ * support, so we still preallocate the memory in the loop.
+ */
+ ret = btrfs_alloc_subpage(fs_info, &prealloc,
+ BTRFS_SUBPAGE_METADATA);
+ if (ret < 0) {
+ unlock_page(p);
+ put_page(p);
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
- /*
- * Do this so attach doesn't complain and we need to
- * drop the ref the old guy had.
- */
- ClearPagePrivate(p);
- WARN_ON(PageDirty(p));
+ spin_lock(&mapping->private_lock);
+ exists = grab_extent_buffer(fs_info, p);
+ if (exists) {
+ spin_unlock(&mapping->private_lock);
+ unlock_page(p);
put_page(p);
+ mark_extent_buffer_accessed(exists, p);
+ btrfs_free_subpage(prealloc);
+ goto free_eb;
}
- attach_extent_buffer_page(eb, p);
+ /* Should not fail, as we have preallocated the memory */
+ ret = attach_extent_buffer_page(eb, p, prealloc);
+ ASSERT(!ret);
+ /*
+ * To inform we have extra eb under allocation, so that
+ * detach_extent_buffer_page() won't release the page private
+ * when the eb hasn't yet been inserted into radix tree.
+ *
+ * The ref will be decreased when the eb released the page, in
+ * detach_extent_buffer_page().
+ * Thus needs no special handling in error path.
+ */
+ btrfs_page_inc_eb_refs(fs_info, p);
spin_unlock(&mapping->private_lock);
+
WARN_ON(PageDirty(p));
eb->pages[i] = p;
if (!PageUptodate(p))
@@ -5265,7 +5649,7 @@ again:
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_SHIFT, eb);
+ start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -5321,7 +5705,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_lock(&fs_info->buffer_lock);
radix_tree_delete(&fs_info->buffer_radix,
- eb->start >> PAGE_SHIFT);
+ eb->start >> fs_info->sectorsize_bits);
spin_unlock(&fs_info->buffer_lock);
} else {
spin_unlock(&eb->refs_lock);
@@ -5446,33 +5830,103 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
- int i;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page;
int num_pages;
+ int i;
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
if (page)
- ClearPageUptodate(page);
+ btrfs_page_clear_uptodate(fs_info, page,
+ eb->start, eb->len);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
- int i;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page;
int num_pages;
+ int i;
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
- SetPageUptodate(page);
+ btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len);
}
}
+static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
+ int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct extent_io_tree *io_tree;
+ struct page *page = eb->pages[0];
+ struct bio *bio = NULL;
+ int ret = 0;
+
+ ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
+ ASSERT(PagePrivate(page));
+ io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+
+ if (wait == WAIT_NONE) {
+ ret = try_lock_extent(io_tree, eb->start,
+ eb->start + eb->len - 1);
+ if (ret <= 0)
+ return ret;
+ } else {
+ ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = 0;
+ if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
+ PageUptodate(page) ||
+ btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+ return ret;
+ }
+
+ clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = 0;
+ atomic_set(&eb->io_pages, 1);
+ check_buffer_tree_ref(eb);
+ btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
+
+ ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start,
+ eb->len, eb->start - page_offset(page), &bio,
+ end_bio_extent_readpage, mirror_num, 0, 0,
+ true);
+ if (ret) {
+ /*
+ * In the endio function, if we hit something wrong we will
+ * increase the io_pages, so here we need to decrease it for
+ * error path.
+ */
+ atomic_dec(&eb->io_pages);
+ }
+ if (bio) {
+ int tmp;
+
+ tmp = submit_one_bio(bio, mirror_num, 0);
+ if (tmp < 0)
+ return tmp;
+ }
+ if (ret || wait != WAIT_COMPLETE)
+ return ret;
+
+ wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ ret = -EIO;
+ return ret;
+}
+
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
{
int i;
@@ -5489,10 +5943,20 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
+ if (eb->fs_info->sectorsize < PAGE_SIZE)
+ return read_extent_buffer_subpage(eb, wait, mirror_num);
+
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
if (wait == WAIT_NONE) {
+ /*
+ * WAIT_NONE is only utilized by readahead. If we can't
+ * acquire the lock atomically it means either the eb
+ * is being read out or under modification.
+ * Either way the eb will be or has been cached,
+ * readahead can exit safely.
+ */
if (!trylock_page(page))
goto unlock_exit;
} else {
@@ -5622,12 +6086,12 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
struct page *page;
char *kaddr;
char *dst = (char *)dstv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
if (check_eb_range(eb, start, len))
return;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5652,13 +6116,13 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
struct page *page;
char *kaddr;
char __user *dst = (char __user *)dstv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5687,13 +6151,13 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
int ret = 0;
if (check_eb_range(eb, start, len))
return -EINVAL;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5719,7 +6183,7 @@ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
char *kaddr;
WARN_ON(!PageUptodate(eb->pages[0]));
- kaddr = page_address(eb->pages[0]);
+ kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
BTRFS_FSID_SIZE);
}
@@ -5729,7 +6193,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
char *kaddr;
WARN_ON(!PageUptodate(eb->pages[0]));
- kaddr = page_address(eb->pages[0]);
+ kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
BTRFS_FSID_SIZE);
}
@@ -5742,12 +6206,14 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
struct page *page;
char *kaddr;
char *src = (char *)srcv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
+
+ WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
if (check_eb_range(eb, start, len))
return;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5771,12 +6237,12 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
size_t offset;
struct page *page;
char *kaddr;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
if (check_eb_range(eb, start, len))
return;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5800,10 +6266,20 @@ void copy_extent_buffer_full(const struct extent_buffer *dst,
ASSERT(dst->len == src->len);
- num_pages = num_extent_pages(dst);
- for (i = 0; i < num_pages; i++)
- copy_page(page_address(dst->pages[i]),
- page_address(src->pages[i]));
+ if (dst->fs_info->sectorsize == PAGE_SIZE) {
+ num_pages = num_extent_pages(dst);
+ for (i = 0; i < num_pages; i++)
+ copy_page(page_address(dst->pages[i]),
+ page_address(src->pages[i]));
+ } else {
+ size_t src_offset = get_eb_offset_in_page(src, 0);
+ size_t dst_offset = get_eb_offset_in_page(dst, 0);
+
+ ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
+ memcpy(page_address(dst->pages[0]) + dst_offset,
+ page_address(src->pages[0]) + src_offset,
+ src->len);
+ }
}
void copy_extent_buffer(const struct extent_buffer *dst,
@@ -5816,7 +6292,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
size_t offset;
struct page *page;
char *kaddr;
- unsigned long i = dst_offset >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(dst_offset);
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(src, src_offset, len))
@@ -5824,7 +6300,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
WARN_ON(src->len != dst_len);
- offset = offset_in_page(dst_offset);
+ offset = get_eb_offset_in_page(dst, dst_offset);
while (len > 0) {
page = dst->pages[i];
@@ -5868,7 +6344,7 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start + byte_offset;
+ offset = start + offset_in_page(eb->start) + byte_offset;
*page_index = offset >> PAGE_SHIFT;
*page_offset = offset_in_page(offset);
@@ -6022,11 +6498,11 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
return;
while (len > 0) {
- dst_off_in_page = offset_in_page(dst_offset);
- src_off_in_page = offset_in_page(src_offset);
+ dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
+ src_off_in_page = get_eb_offset_in_page(dst, src_offset);
- dst_i = dst_offset >> PAGE_SHIFT;
- src_i = src_offset >> PAGE_SHIFT;
+ dst_i = get_eb_page_index(dst_offset);
+ src_i = get_eb_page_index(src_offset);
cur = min(len, (unsigned long)(PAGE_SIZE -
src_off_in_page));
@@ -6062,11 +6538,11 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
return;
}
while (len > 0) {
- dst_i = dst_end >> PAGE_SHIFT;
- src_i = src_end >> PAGE_SHIFT;
+ dst_i = get_eb_page_index(dst_end);
+ src_i = get_eb_page_index(src_end);
- dst_off_in_page = offset_in_page(dst_end);
- src_off_in_page = offset_in_page(src_end);
+ dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
+ src_off_in_page = get_eb_offset_in_page(dst, src_end);
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
@@ -6080,13 +6556,115 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
+static struct extent_buffer *get_next_extent_buffer(
+ struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+{
+ struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
+ struct extent_buffer *found = NULL;
+ u64 page_start = page_offset(page);
+ int ret;
+ int i;
+
+ ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
+ ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
+ lockdep_assert_held(&fs_info->buffer_lock);
+
+ ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
+ bytenr >> fs_info->sectorsize_bits,
+ PAGE_SIZE / fs_info->nodesize);
+ for (i = 0; i < ret; i++) {
+ /* Already beyond page end */
+ if (gang[i]->start >= page_start + PAGE_SIZE)
+ break;
+ /* Found one */
+ if (gang[i]->start >= bytenr) {
+ found = gang[i];
+ break;
+ }
+ }
+ return found;
+}
+
+static int try_release_subpage_extent_buffer(struct page *page)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ u64 cur = page_offset(page);
+ const u64 end = page_offset(page) + PAGE_SIZE;
+ int ret;
+
+ while (cur < end) {
+ struct extent_buffer *eb = NULL;
+
+ /*
+ * Unlike try_release_extent_buffer() which uses page->private
+ * to grab buffer, for subpage case we rely on radix tree, thus
+ * we need to ensure radix tree consistency.
+ *
+ * We also want an atomic snapshot of the radix tree, thus go
+ * with spinlock rather than RCU.
+ */
+ spin_lock(&fs_info->buffer_lock);
+ eb = get_next_extent_buffer(fs_info, page, cur);
+ if (!eb) {
+ /* No more eb in the page range after or at cur */
+ spin_unlock(&fs_info->buffer_lock);
+ break;
+ }
+ cur = eb->start + eb->len;
+
+ /*
+ * The same as try_release_extent_buffer(), to ensure the eb
+ * won't disappear out from under us.
+ */
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ spin_unlock(&eb->refs_lock);
+ spin_unlock(&fs_info->buffer_lock);
+ break;
+ }
+ spin_unlock(&fs_info->buffer_lock);
+
+ /*
+ * If tree ref isn't set then we know the ref on this eb is a
+ * real ref, so just return, this eb will likely be freed soon
+ * anyway.
+ */
+ if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ break;
+ }
+
+ /*
+ * Here we don't care about the return value, we will always
+ * check the page private at the end. And
+ * release_extent_buffer() will release the refs_lock.
+ */
+ release_extent_buffer(eb);
+ }
+ /*
+ * Finally to check if we have cleared page private, as if we have
+ * released all ebs in the page, the page private should be cleared now.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page))
+ ret = 1;
+ else
+ ret = 0;
+ spin_unlock(&page->mapping->private_lock);
+ return ret;
+
+}
+
int try_release_extent_buffer(struct page *page)
{
struct extent_buffer *eb;
+ if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ return try_release_subpage_extent_buffer(page);
+
/*
- * We need to make sure nobody is attaching this page to an eb right
- * now.
+ * We need to make sure nobody is changing page->private, as we rely on
+ * page->private as the pointer to extent buffer.
*/
spin_lock(&page->mapping->private_lock);
if (!PagePrivate(page)) {
@@ -6121,3 +6699,54 @@ int try_release_extent_buffer(struct page *page)
return release_extent_buffer(eb);
}
+
+/*
+ * btrfs_readahead_tree_block - attempt to readahead a child block
+ * @fs_info: the fs_info
+ * @bytenr: bytenr to read
+ * @owner_root: objectid of the root that owns this eb
+ * @gen: generation for the uptodate check, can be 0
+ * @level: level for the eb
+ *
+ * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
+ * normal uptodate check of the eb, without checking the generation. If we have
+ * to read the block we will not block on anything.
+ */
+void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 owner_root, u64 gen, int level)
+{
+ struct extent_buffer *eb;
+ int ret;
+
+ eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
+ if (IS_ERR(eb))
+ return;
+
+ if (btrfs_buffer_uptodate(eb, gen, 1)) {
+ free_extent_buffer(eb);
+ return;
+ }
+
+ ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
+ if (ret < 0)
+ free_extent_buffer_stale(eb);
+ else
+ free_extent_buffer(eb);
+}
+
+/*
+ * btrfs_readahead_node_child - readahead a node's child block
+ * @node: parent node we're reading from
+ * @slot: slot in the parent node for the child we want to read
+ *
+ * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
+ * the slot in the node provided.
+ */
+void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
+{
+ btrfs_readahead_tree_block(node->fs_info,
+ btrfs_node_blockptr(node, slot),
+ btrfs_header_owner(node),
+ btrfs_node_ptr_generation(node, slot),
+ btrfs_header_level(node) - 1);
+}