aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c729
1 files changed, 414 insertions, 315 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4ac383a3a649..0aff9b278c19 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -68,7 +68,7 @@ void btrfs_leak_debug_check(void)
pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
state->start, state->end, state->state,
extent_state_in_tree(state),
- atomic_read(&state->refs));
+ refcount_read(&state->refs));
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
}
@@ -87,19 +87,9 @@ void btrfs_leak_debug_check(void)
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree, u64 start, u64 end)
{
- struct inode *inode;
- u64 isize;
-
- if (!tree->mapping)
- return;
-
- inode = tree->mapping->host;
- isize = i_size_read(inode);
- if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
- "%s: ino %llu isize %llu odd range [%llu,%llu]",
- caller, btrfs_ino(inode), isize, start, end);
- }
+ if (tree->ops && tree->ops->check_extent_io_range)
+ tree->ops->check_extent_io_range(tree->private_data, caller,
+ start, end);
}
#else
#define btrfs_leak_debug_add(new, head) do {} while (0)
@@ -144,7 +134,7 @@ static void add_extent_changeset(struct extent_state *state, unsigned bits,
if (!set && (state->state & bits) == 0)
return;
changeset->bytes_changed += state->end - state->start + 1;
- ret = ulist_add(changeset->range_changed, state->start, state->end,
+ ret = ulist_add(&changeset->range_changed, state->start, state->end,
GFP_ATOMIC);
/* ENOMEM */
BUG_ON(ret < 0);
@@ -154,9 +144,9 @@ static noinline void flush_write_bio(void *data);
static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree *tree)
{
- if (!tree->mapping)
- return NULL;
- return btrfs_sb(tree->mapping->host->i_sb);
+ if (tree->ops)
+ return tree->ops->tree_fs_info(tree->private_data);
+ return NULL;
}
int __init extent_io_init(void)
@@ -174,7 +164,8 @@ int __init extent_io_init(void)
goto free_state_cache;
btrfs_bioset = bioset_create(BIO_POOL_SIZE,
- offsetof(struct btrfs_io_bio, bio));
+ offsetof(struct btrfs_io_bio, bio),
+ BIOSET_NEED_BVECS);
if (!btrfs_bioset)
goto free_buffer_cache;
@@ -213,19 +204,24 @@ void extent_io_exit(void)
}
void extent_io_tree_init(struct extent_io_tree *tree,
- struct address_space *mapping)
+ void *private_data)
{
tree->state = RB_ROOT;
tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
- tree->mapping = mapping;
+ tree->private_data = private_data;
}
static struct extent_state *alloc_extent_state(gfp_t mask)
{
struct extent_state *state;
+ /*
+ * The given mask might be not appropriate for the slab allocator,
+ * drop the unsupported bits
+ */
+ mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
state = kmem_cache_alloc(extent_state_cache, mask);
if (!state)
return state;
@@ -233,7 +229,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
state->failrec = NULL;
RB_CLEAR_NODE(&state->rb_node);
btrfs_leak_debug_add(&state->leak_list, &states);
- atomic_set(&state->refs, 1);
+ refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
trace_alloc_extent_state(state, mask, _RET_IP_);
return state;
@@ -243,7 +239,7 @@ void free_extent_state(struct extent_state *state)
{
if (!state)
return;
- if (atomic_dec_and_test(&state->refs)) {
+ if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del(&state->leak_list);
trace_free_extent_state(state, _RET_IP_);
@@ -364,8 +360,7 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
struct extent_state *other)
{
if (tree->ops && tree->ops->merge_extent_hook)
- tree->ops->merge_extent_hook(tree->mapping->host, new,
- other);
+ tree->ops->merge_extent_hook(tree->private_data, new, other);
}
/*
@@ -416,14 +411,14 @@ static void set_state_cb(struct extent_io_tree *tree,
struct extent_state *state, unsigned *bits)
{
if (tree->ops && tree->ops->set_bit_hook)
- tree->ops->set_bit_hook(tree->mapping->host, state, bits);
+ tree->ops->set_bit_hook(tree->private_data, state, bits);
}
static void clear_state_cb(struct extent_io_tree *tree,
struct extent_state *state, unsigned *bits)
{
if (tree->ops && tree->ops->clear_bit_hook)
- tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
+ tree->ops->clear_bit_hook(tree->private_data, state, bits);
}
static void set_state_bits(struct extent_io_tree *tree,
@@ -472,7 +467,7 @@ static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
u64 split)
{
if (tree->ops && tree->ops->split_extent_hook)
- tree->ops->split_extent_hook(tree->mapping->host, orig, split);
+ tree->ops->split_extent_hook(tree->private_data, orig, split);
}
/*
@@ -635,7 +630,7 @@ again:
if (cached && extent_state_in_tree(cached) &&
cached->start <= start && cached->end > start) {
if (clear)
- atomic_dec(&cached->refs);
+ refcount_dec(&cached->refs);
state = cached;
goto hit_next;
}
@@ -787,7 +782,7 @@ process_node:
if (state->state & bits) {
start = state->start;
- atomic_inc(&state->refs);
+ refcount_inc(&state->refs);
wait_on_state(tree, state);
free_extent_state(state);
goto again;
@@ -828,7 +823,7 @@ static void cache_state_if_flags(struct extent_state *state,
if (cached_ptr && !(*cached_ptr)) {
if (!flags || (state->state & flags)) {
*cached_ptr = state;
- atomic_inc(&state->refs);
+ refcount_inc(&state->refs);
}
}
}
@@ -1396,17 +1391,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
*/
static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
{
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
-
- while (index <= end_index) {
- page = find_get_page(tree->mapping, index);
- BUG_ON(!page); /* Pages should be in the extent_io_tree */
- set_page_writeback(page);
- put_page(page);
- index++;
- }
+ tree->ops->set_range_writeback(tree->private_data, start, end);
}
/* find the first state struct with 'bits' set after 'start', and
@@ -1532,7 +1517,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
if (!found) {
*start = state->start;
*cached_state = state;
- atomic_inc(&state->refs);
+ refcount_inc(&state->refs);
}
found++;
*end = state->end;
@@ -1549,33 +1534,24 @@ out:
return found;
}
+static int __process_pages_contig(struct address_space *mapping,
+ struct page *locked_page,
+ pgoff_t start_index, pgoff_t end_index,
+ unsigned long page_ops, pgoff_t *index_ret);
+
static noinline void __unlock_for_delalloc(struct inode *inode,
struct page *locked_page,
u64 start, u64 end)
{
- int ret;
- struct page *pages[16];
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- unsigned long nr_pages = end_index - index + 1;
- int i;
+ ASSERT(locked_page);
if (index == locked_page->index && end_index == index)
return;
- while (nr_pages > 0) {
- ret = find_get_pages_contig(inode->i_mapping, index,
- min_t(unsigned long, nr_pages,
- ARRAY_SIZE(pages)), pages);
- for (i = 0; i < ret; i++) {
- if (pages[i] != locked_page)
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
- nr_pages -= ret;
- index += ret;
- cond_resched();
- }
+ __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
+ PAGE_UNLOCK, NULL);
}
static noinline int lock_delalloc_pages(struct inode *inode,
@@ -1584,59 +1560,19 @@ static noinline int lock_delalloc_pages(struct inode *inode,
u64 delalloc_end)
{
unsigned long index = delalloc_start >> PAGE_SHIFT;
- unsigned long start_index = index;
+ unsigned long index_ret = index;
unsigned long end_index = delalloc_end >> PAGE_SHIFT;
- unsigned long pages_locked = 0;
- struct page *pages[16];
- unsigned long nrpages;
int ret;
- int i;
- /* the caller is responsible for locking the start index */
+ ASSERT(locked_page);
if (index == locked_page->index && index == end_index)
return 0;
- /* skip the page at the start index */
- nrpages = end_index - index + 1;
- while (nrpages > 0) {
- ret = find_get_pages_contig(inode->i_mapping, index,
- min_t(unsigned long,
- nrpages, ARRAY_SIZE(pages)), pages);
- if (ret == 0) {
- ret = -EAGAIN;
- goto done;
- }
- /* now we have an array of pages, lock them all */
- for (i = 0; i < ret; i++) {
- /*
- * the caller is taking responsibility for
- * locked_page
- */
- if (pages[i] != locked_page) {
- lock_page(pages[i]);
- if (!PageDirty(pages[i]) ||
- pages[i]->mapping != inode->i_mapping) {
- ret = -EAGAIN;
- unlock_page(pages[i]);
- put_page(pages[i]);
- goto done;
- }
- }
- put_page(pages[i]);
- pages_locked++;
- }
- nrpages -= ret;
- index += ret;
- cond_resched();
- }
- ret = 0;
-done:
- if (ret && pages_locked) {
- __unlock_for_delalloc(inode, locked_page,
- delalloc_start,
- ((u64)(start_index + pages_locked - 1)) <<
- PAGE_SHIFT);
- }
+ ret = __process_pages_contig(inode->i_mapping, locked_page, index,
+ end_index, PAGE_LOCK, &index_ret);
+ if (ret == -EAGAIN)
+ __unlock_for_delalloc(inode, locked_page, delalloc_start,
+ (u64)index_ret << PAGE_SHIFT);
return ret;
}
@@ -1726,37 +1662,48 @@ out_failed:
return found;
}
-void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
- u64 delalloc_end, struct page *locked_page,
- unsigned clear_bits,
- unsigned long page_ops)
+static int __process_pages_contig(struct address_space *mapping,
+ struct page *locked_page,
+ pgoff_t start_index, pgoff_t end_index,
+ unsigned long page_ops, pgoff_t *index_ret)
{
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- int ret;
+ unsigned long nr_pages = end_index - start_index + 1;
+ unsigned long pages_locked = 0;
+ pgoff_t index = start_index;
struct page *pages[16];
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
- unsigned long nr_pages = end_index - index + 1;
+ unsigned ret;
+ int err = 0;
int i;
- clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
- if (page_ops == 0)
- return;
+ if (page_ops & PAGE_LOCK) {
+ ASSERT(page_ops == PAGE_LOCK);
+ ASSERT(index_ret && *index_ret == start_index);
+ }
if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
- mapping_set_error(inode->i_mapping, -EIO);
+ mapping_set_error(mapping, -EIO);
while (nr_pages > 0) {
- ret = find_get_pages_contig(inode->i_mapping, index,
+ ret = find_get_pages_contig(mapping, index,
min_t(unsigned long,
nr_pages, ARRAY_SIZE(pages)), pages);
- for (i = 0; i < ret; i++) {
+ if (ret == 0) {
+ /*
+ * Only if we're going to lock these pages,
+ * can we find nothing at @index.
+ */
+ ASSERT(page_ops & PAGE_LOCK);
+ err = -EAGAIN;
+ goto out;
+ }
+ for (i = 0; i < ret; i++) {
if (page_ops & PAGE_SET_PRIVATE2)
SetPagePrivate2(pages[i]);
if (pages[i] == locked_page) {
put_page(pages[i]);
+ pages_locked++;
continue;
}
if (page_ops & PAGE_CLEAR_DIRTY)
@@ -1769,12 +1716,40 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
end_page_writeback(pages[i]);
if (page_ops & PAGE_UNLOCK)
unlock_page(pages[i]);
+ if (page_ops & PAGE_LOCK) {
+ lock_page(pages[i]);
+ if (!PageDirty(pages[i]) ||
+ pages[i]->mapping != mapping) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ err = -EAGAIN;
+ goto out;
+ }
+ }
put_page(pages[i]);
+ pages_locked++;
}
nr_pages -= ret;
index += ret;
cond_resched();
}
+out:
+ if (err && index_ret)
+ *index_ret = start_index + pages_locked - 1;
+ return err;
+}
+
+void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+ u64 delalloc_end, struct page *locked_page,
+ unsigned clear_bits,
+ unsigned long page_ops)
+{
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
+ NULL, GFP_NOFS);
+
+ __process_pages_contig(inode->i_mapping, locked_page,
+ start >> PAGE_SHIFT, end >> PAGE_SHIFT,
+ page_ops, NULL);
}
/*
@@ -1965,11 +1940,12 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
SetPageUptodate(page);
}
-int free_io_failure(struct inode *inode, struct io_failure_record *rec)
+int free_io_failure(struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree,
+ struct io_failure_record *rec)
{
int ret;
int err = 0;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
set_state_failrec(failure_tree, rec->start, NULL);
ret = clear_extent_bits(failure_tree, rec->start,
@@ -1978,7 +1954,7 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
if (ret)
err = ret;
- ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+ ret = clear_extent_bits(io_tree, rec->start,
rec->start + rec->len - 1,
EXTENT_DAMAGED);
if (ret && !err)
@@ -1998,28 +1974,21 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
- struct page *page, unsigned int pg_offset, int mirror_num)
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num)
{
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct bio *bio;
struct btrfs_device *dev;
u64 map_length = 0;
u64 sector;
struct btrfs_bio *bbio = NULL;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
int ret;
ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
BUG_ON(!mirror_num);
- /* we can't repair anything in raid56 yet */
- if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
- return 0;
-
- bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio)
- return -EIO;
+ bio = btrfs_io_bio_alloc(1);
bio->bi_iter.bi_size = 0;
map_length = length;
@@ -2029,17 +1998,35 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
* read repair operation.
*/
btrfs_bio_counter_inc_blocked(fs_info);
- ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bbio, mirror_num);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
+ if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) {
+ /*
+ * Note that we don't use BTRFS_MAP_WRITE because it's supposed
+ * to update all raid stripes, but here we just want to correct
+ * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
+ * stripe's dev and sector.
+ */
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ &map_length, &bbio, 0);
+ if (ret) {
+ btrfs_bio_counter_dec(fs_info);
+ bio_put(bio);
+ return -EIO;
+ }
+ ASSERT(bbio->mirror_num == 1);
+ } else {
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
+ &map_length, &bbio, mirror_num);
+ if (ret) {
+ btrfs_bio_counter_dec(fs_info);
+ bio_put(bio);
+ return -EIO;
+ }
+ BUG_ON(mirror_num != bbio->mirror_num);
}
- BUG_ON(mirror_num != bbio->mirror_num);
- sector = bbio->stripes[mirror_num-1].physical >> 9;
+
+ sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
bio->bi_iter.bi_sector = sector;
- dev = bbio->stripes[mirror_num-1].dev;
+ dev = bbio->stripes[bbio->mirror_num - 1].dev;
btrfs_put_bbio(bbio);
if (!dev || !dev->bdev || !dev->writeable) {
btrfs_bio_counter_dec(fs_info);
@@ -2060,7 +2047,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
btrfs_info_rl_in_rcu(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
- btrfs_ino(inode), start,
+ ino, start,
rcu_str_deref(dev->name), sector);
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
@@ -2080,8 +2067,7 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
- ret = repair_io_failure(fs_info->btree_inode, start,
- PAGE_SIZE, start, p,
+ ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
start - page_offset(p), mirror_num);
if (ret)
break;
@@ -2095,24 +2081,24 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
*/
-int clean_io_failure(struct inode *inode, u64 start, struct page *page,
- unsigned int pg_offset)
+int clean_io_failure(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree, u64 start,
+ struct page *page, u64 ino, unsigned int pg_offset)
{
u64 private;
struct io_failure_record *failrec;
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct extent_state *state;
int num_copies;
int ret;
private = 0;
- ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
- (u64)-1, 1, EXTENT_DIRTY, 0);
+ ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
+ EXTENT_DIRTY, 0);
if (!ret)
return 0;
- ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start,
- &failrec);
+ ret = get_state_failrec(failure_tree, start, &failrec);
if (ret)
return 0;
@@ -2128,25 +2114,25 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
if (fs_info->sb->s_flags & MS_RDONLY)
goto out;
- spin_lock(&BTRFS_I(inode)->io_tree.lock);
- state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+ spin_lock(&io_tree->lock);
+ state = find_first_extent_bit_state(io_tree,
failrec->start,
EXTENT_LOCKED);
- spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+ spin_unlock(&io_tree->lock);
if (state && state->start <= failrec->start &&
state->end >= failrec->start + failrec->len - 1) {
num_copies = btrfs_num_copies(fs_info, failrec->logical,
failrec->len);
if (num_copies > 1) {
- repair_io_failure(inode, start, failrec->len,
- failrec->logical, page,
- pg_offset, failrec->failed_mirror);
+ repair_io_failure(fs_info, ino, start, failrec->len,
+ failrec->logical, page, pg_offset,
+ failrec->failed_mirror);
}
}
out:
- free_io_failure(inode, failrec);
+ free_io_failure(failure_tree, io_tree, failrec);
return 0;
}
@@ -2157,9 +2143,9 @@ out:
* - under ordered extent
* - the inode is freeing
*/
-void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
+void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct extent_io_tree *failure_tree = &inode->io_failure_tree;
struct io_failure_record *failrec;
struct extent_state *state, *next;
@@ -2272,7 +2258,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
return 0;
}
-int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
struct io_failure_record *failrec, int failed_mirror)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2288,7 +2274,7 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
btrfs_debug(fs_info,
"Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
num_copies, failrec->this_mirror, failed_mirror);
- return 0;
+ return false;
}
/*
@@ -2329,10 +2315,10 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
btrfs_debug(fs_info,
"Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
num_copies, failrec->this_mirror, failed_mirror);
- return 0;
+ return false;
}
- return 1;
+ return true;
}
@@ -2346,10 +2332,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
struct btrfs_io_bio *btrfs_failed_bio;
struct btrfs_io_bio *btrfs_bio;
- bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
- if (!bio)
- return NULL;
-
+ bio = btrfs_io_bio_alloc(1);
bio->bi_end_io = endio_func;
bio->bi_iter.bi_sector = failrec->logical >> 9;
bio->bi_bdev = fs_info->fs_devices->latest_bdev;
@@ -2387,8 +2370,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
struct io_failure_record *failrec;
struct inode *inode = page->mapping->host;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct bio *bio;
int read_mode = 0;
+ blk_status_t status;
int ret;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2397,9 +2382,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
if (ret)
return ret;
- ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
- if (!ret) {
- free_io_failure(inode, failrec);
+ if (!btrfs_check_repairable(inode, failed_bio, failrec,
+ failed_mirror)) {
+ free_io_failure(failure_tree, tree, failrec);
return -EIO;
}
@@ -2411,21 +2396,18 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
start - page_offset(page),
(int)phy_offset, failed_bio->bi_end_io,
NULL);
- if (!bio) {
- free_io_failure(inode, failrec);
- return -EIO;
- }
bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
btrfs_debug(btrfs_sb(inode->i_sb),
"Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
read_mode, failrec->this_mirror, failrec->in_validation);
- ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror,
+ status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
failrec->bio_flags, 0);
- if (ret) {
- free_io_failure(inode, failrec);
+ if (status) {
+ free_io_failure(failure_tree, tree, failrec);
bio_put(bio);
+ ret = blk_status_to_errno(status);
}
return ret;
@@ -2441,17 +2423,14 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (tree->ops && tree->ops->writepage_end_io_hook) {
- ret = tree->ops->writepage_end_io_hook(page, start,
- end, NULL, uptodate);
- if (ret)
- uptodate = 0;
- }
+ if (tree->ops && tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, start, end, NULL,
+ uptodate);
if (!uptodate) {
ClearPageUptodate(page);
SetPageError(page);
- ret = ret < 0 ? ret : -EIO;
+ ret = err < 0 ? err : -EIO;
mapping_set_error(page->mapping, ret);
}
}
@@ -2467,11 +2446,13 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
*/
static void end_bio_extent_writepage(struct bio *bio)
{
+ int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
u64 start;
u64 end;
int i;
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
@@ -2496,7 +2477,7 @@ static void end_bio_extent_writepage(struct bio *bio)
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
- end_extent_writepage(page, bio->bi_error, start, end);
+ end_extent_writepage(page, error, start, end);
end_page_writeback(page);
}
@@ -2529,9 +2510,9 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
static void end_bio_extent_readpage(struct bio *bio)
{
struct bio_vec *bvec;
- int uptodate = !bio->bi_error;
+ int uptodate = !bio->bi_status;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- struct extent_io_tree *tree;
+ struct extent_io_tree *tree, *failure_tree;
u64 offset = 0;
u64 start;
u64 end;
@@ -2542,6 +2523,7 @@ static void end_bio_extent_readpage(struct bio *bio)
int ret;
int i;
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
@@ -2549,9 +2531,10 @@ static void end_bio_extent_readpage(struct bio *bio)
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- (u64)bio->bi_iter.bi_sector, bio->bi_error,
+ (u64)bio->bi_iter.bi_sector, bio->bi_status,
io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
+ failure_tree = &BTRFS_I(inode)->io_failure_tree;
/* We always issue full-page reads, but if some block
* in a page fails to read, blk_update_request() will
@@ -2574,42 +2557,54 @@ static void end_bio_extent_readpage(struct bio *bio)
len = bvec->bv_len;
mirror = io_bio->mirror_num;
- if (likely(uptodate && tree->ops &&
- tree->ops->readpage_end_io_hook)) {
+ if (likely(uptodate && tree->ops)) {
ret = tree->ops->readpage_end_io_hook(io_bio, offset,
page, start, end,
mirror);
if (ret)
uptodate = 0;
else
- clean_io_failure(inode, start, page, 0);
+ clean_io_failure(BTRFS_I(inode)->root->fs_info,
+ failure_tree, tree, start,
+ page,
+ btrfs_ino(BTRFS_I(inode)), 0);
}
if (likely(uptodate))
goto readpage_ok;
- if (tree->ops && tree->ops->readpage_io_failed_hook) {
+ if (tree->ops) {
ret = tree->ops->readpage_io_failed_hook(page, mirror);
- if (!ret && !bio->bi_error)
- uptodate = 1;
- } else {
+ if (ret == -EAGAIN) {
+ /*
+ * Data inode's readpage_io_failed_hook() always
+ * returns -EAGAIN.
+ *
+ * The generic bio_readpage_error handles errors
+ * the following way: If possible, new read
+ * requests are created and submitted and will
+ * end up in end_bio_extent_readpage as well (if
+ * we're lucky, not in the !uptodate case). In
+ * that case it returns 0 and we just go on with
+ * the next page in our bio. If it can't handle
+ * the error it will return -EIO and we remain
+ * responsible for that page.
+ */
+ ret = bio_readpage_error(bio, offset, page,
+ start, end, mirror);
+ if (ret == 0) {
+ uptodate = !bio->bi_status;
+ offset += len;
+ continue;
+ }
+ }
+
/*
- * The generic bio_readpage_error handles errors the
- * following way: If possible, new read requests are
- * created and submitted and will end up in
- * end_bio_extent_readpage as well (if we're lucky, not
- * in the !uptodate case). In that case it returns 0 and
- * we just go on with the next page in our bio. If it
- * can't handle the error it will return -EIO and we
- * remain responsible for that page.
+ * metadata's readpage_io_failed_hook() always returns
+ * -EIO and fixes nothing. -EIO is also returned if
+ * data inode error could not be fixed.
*/
- ret = bio_readpage_error(bio, offset, page, start, end,
- mirror);
- if (ret == 0) {
- uptodate = !bio->bi_error;
- offset += len;
- continue;
- }
+ ASSERT(ret == -EIO);
}
readpage_ok:
if (likely(uptodate)) {
@@ -2656,77 +2651,80 @@ readpage_ok:
endio_readpage_release_extent(tree, extent_start, extent_len,
uptodate);
if (io_bio->end_io)
- io_bio->end_io(io_bio, bio->bi_error);
+ io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status));
bio_put(bio);
}
/*
- * this allocates from the btrfs_bioset. We're returning a bio right now
- * but you can call btrfs_io_bio for the appropriate container_of magic
+ * Initialize the members up to but not including 'bio'. Use after allocating a
+ * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
+ * 'bio' because use of __GFP_ZERO is not supported.
*/
-struct bio *
-btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
- gfp_t gfp_flags)
+static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
{
- struct btrfs_io_bio *btrfs_bio;
- struct bio *bio;
-
- bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
+ memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
+}
- if (bio == NULL && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2)) {
- bio = bio_alloc_bioset(gfp_flags,
- nr_vecs, btrfs_bioset);
- }
- }
+/*
+ * The following helpers allocate a bio. As it's backed by a bioset, it'll
+ * never fail. We're returning a bio right now but you can call btrfs_io_bio
+ * for the appropriate container_of magic
+ */
+struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
+{
+ struct bio *bio;
- if (bio) {
- bio->bi_bdev = bdev;
- bio->bi_iter.bi_sector = first_sector;
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_bio->csum = NULL;
- btrfs_bio->csum_allocated = NULL;
- btrfs_bio->end_io = NULL;
- }
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset);
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = first_byte >> 9;
+ btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
}
-struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
+struct bio *btrfs_bio_clone(struct bio *bio)
{
struct btrfs_io_bio *btrfs_bio;
struct bio *new;
- new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
- if (new) {
- btrfs_bio = btrfs_io_bio(new);
- btrfs_bio->csum = NULL;
- btrfs_bio->csum_allocated = NULL;
- btrfs_bio->end_io = NULL;
- }
+ /* Bio allocation backed by a bioset does not fail */
+ new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset);
+ btrfs_bio = btrfs_io_bio(new);
+ btrfs_io_bio_init(btrfs_bio);
+ btrfs_bio->iter = bio->bi_iter;
return new;
}
-/* this also allocates from the btrfs_bioset */
-struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
{
- struct btrfs_io_bio *btrfs_bio;
struct bio *bio;
- bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
- if (bio) {
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_bio->csum = NULL;
- btrfs_bio->csum_allocated = NULL;
- btrfs_bio->end_io = NULL;
- }
+ /* Bio allocation backed by a bioset does not fail */
+ bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset);
+ btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
}
+struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
+{
+ struct bio *bio;
+ struct btrfs_io_bio *btrfs_bio;
+
+ /* this will never fail when it's backed by a bioset */
+ bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset);
+ ASSERT(bio);
+
+ btrfs_bio = btrfs_io_bio(bio);
+ btrfs_io_bio_init(btrfs_bio);
+
+ bio_trim(bio, offset >> 9, size >> 9);
+ btrfs_bio->iter = bio->bi_iter;
+ return bio;
+}
static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags)
{
- int ret = 0;
+ blk_status_t ret = 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
struct page *page = bvec->bv_page;
struct extent_io_tree *tree = bio->bi_private;
@@ -2737,14 +2735,14 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
bio->bi_private = NULL;
bio_get(bio);
- if (tree->ops && tree->ops->submit_bio_hook)
- ret = tree->ops->submit_bio_hook(page->mapping->host, bio,
+ if (tree->ops)
+ ret = tree->ops->submit_bio_hook(tree->private_data, bio,
mirror_num, bio_flags, start);
else
btrfsic_submit_bio(bio);
bio_put(bio);
- return ret;
+ return blk_status_to_errno(ret);
}
static int merge_bio(struct extent_io_tree *tree, struct page *page,
@@ -2752,7 +2750,7 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page,
unsigned long bio_flags)
{
int ret = 0;
- if (tree->ops && tree->ops->merge_bio_hook)
+ if (tree->ops)
ret = tree->ops->merge_bio_hook(page, offset, size, bio,
bio_flags);
return ret;
@@ -2765,7 +2763,6 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree,
size_t size, unsigned long offset,
struct block_device *bdev,
struct bio **bio_ret,
- unsigned long max_pages,
bio_end_io_t end_io_func,
int mirror_num,
unsigned long prev_bio_flags,
@@ -2802,14 +2799,11 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree,
}
}
- bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES,
- GFP_NOFS | __GFP_HIGH);
- if (!bio)
- return -ENOMEM;
-
+ bio = btrfs_bio_alloc(bdev, sector << 9);
bio_add_page(bio, page, page_size, offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
+ bio->bi_write_hint = page->mapping->host->i_write_hint;
bio_set_op_attrs(bio, op, op_flags);
if (wbc) {
wbc_init_bio(wbc, bio);
@@ -2856,7 +2850,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
em = *em_cached;
if (extent_map_in_tree(em) && start >= em->start &&
start < extent_map_end(em)) {
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
return em;
}
@@ -2864,10 +2858,10 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
*em_cached = NULL;
}
- em = get_extent(inode, page, pg_offset, start, len, 0);
+ em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
*em_cached = em;
}
return em;
@@ -2931,7 +2925,6 @@ static int __do_readpage(struct extent_io_tree *tree,
}
}
while (cur <= end) {
- unsigned long pnr = (last_byte >> PAGE_SHIFT) + 1;
bool force_bio_submit = false;
if (cur >= last_byte) {
@@ -3066,10 +3059,9 @@ static int __do_readpage(struct extent_io_tree *tree,
continue;
}
- pnr -= page->index;
ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL,
page, sector, disk_io_size, pg_offset,
- bdev, bio, pnr,
+ bdev, bio,
end_bio_extent_readpage, mirror_num,
*bio_flags,
this_bio_flag,
@@ -3110,7 +3102,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
inode = pages[0]->mapping->host;
while (1) {
lock_extent(tree, start, end);
- ordered = btrfs_lookup_ordered_range(inode, start,
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
end - start + 1);
if (!ordered)
break;
@@ -3182,7 +3174,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
while (1) {
lock_extent(tree, start, end);
- ordered = btrfs_lookup_ordered_range(inode, start,
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
PAGE_SIZE);
if (!ordered)
break;
@@ -3210,7 +3202,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
return ret;
}
-static void update_nr_written(struct page *page, struct writeback_control *wbc,
+static void update_nr_written(struct writeback_control *wbc,
unsigned long nr_written)
{
wbc->nr_to_write -= nr_written;
@@ -3330,7 +3322,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
u64 block_start;
u64 iosize;
sector_t sector;
- struct extent_state *cached_state = NULL;
struct extent_map *em;
struct block_device *bdev;
size_t pg_offset = 0;
@@ -3349,10 +3340,9 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
else
redirty_page_for_writepage(wbc, page);
- update_nr_written(page, wbc, nr_written);
+ update_nr_written(wbc, nr_written);
unlock_page(page);
- ret = 1;
- goto done_unlocked;
+ return 1;
}
}
@@ -3360,7 +3350,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
* we don't want to touch the inode after unlocking the page,
* so we update the mapping writeback index now
*/
- update_nr_written(page, wbc, nr_written + 1);
+ update_nr_written(wbc, nr_written + 1);
end = page_end;
if (i_size <= start) {
@@ -3374,7 +3364,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
while (cur <= end) {
u64 em_end;
- unsigned long max_nr;
if (cur >= i_size) {
if (tree->ops && tree->ops->writepage_end_io_hook)
@@ -3382,7 +3371,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page_end, NULL, 1);
break;
}
- em = epd->get_extent(inode, page, pg_offset, cur,
+ em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur,
end - cur + 1, 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
@@ -3431,8 +3420,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
continue;
}
- max_nr = (i_size >> PAGE_SHIFT) + 1;
-
set_range_writeback(tree, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
@@ -3442,11 +3429,14 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc,
page, sector, iosize, pg_offset,
- bdev, &epd->bio, max_nr,
+ bdev, &epd->bio,
end_bio_extent_writepage,
0, 0, 0, false);
- if (ret)
+ if (ret) {
SetPageError(page);
+ if (PageWriteback(page))
+ end_page_writeback(page);
+ }
cur = cur + iosize;
pg_offset += iosize;
@@ -3454,11 +3444,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
}
done:
*nr_ret = nr;
-
-done_unlocked:
-
- /* drop our reference on any cached states */
- free_extent_state(cached_state);
return ret;
}
@@ -3590,9 +3575,9 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
spin_unlock(&eb->refs_lock);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
- __percpu_counter_add(&fs_info->dirty_metadata_bytes,
- -eb->len,
- fs_info->dirty_metadata_batch);
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ -eb->len,
+ fs_info->dirty_metadata_batch);
ret = 1;
} else {
spin_unlock(&eb->refs_lock);
@@ -3693,6 +3678,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
struct extent_buffer *eb;
int i, done;
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
@@ -3700,7 +3686,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
BUG_ON(!eb);
done = atomic_dec_and_test(&eb->io_pages);
- if (bio->bi_error ||
+ if (bio->bi_status ||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
set_btree_ioerr(page);
@@ -3750,7 +3736,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
*/
start = btrfs_item_nr_offset(nritems);
- end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb);
+ end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb);
memzero_extent_buffer(eb, start, end - start);
}
@@ -3761,20 +3747,21 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc,
p, offset >> 9, PAGE_SIZE, 0, bdev,
- &epd->bio, -1,
+ &epd->bio,
end_bio_extent_buffer_writepage,
0, epd->bio_flags, bio_flags, false);
epd->bio_flags = bio_flags;
if (ret) {
set_btree_ioerr(p);
- end_page_writeback(p);
+ if (PageWriteback(p))
+ end_page_writeback(p);
if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
end_extent_buffer_writeback(eb);
ret = -EIO;
break;
}
offset += PAGE_SIZE;
- update_nr_written(p, wbc, 1);
+ update_nr_written(wbc, 1);
unlock_page(p);
}
@@ -3926,8 +3913,7 @@ retry:
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete.
*/
-static int extent_write_cache_pages(struct extent_io_tree *tree,
- struct address_space *mapping,
+static int extent_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc,
writepage_t writepage, void *data,
void (*flush_fn)(void *))
@@ -4168,8 +4154,7 @@ int extent_writepages(struct extent_io_tree *tree,
.bio_flags = 0,
};
- ret = extent_write_cache_pages(tree, mapping, wbc,
- __extent_writepage, &epd,
+ ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd,
flush_write_bio);
flush_epd_write_bio(&epd);
return ret;
@@ -4264,8 +4249,6 @@ static int try_release_extent_state(struct extent_map_tree *map,
EXTENT_IOBITS, 0, NULL))
ret = 0;
else {
- if ((mask & GFP_NOFS) == GFP_NOFS)
- mask = GFP_NOFS;
/*
* at this point we can safely clear everything except the
* locked bit and the nodatasum bit
@@ -4354,7 +4337,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = get_extent(inode, NULL, 0, offset, len, 0);
+ em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0);
if (IS_ERR_OR_NULL(em))
return em;
@@ -4373,6 +4356,119 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
return NULL;
}
+/*
+ * To cache previous fiemap extent
+ *
+ * Will be used for merging fiemap extent
+ */
+struct fiemap_cache {
+ u64 offset;
+ u64 phys;
+ u64 len;
+ u32 flags;
+ bool cached;
+};
+
+/*
+ * Helper to submit fiemap extent.
+ *
+ * Will try to merge current fiemap extent specified by @offset, @phys,
+ * @len and @flags with cached one.
+ * And only when we fails to merge, cached one will be submitted as
+ * fiemap extent.
+ *
+ * Return value is the same as fiemap_fill_next_extent().
+ */
+static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ u64 offset, u64 phys, u64 len, u32 flags)
+{
+ int ret = 0;
+
+ if (!cache->cached)
+ goto assign;
+
+ /*
+ * Sanity check, extent_fiemap() should have ensured that new
+ * fiemap extent won't overlap with cahced one.
+ * Not recoverable.
+ *
+ * NOTE: Physical address can overlap, due to compression
+ */
+ if (cache->offset + cache->len > offset) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ /*
+ * Only merges fiemap extents if
+ * 1) Their logical addresses are continuous
+ *
+ * 2) Their physical addresses are continuous
+ * So truly compressed (physical size smaller than logical size)
+ * extents won't get merged with each other
+ *
+ * 3) Share same flags except FIEMAP_EXTENT_LAST
+ * So regular extent won't get merged with prealloc extent
+ */
+ if (cache->offset + cache->len == offset &&
+ cache->phys + cache->len == phys &&
+ (cache->flags & ~FIEMAP_EXTENT_LAST) ==
+ (flags & ~FIEMAP_EXTENT_LAST)) {
+ cache->len += len;
+ cache->flags |= flags;
+ goto try_submit_last;
+ }
+
+ /* Not mergeable, need to submit cached one */
+ ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+ cache->len, cache->flags);
+ cache->cached = false;
+ if (ret)
+ return ret;
+assign:
+ cache->cached = true;
+ cache->offset = offset;
+ cache->phys = phys;
+ cache->len = len;
+ cache->flags = flags;
+try_submit_last:
+ if (cache->flags & FIEMAP_EXTENT_LAST) {
+ ret = fiemap_fill_next_extent(fieinfo, cache->offset,
+ cache->phys, cache->len, cache->flags);
+ cache->cached = false;
+ }
+ return ret;
+}
+
+/*
+ * Emit last fiemap cache
+ *
+ * The last fiemap cache may still be cached in the following case:
+ * 0 4k 8k
+ * |<- Fiemap range ->|
+ * |<------------ First extent ----------->|
+ *
+ * In this case, the first extent range will be cached but not emitted.
+ * So we must emit it before ending extent_fiemap().
+ */
+static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
+ struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache)
+{
+ int ret;
+
+ if (!cache->cached)
+ return 0;
+
+ ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+ cache->len, cache->flags);
+ cache->cached = false;
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent)
{
@@ -4390,6 +4486,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct fiemap_cache cache = { 0 };
int end = 0;
u64 em_start = 0;
u64 em_len = 0;
@@ -4410,8 +4507,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
*/
- ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
- 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path,
+ btrfs_ino(BTRFS_I(inode)), -1, 0);
if (ret < 0) {
btrfs_free_path(path);
return ret;
@@ -4426,7 +4523,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
found_type = found_key.type;
/* No extents, but there might be delalloc bits */
- if (found_key.objectid != btrfs_ino(inode) ||
+ if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
found_type != BTRFS_EXTENT_DATA_KEY) {
/* have to trust i_size as the end */
last = (u64)-1;
@@ -4535,8 +4632,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* lookup stuff.
*/
ret = btrfs_check_shared(trans, root->fs_info,
- root->objectid,
- btrfs_ino(inode), bytenr);
+ root->objectid,
+ btrfs_ino(BTRFS_I(inode)), bytenr);
if (trans)
btrfs_end_transaction(trans);
if (ret < 0)
@@ -4569,8 +4666,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
flags |= FIEMAP_EXTENT_LAST;
end = 1;
}
- ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
- em_len, flags);
+ ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
+ em_len, flags);
if (ret) {
if (ret == 1)
ret = 0;
@@ -4578,6 +4675,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
}
out_free:
+ if (!ret)
+ ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache);
free_extent_map(em);
out:
btrfs_free_path(path);