aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c1404
1 files changed, 722 insertions, 682 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5e3fccddde0c..753db965f7c0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -39,14 +39,12 @@
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "ordered-data.h"
#include "xattr.h"
#include "tree-log.h"
#include "bio.h"
#include "compression.h"
#include "locking.h"
-#include "free-space-cache.h"
#include "props.h"
#include "qgroup.h"
#include "delalloc-space.h"
@@ -114,6 +112,15 @@ struct data_reloc_warn {
int mirror_num;
};
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -247,7 +254,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- inode->root->root_key.objectid, btrfs_ino(inode), file_off,
+ btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
@@ -257,7 +264,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
logical += file_off;
btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
btrfs_ino(inode), file_off, logical,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -324,15 +331,15 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
const u32 csum_size = root->fs_info->csum_size;
/* For data reloc tree, it's better to do a backref lookup instead. */
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
return print_data_reloc_error(inode, logical_start, csum,
csum_expected, mirror_num);
/* Output without objectid, which is more meaningful */
- if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
+ if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(root), btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -340,7 +347,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
} else {
btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(root), btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -447,8 +454,8 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
- btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
- offset, bytes);
+ btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
+ page_folio(page), offset, bytes);
put_page(page);
}
@@ -505,12 +512,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, bool extent_inserted,
size_t size, size_t compressed_size,
int compress_type,
- struct page **compressed_pages,
+ struct folio *compressed_folio,
bool update_i_size)
{
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct page *page = NULL;
+ const u32 sectorsize = trans->fs_info->sectorsize;
char *kaddr;
unsigned long ptr;
struct btrfs_file_extent_item *ei;
@@ -518,10 +526,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t cur_size = size;
u64 i_size;
- ASSERT((compressed_size > 0 && compressed_pages) ||
- (compressed_size == 0 && !compressed_pages));
+ /*
+ * The decompressed size must still be no larger than a sector. Under
+ * heavy race, we can have size == 0 passed in, but that shouldn't be a
+ * big deal and we can continue the insertion.
+ */
+ ASSERT(size <= sectorsize);
- if (compressed_size && compressed_pages)
+ /*
+ * The compressed size also needs to be no larger than a sector.
+ * That's also why we only need one page as the parameter.
+ */
+ if (compressed_folio)
+ ASSERT(compressed_size <= sectorsize);
+ else
+ ASSERT(compressed_size == 0);
+
+ if (compressed_size && compressed_folio)
cur_size = compressed_size;
if (!extent_inserted) {
@@ -549,21 +570,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
ptr = btrfs_file_extent_inline_start(ei);
if (compress_type != BTRFS_COMPRESS_NONE) {
- struct page *cpage;
- int i = 0;
- while (compressed_size > 0) {
- cpage = compressed_pages[i];
- cur_size = min_t(unsigned long, compressed_size,
- PAGE_SIZE);
-
- kaddr = kmap_local_page(cpage);
- write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_local(kaddr);
+ kaddr = kmap_local_folio(compressed_folio, 0);
+ write_extent_buffer(leaf, kaddr, ptr, compressed_size);
+ kunmap_local(kaddr);
- i++;
- ptr += cur_size;
- compressed_size -= cur_size;
- }
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
@@ -604,17 +614,62 @@ fail:
return ret;
}
+static bool can_cow_file_range_inline(struct btrfs_inode *inode,
+ u64 offset, u64 size,
+ size_t compressed_size)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u64 data_len = (compressed_size ?: size);
+
+ /* Inline extents must start at offset 0. */
+ if (offset != 0)
+ return false;
+
+ /*
+ * Due to the page size limit, for subpage we can only trigger the
+ * writeback for the dirty sectors of page, that means data writeback
+ * is doing more writeback than what we want.
+ *
+ * This is especially unexpected for some call sites like fallocate,
+ * where we only increase i_size after everything is done.
+ * This means we can trigger inline extent even if we didn't want to.
+ * So here we skip inline extent creation completely.
+ */
+ if (fs_info->sectorsize != PAGE_SIZE)
+ return false;
+
+ /* Inline extents are limited to sectorsize. */
+ if (size > fs_info->sectorsize)
+ return false;
+
+ /* We cannot exceed the maximum inline data size. */
+ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
+ return false;
+
+ /* We cannot exceed the user specified max_inline size. */
+ if (data_len > fs_info->max_inline)
+ return false;
+
+ /* Inline extents must be the entirety of the file. */
+ if (size < i_size_read(&inode->vfs_inode))
+ return false;
+
+ return true;
+}
/*
* conditionally insert an inline extent into the file. This
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
+ *
+ * If being used directly, you must have already checked we're allowed to cow
+ * the range by getting true from can_cow_file_range_inline().
*/
-static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
- size_t compressed_size,
- int compress_type,
- struct page **compressed_pages,
- bool update_i_size)
+static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
+ u64 size, size_t compressed_size,
+ int compress_type,
+ struct folio *compressed_folio,
+ bool update_i_size)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
@@ -624,18 +679,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
int ret;
struct btrfs_path *path;
- /*
- * We can create an inline extent if it ends at or beyond the current
- * i_size, is no larger than a sector (decompressed), and the (possibly
- * compressed) data fits in a leaf and the configured maximum inline
- * size.
- */
- if (size < i_size_read(&inode->vfs_inode) ||
- size > fs_info->sectorsize ||
- data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
- data_len > fs_info->max_inline)
- return 1;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -661,7 +704,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
size, compressed_size, compress_type,
- compressed_pages, update_i_size);
+ compressed_folio, update_i_size);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -688,18 +731,50 @@ out:
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
- btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
btrfs_free_path(path);
btrfs_end_transaction(trans);
return ret;
}
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
+ u64 end,
+ size_t compressed_size,
+ int compress_type,
+ struct folio *compressed_folio,
+ bool update_i_size)
+{
+ struct extent_state *cached = NULL;
+ unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
+ u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
+ int ret;
+
+ if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
+ return 1;
+
+ lock_extent(&inode->io_tree, offset, end, &cached);
+ ret = __cow_file_range_inline(inode, offset, size, compressed_size,
+ compress_type, compressed_folio,
+ update_i_size);
+ if (ret > 0) {
+ unlock_extent(&inode->io_tree, offset, end, &cached);
+ return ret;
+ }
+
+ extent_clear_unlock_delalloc(inode, offset, end, NULL, &cached,
+ clear_flags,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ return ret;
+}
+
struct async_extent {
u64 start;
u64 ram_size;
u64 compressed_size;
- struct page **pages;
- unsigned long nr_pages;
+ struct folio **folios;
+ unsigned long nr_folios;
int compress_type;
struct list_head list;
};
@@ -724,19 +799,20 @@ struct async_cow {
static noinline int add_async_extent(struct async_chunk *cow,
u64 start, u64 ram_size,
u64 compressed_size,
- struct page **pages,
- unsigned long nr_pages,
+ struct folio **folios,
+ unsigned long nr_folios,
int compress_type)
{
struct async_extent *async_extent;
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
- BUG_ON(!async_extent); /* -ENOMEM */
+ if (!async_extent)
+ return -ENOMEM;
async_extent->start = start;
async_extent->ram_size = ram_size;
async_extent->compressed_size = compressed_size;
- async_extent->pages = pages;
- async_extent->nr_pages = nr_pages;
+ async_extent->folios = folios;
+ async_extent->nr_folios = nr_folios;
async_extent->compress_type = compress_type;
list_add_tail(&async_extent->list, &cow->extents);
return 0;
@@ -840,8 +916,8 @@ static void compress_file_range(struct btrfs_work *work)
u64 actual_end;
u64 i_size;
int ret = 0;
- struct page **pages;
- unsigned long nr_pages;
+ struct folio **folios;
+ unsigned long nr_folios;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
unsigned int poff;
@@ -871,9 +947,9 @@ static void compress_file_range(struct btrfs_work *work)
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
- pages = NULL;
- nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
+ folios = NULL;
+ nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
/*
* we don't want to send crud past the end of i_size through
@@ -922,8 +998,8 @@ again:
if (!inode_need_compress(inode, start, end))
goto cleanup_and_bail_uncompressed;
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!pages) {
+ folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
+ if (!folios) {
/*
* Memory allocation failure is not a fatal error, we can fall
* back to uncompressed code.
@@ -937,9 +1013,9 @@ again:
compress_type = inode->prop_compress;
/* Compression level is applied here. */
- ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4),
- mapping, start, pages, &nr_pages, &total_in,
- &total_compressed);
+ ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
+ mapping, start, folios, &nr_folios, &total_in,
+ &total_compressed);
if (ret)
goto mark_incompressible;
@@ -949,7 +1025,7 @@ again:
*/
poff = offset_in_page(total_compressed);
if (poff)
- memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff);
+ folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
/*
* Try to create an inline extent.
@@ -960,43 +1036,16 @@ again:
* Check cow_file_range() for why we don't even try to create inline
* extent for the subpage case.
*/
- if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
- if (total_in < actual_end) {
- ret = cow_file_range_inline(inode, actual_end, 0,
- BTRFS_COMPRESS_NONE, NULL,
- false);
- } else {
- ret = cow_file_range_inline(inode, actual_end,
- total_compressed,
- compress_type, pages,
- false);
- }
- if (ret <= 0) {
- unsigned long clear_flags = EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
- EXTENT_DO_ACCOUNTING;
-
- if (ret < 0)
- mapping_set_error(mapping, -EIO);
-
- /*
- * inline extent creation worked or returned error,
- * we don't need to create any more async work items.
- * Unlock and free up our temp pages.
- *
- * We use DO_ACCOUNTING here because we need the
- * delalloc_release_metadata to be done _after_ we drop
- * our outstanding extent for clearing delalloc for this
- * range.
- */
- extent_clear_unlock_delalloc(inode, start, end,
- NULL,
- clear_flags,
- PAGE_UNLOCK |
- PAGE_START_WRITEBACK |
- PAGE_END_WRITEBACK);
- goto free_pages;
- }
+ if (total_in < actual_end)
+ ret = cow_file_range_inline(inode, start, end, 0,
+ BTRFS_COMPRESS_NONE, NULL, false);
+ else
+ ret = cow_file_range_inline(inode, start, end, total_compressed,
+ compress_type, folios[0], false);
+ if (ret <= 0) {
+ if (ret < 0)
+ mapping_set_error(mapping, -EIO);
+ goto free_pages;
}
/*
@@ -1018,8 +1067,9 @@ again:
* The async work queues will take care of doing actual allocation on
* disk for these compressed pages, and will submit the bios.
*/
- add_async_extent(async_chunk, start, total_in, total_compressed, pages,
- nr_pages, compress_type);
+ ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
+ nr_folios, compress_type);
+ BUG_ON(ret);
if (start + total_in < end) {
start += total_in;
cond_resched();
@@ -1031,15 +1081,16 @@ mark_incompressible:
if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
inode->flags |= BTRFS_INODE_NOCOMPRESS;
cleanup_and_bail_uncompressed:
- add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
- BTRFS_COMPRESS_NONE);
+ ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
+ BTRFS_COMPRESS_NONE);
+ BUG_ON(ret);
free_pages:
- if (pages) {
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ if (folios) {
+ for (i = 0; i < nr_folios; i++) {
+ WARN_ON(folios[i]->mapping);
+ btrfs_free_compr_folio(folios[i]);
}
- kfree(pages);
+ kfree(folios);
}
}
@@ -1047,16 +1098,16 @@ static void free_async_extent_pages(struct async_extent *async_extent)
{
int i;
- if (!async_extent->pages)
+ if (!async_extent->folios)
return;
- for (i = 0; i < async_extent->nr_pages; i++) {
- WARN_ON(async_extent->pages[i]->mapping);
- put_page(async_extent->pages[i]);
+ for (i = 0; i < async_extent->nr_folios; i++) {
+ WARN_ON(async_extent->folios[i]->mapping);
+ btrfs_free_compr_folio(async_extent->folios[i]);
}
- kfree(async_extent->pages);
- async_extent->nr_pages = 0;
- async_extent->pages = NULL;
+ kfree(async_extent->folios);
+ async_extent->nr_folios = 0;
+ async_extent->folios = NULL;
}
static void submit_uncompressed_range(struct btrfs_inode *inode,
@@ -1103,6 +1154,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct btrfs_ordered_extent *ordered;
struct btrfs_key ins;
struct page *locked_page = NULL;
+ struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
u64 start = async_extent->start;
@@ -1122,7 +1174,6 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
if (!(start >= locked_page_end || end <= locked_page_start))
locked_page = async_chunk->locked_page;
}
- lock_extent(io_tree, start, end, NULL);
if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
submit_uncompressed_range(inode, async_extent, locked_page);
@@ -1135,15 +1186,17 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
0, *alloc_hint, &ins, 1, 1);
if (ret) {
/*
- * Here we used to try again by going back to non-compressed
- * path for ENOSPC. But we can't reserve space even for
- * compressed size, how could it work for uncompressed size
- * which requires larger size? So here we directly go error
- * path.
+ * We can't reserve contiguous space for the compressed size.
+ * Unlikely, but it's possible that we could have enough
+ * non-contiguous space for the uncompressed size instead. So
+ * fall back to uncompressed.
*/
- goto out_free;
+ submit_uncompressed_range(inode, async_extent, locked_page);
+ goto done;
}
+ lock_extent(io_tree, start, end, &cached);
+
/* Here we're doing allocation and writeback of the compressed pages */
em = create_io_em(inode, start,
async_extent->ram_size, /* len */
@@ -1177,11 +1230,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
/* Clear dirty, set writeback and unlock the pages. */
extent_clear_unlock_delalloc(inode, start, end,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_START_WRITEBACK);
btrfs_submit_compressed_write(ordered,
- async_extent->pages, /* compressed_pages */
- async_extent->nr_pages,
+ async_extent->folios, /* compressed_folios */
+ async_extent->nr_folios,
async_chunk->write_flags, true);
*alloc_hint = ins.objectid + ins.offset;
done:
@@ -1193,10 +1246,10 @@ done:
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
-out_free:
mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
extent_clear_unlock_delalloc(inode, start, end,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+ NULL, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_START_WRITEBACK |
@@ -1206,7 +1259,7 @@ out_free:
kthread_associate_blkcg(NULL);
btrfs_debug(fs_info,
"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
- root->root_key.objectid, btrfs_ino(inode), start,
+ btrfs_root_id(root), btrfs_ino(inode), start,
async_extent->ram_size, ret);
kfree(async_extent);
}
@@ -1278,6 +1331,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_state *cached = NULL;
u64 alloc_hint = 0;
u64 orig_start = start;
u64 num_bytes;
@@ -1303,53 +1357,21 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- /*
- * Due to the page size limit, for subpage we can only trigger the
- * writeback for the dirty sectors of page, that means data writeback
- * is doing more writeback than what we want.
- *
- * This is especially unexpected for some call sites like fallocate,
- * where we only increase i_size after everything is done.
- * This means we can trigger inline extent even if we didn't want to.
- * So here we skip inline extent creation completely.
- */
- if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
- u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
- end + 1);
-
+ if (!no_inline) {
/* lets try to make an inline extent */
- ret = cow_file_range_inline(inode, actual_end, 0,
+ ret = cow_file_range_inline(inode, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false);
- if (ret == 0) {
- /*
- * We use DO_ACCOUNTING here because we need the
- * delalloc_release_metadata to be run _after_ we drop
- * our outstanding extent for clearing delalloc for this
- * range.
- */
- extent_clear_unlock_delalloc(inode, start, end,
- locked_page,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
- EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
+ if (ret <= 0) {
/*
- * locked_page is locked by the caller of
- * writepage_delalloc(), not locked by
- * __process_pages_contig().
+ * We succeeded, return 1 so the caller knows we're done
+ * with this page and already handled the IO.
*
- * We can't let __process_pages_contig() to unlock it,
- * as it doesn't have any subpage::writers recorded.
- *
- * Here we manually unlock the page, since the caller
- * can't determine if it's an inline extent or a
- * compressed extent.
+ * If there was an error then cow_file_range_inline() has
+ * already done the cleanup.
*/
- unlock_page(locked_page);
- ret = 1;
+ if (ret == 0)
+ ret = 1;
goto done;
- } else if (ret < 0) {
- goto out_unlock;
}
}
@@ -1409,6 +1431,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
extent_reserved = true;
ram_size = ins.offset;
+
+ lock_extent(&inode->io_tree, start, start + ram_size - 1,
+ &cached);
+
em = create_io_em(inode, start, ins.offset, /* len */
start, /* orig_start */
ins.objectid, /* block_start */
@@ -1418,6 +1444,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
BTRFS_COMPRESS_NONE, /* compress_type */
BTRFS_ORDERED_REGULAR /* type */);
if (IS_ERR(em)) {
+ unlock_extent(&inode->io_tree, start,
+ start + ram_size - 1, &cached);
ret = PTR_ERR(em);
goto out_reserve;
}
@@ -1428,6 +1456,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
0, 1 << BTRFS_ORDERED_REGULAR,
BTRFS_COMPRESS_NONE);
if (IS_ERR(ordered)) {
+ unlock_extent(&inode->io_tree, start,
+ start + ram_size - 1, &cached);
ret = PTR_ERR(ordered);
goto out_drop_extent_cache;
}
@@ -1467,7 +1497,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
- locked_page,
+ locked_page, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
if (num_bytes < cur_alloc_size)
@@ -1526,10 +1556,17 @@ out_unlock:
if (!locked_page)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
- locked_page, 0, page_ops);
+ locked_page, NULL, 0, page_ops);
}
/*
+ * At this point we're unlocked, we want to make sure we're only
+ * clearing these flags under the extent lock, so lock the rest of the
+ * range and clear everything up.
+ */
+ lock_extent(&inode->io_tree, start, end, NULL);
+
+ /*
* For the range (2). If we reserved an extent for our delalloc range
* (or a subrange) and failed to create the respective ordered extent,
* then it means that when we reserved the extent we decremented the
@@ -1542,7 +1579,7 @@ out_unlock:
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1,
- locked_page,
+ locked_page, &cached,
clear_bits,
page_ops);
start += cur_alloc_size;
@@ -1557,7 +1594,7 @@ out_unlock:
if (start < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV;
extent_clear_unlock_delalloc(inode, start, end, locked_page,
- clear_bits, page_ops);
+ &cached, clear_bits, page_ops);
}
return ret;
}
@@ -1630,7 +1667,6 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
if (!ctx)
return false;
- unlock_extent(&inode->io_tree, start, end, NULL);
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
async_chunk = ctx->chunks;
@@ -1724,29 +1760,6 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
return 1;
}
-static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 num_bytes, bool nowait)
-{
- struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
- struct btrfs_ordered_sum *sums;
- int ret;
- LIST_HEAD(list);
-
- ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
- &list, 0, nowait);
- if (ret == 0 && list_empty(&list))
- return 0;
-
- while (!list_empty(&list)) {
- sums = list_entry(list.next, struct btrfs_ordered_sum, list);
- list_del(&sums->list);
- kfree(sums);
- }
- if (ret < 0)
- return ret;
- return 1;
-}
-
static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
const u64 start, const u64 end)
{
@@ -1754,6 +1767,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_state *cached_state = NULL;
u64 range_start = start;
u64 count;
int ret;
@@ -1790,6 +1804,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
* group that contains that extent to RO mode and therefore force COW
* when starting writeback.
*/
+ lock_extent(io_tree, start, end, &cached_state);
count = count_range_bits(io_tree, &range_start, end, range_bytes,
EXTENT_NORESERVE, 0, NULL);
if (count > 0 || is_space_ino || is_reloc_ino) {
@@ -1808,6 +1823,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
NULL);
}
+ unlock_extent(io_tree, start, end, &cached_state);
/*
* Don't try to create inline extents, as a mix of inline extent that
@@ -1861,6 +1877,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *fi;
+ struct btrfs_root *csum_root;
u64 extent_end;
u8 extent_type;
int can_nocow = 0;
@@ -1921,7 +1938,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
if (args->free_path) {
/*
* We don't need the path anymore, plus through the
- * csum_exist_in_range() call below we will end up allocating
+ * btrfs_lookup_csums_list() call below we will end up allocating
* another path. So free the path to avoid unnecessary extra
* memory usage.
*/
@@ -1942,8 +1959,11 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* Force COW if csums exist in the range. This ensures that csums for a
* given extent are either valid or do not exist.
*/
- ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
- nowait);
+
+ csum_root = btrfs_csum_root(root->fs_info, args->disk_bytenr);
+ ret = btrfs_lookup_csums_list(csum_root, args->disk_bytenr,
+ args->disk_bytenr + args->num_bytes - 1,
+ NULL, nowait);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -1993,12 +2013,13 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
nocow_args.end = end;
nocow_args.writeback_path = true;
- while (1) {
+ while (cur_offset <= end) {
struct btrfs_block_group *nocow_bg = NULL;
struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
+ struct extent_state *cached_state = NULL;
u64 extent_end;
u64 ram_bytes;
u64 nocow_end;
@@ -2136,6 +2157,8 @@ must_cow:
}
nocow_end = cur_offset + nocow_args.num_bytes - 1;
+ lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
+
is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
if (is_prealloc) {
u64 orig_start = found_key.offset - nocow_args.extent_offset;
@@ -2149,6 +2172,8 @@ must_cow:
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
+ unlock_extent(&inode->io_tree, cur_offset,
+ nocow_end, &cached_state);
btrfs_dec_nocow_writers(nocow_bg);
ret = PTR_ERR(em);
goto error;
@@ -2169,6 +2194,8 @@ must_cow:
btrfs_drop_extent_map_range(inode, cur_offset,
nocow_end, false);
}
+ unlock_extent(&inode->io_tree, cur_offset,
+ nocow_end, &cached_state);
ret = PTR_ERR(ordered);
goto error;
}
@@ -2183,8 +2210,8 @@ must_cow:
btrfs_put_ordered_extent(ordered);
extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
- locked_page, EXTENT_LOCKED |
- EXTENT_DELALLOC |
+ locked_page, &cached_state,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
PAGE_UNLOCK | PAGE_SET_ORDERED);
@@ -2197,8 +2224,6 @@ must_cow:
*/
if (ret)
goto error;
- if (cur_offset > end)
- break;
}
btrfs_release_path(path);
@@ -2224,13 +2249,23 @@ error:
*/
if (cow_start != (u64)-1)
cur_offset = cow_start;
- if (cur_offset < end)
+
+ /*
+ * We need to lock the extent here because we're clearing DELALLOC and
+ * we're not locked at this point.
+ */
+ if (cur_offset < end) {
+ struct extent_state *cached = NULL;
+
+ lock_extent(&inode->io_tree, cur_offset, end, &cached);
extent_clear_unlock_delalloc(inode, cur_offset, end,
- locked_page, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_DEFRAG |
+ locked_page, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
+ }
btrfs_free_path(path);
return ret;
}
@@ -2293,6 +2328,8 @@ void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 size;
+ lockdep_assert_held(&inode->io_tree.lock);
+
/* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
return;
@@ -2331,6 +2368,8 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
u64 new_size, old_size;
u32 num_extents;
+ lockdep_assert_held(&inode->io_tree.lock);
+
/* not delalloc, ignore it */
if (!(other->state & EXTENT_DELALLOC))
return;
@@ -2378,55 +2417,50 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
spin_unlock(&inode->lock);
}
-static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
- struct btrfs_inode *inode)
+static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&root->delalloc_lock);
- if (list_empty(&inode->delalloc_inodes)) {
- list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
- set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
- root->nr_delalloc_inodes++;
- if (root->nr_delalloc_inodes == 1) {
- spin_lock(&fs_info->delalloc_root_lock);
- BUG_ON(!list_empty(&root->delalloc_root));
- list_add_tail(&root->delalloc_root,
- &fs_info->delalloc_roots);
- spin_unlock(&fs_info->delalloc_root_lock);
- }
+ ASSERT(list_empty(&inode->delalloc_inodes));
+ list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
+ root->nr_delalloc_inodes++;
+ if (root->nr_delalloc_inodes == 1) {
+ spin_lock(&fs_info->delalloc_root_lock);
+ ASSERT(list_empty(&root->delalloc_root));
+ list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
}
spin_unlock(&root->delalloc_lock);
}
-void __btrfs_del_delalloc_inode(struct btrfs_root *root,
- struct btrfs_inode *inode)
+void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
+ lockdep_assert_held(&root->delalloc_lock);
+
+ /*
+ * We may be called after the inode was already deleted from the list,
+ * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
+ * and then later through btrfs_clear_delalloc_extent() while the inode
+ * still has ->delalloc_bytes > 0.
+ */
if (!list_empty(&inode->delalloc_inodes)) {
list_del_init(&inode->delalloc_inodes);
- clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &inode->runtime_flags);
root->nr_delalloc_inodes--;
if (!root->nr_delalloc_inodes) {
ASSERT(list_empty(&root->delalloc_inodes));
spin_lock(&fs_info->delalloc_root_lock);
- BUG_ON(list_empty(&root->delalloc_root));
+ ASSERT(!list_empty(&root->delalloc_root));
list_del_init(&root->delalloc_root);
spin_unlock(&fs_info->delalloc_root_lock);
}
}
}
-static void btrfs_del_delalloc_inode(struct btrfs_root *root,
- struct btrfs_inode *inode)
-{
- spin_lock(&root->delalloc_lock);
- __btrfs_del_delalloc_inode(root, inode);
- spin_unlock(&root->delalloc_lock);
-}
-
/*
* Properly track delayed allocation bytes in the inode and to maintain the
* list of inodes that have pending delalloc work to be done.
@@ -2436,6 +2470,8 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ lockdep_assert_held(&inode->io_tree.lock);
+
if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
WARN_ON(1);
/*
@@ -2444,10 +2480,9 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
* bit, which is only set or cleared with irqs on
*/
if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
- struct btrfs_root *root = inode->root;
u64 len = state->end + 1 - state->start;
+ u64 prev_delalloc_bytes;
u32 num_extents = count_max_extents(fs_info, len);
- bool do_list = !btrfs_is_free_space_inode(inode);
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, num_extents);
@@ -2460,13 +2495,20 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
fs_info->delalloc_batch);
spin_lock(&inode->lock);
+ prev_delalloc_bytes = inode->delalloc_bytes;
inode->delalloc_bytes += len;
if (bits & EXTENT_DEFRAG)
inode->defrag_bytes += len;
- if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &inode->runtime_flags))
- btrfs_add_delalloc_inodes(root, inode);
spin_unlock(&inode->lock);
+
+ /*
+ * We don't need to be under the protection of the inode's lock,
+ * because we are called while holding the inode's io_tree lock
+ * and are therefore protected against concurrent calls of this
+ * function and btrfs_clear_delalloc_extent().
+ */
+ if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
+ btrfs_add_delalloc_inode(inode);
}
if (!(state->state & EXTENT_DELALLOC_NEW) &&
@@ -2488,6 +2530,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(fs_info, len);
+ lockdep_assert_held(&inode->io_tree.lock);
+
if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
spin_lock(&inode->lock);
inode->defrag_bytes -= len;
@@ -2501,7 +2545,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
*/
if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = inode->root;
- bool do_list = !btrfs_is_free_space_inode(inode);
+ u64 new_delalloc_bytes;
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, -num_extents);
@@ -2514,14 +2558,15 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
*/
if (bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(inode, len, false);
+ btrfs_delalloc_release_metadata(inode, len, true);
/* For sanity tests. */
if (btrfs_is_testing(fs_info))
return;
if (!btrfs_is_data_reloc_root(root) &&
- do_list && !(state->state & EXTENT_NORESERVE) &&
+ !btrfs_is_free_space_inode(inode) &&
+ !(state->state & EXTENT_NORESERVE) &&
(bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(fs_info, len);
@@ -2529,11 +2574,20 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
fs_info->delalloc_batch);
spin_lock(&inode->lock);
inode->delalloc_bytes -= len;
- if (do_list && inode->delalloc_bytes == 0 &&
- test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &inode->runtime_flags))
- btrfs_del_delalloc_inode(root, inode);
+ new_delalloc_bytes = inode->delalloc_bytes;
spin_unlock(&inode->lock);
+
+ /*
+ * We don't need to be under the protection of the inode's lock,
+ * because we are called while holding the inode's io_tree lock
+ * and are therefore protected against concurrent calls of this
+ * function and btrfs_set_delalloc_extent().
+ */
+ if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
+ spin_lock(&root->delalloc_lock);
+ btrfs_del_delalloc_inode(inode);
+ spin_unlock(&root->delalloc_lock);
+ }
}
if ((state->state & EXTENT_DELALLOC_NEW) &&
@@ -2623,7 +2677,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
u64 em_len;
int ret = 0;
- em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+ em = btrfs_get_extent(inode, NULL, search_start, search_len);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -2793,7 +2847,7 @@ out_page:
PAGE_SIZE, !ret);
clear_page_dirty_for_io(page);
}
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
kfree(fixup);
@@ -2820,7 +2874,7 @@ out_page:
int btrfs_writepage_cow_fixup(struct page *page)
{
struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_writepage_fixup *fixup;
/* This page has ordered extent covering it already */
@@ -2848,7 +2902,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
* page->mapping outside of the page lock.
*/
ihold(inode);
- btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
+ btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
fixup->page = page;
@@ -3118,8 +3172,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
- ordered_extent->num_bytes, trans->transid);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = unpin_extent_cache(inode, ordered_extent->file_offset,
+ ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3148,7 +3207,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
btrfs_abort_transaction(trans, ret);
goto out;
}
- ret = 0;
out:
clear_extent_bit(&inode->io_tree, start, end, clear_bits,
&cached_state);
@@ -3167,16 +3225,30 @@ out:
* set the mapping error, so we need to set it if we're the ones
* marking this ordered extent as failed.
*/
- if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
- &ordered_extent->flags))
- mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
+ if (ret)
+ btrfs_mark_ordered_extent_error(ordered_extent);
if (truncated)
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
- /* Drop extent maps for the part of the extent we didn't write. */
- btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
+ /*
+ * Drop extent maps for the part of the extent we didn't write.
+ *
+ * We have an exception here for the free_space_inode, this is
+ * because when we do btrfs_get_extent() on the free space inode
+ * we will search the commit root. If this is a new block group
+ * we won't find anything, and we will trip over the assert in
+ * writepage where we do ASSERT(em->block_start !=
+ * EXTENT_MAP_HOLE).
+ *
+ * Theoretically we could also skip this for any NOCOW extent as
+ * we don't mess with the extent map tree in the NOCOW case, but
+ * for now simply skip this if we are the free space inode.
+ */
+ if (!btrfs_is_free_space_inode(inode))
+ btrfs_drop_extent_map_range(inode, unwritten_start,
+ end, false);
/*
* If the ordered extent had an IOERR or something else went
@@ -3208,7 +3280,7 @@ out:
* Actually free the qgroup rsv which was released when
* the ordered extent was created.
*/
- btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
ordered_extent->qgroup_rsv,
BTRFS_QGROUP_RSV_DATA);
}
@@ -3230,7 +3302,7 @@ out:
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
{
- if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
+ if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) &&
!test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
list_empty(&ordered->bioc_list))
btrfs_finish_ordered_zoned(ordered);
@@ -3715,7 +3787,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
static int btrfs_read_locked_inode(struct inode *inode,
struct btrfs_path *in_path)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_path *path = in_path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
@@ -3796,7 +3868,7 @@ cache_index:
* cache.
*
* This is required for both inode re-read from disk and delayed inode
- * in delayed_nodes_tree.
+ * in the delayed_nodes xarray.
*/
if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -3875,7 +3947,7 @@ cache_acl:
btrfs_err(fs_info,
"error loading props for ino %llu (root %llu): %d",
btrfs_ino(BTRFS_I(inode)),
- root->root_key.objectid, ret);
+ btrfs_root_id(root), ret);
}
if (path != in_path)
btrfs_free_path(path);
@@ -4234,7 +4306,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
/* This needs to handle no-key deletions later on */
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
- objectid = inode->root->root_key.objectid;
+ objectid = btrfs_root_id(inode->root);
} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
objectid = inode->location.objectid;
} else {
@@ -4292,7 +4364,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
} else {
ret = btrfs_del_root_ref(trans, objectid,
- root->root_key.objectid, dir_ino,
+ btrfs_root_id(root), dir_ino,
&index, &fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -4342,7 +4414,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
dir_id, &name, 0);
if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
- if (key.objectid == root->root_key.objectid) {
+ if (key.objectid == btrfs_root_id(root)) {
ret = -EPERM;
btrfs_err(fs_info,
"deleting default subvolume %llu is not allowed",
@@ -4352,21 +4424,27 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
btrfs_release_path(path);
}
- key.objectid = root->root_key.objectid;
+ key.objectid = btrfs_root_id(root);
key.type = BTRFS_ROOT_REF_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist a root
+ * with such id, but this is out of valid range.
+ */
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = 0;
if (path->slots[0] > 0) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == root->root_key.objectid &&
- key.type == BTRFS_ROOT_REF_KEY)
+ if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
ret = -ENOTEMPTY;
}
out:
@@ -4378,77 +4456,42 @@ out:
static void btrfs_prune_dentries(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct rb_node *node;
- struct rb_node *prev;
- struct btrfs_inode *entry;
- struct inode *inode;
- u64 objectid = 0;
+ struct btrfs_inode *inode;
+ u64 min_ino = 0;
if (!BTRFS_FS_ERROR(fs_info))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
- spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- entry = rb_entry(node, struct btrfs_inode, rb_node);
-
- if (objectid < btrfs_ino(entry))
- node = node->rb_left;
- else if (objectid > btrfs_ino(entry))
- node = node->rb_right;
- else
- break;
- }
- if (!node) {
- while (prev) {
- entry = rb_entry(prev, struct btrfs_inode, rb_node);
- if (objectid <= btrfs_ino(entry)) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
- while (node) {
- entry = rb_entry(node, struct btrfs_inode, rb_node);
- objectid = btrfs_ino(entry) + 1;
- inode = igrab(&entry->vfs_inode);
- if (inode) {
- spin_unlock(&root->inode_lock);
- if (atomic_read(&inode->i_count) > 1)
- d_prune_aliases(inode);
- /*
- * btrfs_drop_inode will have it removed from the inode
- * cache when its usage count hits zero.
- */
- iput(inode);
- cond_resched();
- spin_lock(&root->inode_lock);
- goto again;
- }
-
- if (cond_resched_lock(&root->inode_lock))
- goto again;
+ inode = btrfs_find_first_inode(root, min_ino);
+ while (inode) {
+ if (atomic_read(&inode->vfs_inode.i_count) > 1)
+ d_prune_aliases(&inode->vfs_inode);
- node = rb_next(node);
+ min_ino = btrfs_ino(inode) + 1;
+ /*
+ * btrfs_drop_inode() will have it removed from the inode
+ * cache when its usage count hits zero.
+ */
+ iput(&inode->vfs_inode);
+ cond_resched();
+ inode = btrfs_find_first_inode(root, min_ino);
}
- spin_unlock(&root->inode_lock);
}
int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
struct btrfs_root *root = dir->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode = d_inode(dentry);
struct btrfs_root *dest = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_block_rsv block_rsv;
u64 root_flags;
+ u64 qgroup_reserved = 0;
int ret;
+ down_write(&fs_info->subvol_sem);
+
/*
* Don't allow to delete a subvolume with send in progress. This is
* inside the inode lock so the error handling that has to drop the bit
@@ -4459,26 +4502,26 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
- dest->root_key.objectid);
- return -EPERM;
+ btrfs_root_id(dest));
+ ret = -EPERM;
+ goto out_up_write;
}
if (atomic_read(&dest->nr_swapfiles)) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu with active swapfile",
- root->root_key.objectid);
- return -EPERM;
+ btrfs_root_id(root));
+ ret = -EPERM;
+ goto out_up_write;
}
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- down_write(&fs_info->subvol_sem);
-
ret = may_destroy_subvol(dest);
if (ret)
- goto out_up_write;
+ goto out_undead;
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
@@ -4488,13 +4531,21 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
*/
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
if (ret)
- goto out_up_write;
+ goto out_undead;
+ qgroup_reserved = block_rsv.qgroup_rsv_reserved;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_release;
}
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+ qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
@@ -4520,7 +4571,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
ret = btrfs_insert_orphan_item(trans,
fs_info->tree_root,
- dest->root_key.objectid);
+ btrfs_root_id(dest));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4528,8 +4579,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
}
ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
- BTRFS_UUID_KEY_SUBVOL,
- dest->root_key.objectid);
+ BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4538,7 +4588,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = btrfs_uuid_tree_remove(trans,
dest->root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- dest->root_key.objectid);
+ btrfs_root_id(dest));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4553,16 +4603,20 @@ out_end_trans:
ret = btrfs_end_transaction(trans);
inode->i_flags |= S_DEAD;
out_release:
- btrfs_subvolume_release_metadata(root, &block_rsv);
-out_up_write:
- up_write(&fs_info->subvol_sem);
+ btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
+ if (qgroup_reserved)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
+out_undead:
if (ret) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- } else {
+ }
+out_up_write:
+ up_write(&fs_info->subvol_sem);
+ if (!ret) {
d_invalidate(dentry);
btrfs_prune_dentries(dest);
ASSERT(dest->send_in_progress == 0);
@@ -4575,7 +4629,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- int err = 0;
+ int ret = 0;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
struct fscrypt_name fname;
@@ -4591,33 +4645,33 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
}
- err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
- if (err)
- return err;
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (ret)
+ return ret;
/* This needs to handle no-key deletions later on */
trans = __unlink_start_trans(BTRFS_I(dir));
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_notrans;
}
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
goto out;
}
- err = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (err)
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret)
goto out;
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
&fname.disk_name);
- if (!err) {
+ if (!ret) {
btrfs_i_size_write(BTRFS_I(inode), 0);
/*
* Propagate the last_unlink_trans value of the deleted dir to
@@ -4639,7 +4693,7 @@ out_notrans:
btrfs_btree_balance_dirty(fs_info);
fscrypt_free_filename(&fname);
- return err;
+ return ret;
}
/*
@@ -4667,7 +4721,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (blocksize - 1);
- struct page *page;
+ struct folio *folio;
gfp_t mask = btrfs_alloc_write_mask(mapping);
size_t write_bytes = blocksize;
int ret = 0;
@@ -4699,8 +4753,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
goto out;
}
again:
- page = find_or_create_page(mapping, index, mask);
- if (!page) {
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+ if (IS_ERR(folio)) {
btrfs_delalloc_release_space(inode, data_reserved, block_start,
blocksize, true);
btrfs_delalloc_release_extents(inode, blocksize);
@@ -4708,15 +4763,15 @@ again:
goto out;
}
- if (!PageUptodate(page)) {
- ret = btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != mapping) {
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ ret = btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
goto again;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto out_unlock;
}
@@ -4725,22 +4780,22 @@ again:
/*
* We unlock the page after the io is completed and then re-lock it
* above. release_folio() could have come in between that and cleared
- * PagePrivate(), but left the page in the mapping. Set the page mapped
+ * folio private, but left the page in the mapping. Set the page mapped
* here to make sure it's properly set for the subpage stuff.
*/
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto out_unlock;
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
lock_extent(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
unlock_extent(io_tree, block_start, block_end, &cached_state);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
@@ -4761,15 +4816,17 @@ again:
if (!len)
len = blocksize - offset;
if (front)
- memzero_page(page, (block_start - page_offset(page)),
- offset);
+ folio_zero_range(folio, block_start - folio_pos(folio),
+ offset);
else
- memzero_page(page, (block_start - page_offset(page)) + offset,
- len);
- }
- btrfs_page_clear_checked(fs_info, page, block_start,
- block_end + 1 - block_start);
- btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
+ folio_zero_range(folio,
+ (block_start - folio_pos(folio)) + offset,
+ len);
+ }
+ btrfs_folio_clear_checked(fs_info, folio, block_start,
+ block_end + 1 - block_start);
+ btrfs_folio_set_dirty(fs_info, folio, block_start,
+ block_end + 1 - block_start);
unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
@@ -4785,8 +4842,8 @@ out_unlock:
block_start, blocksize, true);
}
btrfs_delalloc_release_extents(inode, blocksize);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
out:
if (only_release_metadata)
btrfs_check_nocow_unlock(inode);
@@ -4860,16 +4917,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
u64 last_byte;
u64 cur_offset;
u64 hole_size;
- int err = 0;
+ int ret = 0;
/*
* If our size started in the middle of a block we need to zero out the
* rest of the block before we expand the i_size, otherwise we could
* expose stale data.
*/
- err = btrfs_truncate_block(inode, oldsize, 0, 0);
- if (err)
- return err;
+ ret = btrfs_truncate_block(inode, oldsize, 0, 0);
+ if (ret)
+ return ret;
if (size <= hole_start)
return 0;
@@ -4878,10 +4935,9 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
&cached_state);
cur_offset = hole_start;
while (1) {
- em = btrfs_get_extent(inode, NULL, 0, cur_offset,
- block_end - cur_offset);
+ em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
if (IS_ERR(em)) {
- err = PTR_ERR(em);
+ ret = PTR_ERR(em);
em = NULL;
break;
}
@@ -4889,16 +4945,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
struct extent_map *hole_em;
- err = maybe_insert_hole(inode, cur_offset, hole_size);
- if (err)
+ ret = maybe_insert_hole(inode, cur_offset, hole_size);
+ if (ret)
break;
- err = btrfs_inode_set_file_extent_range(inode,
+ ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
- if (err)
+ if (ret)
break;
hole_em = alloc_extent_map();
@@ -4917,15 +4973,14 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
- hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = btrfs_get_fs_generation(fs_info);
- err = btrfs_replace_extent_map_range(inode, hole_em, true);
+ ret = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
} else {
- err = btrfs_inode_set_file_extent_range(inode,
+ ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
- if (err)
+ if (ret)
break;
}
next:
@@ -4937,7 +4992,7 @@ next:
}
free_extent_map(em);
unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
- return err;
+ return ret;
}
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
@@ -4991,7 +5046,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
if (btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode,
@@ -5132,7 +5187,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
*/
if (state_flags & EXTENT_DELALLOC)
btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
- end - start + 1);
+ end - start + 1, NULL);
clear_extent_bit(io_tree, start, end,
EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
@@ -5194,7 +5249,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
void btrfs_evict_inode(struct inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv = NULL;
@@ -5208,11 +5263,12 @@ void btrfs_evict_inode(struct inode *inode)
return;
}
+ fs_info = inode_to_fs_info(inode);
evict_inode_truncate_pages(inode);
if (inode->i_nlink &&
((btrfs_root_refs(&root->root_item) != 0 &&
- root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+ btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
btrfs_is_free_space_inode(BTRFS_I(inode))))
goto out;
@@ -5224,7 +5280,7 @@ void btrfs_evict_inode(struct inode *inode)
if (inode->i_nlink > 0) {
BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
- root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
+ btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
goto out;
}
@@ -5396,7 +5452,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
}
err = -ENOENT;
- key.objectid = dir->root->root_key.objectid;
+ key.objectid = btrfs_root_id(dir->root);
key.type = BTRFS_ROOT_REF_KEY;
key.offset = location->objectid;
@@ -5505,7 +5561,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
BTRFS_I(inode)->location.offset = 0;
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
- BUG_ON(args->root && !BTRFS_I(inode)->root);
if (args->root && args->root == args->root->fs_info->tree_root &&
args->ino != BTRFS_BTREE_INODE_OBJECTID)
@@ -5633,7 +5688,7 @@ static inline u8 btrfs_inode_type(struct inode *inode)
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
@@ -6172,7 +6227,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct inode *dir = args->dir;
struct inode *inode = args->inode;
const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_root *root;
struct btrfs_inode_item *inode_item;
struct btrfs_key *location;
@@ -6217,6 +6272,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
inode->i_generation = BTRFS_I(inode)->generation;
/*
+ * We don't have any capability xattrs set here yet, shortcut any
+ * queries for the xattrs here. If we add them later via the inode
+ * security init path or any other path this flag will be cleared.
+ */
+ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ /*
* Subvolumes don't inherit flags from their parent directory.
* Originally this was probably by accident, but we probably can't
* change it now without compatibility issues.
@@ -6349,8 +6411,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_err(fs_info,
"error inheriting props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
- ret);
+ btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
}
/*
@@ -6423,7 +6484,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_add_root_ref(trans, key.objectid,
- root->root_key.objectid, parent_ino,
+ btrfs_root_id(root), parent_ino,
index, name);
} else if (add_backref) {
ret = btrfs_insert_inode_ref(trans, root, name,
@@ -6466,7 +6527,7 @@ fail_dir_item:
u64 local_index;
int err;
err = btrfs_del_root_ref(trans, key.objectid,
- root->root_key.objectid, parent_ino,
+ btrfs_root_id(root), parent_ino,
&local_index, name);
if (err)
btrfs_abort_transaction(trans, err);
@@ -6487,7 +6548,7 @@ fail_dir_item:
static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
struct inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_new_inode_args new_inode_args = {
.dir = dir,
@@ -6557,14 +6618,14 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct fscrypt_name fname;
u64 index;
int err;
int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
- if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
+ if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
return -EXDEV;
if (inode->i_nlink >= BTRFS_LINK_MAX)
@@ -6721,7 +6782,6 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
*
* @inode: file to search in
* @page: page to read extent data into if the extent is inline
- * @pg_offset: offset into @page to copy to
* @start: file offset
* @len: length of range starting at @start
*
@@ -6735,8 +6795,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
* Return: ERR_PTR on error, non-NULL extent_map on success.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len)
+ struct page *page, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
@@ -6879,7 +6938,6 @@ next:
* ensured by tree-checker and inline extent creation path.
* Thus all members representing file offsets should be zero.
*/
- ASSERT(pg_offset == 0);
ASSERT(extent_start == 0);
ASSERT(em->start == 0);
@@ -6914,7 +6972,7 @@ insert:
}
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
@@ -6983,8 +7041,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
int ret;
alloc_hint = get_extent_allocation_hint(inode, start, len);
+again:
ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
0, alloc_hint, &ins, 1, 1);
+ if (ret == -EAGAIN) {
+ ASSERT(btrfs_is_zoned(fs_info));
+ wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ goto again;
+ }
if (ret)
return ERR_PTR(ret);
@@ -7036,7 +7101,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes, bool nowait, bool strict)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct can_nocow_file_extent_args nocow_args = { 0 };
struct btrfs_path *path;
int ret;
@@ -7234,11 +7299,49 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
struct extent_map *em;
int ret;
+ /*
+ * Note the missing NOCOW type.
+ *
+ * For pure NOCOW writes, we should not create an io extent map, but
+ * just reusing the existing one.
+ * Only PREALLOC writes (NOCOW write into preallocated range) can
+ * create an io extent map.
+ */
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
type == BTRFS_ORDERED_COMPRESSED ||
- type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
+ switch (type) {
+ case BTRFS_ORDERED_PREALLOC:
+ /* Uncompressed extents. */
+ ASSERT(block_len == len);
+
+ /* We're only referring part of a larger preallocated extent. */
+ ASSERT(block_len <= ram_bytes);
+ break;
+ case BTRFS_ORDERED_REGULAR:
+ /* Uncompressed extents. */
+ ASSERT(block_len == len);
+
+ /* COW results a new extent matching our file extent size. */
+ ASSERT(orig_block_len == len);
+ ASSERT(ram_bytes == len);
+
+ /* Since it's a new extent, we should not have any offset. */
+ ASSERT(orig_start == start);
+ break;
+ case BTRFS_ORDERED_COMPRESSED:
+ /* Must be compressed. */
+ ASSERT(compress_type != BTRFS_COMPRESS_NONE);
+
+ /*
+ * Encoded write can make us to refer to part of the
+ * uncompressed extent.
+ */
+ ASSERT(len <= ram_bytes);
+ break;
+ }
+
em = alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7251,13 +7354,9 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
em->orig_block_len = orig_block_len;
em->ram_bytes = ram_bytes;
em->generation = -1;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
- if (type == BTRFS_ORDERED_PREALLOC) {
- set_bit(EXTENT_FLAG_FILLING, &em->flags);
- } else if (type == BTRFS_ORDERED_COMPRESSED) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- }
+ em->flags |= EXTENT_FLAG_PINNED;
+ if (type == BTRFS_ORDERED_COMPRESSED)
+ extent_map_set_compression(em, compress_type);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
@@ -7277,7 +7376,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
unsigned int iomap_flags)
{
const bool nowait = (iomap_flags & IOMAP_NOWAIT);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_map *em = *map;
int type;
u64 block_start, orig_start, orig_block_len, ram_bytes;
@@ -7297,10 +7396,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
* just use the extent.
*
*/
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ if ((em->flags & EXTENT_FLAG_PREALLOC) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
type = BTRFS_ORDERED_PREALLOC;
else
type = BTRFS_ORDERED_NOCOW;
@@ -7417,7 +7516,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
struct iomap *srcmap)
{
struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = iter->private;
@@ -7515,7 +7614,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (ret < 0)
goto err;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto unlock_err;
@@ -7535,7 +7634,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* to buffered IO. Don't blame me, this is the price we pay for using
* the generic code.
*/
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+ if (extent_map_is_compressed(em) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
/*
@@ -7631,7 +7730,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* that, since we have locked only the parts we are performing I/O in.
*/
if ((em->block_start == EXTENT_MAP_HOLE) ||
- (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else {
@@ -7795,6 +7894,7 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
int ret;
ret = fiemap_prep(inode, fieinfo, start, &len, 0);
@@ -7820,18 +7920,26 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
- return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
-}
+ btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
-static int btrfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return extent_writepages(mapping, wbc);
-}
+ /*
+ * We did an initial flush to avoid holding the inode's lock while
+ * triggering writeback and waiting for the completion of IO and ordered
+ * extents. Now after we locked the inode we do it again, because it's
+ * possible a new write may have happened in between those two steps.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
+ if (ret) {
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+ }
+ }
-static void btrfs_readahead(struct readahead_control *rac)
-{
- extent_readahead(rac);
+ ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ return ret;
}
/*
@@ -7843,14 +7951,15 @@ static void btrfs_readahead(struct readahead_control *rac)
*/
static void wait_subpage_spinlock(struct page *page)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = page_to_fs_info(page);
+ struct folio *folio = page_folio(page);
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, page->mapping))
return;
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ subpage = folio_get_private(folio);
/*
* This may look insane as we just acquire the spinlock and release it,
@@ -7869,13 +7978,12 @@ static void wait_subpage_spinlock(struct page *page)
static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- int ret = try_release_extent_mapping(&folio->page, gfp_flags);
-
- if (ret == 1) {
+ if (try_release_extent_mapping(&folio->page, gfp_flags)) {
wait_subpage_spinlock(&folio->page);
clear_page_extent_mapped(&folio->page);
+ return true;
}
- return ret;
+ return false;
}
static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
@@ -7909,7 +8017,7 @@ static int btrfs_migrate_folio(struct address_space *mapping,
static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
- struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
+ struct btrfs_inode *inode = folio_to_inode(folio);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
@@ -7988,7 +8096,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
- if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
+ if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
/*
* If Ordered (Private2) is cleared, it means endio has
* already been executed for the range.
@@ -7997,7 +8105,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
*/
goto next;
}
- btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
+ btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
/*
* IO on this page will never be started, so we need to account
@@ -8052,7 +8160,7 @@ next:
* reserved data space.
* Since the IO will never happen for this page.
*/
- btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
+ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
if (!inode_evicting) {
clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_UPTODATE |
@@ -8067,176 +8175,12 @@ next:
* did something wrong.
*/
ASSERT(!folio_test_ordered(folio));
- btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
+ btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
__btrfs_release_folio(folio, GFP_NOFS);
clear_page_extent_mapped(&folio->page);
}
-/*
- * btrfs_page_mkwrite() is not allowed to change the file size as it gets
- * called from a page fault handler when a page is first dirtied. Hence we must
- * be careful to check for EOF conditions here. We set the page up correctly
- * for a written page which means we get ENOSPC checking when writing into
- * holes and correct delalloc and unwritten extent mapping on filesystems that
- * support these features.
- *
- * We are not allowed to take the i_mutex here so we have to play games to
- * protect against truncate races as the page could now be beyond EOF. Because
- * truncate_setsize() writes the inode size before removing pages, once we have
- * the page lock we can determine safely if the page is beyond EOF. If it is not
- * beyond EOF, then the page is guaranteed safe against truncation until we
- * unlock the page.
- */
-vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
-{
- struct page *page = vmf->page;
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- struct extent_changeset *data_reserved = NULL;
- unsigned long zero_start;
- loff_t size;
- vm_fault_t ret;
- int ret2;
- int reserved = 0;
- u64 reserved_space;
- u64 page_start;
- u64 page_end;
- u64 end;
-
- reserved_space = PAGE_SIZE;
-
- sb_start_pagefault(inode->i_sb);
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
- end = page_end;
-
- /*
- * Reserving delalloc space after obtaining the page lock can lead to
- * deadlock. For example, if a dirty page is locked by this function
- * and the call to btrfs_delalloc_reserve_space() ends up triggering
- * dirty page write out, then the btrfs_writepages() function could
- * end up waiting indefinitely to get a lock on the page currently
- * being processed by btrfs_page_mkwrite() function.
- */
- ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- page_start, reserved_space);
- if (!ret2) {
- ret2 = file_update_time(vmf->vma->vm_file);
- reserved = 1;
- }
- if (ret2) {
- ret = vmf_error(ret2);
- if (reserved)
- goto out;
- goto out_noreserve;
- }
-
- ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
-again:
- down_read(&BTRFS_I(inode)->i_mmap_lock);
- lock_page(page);
- size = i_size_read(inode);
-
- if ((page->mapping != inode->i_mapping) ||
- (page_start >= size)) {
- /* page got truncated out from underneath us */
- goto out_unlock;
- }
- wait_on_page_writeback(page);
-
- lock_extent(io_tree, page_start, page_end, &cached_state);
- ret2 = set_page_extent_mapped(page);
- if (ret2 < 0) {
- ret = vmf_error(ret2);
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- goto out_unlock;
- }
-
- /*
- * we can't set the delalloc bits if there are pending ordered
- * extents. Drop our locks and wait for them to finish
- */
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
- PAGE_SIZE);
- if (ordered) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- unlock_page(page);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
- btrfs_start_ordered_extent(ordered);
- btrfs_put_ordered_extent(ordered);
- goto again;
- }
-
- if (page->index == ((size - 1) >> PAGE_SHIFT)) {
- reserved_space = round_up(size - page_start,
- fs_info->sectorsize);
- if (reserved_space < PAGE_SIZE) {
- end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, page_start,
- PAGE_SIZE - reserved_space, true);
- }
- }
-
- /*
- * page_mkwrite gets called when the page is firstly dirtied after it's
- * faulted in, but write(2) could also dirty a page and set delalloc
- * bits, thus in this case for space account reason, we still need to
- * clear any delalloc bits within this page range since we have to
- * reserve data&meta space before lock_page() (see above comments).
- */
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
-
- ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
- &cached_state);
- if (ret2) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- ret = VM_FAULT_SIGBUS;
- goto out_unlock;
- }
-
- /* page is wholly or partially inside EOF */
- if (page_start + PAGE_SIZE > size)
- zero_start = offset_in_page(size);
- else
- zero_start = PAGE_SIZE;
-
- if (zero_start != PAGE_SIZE)
- memzero_page(page, zero_start, PAGE_SIZE - zero_start);
-
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
- btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
- btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
-
- btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
-
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
-
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- sb_end_pagefault(inode->i_sb);
- extent_changeset_free(data_reserved);
- return VM_FAULT_LOCKED;
-
-out_unlock:
- unlock_page(page);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
-out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
- reserved_space, (ret != 0));
-out_noreserve:
- sb_end_pagefault(inode->i_sb);
- extent_changeset_free(data_reserved);
- return ret;
-}
-
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
{
struct btrfs_truncate_control control = {
@@ -8455,10 +8399,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_inode *ei;
struct inode *inode;
+ struct extent_io_tree *file_extent_tree = NULL;
+
+ /* Self tests may pass a NULL fs_info. */
+ if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
+ if (!file_extent_tree)
+ return NULL;
+ }
ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
- if (!ei)
+ if (!ei) {
+ kfree(file_extent_tree);
return NULL;
+ }
ei->root = NULL;
ei->generation = 0;
@@ -8494,10 +8448,18 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
+
+ /* This io tree sets the valid inode. */
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
- extent_io_tree_init(fs_info, &ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT);
+
+ ei->file_extent_tree = file_extent_tree;
+ if (file_extent_tree) {
+ extent_io_tree_init(fs_info, ei->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT);
+ /* Lockdep class is set only for the file extent tree. */
+ lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class);
+ }
mutex_init(&ei->log_mutex);
spin_lock_init(&ei->ordered_tree_lock);
ei->ordered_tree = RB_ROOT;
@@ -8514,12 +8476,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
void btrfs_test_destroy_inode(struct inode *inode)
{
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif
void btrfs_free_inode(struct inode *inode)
{
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
@@ -8616,7 +8580,7 @@ int __init btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
init_once);
if (!btrfs_inode_cachep)
goto fail;
@@ -8639,7 +8603,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
u64 delalloc_bytes;
u64 inode_bytes;
struct inode *inode = d_inode(path->dentry);
- u32 blocksize = inode->i_sb->s_blocksize;
+ u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
u32 bi_flags = BTRFS_I(inode)->flags;
u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
@@ -8665,6 +8629,9 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
+ stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
+ stat->result_mask |= STATX_SUBVOL;
+
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
inode_bytes = inode_get_bytes(inode);
@@ -8679,7 +8646,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_dir,
struct dentry *new_dentry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
struct btrfs_trans_handle *trans;
unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
@@ -8931,7 +8898,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
struct btrfs_new_inode_args whiteout_args = {
.dir = old_dir,
.dentry = old_dentry,
@@ -9373,7 +9340,7 @@ out:
static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
@@ -9484,7 +9451,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
struct btrfs_path *path;
u64 start = ins->objectid;
u64 len = ins->offset;
- int qgroup_released;
+ u64 qgroup_released = 0;
int ret;
memset(&stack_fi, 0, sizeof(stack_fi));
@@ -9497,9 +9464,9 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
/* Encryption and other encoding is reserved and all 0 */
- qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
- if (qgroup_released < 0)
- return ERR_PTR(qgroup_released);
+ ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
+ if (ret < 0)
+ return ERR_PTR(ret);
if (trans) {
ret = insert_reserved_file_extent(trans, inode,
@@ -9544,7 +9511,7 @@ free_qgroup:
* or we leak qgroup data reservation.
*/
btrfs_qgroup_free_refroot(inode->root->fs_info,
- inode->root->root_key.objectid, qgroup_released,
+ btrfs_root_id(inode->root), qgroup_released,
BTRFS_QGROUP_RSV_DATA);
return ERR_PTR(ret);
}
@@ -9554,7 +9521,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
loff_t actual_len, u64 *alloc_hint,
struct btrfs_trans_handle *trans)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
@@ -9625,7 +9592,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ins.offset;
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->flags |= EXTENT_FLAG_PREALLOC;
em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
@@ -9706,7 +9673,7 @@ static int btrfs_permission(struct mnt_idmap *idmap,
static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct file *file, umode_t mode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode;
@@ -9778,7 +9745,9 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
- btrfs_page_set_writeback(fs_info, page, start, len);
+ /* This is for data, which doesn't yet support larger folio. */
+ ASSERT(folio_order(page_folio(page)) == 0);
+ btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
put_page(page);
index++;
}
@@ -9987,7 +9956,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- ret = btrfs_alloc_page_array(nr_pages, pages);
+ ret = btrfs_alloc_page_array(nr_pages, pages, 0);
if (ret) {
ret = -ENOMEM;
goto out;
@@ -10078,7 +10047,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
cond_resched();
}
- em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+ em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_unlock_extent;
@@ -10106,12 +10075,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->len = min_t(u64, extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ (em->flags & EXTENT_FLAG_PREALLOC)) {
disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
- } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ } else if (extent_map_is_compressed(em)) {
disk_bytenr = em->block_start;
/*
* Bail if the buffer isn't large enough to return the whole
@@ -10126,7 +10095,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
ret = btrfs_encoded_io_compression_from_extent(fs_info,
- em->compress_type);
+ extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
@@ -10190,8 +10159,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
size_t orig_count;
u64 start, end;
u64 num_bytes, ram_bytes, disk_num_bytes;
- unsigned long nr_pages, i;
- struct page **pages;
+ unsigned long nr_folios, i;
+ struct folio **folios;
struct btrfs_key ins;
bool extent_reserved = false;
struct extent_map *em;
@@ -10222,6 +10191,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
return -EINVAL;
+ /*
+ * Compressed extents should always have checksums, so error out if we
+ * have a NOCOW file or inode was created while mounted with NODATASUM.
+ */
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ return -EINVAL;
+
orig_count = iov_iter_count(from);
/* The extent size must be sane. */
@@ -10273,24 +10249,24 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
* isn't.
*/
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
- nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
- pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
- if (!pages)
+ nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
+ folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ if (!folios)
return -ENOMEM;
- for (i = 0; i < nr_pages; i++) {
+ for (i = 0; i < nr_folios; i++) {
size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
char *kaddr;
- pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!pages[i]) {
+ folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
+ if (!folios[i]) {
ret = -ENOMEM;
- goto out_pages;
+ goto out_folios;
}
- kaddr = kmap_local_page(pages[i]);
+ kaddr = kmap_local_folio(folios[i], 0);
if (copy_from_iter(kaddr, bytes, from) != bytes) {
kunmap_local(kaddr);
ret = -EFAULT;
- goto out_pages;
+ goto out_folios;
}
if (bytes < PAGE_SIZE)
memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
@@ -10302,12 +10278,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
if (ret)
- goto out_pages;
+ goto out_folios;
ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
if (ret)
- goto out_pages;
+ goto out_folios;
lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
if (!ordered &&
@@ -10335,10 +10311,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
goto out_qgroup_free_data;
/* Try an inline extent first. */
- if (start == 0 && encoded->unencoded_len == encoded->len &&
- encoded->unencoded_offset == 0) {
- ret = cow_file_range_inline(inode, encoded->len, orig_count,
- compression, pages, true);
+ if (encoded->unencoded_len == encoded->len &&
+ encoded->unencoded_offset == 0 &&
+ can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
+ ret = __cow_file_range_inline(inode, start, encoded->len,
+ orig_count, compression, folios[0],
+ true);
if (ret <= 0) {
if (ret == 0)
ret = orig_count;
@@ -10382,7 +10360,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
btrfs_delalloc_release_extents(inode, num_bytes);
- btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
+ btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
ret = orig_count;
goto out;
@@ -10394,7 +10372,7 @@ out_delalloc_release:
btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
out_qgroup_free_data:
if (ret < 0)
- btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
+ btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
out_free_data_space:
/*
* If btrfs_reserve_extent() succeeded, then we already decremented
@@ -10404,12 +10382,12 @@ out_free_data_space:
btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
out_unlock:
unlock_extent(io_tree, start, end, &cached_state);
-out_pages:
- for (i = 0; i < nr_pages; i++) {
- if (pages[i])
- __free_page(pages[i]);
+out_folios:
+ for (i = 0; i < nr_folios; i++) {
+ if (folios[i])
+ __folio_put(folios[i]);
}
- kvfree(pages);
+ kvfree(folios);
out:
if (ret >= 0)
iocb->ki_pos += encoded->len;
@@ -10557,6 +10535,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
struct extent_map *em = NULL;
+ struct btrfs_chunk_map *map = NULL;
struct btrfs_device *device = NULL;
struct btrfs_swap_info bsi = {
.lowest_ppage = (sector_t)-1ULL,
@@ -10635,7 +10614,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because subvolume %llu is being deleted",
- root->root_key.objectid);
+ btrfs_root_id(root));
return -EPERM;
}
atomic_inc(&root->nr_swapfiles);
@@ -10650,7 +10629,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct btrfs_block_group *bg;
u64 len = isize - start;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -10673,7 +10652,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = -EINVAL;
goto out;
}
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ if (extent_map_is_compressed(em)) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL;
goto out;
@@ -10696,13 +10675,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out;
}
- em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
goto out;
}
- if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
btrfs_warn(fs_info,
"swapfile must have single data profile");
ret = -EINVAL;
@@ -10710,23 +10689,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
}
if (device == NULL) {
- device = em->map_lookup->stripes[0].dev;
+ device = map->stripes[0].dev;
ret = btrfs_add_swapfile_pin(inode, device, false);
if (ret == 1)
ret = 0;
else if (ret)
goto out;
- } else if (device != em->map_lookup->stripes[0].dev) {
+ } else if (device != map->stripes[0].dev) {
btrfs_warn(fs_info, "swapfile must be on one device");
ret = -EINVAL;
goto out;
}
- physical_block_start = (em->map_lookup->stripes[0].physical +
- (logical_block_start - em->start));
- len = min(len, em->len - (logical_block_start - em->start));
- free_extent_map(em);
- em = NULL;
+ physical_block_start = (map->stripes[0].physical +
+ (logical_block_start - map->start));
+ len = min(len, map->chunk_len - (logical_block_start - map->start));
+ btrfs_free_chunk_map(map);
+ map = NULL;
bg = btrfs_lookup_block_group(fs_info, logical_block_start);
if (!bg) {
@@ -10779,6 +10758,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
out:
if (!IS_ERR_OR_NULL(em))
free_extent_map(em);
+ if (!IS_ERR_OR_NULL(map))
+ btrfs_free_chunk_map(map);
unlock_extent(io_tree, 0, isize - 1, &cached_state);
@@ -10859,7 +10840,7 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
if (ordered) {
btrfs_err(root->fs_info,
"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
- start, end, btrfs_ino(inode), root->root_key.objectid,
+ start, end, btrfs_ino(inode), btrfs_root_id(root),
ordered->file_offset,
ordered->file_offset + ordered->num_bytes - 1);
btrfs_put_ordered_extent(ordered);
@@ -10868,6 +10849,65 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
ASSERT(ordered == NULL);
}
+/*
+ * Find the first inode with a minimum number.
+ *
+ * @root: The root to search for.
+ * @min_ino: The minimum inode number.
+ *
+ * Find the first inode in the @root with a number >= @min_ino and return it.
+ * Returns NULL if no such inode found.
+ */
+struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
+{
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *inode;
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ inode = rb_entry(node, struct btrfs_inode, rb_node);
+ if (min_ino < btrfs_ino(inode))
+ node = node->rb_left;
+ else if (min_ino > btrfs_ino(inode))
+ node = node->rb_right;
+ else
+ break;
+ }
+
+ if (!node) {
+ while (prev) {
+ inode = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (min_ino <= btrfs_ino(inode)) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+
+ while (node) {
+ inode = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (igrab(&inode->vfs_inode)) {
+ spin_unlock(&root->inode_lock);
+ return inode;
+ }
+
+ min_ino = btrfs_ino(inode) + 1;
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+
+ return NULL;
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -10923,7 +10963,7 @@ static const struct address_space_operations btrfs_aops = {
.release_folio = btrfs_release_folio,
.migrate_folio = btrfs_migrate_folio,
.dirty_folio = filemap_dirty_folio,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,
};