aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/accessors.c101
-rw-r--r--fs/btrfs/accessors.h54
-rw-r--r--fs/btrfs/acl.c1
-rw-r--r--fs/btrfs/acl.h11
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/async-thread.h3
-rw-r--r--fs/btrfs/backref.c179
-rw-r--r--fs/btrfs/backref.h136
-rw-r--r--fs/btrfs/bio.c34
-rw-r--r--fs/btrfs/bio.h6
-rw-r--r--fs/btrfs/block-group.c265
-rw-r--r--fs/btrfs/block-group.h27
-rw-r--r--fs/btrfs/block-rsv.c14
-rw-r--r--fs/btrfs/block-rsv.h39
-rw-r--r--fs/btrfs/btrfs_inode.h43
-rw-r--r--fs/btrfs/compression.c271
-rw-r--r--fs/btrfs/compression.h55
-rw-r--r--fs/btrfs/ctree.c124
-rw-r--r--fs/btrfs/ctree.h45
-rw-r--r--fs/btrfs/defrag.c119
-rw-r--r--fs/btrfs/defrag.h10
-rw-r--r--fs/btrfs/delalloc-space.c33
-rw-r--r--fs/btrfs/delalloc-space.h4
-rw-r--r--fs/btrfs/delayed-inode.c135
-rw-r--r--fs/btrfs/delayed-inode.h21
-rw-r--r--fs/btrfs/delayed-ref.c364
-rw-r--r--fs/btrfs/delayed-ref.h220
-rw-r--r--fs/btrfs/dev-replace.c71
-rw-r--r--fs/btrfs/dev-replace.h4
-rw-r--r--fs/btrfs/dir-item.h6
-rw-r--r--fs/btrfs/disk-io.c388
-rw-r--r--fs/btrfs/disk-io.h25
-rw-r--r--fs/btrfs/export.c20
-rw-r--r--fs/btrfs/export.h4
-rw-r--r--fs/btrfs/extent-io-tree.c171
-rw-r--r--fs/btrfs/extent-io-tree.h25
-rw-r--r--fs/btrfs/extent-tree.c637
-rw-r--r--fs/btrfs/extent-tree.h13
-rw-r--r--fs/btrfs/extent_io.c1775
-rw-r--r--fs/btrfs/extent_io.h131
-rw-r--r--fs/btrfs/extent_map.c528
-rw-r--r--fs/btrfs/extent_map.h150
-rw-r--r--fs/btrfs/file-item.c109
-rw-r--r--fs/btrfs/file-item.h16
-rw-r--r--fs/btrfs/file.c403
-rw-r--r--fs/btrfs/file.h15
-rw-r--r--fs/btrfs/free-space-cache.c24
-rw-r--r--fs/btrfs/free-space-cache.h15
-rw-r--r--fs/btrfs/free-space-tree.c56
-rw-r--r--fs/btrfs/free-space-tree.h6
-rw-r--r--fs/btrfs/fs.h80
-rw-r--r--fs/btrfs/inode-item.c17
-rw-r--r--fs/btrfs/inode-item.h5
-rw-r--r--fs/btrfs/inode.c1404
-rw-r--r--fs/btrfs/ioctl.c330
-rw-r--r--fs/btrfs/ioctl.h9
-rw-r--r--fs/btrfs/locking.c29
-rw-r--r--fs/btrfs/locking.h24
-rw-r--r--fs/btrfs/lru_cache.c2
-rw-r--r--fs/btrfs/lru_cache.h7
-rw-r--r--fs/btrfs/lzo.c125
-rw-r--r--fs/btrfs/messages.c6
-rw-r--r--fs/btrfs/messages.h2
-rw-r--r--fs/btrfs/misc.h2
-rw-r--r--fs/btrfs/ordered-data.c31
-rw-r--r--fs/btrfs/ordered-data.h23
-rw-r--r--fs/btrfs/orphan.c1
-rw-r--r--fs/btrfs/orphan.h5
-rw-r--r--fs/btrfs/print-tree.h3
-rw-r--r--fs/btrfs/props.c5
-rw-r--r--fs/btrfs/props.h7
-rw-r--r--fs/btrfs/qgroup.c316
-rw-r--r--fs/btrfs/qgroup.h27
-rw-r--r--fs/btrfs/raid-stripe-tree.c3
-rw-r--r--fs/btrfs/raid-stripe-tree.h5
-rw-r--r--fs/btrfs/raid56.c41
-rw-r--r--fs/btrfs/raid56.h11
-rw-r--r--fs/btrfs/rcu-string.h6
-rw-r--r--fs/btrfs/ref-verify.c16
-rw-r--r--fs/btrfs/ref-verify.h9
-rw-r--r--fs/btrfs/reflink.c74
-rw-r--r--fs/btrfs/reflink.h4
-rw-r--r--fs/btrfs/relocation.c421
-rw-r--r--fs/btrfs/relocation.h9
-rw-r--r--fs/btrfs/root-tree.c30
-rw-r--r--fs/btrfs/root-tree.h12
-rw-r--r--fs/btrfs/scrub.c148
-rw-r--r--fs/btrfs/scrub.h6
-rw-r--r--fs/btrfs/send.c163
-rw-r--r--fs/btrfs/send.h8
-rw-r--r--fs/btrfs/space-info.c27
-rw-r--r--fs/btrfs/space-info.h9
-rw-r--r--fs/btrfs/subpage.c440
-rw-r--r--fs/btrfs/subpage.h99
-rw-r--r--fs/btrfs/super.c2346
-rw-r--r--fs/btrfs/super.h12
-rw-r--r--fs/btrfs/sysfs.c65
-rw-r--r--fs/btrfs/sysfs.h9
-rw-r--r--fs/btrfs/tests/btrfs-tests.c8
-rw-r--r--fs/btrfs/tests/btrfs-tests.h1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c32
-rw-r--r--fs/btrfs/tests/extent-map-tests.c318
-rw-r--r--fs/btrfs/tests/inode-tests.c100
-rw-r--r--fs/btrfs/transaction.c158
-rw-r--r--fs/btrfs/transaction.h21
-rw-r--r--fs/btrfs/tree-checker.c81
-rw-r--r--fs/btrfs/tree-checker.h5
-rw-r--r--fs/btrfs/tree-log.c204
-rw-r--r--fs/btrfs/tree-log.h49
-rw-r--r--fs/btrfs/tree-mod-log.c15
-rw-r--r--fs/btrfs/tree-mod-log.h8
-rw-r--r--fs/btrfs/ulist.c1
-rw-r--r--fs/btrfs/ulist.h1
-rw-r--r--fs/btrfs/uuid-tree.c3
-rw-r--r--fs/btrfs/uuid-tree.h5
-rw-r--r--fs/btrfs/verity.c1
-rw-r--r--fs/btrfs/verity.h7
-rw-r--r--fs/btrfs/volumes.c1200
-rw-r--r--fs/btrfs/volumes.h112
-rw-r--r--fs/btrfs/xattr.c65
-rw-r--r--fs/btrfs/xattr.h6
-rw-r--r--fs/btrfs/zlib.c185
-rw-r--r--fs/btrfs/zoned.c157
-rw-r--r--fs/btrfs/zoned.h29
-rw-r--r--fs/btrfs/zstd.c158
125 files changed, 9399 insertions, 6871 deletions
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 206cf1612c1d..79026917db19 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -5,7 +5,8 @@
#include <asm/unaligned.h>
#include "messages.h"
-#include "ctree.h"
+#include "extent_io.h"
+#include "fs.h"
#include "accessors.h"
static bool check_setget_bounds(const struct extent_buffer *eb,
@@ -27,7 +28,7 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
{
token->eb = eb;
- token->kaddr = page_address(eb->pages[0]);
+ token->kaddr = folio_address(eb->folios[0]);
token->offset = 0;
}
@@ -50,7 +51,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
* an offset into the extent buffer page array, cast to a specific type. This
* gives us all the type checking.
*
- * The extent buffer pages stored in the array pages do not form a contiguous
+ * The extent buffer pages stored in the array folios may not form a contiguous
* phyusical range, but the API functions assume the linear offset to the range
* from 0 to metadata node size.
*/
@@ -60,28 +61,30 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_page_index(member_offset); \
- const unsigned long oip = get_eb_offset_in_page(token->eb, \
- member_offset); \
+ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
+ const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ member_offset);\
+ const int unit_size = token->eb->folio_size; \
+ const int unit_shift = token->eb->folio_shift; \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
\
ASSERT(token); \
ASSERT(token->kaddr); \
ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
if (token->offset <= member_offset && \
- member_offset + size <= token->offset + PAGE_SIZE) { \
- return get_unaligned_le##bits(token->kaddr + oip); \
+ member_offset + size <= token->offset + unit_size) { \
+ return get_unaligned_le##bits(token->kaddr + oil); \
} \
- token->kaddr = page_address(token->eb->pages[idx]); \
- token->offset = idx << PAGE_SHIFT; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
- return get_unaligned_le##bits(token->kaddr + oip); \
+ token->kaddr = folio_address(token->eb->folios[idx]); \
+ token->offset = idx << unit_shift; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
+ return get_unaligned_le##bits(token->kaddr + oil); \
\
- memcpy(lebytes, token->kaddr + oip, part); \
- token->kaddr = page_address(token->eb->pages[idx + 1]); \
- token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(lebytes, token->kaddr + oil, part); \
+ token->kaddr = folio_address(token->eb->folios[idx + 1]); \
+ token->offset = (idx + 1) << unit_shift; \
memcpy(lebytes + part, token->kaddr, size - part); \
return get_unaligned_le##bits(lebytes); \
} \
@@ -89,19 +92,21 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
- const unsigned long idx = get_eb_page_index(member_offset); \
- char *kaddr = page_address(eb->pages[idx]); \
+ const unsigned long idx = get_eb_folio_index(eb, member_offset);\
+ const unsigned long oil = get_eb_offset_in_folio(eb, \
+ member_offset);\
+ const int unit_size = eb->folio_size; \
+ char *kaddr = folio_address(eb->folios[idx]); \
const int size = sizeof(u##bits); \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \
- return get_unaligned_le##bits(kaddr + oip); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
+ return get_unaligned_le##bits(kaddr + oil); \
\
- memcpy(lebytes, kaddr + oip, part); \
- kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(lebytes, kaddr + oil, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
memcpy(lebytes + part, kaddr, size - part); \
return get_unaligned_le##bits(lebytes); \
} \
@@ -110,53 +115,59 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \
u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_page_index(member_offset); \
- const unsigned long oip = get_eb_offset_in_page(token->eb, \
- member_offset); \
+ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
+ const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ member_offset);\
+ const int unit_size = token->eb->folio_size; \
+ const int unit_shift = token->eb->folio_shift; \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
\
ASSERT(token); \
ASSERT(token->kaddr); \
ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
if (token->offset <= member_offset && \
- member_offset + size <= token->offset + PAGE_SIZE) { \
- put_unaligned_le##bits(val, token->kaddr + oip); \
+ member_offset + size <= token->offset + unit_size) { \
+ put_unaligned_le##bits(val, token->kaddr + oil); \
return; \
} \
- token->kaddr = page_address(token->eb->pages[idx]); \
- token->offset = idx << PAGE_SHIFT; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
- put_unaligned_le##bits(val, token->kaddr + oip); \
+ token->kaddr = folio_address(token->eb->folios[idx]); \
+ token->offset = idx << unit_shift; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
+ oil + size <= unit_size) { \
+ put_unaligned_le##bits(val, token->kaddr + oil); \
return; \
} \
put_unaligned_le##bits(val, lebytes); \
- memcpy(token->kaddr + oip, lebytes, part); \
- token->kaddr = page_address(token->eb->pages[idx + 1]); \
- token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(token->kaddr + oil, lebytes, part); \
+ token->kaddr = folio_address(token->eb->folios[idx + 1]); \
+ token->offset = (idx + 1) << unit_shift; \
memcpy(token->kaddr, lebytes + part, size - part); \
} \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
- const unsigned long idx = get_eb_page_index(member_offset); \
- char *kaddr = page_address(eb->pages[idx]); \
+ const unsigned long idx = get_eb_folio_index(eb, member_offset);\
+ const unsigned long oil = get_eb_offset_in_folio(eb, \
+ member_offset);\
+ const int unit_size = eb->folio_size; \
+ char *kaddr = folio_address(eb->folios[idx]); \
const int size = sizeof(u##bits); \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
- put_unaligned_le##bits(val, kaddr + oip); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
+ oil + size <= unit_size) { \
+ put_unaligned_le##bits(val, kaddr + oil); \
return; \
} \
\
put_unaligned_le##bits(val, lebytes); \
- memcpy(kaddr + oip, lebytes, part); \
- kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(kaddr + oil, lebytes, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
memcpy(kaddr, lebytes + part, size - part); \
}
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index aa0844535644..6fce3e8d3dac 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -3,8 +3,17 @@
#ifndef BTRFS_ACCESSORS_H
#define BTRFS_ACCESSORS_H
-#include <linux/stddef.h>
#include <asm/unaligned.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/align.h>
+#include <linux/build_bug.h>
+#include <linux/compiler.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <uapi/linux/btrfs_tree.h>
+
+struct extent_buffer;
struct btrfs_map_token {
struct extent_buffer *eb;
@@ -90,14 +99,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
- const type *p = page_address(eb->pages[0]) + \
+ const type *p = folio_address(eb->folios[0]) + \
offset_in_page(eb->start); \
return get_unaligned_le##bits(&p->member); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
- type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
+ type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \
put_unaligned_le##bits(val, &p->member); \
}
@@ -844,45 +853,6 @@ static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
}
-static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
- const struct btrfs_disk_balance_args *disk)
-{
- memset(cpu, 0, sizeof(*cpu));
-
- cpu->profiles = le64_to_cpu(disk->profiles);
- cpu->usage = le64_to_cpu(disk->usage);
- cpu->devid = le64_to_cpu(disk->devid);
- cpu->pstart = le64_to_cpu(disk->pstart);
- cpu->pend = le64_to_cpu(disk->pend);
- cpu->vstart = le64_to_cpu(disk->vstart);
- cpu->vend = le64_to_cpu(disk->vend);
- cpu->target = le64_to_cpu(disk->target);
- cpu->flags = le64_to_cpu(disk->flags);
- cpu->limit = le64_to_cpu(disk->limit);
- cpu->stripes_min = le32_to_cpu(disk->stripes_min);
- cpu->stripes_max = le32_to_cpu(disk->stripes_max);
-}
-
-static inline void btrfs_cpu_balance_args_to_disk(
- struct btrfs_disk_balance_args *disk,
- const struct btrfs_balance_args *cpu)
-{
- memset(disk, 0, sizeof(*disk));
-
- disk->profiles = cpu_to_le64(cpu->profiles);
- disk->usage = cpu_to_le64(cpu->usage);
- disk->devid = cpu_to_le64(cpu->devid);
- disk->pstart = cpu_to_le64(cpu->pstart);
- disk->pend = cpu_to_le64(cpu->pend);
- disk->vstart = cpu_to_le64(cpu->vstart);
- disk->vend = cpu_to_le64(cpu->vend);
- disk->target = cpu_to_le64(cpu->target);
- disk->flags = cpu_to_le64(cpu->flags);
- disk->limit = cpu_to_le64(cpu->limit);
- disk->stripes_min = cpu_to_le32(cpu->stripes_min);
- disk->stripes_max = cpu_to_le32(cpu->stripes_max);
-}
-
/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 7427449a04a3..e0ba00d64ea0 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -12,7 +12,6 @@
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include "ctree.h"
-#include "btrfs_inode.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
index a270e71ec05f..48b9ddae4a46 100644
--- a/fs/btrfs/acl.h
+++ b/fs/btrfs/acl.h
@@ -3,8 +3,15 @@
#ifndef BTRFS_ACL_H
#define BTRFS_ACL_H
+struct posix_acl;
+struct inode;
+struct btrfs_trans_handle;
+
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+struct mnt_idmap;
+struct dentry;
+
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
struct posix_acl *acl, int type);
@@ -13,6 +20,10 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
#else
+#include <linux/errno.h>
+
+struct btrfs_trans_handle;
+
#define btrfs_get_acl NULL
#define btrfs_set_acl NULL
static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 9e261aac671e..361a866c1995 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -11,7 +11,6 @@
#include <linux/freezer.h>
#include <trace/events/btrfs.h>
#include "async-thread.h"
-#include "ctree.h"
enum {
WORK_DONE_BIT,
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 62b8a0d57898..04c2f3175828 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -7,11 +7,14 @@
#ifndef BTRFS_ASYNC_THREAD_H
#define BTRFS_ASYNC_THREAD_H
+#include <linux/compiler_types.h>
#include <linux/workqueue.h>
+#include <linux/list.h>
struct btrfs_fs_info;
struct btrfs_workqueue;
struct btrfs_work;
+
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index beed7e459dab..a2de5c05f97c 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -198,10 +198,7 @@ static struct kmem_cache *btrfs_prelim_ref_cache;
int __init btrfs_prelim_ref_init(void)
{
btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
- sizeof(struct prelim_ref),
- 0,
- SLAB_MEM_SPREAD,
- NULL);
+ sizeof(struct prelim_ref), 0, 0, NULL);
if (!btrfs_prelim_ref_cache)
return -ENOMEM;
return 0;
@@ -264,7 +261,7 @@ static void update_share_count(struct share_check *sc, int oldcount,
else if (oldcount < 1 && newcount > 0)
sc->share_count++;
- if (newref->root_id == sc->root->root_key.objectid &&
+ if (newref->root_id == btrfs_root_id(sc->root) &&
newref->wanted_disk_byte == sc->data_bytenr &&
newref->key_for_search.objectid == sc->inum)
sc->self_ref_count += newref->count;
@@ -772,7 +769,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
continue;
}
- if (sc && ref->root_id != sc->root->root_key.objectid) {
+ if (sc && ref->root_id != btrfs_root_id(sc->root)) {
free_pref(ref);
ret = BACKREF_FOUND_SHARED;
goto out;
@@ -922,40 +919,38 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
switch (node->type) {
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
- struct btrfs_delayed_tree_ref *ref;
struct btrfs_key *key_ptr = NULL;
+ /* The owner of a tree block ref is the level. */
+ int level = btrfs_delayed_ref_owner(node);
if (head->extent_op && head->extent_op->update_key) {
btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
key_ptr = &key;
}
- ref = btrfs_delayed_node_to_tree_ref(node);
- ret = add_indirect_ref(fs_info, preftrees, ref->root,
- key_ptr, ref->level + 1,
- node->bytenr, count, sc,
- GFP_ATOMIC);
+ ret = add_indirect_ref(fs_info, preftrees, node->ref_root,
+ key_ptr, level + 1, node->bytenr,
+ count, sc, GFP_ATOMIC);
break;
}
case BTRFS_SHARED_BLOCK_REF_KEY: {
- /* SHARED DIRECT METADATA backref */
- struct btrfs_delayed_tree_ref *ref;
-
- ref = btrfs_delayed_node_to_tree_ref(node);
+ /*
+ * SHARED DIRECT METADATA backref
+ *
+ * The owner of a tree block ref is the level.
+ */
+ int level = btrfs_delayed_ref_owner(node);
- ret = add_direct_ref(fs_info, preftrees, ref->level + 1,
- ref->parent, node->bytenr, count,
+ ret = add_direct_ref(fs_info, preftrees, level + 1,
+ node->parent, node->bytenr, count,
sc, GFP_ATOMIC);
break;
}
case BTRFS_EXTENT_DATA_REF_KEY: {
/* NORMAL INDIRECT DATA backref */
- struct btrfs_delayed_data_ref *ref;
- ref = btrfs_delayed_node_to_data_ref(node);
-
- key.objectid = ref->objectid;
+ key.objectid = btrfs_delayed_ref_owner(node);
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = ref->offset;
+ key.offset = btrfs_delayed_ref_offset(node);
/*
* If we have a share check context and a reference for
@@ -975,18 +970,14 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
if (sc && count < 0)
sc->have_delayed_delete_refs = true;
- ret = add_indirect_ref(fs_info, preftrees, ref->root,
+ ret = add_indirect_ref(fs_info, preftrees, node->ref_root,
&key, 0, node->bytenr, count, sc,
GFP_ATOMIC);
break;
}
case BTRFS_SHARED_DATA_REF_KEY: {
/* SHARED DIRECT FULL backref */
- struct btrfs_delayed_data_ref *ref;
-
- ref = btrfs_delayed_node_to_data_ref(node);
-
- ret = add_direct_ref(fs_info, preftrees, 0, ref->parent,
+ ret = add_direct_ref(fs_info, preftrees, 0, node->parent,
node->bytenr, count, sc,
GFP_ATOMIC);
break;
@@ -1036,8 +1027,6 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
slot = path->slots[0];
item_size = btrfs_item_size(leaf, slot);
- BUG_ON(item_size < sizeof(*ei));
-
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
if (ctx->check_extent_item) {
@@ -1435,8 +1424,10 @@ again:
if (ret < 0)
goto out;
if (ret == 0) {
- /* This shouldn't happen, indicates a bug or fs corruption. */
- ASSERT(ret != 0);
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
ret = -EUCLEAN;
goto out;
}
@@ -2225,6 +2216,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
+ return -EUCLEAN;
+ }
ret = btrfs_previous_extent_item(extent_root, path, 0);
if (ret) {
@@ -2247,7 +2245,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
eb = path->nodes[0];
item_size = btrfs_item_size(eb, path->slots[0]);
- BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
flags = btrfs_extent_flags(eb, ei);
@@ -2626,7 +2623,7 @@ static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath)
btrfs_debug(fs_root->fs_info,
"following ref at offset %u for inode %llu in tree %llu",
cur, found_key.objectid,
- fs_root->root_key.objectid);
+ btrfs_root_id(fs_root));
ret = inode_to_path(parent, name_len,
(unsigned long)(iref + 1), eb, ipath);
if (ret)
@@ -2773,20 +2770,14 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
size_t alloc_bytes;
alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
- data = kvmalloc(alloc_bytes, GFP_KERNEL);
+ data = kvzalloc(alloc_bytes, GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
- if (total_bytes >= sizeof(*data)) {
+ if (total_bytes >= sizeof(*data))
data->bytes_left = total_bytes - sizeof(*data);
- data->bytes_missing = 0;
- } else {
+ else
data->bytes_missing = sizeof(*data) - total_bytes;
- data->bytes_left = 0;
- }
-
- data->elem_cnt = 0;
- data->elem_missed = 0;
return data;
}
@@ -2850,6 +2841,16 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf
return ret;
}
+static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
+{
+ iter->bytenr = 0;
+ iter->item_ptr = 0;
+ iter->cur_ptr = 0;
+ iter->end_ptr = 0;
+ btrfs_release_path(iter->path);
+ memset(&iter->cur_key, 0, sizeof(iter->cur_key));
+}
+
int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
{
struct btrfs_fs_info *fs_info = iter->fs_info;
@@ -2868,6 +2869,10 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
if (ret < 0)
return ret;
if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
ret = -EUCLEAN;
goto release;
}
@@ -2938,6 +2943,14 @@ release:
return ret;
}
+static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter *iter)
+{
+ if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY ||
+ iter->cur_key.type == BTRFS_METADATA_ITEM_KEY)
+ return true;
+ return false;
+}
+
/*
* Go to the next backref item of current bytenr, can be either inlined or
* keyed.
@@ -2950,7 +2963,7 @@ release:
*/
int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
{
- struct extent_buffer *eb = btrfs_backref_get_eb(iter);
+ struct extent_buffer *eb = iter->path->nodes[0];
struct btrfs_root *extent_root;
struct btrfs_path *path = iter->path;
struct btrfs_extent_inline_ref *iref;
@@ -3038,6 +3051,19 @@ struct btrfs_backref_node *btrfs_backref_alloc_node(
return node;
}
+void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node)
+{
+ if (node) {
+ ASSERT(list_empty(&node->list));
+ ASSERT(list_empty(&node->lower));
+ ASSERT(node->eb == NULL);
+ cache->nr_nodes--;
+ btrfs_put_root(node->root);
+ kfree(node);
+ }
+}
+
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
struct btrfs_backref_cache *cache)
{
@@ -3049,6 +3075,52 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge(
return edge;
}
+void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_edge *edge)
+{
+ if (edge) {
+ cache->nr_edges--;
+ kfree(edge);
+ }
+}
+
+void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node)
+{
+ if (node->locked) {
+ btrfs_tree_unlock(node->eb);
+ node->locked = 0;
+ }
+}
+
+void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node)
+{
+ if (node->eb) {
+ btrfs_backref_unlock_node_buffer(node);
+ free_extent_buffer(node->eb);
+ node->eb = NULL;
+ }
+}
+
+/*
+ * Drop the backref node from cache without cleaning up its children
+ * edges.
+ *
+ * This can only be called on node without parent edges.
+ * The children edges are still kept as is.
+ */
+void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
+ struct btrfs_backref_node *node)
+{
+ ASSERT(list_empty(&node->upper));
+
+ btrfs_backref_drop_node_buffer(node);
+ list_del_init(&node->list);
+ list_del_init(&node->lower);
+ if (!RB_EMPTY_NODE(&node->rb_node))
+ rb_erase(&node->rb_node, &tree->rb_root);
+ btrfs_backref_free_node(tree, node);
+}
+
/*
* Drop the backref node from cache, also cleaning up all its
* upper edges and any uncached nodes in the path.
@@ -3120,6 +3192,19 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
ASSERT(!cache->nr_edges);
}
+void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper,
+ int link_which)
+{
+ ASSERT(upper && lower && upper->level == lower->level + 1);
+ edge->node[LOWER] = lower;
+ edge->node[UPPER] = upper;
+ if (link_which & LINK_LOWER)
+ list_add_tail(&edge->list[LOWER], &lower->upper);
+ if (link_which & LINK_UPPER)
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+}
/*
* Handle direct tree backref
*
@@ -3270,7 +3355,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) {
btrfs_err(fs_info,
"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
- cur->bytenr, level - 1, root->root_key.objectid,
+ cur->bytenr, level - 1, btrfs_root_id(root),
tree_key->objectid, tree_key->type, tree_key->offset);
btrfs_put_root(root);
ret = -ENOENT;
@@ -3428,7 +3513,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
int type;
cond_resched();
- eb = btrfs_backref_get_eb(iter);
+ eb = iter->path->nodes[0];
key.objectid = iter->bytenr;
if (btrfs_backref_iter_is_inline_ref(iter)) {
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index ab4ca0eda605..e8c22cccb5c1 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -6,11 +6,23 @@
#ifndef BTRFS_BACKREF_H
#define BTRFS_BACKREF_H
-#include <linux/btrfs.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <uapi/linux/btrfs.h>
+#include <uapi/linux/btrfs_tree.h>
#include "messages.h"
-#include "ulist.h"
+#include "locking.h"
#include "disk-io.h"
#include "extent_io.h"
+#include "ctree.h"
+
+struct extent_inode_elem;
+struct ulist;
+struct btrfs_extent_item;
+struct btrfs_trans_handle;
+struct btrfs_fs_info;
/*
* Used by implementations of iterate_extent_inodes_t (see definition below) to
@@ -271,22 +283,6 @@ struct btrfs_backref_iter {
struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info);
-static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter)
-{
- if (!iter)
- return;
- btrfs_free_path(iter->path);
- kfree(iter);
-}
-
-static inline struct extent_buffer *btrfs_backref_get_eb(
- struct btrfs_backref_iter *iter)
-{
- if (!iter)
- return NULL;
- return iter->path->nodes[0];
-}
-
/*
* For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data
* is btrfs_tree_block_info, without a btrfs_extent_inline_ref header.
@@ -306,25 +302,6 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr);
int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
-static inline bool btrfs_backref_iter_is_inline_ref(
- struct btrfs_backref_iter *iter)
-{
- if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY ||
- iter->cur_key.type == BTRFS_METADATA_ITEM_KEY)
- return true;
- return false;
-}
-
-static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
-{
- iter->bytenr = 0;
- iter->item_ptr = 0;
- iter->cur_ptr = 0;
- iter->end_ptr = 0;
- btrfs_release_path(iter->path);
- memset(&iter->cur_key, 0, sizeof(iter->cur_key));
-}
-
/*
* Backref cache related structures
*
@@ -452,83 +429,22 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge(
#define LINK_LOWER (1 << 0)
#define LINK_UPPER (1 << 1)
-static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
- struct btrfs_backref_node *lower,
- struct btrfs_backref_node *upper,
- int link_which)
-{
- ASSERT(upper && lower && upper->level == lower->level + 1);
- edge->node[LOWER] = lower;
- edge->node[UPPER] = upper;
- if (link_which & LINK_LOWER)
- list_add_tail(&edge->list[LOWER], &lower->upper);
- if (link_which & LINK_UPPER)
- list_add_tail(&edge->list[UPPER], &upper->lower);
-}
-
-static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
- struct btrfs_backref_node *node)
-{
- if (node) {
- ASSERT(list_empty(&node->list));
- ASSERT(list_empty(&node->lower));
- ASSERT(node->eb == NULL);
- cache->nr_nodes--;
- btrfs_put_root(node->root);
- kfree(node);
- }
-}
-
-static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
- struct btrfs_backref_edge *edge)
-{
- if (edge) {
- cache->nr_edges--;
- kfree(edge);
- }
-}
-
-static inline void btrfs_backref_unlock_node_buffer(
- struct btrfs_backref_node *node)
-{
- if (node->locked) {
- btrfs_tree_unlock(node->eb);
- node->locked = 0;
- }
-}
-static inline void btrfs_backref_drop_node_buffer(
- struct btrfs_backref_node *node)
-{
- if (node->eb) {
- btrfs_backref_unlock_node_buffer(node);
- free_extent_buffer(node->eb);
- node->eb = NULL;
- }
-}
-
-/*
- * Drop the backref node from cache without cleaning up its children
- * edges.
- *
- * This can only be called on node without parent edges.
- * The children edges are still kept as is.
- */
-static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
- struct btrfs_backref_node *node)
-{
- ASSERT(list_empty(&node->upper));
-
- btrfs_backref_drop_node_buffer(node);
- list_del_init(&node->list);
- list_del_init(&node->lower);
- if (!RB_EMPTY_NODE(&node->rb_node))
- rb_erase(&node->rb_node, &tree->rb_root);
- btrfs_backref_free_node(tree, node);
-}
+void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper,
+ int link_which);
+void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node);
+void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_edge *edge);
+void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node);
+void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node);
void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node);
+void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
+ struct btrfs_backref_node *node);
void btrfs_backref_release_cache(struct btrfs_backref_cache *cache);
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 4f3b693a16b1..477f350a8bd0 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -11,7 +11,6 @@
#include "raid56.h"
#include "async-thread.h"
#include "dev-replace.h"
-#include "rcu-string.h"
#include "zoned.h"
#include "file-item.h"
#include "raid-stripe-tree.h"
@@ -194,6 +193,12 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
int mirror = repair_bbio->mirror_num;
+ /*
+ * We can only trigger this for data bio, which doesn't support larger
+ * folios yet.
+ */
+ ASSERT(folio_order(page_folio(bv->bv_page)) == 0);
+
if (repair_bbio->bio.bi_status ||
!btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
@@ -215,7 +220,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
repair_bbio->file_offset, fs_info->sectorsize,
repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
- bv->bv_page, bv->bv_offset, mirror);
+ page_folio(bv->bv_page), bv->bv_offset, mirror);
} while (mirror != fbio->bbio->mirror_num);
done:
@@ -503,8 +508,6 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
if (!bioc) {
/* Single mirror read/write fast path. */
btrfs_bio(bio)->mirror_num = mirror_num;
- if (bio_op(bio) != REQ_OP_READ)
- btrfs_bio(bio)->orig_physical = smap->physical;
bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
if (bio_op(bio) != REQ_OP_READ)
btrfs_bio(bio)->orig_physical = smap->physical;
@@ -605,8 +608,20 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
static bool should_async_write(struct btrfs_bio *bbio)
{
+ bool auto_csum_mode = true;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
+ enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
+
+ if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF)
+ return false;
+
+ auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO);
+#endif
+
/* Submit synchronously if the checksum implementation is fast. */
- if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
+ if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
return false;
/*
@@ -626,7 +641,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
/*
* Submit bio to an async queue.
*
- * Return true if the work has been succesfuly submitted, else false.
+ * Return true if the work has been successfully submitted, else false.
*/
static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
struct btrfs_io_context *bioc,
@@ -767,8 +782,8 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
* freeing the bio.
*/
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
+ u64 length, u64 logical, struct folio *folio,
+ unsigned int folio_offset, int mirror_num)
{
struct btrfs_io_stripe smap = { 0 };
struct bio_vec bvec;
@@ -799,7 +814,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
- __bio_add_page(&bio, page, length, pg_offset);
+ ret = bio_add_folio(&bio, folio, length, folio_offset);
+ ASSERT(ret);
ret = submit_bio_wait(&bio);
if (ret) {
/* try to remap that extent elsewhere? */
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index ca79decee060..d9dd5276093d 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -7,12 +7,14 @@
#ifndef BTRFS_BIO_H
#define BTRFS_BIO_H
+#include <linux/types.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include "tree-checker.h"
struct btrfs_bio;
struct btrfs_fs_info;
+struct btrfs_inode;
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
@@ -105,7 +107,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num);
+ u64 length, u64 logical, struct folio *folio,
+ unsigned int folio_offset, int mirror_num);
#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6e5dc68ff661..1e09aeea69c2 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -168,7 +168,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
cache);
kfree(cache->free_space_ctl);
- kfree(cache->physical_map);
+ btrfs_free_chunk_map(cache->physical_map);
kfree(cache);
}
}
@@ -418,7 +418,7 @@ struct btrfs_caching_control *btrfs_get_caching_control(
return ctl;
}
-void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
+static void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
{
if (refcount_dec_and_test(&ctl->count))
kfree(ctl);
@@ -1047,7 +1047,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em)
+ struct btrfs_chunk_map *map)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path;
@@ -1059,11 +1059,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
int index;
int factor;
struct btrfs_caching_control *caching_ctl = NULL;
- bool remove_em;
+ bool remove_map;
bool remove_rsv = false;
- block_group = btrfs_lookup_block_group(fs_info, group_start);
- BUG_ON(!block_group);
+ block_group = btrfs_lookup_block_group(fs_info, map->start);
+ if (!block_group)
+ return -ENOENT;
+
BUG_ON(!block_group->ro);
trace_btrfs_remove_block_group(block_group);
@@ -1252,7 +1254,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* entries because we already removed them all when we called
* btrfs_remove_free_space_cache().
*
- * And we must not remove the extent map from the fs_info->mapping_tree
+ * And we must not remove the chunk map from the fs_info->mapping_tree
* to prevent the same logical address range and physical device space
* ranges from being reused for a new block group. This is needed to
* avoid races with trimming and scrub.
@@ -1268,19 +1270,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* in place until the extents have been discarded completely when
* the transaction commit has completed.
*/
- remove_em = (atomic_read(&block_group->frozen) == 0);
+ remove_map = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
- if (remove_em) {
- struct extent_map_tree *em_tree;
-
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- /* once for the tree */
- free_extent_map(em);
- }
+ if (remove_map)
+ btrfs_remove_chunk_map(fs_info, map);
out:
/* Once for the lookup reference */
@@ -1295,15 +1289,12 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
struct btrfs_root *root = btrfs_block_group_root(fs_info);
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
unsigned int num_items;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
- ASSERT(em && em->start == chunk_offset);
+ map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(map != NULL);
+ ASSERT(map->start == chunk_offset);
/*
* We need to reserve 3 + N units from the metadata space info in order
@@ -1324,9 +1315,8 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
* more device items and remove one chunk item), but this is done at
* btrfs_remove_chunk() through a call to check_system_chunk().
*/
- map = em->map_lookup;
num_items = 3 + map->num_stripes;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return btrfs_start_transaction_fallback_global_rsv(root, num_items);
}
@@ -1441,7 +1431,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
* group in pinned_extents before we were able to clear the whole block
* group range from pinned_extents. This means that task can lookup for
* the block group after we unpinned it from pinned_extents and removed
- * it, leading to a BUG_ON() at unpin_extent_range().
+ * it, leading to an error at unpin_extent_range().
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans) {
@@ -1467,6 +1457,7 @@ out:
*/
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
+ LIST_HEAD(retry_list);
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
@@ -1488,6 +1479,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
+ u64 used;
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1523,22 +1515,69 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
+ spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (block_group->reserved || block_group->pinned ||
- block_group->used || block_group->ro ||
+ if (btrfs_is_block_group_used(block_group) || block_group->ro ||
list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
* the ro check in case balance is currently acting on
* this block group.
+ *
+ * Also bail out if this is the only block group for its
+ * type, because otherwise we would lose profile
+ * information from fs_info->avail_*_alloc_bits and the
+ * next block group of this type would be created with a
+ * "single" profile (even if we're in a raid fs) because
+ * fs_info->avail_*_alloc_bits would be 0.
*/
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
+
+ /*
+ * The block group may be unused but there may be space reserved
+ * accounting with the existence of that block group, that is,
+ * space_info->bytes_may_use was incremented by a task but no
+ * space was yet allocated from the block group by the task.
+ * That space may or may not be allocated, as we are generally
+ * pessimistic about space reservation for metadata as well as
+ * for data when using compression (as we reserve space based on
+ * the worst case, when data can't be compressed, and before
+ * actually attempting compression, before starting writeback).
+ *
+ * So check if the total space of the space_info minus the size
+ * of this block group is less than the used space of the
+ * space_info - if that's the case, then it means we have tasks
+ * that might be relying on the block group in order to allocate
+ * extents, and add back the block group to the unused list when
+ * we finish, so that we retry later in case no tasks ended up
+ * needing to allocate extents from the block group.
+ */
+ used = btrfs_space_info_used(space_info, true);
+ if (space_info->total_bytes - block_group->length < used &&
+ block_group->zone_unusable < block_group->length) {
+ /*
+ * Add a reference for the list, compensate for the ref
+ * drop under the "next" label for the
+ * fs_info->unused_bgs list.
+ */
+ btrfs_get_block_group(block_group);
+ list_add_tail(&block_group->bg_list, &retry_list);
+
+ trace_btrfs_skip_unused_block_group(block_group);
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
/* We don't want to force the issue, only flip if it's ok. */
ret = inc_block_group_ro(block_group, 0);
@@ -1662,12 +1701,16 @@ next:
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
+ list_splice_tail(&retry_list, &fs_info->unused_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_splice_tail(&retry_list, &fs_info->unused_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
@@ -1927,8 +1970,7 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct btrfs_path *path)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
struct btrfs_block_group_item bg;
struct extent_buffer *leaf;
int slot;
@@ -1938,23 +1980,20 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
slot = path->slots[0];
leaf = path->nodes[0];
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
- read_unlock(&em_tree->lock);
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
+ if (!map) {
btrfs_err(fs_info,
"logical %llu len %llu found bg but no related chunk",
key->objectid, key->offset);
return -ENOENT;
}
- if (em->start != key->objectid || em->len != key->offset) {
+ if (map->start != key->objectid || map->chunk_len != key->offset) {
btrfs_err(fs_info,
"block group %llu len %llu mismatch with chunk %llu len %llu",
- key->objectid, key->offset, em->start, em->len);
+ key->objectid, key->offset, map->start, map->chunk_len);
ret = -EUCLEAN;
- goto out_free_em;
+ goto out_free_map;
}
read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
@@ -1962,16 +2001,16 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;
- if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
key->objectid, key->offset, flags,
- (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
+ (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
ret = -EUCLEAN;
}
-out_free_em:
- free_extent_map(em);
+out_free_map:
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2024,8 +2063,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 *buf;
u64 bytenr;
u64 data_stripe_length;
@@ -2033,14 +2071,13 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
int i, nr = 0;
int ret = 0;
- em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(map))
return -EIO;
- map = em->map_lookup;
- data_stripe_length = em->orig_block_len;
+ data_stripe_length = map->stripe_size;
io_stripe_size = BTRFS_STRIPE_LEN;
- chunk_start = em->start;
+ chunk_start = map->start;
/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -2094,7 +2131,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
*naddrs = nr;
*stripe_len = io_stripe_size;
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2199,49 +2236,47 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
*/
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct btrfs_block_group *bg;
u64 start = 0;
int ret = 0;
while (1) {
- read_lock(&map_tree->lock);
+ struct btrfs_chunk_map *map;
+ struct btrfs_block_group *bg;
+
/*
- * lookup_extent_mapping will return the first extent map
- * intersecting the range, so setting @len to 1 is enough to
+ * btrfs_find_chunk_map() will return the first chunk map
+ * intersecting the range, so setting @length to 1 is enough to
* get the first chunk.
*/
- em = lookup_extent_mapping(map_tree, start, 1);
- read_unlock(&map_tree->lock);
- if (!em)
+ map = btrfs_find_chunk_map(fs_info, start, 1);
+ if (!map)
break;
- bg = btrfs_lookup_block_group(fs_info, em->start);
+ bg = btrfs_lookup_block_group(fs_info, map->start);
if (!bg) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu doesn't have corresponding block group",
- em->start, em->len);
+ map->start, map->chunk_len);
ret = -EUCLEAN;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
break;
}
- if (bg->start != em->start || bg->length != em->len ||
+ if (bg->start != map->start || bg->length != map->chunk_len ||
(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
- (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
- em->start, em->len,
- em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+ map->start, map->chunk_len,
+ map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
bg->start, bg->length,
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
ret = -EUCLEAN;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
break;
}
- start = em->start + em->len;
- free_extent_map(em);
+ start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
}
return ret;
@@ -2369,28 +2404,25 @@ error:
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct rb_node *node;
int ret = 0;
- for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
- struct extent_map *em;
- struct map_lookup *map;
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+ struct btrfs_chunk_map *map;
struct btrfs_block_group *bg;
- em = rb_entry(node, struct extent_map, rb_node);
- map = em->map_lookup;
- bg = btrfs_create_block_group_cache(fs_info, em->start);
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ bg = btrfs_create_block_group_cache(fs_info, map->start);
if (!bg) {
ret = -ENOMEM;
break;
}
/* Fill dummy cache as FULL */
- bg->length = em->len;
+ bg->length = map->chunk_len;
bg->flags = map->type;
bg->cached = BTRFS_CACHE_FINISHED;
- bg->used = em->len;
+ bg->used = map->chunk_len;
bg->flags = map->type;
ret = btrfs_add_block_group_cache(fs_info, bg);
/*
@@ -2618,19 +2650,14 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_device *device;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 dev_offset;
- u64 stripe_size;
int i;
int ret = 0;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- map = em->map_lookup;
- stripe_size = em->orig_block_len;
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
/*
* Take the device list mutex to prevent races with the final phase of
@@ -2647,13 +2674,13 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
dev_offset = map->stripes[i].physical;
ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
- stripe_size);
+ map->stripe_size);
if (ret)
break;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2712,6 +2739,37 @@ next:
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+
+ /*
+ * If the block group is still unused, add it to the list of
+ * unused block groups. The block group may have been created in
+ * order to satisfy a space reservation, in which case the
+ * extent allocation only happens later. But often we don't
+ * actually need to allocate space that we previously reserved,
+ * so the block group may become unused for a long time. For
+ * example for metadata we generally reserve space for a worst
+ * possible scenario, but then don't end up allocating all that
+ * space or none at all (due to no need to COW, extent buffers
+ * were already COWed in the current transaction and still
+ * unwritten, tree heights lower than the maximum possible
+ * height, etc). For data we generally reserve the axact amount
+ * of space we are going to allocate later, the exception is
+ * when using compression, as we must reserve space based on the
+ * uncompressed data size, because the compression is only done
+ * when writeback triggered and we don't know how much space we
+ * are actually going to need, so we reserve the uncompressed
+ * size because the data may be uncompressible in the worst case.
+ */
+ if (ret == 0) {
+ bool used;
+
+ spin_lock(&block_group->lock);
+ used = btrfs_is_block_group_used(block_group);
+ spin_unlock(&block_group->lock);
+
+ if (!used)
+ btrfs_mark_bg_unused(block_group);
+ }
}
btrfs_trans_release_chunk_metadata(trans);
}
@@ -2910,7 +2968,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
goto unlock_out;
/*
- * Skip chunk alloction if the bg is SYSTEM, this is to avoid system
+ * Skip chunk allocation if the bg is SYSTEM, this is to avoid system
* chunk allocation storm to exhaust the system chunk array. Otherwise
* we still want to try our best to mark the block group read-only.
*/
@@ -4406,8 +4464,6 @@ void btrfs_freeze_block_group(struct btrfs_block_group *cache)
void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
bool cleanup;
spin_lock(&block_group->lock);
@@ -4416,17 +4472,16 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_unlock(&block_group->lock);
if (cleanup) {
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, block_group->start,
- 1);
- BUG_ON(!em); /* logic error, can't happen */
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* once for us and once for the tree */
- free_extent_map(em);
- free_extent_map(em);
+ struct btrfs_chunk_map *map;
+
+ map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
+ /* Logic error, can't happen. */
+ ASSERT(map);
+
+ btrfs_remove_chunk_map(fs_info, map);
+
+ /* Once for our lookup reference. */
+ btrfs_free_chunk_map(map);
/*
* We may have left one free space entry and other possible
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 2bdbcb834f95..85e2d4cd12dc 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -3,8 +3,23 @@
#ifndef BTRFS_BLOCK_GROUP_H
#define BTRFS_BLOCK_GROUP_H
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/refcount.h>
+#include <linux/wait.h>
+#include <linux/sizes.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/btrfs_tree.h>
#include "free-space-cache.h"
+struct btrfs_chunk_map;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_trans_handle;
+
enum btrfs_disk_cache_state {
BTRFS_DC_WRITTEN,
BTRFS_DC_ERROR,
@@ -243,7 +258,7 @@ struct btrfs_block_group {
u64 zone_unusable;
u64 zone_capacity;
u64 meta_write_pointer;
- struct map_lookup *physical_map;
+ struct btrfs_chunk_map *physical_map;
struct list_head active_bg_list;
struct work_struct zone_finish_work;
struct extent_buffer *last_eb;
@@ -255,6 +270,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
return (block_group->start + block_group->length);
}
+static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
+{
+ lockdep_assert_held(&bg->lock);
+
+ return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
+}
+
static inline bool btrfs_is_block_group_data_only(
struct btrfs_block_group *block_group)
{
@@ -288,7 +310,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
-void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache);
int btrfs_add_new_free_space(struct btrfs_block_group *block_group,
@@ -297,7 +318,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em);
+ struct btrfs_chunk_map *map);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
void btrfs_reclaim_bgs_work(struct work_struct *work);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index ceb5f586a2d5..b299b82d676e 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -6,7 +6,6 @@
#include "space-info.h"
#include "transaction.h"
#include "block-group.h"
-#include "disk-io.h"
#include "fs.h"
#include "accessors.h"
@@ -342,9 +341,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
read_lock(&fs_info->global_root_lock);
rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree,
rb_node) {
- if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID ||
- root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
- root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID ||
+ btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID ||
+ btrfs_root_id(root) == BTRFS_FREE_SPACE_TREE_OBJECTID) {
num_bytes += btrfs_root_used(&root->root_item);
min_items++;
}
@@ -407,7 +406,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- switch (root->root_key.objectid) {
+ switch (btrfs_root_id(root)) {
case BTRFS_CSUM_TREE_OBJECTID:
case BTRFS_EXTENT_TREE_OBJECTID:
case BTRFS_FREE_SPACE_TREE_OBJECTID:
@@ -469,8 +468,7 @@ static struct btrfs_block_rsv *get_block_rsv(
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
(root == fs_info->uuid_root) ||
- (trans->adding_csums &&
- root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID))
+ (trans->adding_csums && btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID))
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -494,7 +492,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
- if (unlikely(block_rsv->size == 0))
+ if (unlikely(btrfs_block_rsv_size(block_rsv) == 0))
goto try_reserve;
again:
ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index b0bd12b8652f..1f53b967d069 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -3,8 +3,15 @@
#ifndef BTRFS_BLOCK_RSV_H
#define BTRFS_BLOCK_RSV_H
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/spinlock.h>
+
struct btrfs_trans_handle;
struct btrfs_root;
+struct btrfs_space_info;
+struct btrfs_block_rsv;
+struct btrfs_fs_info;
enum btrfs_reserve_flush_enum;
/*
@@ -101,4 +108,36 @@ static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
return data_race(rsv->full);
}
+/*
+ * Get the reserved mount of a block reserve in a context where getting a stale
+ * value is acceptable, instead of accessing it directly and trigger data race
+ * warning from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_reserved(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->reserved;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
+/*
+ * Get the size of a block reserve in a context where getting a stale value is
+ * acceptable, instead of accessing it directly and trigger data race warning
+ * from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->size;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5572ae52444e..91c994b569f3 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -8,13 +8,32 @@
#include <linux/hash.h>
#include <linux/refcount.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/compiler.h>
#include <linux/fscrypt.h>
+#include <linux/lockdep.h>
+#include <uapi/linux/btrfs_tree.h>
#include <trace/events/btrfs.h>
+#include "block-rsv.h"
+#include "btrfs_inode.h"
#include "extent_map.h"
#include "extent_io.h"
+#include "extent-io-tree.h"
#include "ordered-data.h"
#include "delayed-inode.h"
+struct extent_state;
+struct posix_acl;
+struct iov_iter;
+struct writeback_control;
+struct btrfs_root;
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+
/*
* Since we search a directory based on f_pos (struct dir_context::pos) we have
* to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
@@ -41,7 +60,6 @@ enum {
*/
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
- BTRFS_INODE_IN_DELALLOC_LIST,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
/*
@@ -69,6 +87,8 @@ enum {
BTRFS_INODE_VERITY_IN_PROGRESS,
/* Set when this inode is a free space inode. */
BTRFS_INODE_FREE_SPACE_INODE,
+ /* Set when there are no capabilities in XATTs for the inode. */
+ BTRFS_INODE_NO_CAP_XATTR,
};
/* in memory btrfs inode */
@@ -107,9 +127,11 @@ struct btrfs_inode {
/*
* Keep track of where the inode has extent items mapped in order to
- * make sure the i_size adjustments are accurate
+ * make sure the i_size adjustments are accurate. Not required when the
+ * filesystem is NO_HOLES, the status can't be set while mounted as
+ * it's a mkfs-time feature.
*/
- struct extent_io_tree file_extent_tree;
+ struct extent_io_tree *file_extent_tree;
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
@@ -359,9 +381,11 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
}
/*
- * Should be called while holding the inode's VFS lock in exclusive mode or in a
- * context where no one else can access the inode concurrently (during inode
- * creation or when loading an inode from disk).
+ * Should be called while holding the inode's VFS lock in exclusive mode, or
+ * while holding the inode's mmap lock (struct btrfs_inode::i_mmap_lock) in
+ * either shared or exclusive mode, or in a context where no one else can access
+ * the inode concurrently (during inode creation or when loading an inode from
+ * disk).
*/
static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
{
@@ -424,7 +448,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes, bool nowait, bool strict);
-void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode);
+void btrfs_del_delalloc_inode(struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -474,7 +498,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *orig, u64 split);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
-vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
void btrfs_evict_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
@@ -486,8 +509,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
struct btrfs_root *root, struct btrfs_path *path);
struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 end);
+ struct page *page, u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
@@ -523,6 +545,7 @@ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
size_t done_before);
struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
size_t done_before);
+struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino);
extern const struct dentry_operations btrfs_dentry_operations;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 19b22b4653c8..6441e47d8a5e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -20,12 +20,11 @@
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
+#include <linux/shrinker.h>
#include <crypto/hash.h>
#include "misc.h"
#include "ctree.h"
#include "fs.h"
-#include "disk-io.h"
-#include "transaction.h"
#include "btrfs_inode.h"
#include "bio.h"
#include "ordered-data.h"
@@ -33,8 +32,7 @@
#include "extent_io.h"
#include "extent_map.h"
#include "subpage.h"
-#include "zoned.h"
-#include "file-item.h"
+#include "messages.h"
#include "super.h"
static struct bio_set btrfs_compressed_bioset;
@@ -92,20 +90,20 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len)
}
static int compression_compress_pages(int type, struct list_head *ws,
- struct address_space *mapping, u64 start, struct page **pages,
- unsigned long *out_pages, unsigned long *total_in,
- unsigned long *total_out)
+ struct address_space *mapping, u64 start,
+ struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB:
- return zlib_compress_pages(ws, mapping, start, pages,
- out_pages, total_in, total_out);
+ return zlib_compress_folios(ws, mapping, start, folios,
+ out_folios, total_in, total_out);
case BTRFS_COMPRESS_LZO:
- return lzo_compress_pages(ws, mapping, start, pages,
- out_pages, total_in, total_out);
+ return lzo_compress_folios(ws, mapping, start, folios,
+ out_folios, total_in, total_out);
case BTRFS_COMPRESS_ZSTD:
- return zstd_compress_pages(ws, mapping, start, pages,
- out_pages, total_in, total_out);
+ return zstd_compress_folios(ws, mapping, start, folios,
+ out_folios, total_in, total_out);
case BTRFS_COMPRESS_NONE:
default:
/*
@@ -117,7 +115,7 @@ static int compression_compress_pages(int type, struct list_head *ws,
* Not a big deal, just need to inform caller that we
* haven't allocated any pages yet.
*/
- *out_pages = 0;
+ *out_folios = 0;
return -E2BIG;
}
}
@@ -140,16 +138,16 @@ static int compression_decompress_bio(struct list_head *ws,
}
static int compression_decompress(int type, struct list_head *ws,
- const u8 *data_in, struct page *dest_page,
- unsigned long start_byte, size_t srclen, size_t destlen)
+ const u8 *data_in, struct page *dest_page,
+ unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_NONE:
default:
/*
@@ -160,16 +158,110 @@ static int compression_decompress(int type, struct list_head *ws,
}
}
-static void btrfs_free_compressed_pages(struct compressed_bio *cb)
+static void btrfs_free_compressed_folios(struct compressed_bio *cb)
{
- for (unsigned int i = 0; i < cb->nr_pages; i++)
- put_page(cb->compressed_pages[i]);
- kfree(cb->compressed_pages);
+ for (unsigned int i = 0; i < cb->nr_folios; i++)
+ btrfs_free_compr_folio(cb->compressed_folios[i]);
+ kfree(cb->compressed_folios);
}
static int btrfs_decompress_bio(struct compressed_bio *cb);
-static void end_compressed_bio_read(struct btrfs_bio *bbio)
+/*
+ * Global cache of last unused pages for compression/decompression.
+ */
+static struct btrfs_compr_pool {
+ struct shrinker *shrinker;
+ spinlock_t lock;
+ struct list_head list;
+ int count;
+ int thresh;
+} compr_pool;
+
+static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc)
+{
+ int ret;
+
+ /*
+ * We must not read the values more than once if 'ret' gets expanded in
+ * the return statement so we don't accidentally return a negative
+ * number, even if the first condition finds it positive.
+ */
+ ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh);
+
+ return ret > 0 ? ret : 0;
+}
+
+static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc)
+{
+ struct list_head remove;
+ struct list_head *tmp, *next;
+ int freed;
+
+ if (compr_pool.count == 0)
+ return SHRINK_STOP;
+
+ INIT_LIST_HEAD(&remove);
+
+ /* For now, just simply drain the whole list. */
+ spin_lock(&compr_pool.lock);
+ list_splice_init(&compr_pool.list, &remove);
+ freed = compr_pool.count;
+ compr_pool.count = 0;
+ spin_unlock(&compr_pool.lock);
+
+ list_for_each_safe(tmp, next, &remove) {
+ struct page *page = list_entry(tmp, struct page, lru);
+
+ ASSERT(page_ref_count(page) == 1);
+ put_page(page);
+ }
+
+ return freed;
+}
+
+/*
+ * Common wrappers for page allocation from compression wrappers
+ */
+struct folio *btrfs_alloc_compr_folio(void)
+{
+ struct folio *folio = NULL;
+
+ spin_lock(&compr_pool.lock);
+ if (compr_pool.count > 0) {
+ folio = list_first_entry(&compr_pool.list, struct folio, lru);
+ list_del_init(&folio->lru);
+ compr_pool.count--;
+ }
+ spin_unlock(&compr_pool.lock);
+
+ if (folio)
+ return folio;
+
+ return folio_alloc(GFP_NOFS, 0);
+}
+
+void btrfs_free_compr_folio(struct folio *folio)
+{
+ bool do_free = false;
+
+ spin_lock(&compr_pool.lock);
+ if (compr_pool.count > compr_pool.thresh) {
+ do_free = true;
+ } else {
+ list_add(&folio->lru, &compr_pool.list);
+ compr_pool.count++;
+ }
+ spin_unlock(&compr_pool.lock);
+
+ if (!do_free)
+ return;
+
+ ASSERT(folio_ref_count(folio) == 1);
+ folio_put(folio);
+}
+
+static void end_bbio_comprssed_read(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
blk_status_t status = bbio->bio.bi_status;
@@ -177,7 +269,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
if (!status)
status = errno_to_blk_status(btrfs_decompress_bio(cb));
- btrfs_free_compressed_pages(cb);
+ btrfs_free_compressed_folios(cb);
btrfs_bio_end_io(cb->orig_bbio, status);
bio_put(&bbio->bio);
}
@@ -189,7 +281,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
static noinline void end_compressed_writeback(const struct compressed_bio *cb)
{
struct inode *inode = &cb->bbio.inode->vfs_inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct folio_batch fbatch;
@@ -211,8 +303,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
for (i = 0; i < ret; i++) {
struct folio *folio = fbatch.folios[i];
- btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
- cb->start, cb->len);
+ btrfs_folio_clamp_clear_writeback(fs_info, folio,
+ cb->start, cb->len);
}
folio_batch_release(&fbatch);
}
@@ -231,7 +323,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
end_compressed_writeback(cb);
/* Note, our inode could be gone now */
- btrfs_free_compressed_pages(cb);
+ btrfs_free_compressed_folios(cb);
bio_put(&cb->bbio.bio);
}
@@ -242,7 +334,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
* This also calls the writeback end hooks for the file pages so that metadata
* and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct btrfs_bio *bbio)
+static void end_bbio_comprssed_write(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
@@ -250,17 +342,19 @@ static void end_compressed_bio_write(struct btrfs_bio *bbio)
queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
}
-static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
+static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
{
struct bio *bio = &cb->bbio.bio;
u32 offset = 0;
while (offset < cb->compressed_len) {
+ int ret;
u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
/* Maximum compressed extent is smaller than bio size limit. */
- __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT],
- len, 0);
+ ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT],
+ len, 0);
+ ASSERT(ret);
offset += len;
}
}
@@ -275,8 +369,8 @@ static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
* the end io hooks.
*/
void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
- struct page **compressed_pages,
- unsigned int nr_pages,
+ struct folio **compressed_folios,
+ unsigned int nr_folios,
blk_opf_t write_flags,
bool writeback)
{
@@ -289,17 +383,17 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
cb = alloc_compressed_bio(inode, ordered->file_offset,
REQ_OP_WRITE | write_flags,
- end_compressed_bio_write);
+ end_bbio_comprssed_write);
cb->start = ordered->file_offset;
cb->len = ordered->num_bytes;
- cb->compressed_pages = compressed_pages;
+ cb->compressed_folios = compressed_folios;
cb->compressed_len = ordered->disk_num_bytes;
cb->writeback = writeback;
INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
- cb->nr_pages = nr_pages;
+ cb->nr_folios = nr_folios;
cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT;
cb->bbio.ordered = ordered;
- btrfs_add_compressed_bio_pages(cb);
+ btrfs_add_compressed_bio_folios(cb);
btrfs_submit_bio(&cb->bbio, 0);
}
@@ -320,7 +414,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
struct compressed_bio *cb,
int *memstall, unsigned long *pflags)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
unsigned long end_index;
struct bio *orig_bio = &cb->orig_bbio->bio;
u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
@@ -346,7 +440,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* This makes readahead less effective, so here disable readahead for
* subpage for now, until full compressed write is supported.
*/
- if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
+ if (fs_info->sectorsize < PAGE_SIZE)
return 0;
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
@@ -446,7 +540,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* subpage::readers and to unlock the page.
*/
if (fs_info->sectorsize < PAGE_SIZE)
- btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+ btrfs_subpage_start_reader(fs_info, page_folio(page),
+ cur, add_size);
put_page(page);
cur += add_size;
}
@@ -489,11 +584,11 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
goto out;
}
- ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
+ ASSERT(extent_map_is_compressed(em));
compressed_len = em->block_len;
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
- end_compressed_bio_read);
+ end_bbio_comprssed_read);
cb->start = em->orig_start;
em_len = em->len;
@@ -501,19 +596,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
cb->len = bbio->bio.bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = em->compress_type;
+ cb->compress_type = extent_map_compression(em);
cb->orig_bbio = bbio;
free_extent_map(em);
- cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
- cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!cb->compressed_pages) {
+ cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+ cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS);
+ if (!cb->compressed_folios) {
ret = BLK_STS_RESOURCE;
goto out_free_bio;
}
- ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+ ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios, 0);
if (ret2) {
ret = BLK_STS_RESOURCE;
goto out_free_compressed_pages;
@@ -525,7 +620,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
/* include any pages we added in add_ra-bio_pages */
cb->len = bbio->bio.bi_iter.bi_size;
cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
- btrfs_add_compressed_bio_pages(cb);
+ btrfs_add_compressed_bio_folios(cb);
if (memstall)
psi_memstall_leave(&pflags);
@@ -534,7 +629,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
return;
out_free_compressed_pages:
- kfree(cb->compressed_pages);
+ kfree(cb->compressed_folios);
out_free_bio:
bio_put(&cb->bbio.bio);
out:
@@ -881,6 +976,29 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level)
return level;
}
+/* Wrapper around find_get_page(), with extra error message. */
+int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
+ struct folio **in_folio_ret)
+{
+ struct folio *in_folio;
+
+ /*
+ * The compressed write path should have the folio locked already, thus
+ * we only need to grab one reference.
+ */
+ in_folio = filemap_get_folio(mapping, start >> PAGE_SHIFT);
+ if (IS_ERR(in_folio)) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_crit(inode->root->fs_info,
+ "failed to get page cache, root %lld ino %llu file offset %llu",
+ btrfs_root_id(inode->root), btrfs_ino(inode), start);
+ return -ENOENT;
+ }
+ *in_folio_ret = in_folio;
+ return 0;
+}
+
/*
* Given an address space and start and length, compress the bytes into @pages
* that are allocated on demand.
@@ -901,11 +1019,9 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level)
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
*/
-int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
- u64 start, struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out)
+int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
int type = btrfs_compress_type(type_level);
int level = btrfs_compress_level(type_level);
@@ -914,8 +1030,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
level = btrfs_compress_set_level(type, level);
workspace = get_workspace(type, level);
- ret = compression_compress_pages(type, workspace, mapping, start, pages,
- out_pages, total_in, total_out);
+ ret = compression_compress_pages(type, workspace, mapping, start, folios,
+ out_folios, total_in, total_out);
put_workspace(type, workspace);
return ret;
}
@@ -941,14 +1057,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
* start_byte tells us the offset into the compressed data we're interested in
*/
int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
- unsigned long start_byte, size_t srclen, size_t destlen)
+ unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
+ struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page);
struct list_head *workspace;
+ const u32 sectorsize = fs_info->sectorsize;
int ret;
+ /*
+ * The full destination page range should not exceed the page size.
+ * And the @destlen should not exceed sectorsize, as this is only called for
+ * inline file extents, which should not exceed sectorsize.
+ */
+ ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
+
workspace = get_workspace(type, 0);
ret = compression_decompress(type, workspace, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
put_workspace(type, workspace);
return ret;
@@ -960,15 +1085,36 @@ int __init btrfs_init_compress(void)
offsetof(struct compressed_bio, bbio.bio),
BIOSET_NEED_BVECS))
return -ENOMEM;
+
+ compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages");
+ if (!compr_pool.shrinker)
+ return -ENOMEM;
+
btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
zstd_init_workspace_manager();
+
+ spin_lock_init(&compr_pool.lock);
+ INIT_LIST_HEAD(&compr_pool.list);
+ compr_pool.count = 0;
+ /* 128K / 4K = 32, for 8 threads is 256 pages. */
+ compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8;
+ compr_pool.shrinker->count_objects = btrfs_compr_pool_count;
+ compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan;
+ compr_pool.shrinker->batch = 32;
+ compr_pool.shrinker->seeks = DEFAULT_SEEKS;
+ shrinker_register(compr_pool.shrinker);
+
return 0;
}
void __cold btrfs_exit_compress(void)
{
+ /* For now scan drains all pages and does not touch the parameters. */
+ btrfs_compr_pool_scan(NULL, NULL);
+ shrinker_free(compr_pool.shrinker);
+
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
@@ -1353,11 +1499,6 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
/*
* Compression heuristic.
*
- * For now is's a naive and optimistic 'return true', we'll extend the logic to
- * quickly (compared to direct compression) detect data characteristics
- * (compressible/incompressible) to avoid wasting CPU time on incompressible
- * data.
- *
* The following types of analysis can be performed:
* - detect mostly zero data
* - detect data with low "byte set" size (text, etc)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 03bb9d143fa7..c20c1a1b09d5 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -7,10 +7,18 @@
#define BTRFS_COMPRESSION_H
#include <linux/sizes.h>
+#include <linux/mm.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
#include "bio.h"
+struct address_space;
+struct page;
+struct inode;
struct btrfs_inode;
struct btrfs_ordered_extent;
+struct btrfs_bio;
/*
* We want to make sure that amount of RAM required to uncompress an extent is
@@ -33,11 +41,11 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
struct compressed_bio {
- /* Number of compressed pages in the array */
- unsigned int nr_pages;
+ /* Number of compressed folios in the array. */
+ unsigned int nr_folios;
- /* the pages with the compressed data on them */
- struct page **compressed_pages;
+ /* The folios with the compressed data on them. */
+ struct folio **compressed_folios;
/* starting offset in the inode for our pages */
u64 start;
@@ -77,25 +85,25 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level)
int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
-int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
- u64 start, struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out);
+int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out);
int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
- struct page **compressed_pages,
- unsigned int nr_pages,
- blk_opf_t write_flags,
- bool writeback);
+ struct folio **compressed_folios,
+ unsigned int nr_folios, blk_opf_t write_flags,
+ bool writeback);
void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
+struct folio *btrfs_alloc_compr_folio(void);
+void btrfs_free_compr_folio(struct folio *folio);
+
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
BTRFS_COMPRESS_ZLIB = 1,
@@ -138,33 +146,36 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len);
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
+int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
+ struct folio **in_folio_ret);
+
+int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
void zlib_free_workspace(struct list_head *ws);
struct list_head *zlib_get_workspace(unsigned int level);
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
+int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
void lzo_free_workspace(struct list_head *ws);
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
+int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zstd_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
void zstd_init_workspace_manager(void);
void zstd_cleanup_workspace_manager(void);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2a9344a3fcee..1a49b9232990 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -291,7 +291,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root)
spin_lock(&fs_info->trans_lock);
if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
/* Want the extent tree to be the last on the list */
- if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID)
list_move_tail(&root->dirty_list,
&fs_info->dirty_cowonly_roots);
else
@@ -370,33 +370,41 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
/*
* check if the tree block can be shared by multiple trees
*/
-int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf)
{
+ const u64 buf_gen = btrfs_header_generation(buf);
+
/*
* Tree blocks not in shareable trees and tree roots are never shared.
* If a block was allocated after the last snapshot and the block was
* not allocated by tree relocation, we know the block is not shared.
*/
- if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- buf != root->node &&
- (btrfs_header_generation(buf) <=
- btrfs_root_last_snapshot(&root->root_item) ||
- btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) {
- if (buf != root->commit_root)
- return 1;
- /*
- * An extent buffer that used to be the commit root may still be
- * shared because the tree height may have increased and it
- * became a child of a higher level root. This can happen when
- * snapshotting a subvolume created in the current transaction.
- */
- if (btrfs_header_generation(buf) == trans->transid)
- return 1;
- }
- return 0;
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ return false;
+
+ if (buf == root->node)
+ return false;
+
+ if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
+ !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
+ return false;
+
+ if (buf != root->commit_root)
+ return true;
+
+ /*
+ * An extent buffer that used to be the commit root may still be shared
+ * because the tree height may have increased and it became a child of a
+ * higher level root. This can happen when snapshotting a subvolume
+ * created in the current transaction.
+ */
+ if (buf_gen == trans->transid)
+ return true;
+
+ return false;
}
static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
@@ -432,7 +440,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (btrfs_block_can_be_shared(trans, root, buf)) {
ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
btrfs_header_level(buf), 1,
- &refs, &flags);
+ &refs, &flags, NULL);
if (ret)
return ret;
if (unlikely(refs == 0)) {
@@ -446,7 +454,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
}
} else {
refs = 1;
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
else
@@ -458,15 +466,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
if (refs > 1) {
- if ((owner == root->root_key.objectid ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+ if ((owner == btrfs_root_id(root) ||
+ btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
ret = btrfs_inc_ref(trans, root, buf, 1);
if (ret)
return ret;
- if (root->root_key.objectid ==
- BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_dec_ref(trans, root, buf, 0);
if (ret)
return ret;
@@ -477,8 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
} else {
- if (root->root_key.objectid ==
- BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
@@ -492,8 +498,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
}
} else {
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
- if (root->root_key.objectid ==
- BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
@@ -555,13 +560,13 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
else
btrfs_node_key(buf, &disk_key, 0);
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
if (parent)
parent_start = parent->start;
reloc_src_root = btrfs_header_owner(buf);
}
cow = btrfs_alloc_tree_block(trans, root, parent_start,
- root->root_key.objectid, &disk_key, level,
+ btrfs_root_id(root), &disk_key, level,
search_start, empty_size, reloc_src_root, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -574,10 +579,10 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
BTRFS_HEADER_FLAG_RELOC);
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
else
- btrfs_set_header_owner(cow, root->root_key.objectid);
+ btrfs_set_header_owner(cow, btrfs_root_id(root));
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
@@ -601,7 +606,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
if (buf == root->node) {
WARN_ON(parent && parent != buf);
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
parent_start = buf->start;
@@ -677,7 +682,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
*/
if (btrfs_header_generation(buf) == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
- !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
!test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
return 0;
@@ -812,7 +817,8 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
}
while (low < high) {
- unsigned long oip;
+ const int unit_size = eb->folio_size;
+ unsigned long oil;
unsigned long offset;
struct btrfs_disk_key *tmp;
struct btrfs_disk_key unaligned;
@@ -820,14 +826,14 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
mid = (low + high) / 2;
offset = p + mid * item_size;
- oip = offset_in_page(offset);
+ oil = get_eb_offset_in_folio(eb, offset);
- if (oip + key_size <= PAGE_SIZE) {
- const unsigned long idx = get_eb_page_index(offset);
- char *kaddr = page_address(eb->pages[idx]);
+ if (oil + key_size <= unit_size) {
+ const unsigned long idx = get_eb_folio_index(eb, offset);
+ char *kaddr = folio_address(eb->folios[idx]);
- oip = get_eb_offset_in_page(eb, offset);
- tmp = (struct btrfs_disk_key *)(kaddr + oip);
+ oil = get_eb_offset_in_folio(eb, offset);
+ tmp = (struct btrfs_disk_key *)(kaddr + oil);
} else {
read_extent_buffer(eb, &unaligned, offset, key_size);
tmp = &unaligned;
@@ -994,7 +1000,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
- __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left,
BTRFS_NESTING_LEFT_COW);
@@ -1012,7 +1018,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
- __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right,
BTRFS_NESTING_RIGHT_COW);
@@ -1196,7 +1202,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (IS_ERR(left))
return PTR_ERR(left);
- __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -1256,7 +1262,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (IS_ERR(right))
return PTR_ERR(right);
- __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -1502,7 +1508,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
check.has_first_key = true;
check.level = parent_level - 1;
check.transid = gen;
- check.owner_root = root->root_key.objectid;
+ check.owner_root = btrfs_root_id(root);
/*
* If we need to read an extent buffer from disk and we are holding locks
@@ -1547,7 +1553,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
btrfs_release_path(p);
return -EIO;
}
- if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
+ if (btrfs_check_eb_owner(tmp, btrfs_root_id(root))) {
free_extent_buffer(tmp);
btrfs_release_path(p);
return -EUCLEAN;
@@ -2856,7 +2862,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
&lower_key, level, root->node->start, 0,
0, BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
@@ -3000,7 +3006,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
&disk_key, level, c->start, 0,
0, BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
@@ -3258,7 +3264,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(right))
return PTR_ERR(right);
- __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
free_space = btrfs_leaf_free_space(right);
if (free_space < data_size)
@@ -3474,7 +3480,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(left))
return PTR_ERR(left);
- __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
free_space = btrfs_leaf_free_space(left);
if (free_space < data_size) {
@@ -3752,7 +3758,7 @@ again:
* BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
* use BTRFS_NESTING_NEW_ROOT.
*/
- right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
&disk_key, 0, l->start, 0, 0,
num_doubles ? BTRFS_NESTING_NEW_ROOT :
BTRFS_NESTING_SPLIT);
@@ -4271,6 +4277,10 @@ void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
/*
* Given a key and some data, insert items into the tree.
* This does all the path init required, making room in the tree if needed.
+ *
+ * Returns: 0 on success
+ * -EEXIST if the first key already exists
+ * < 0 on other errors
*/
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -5073,9 +5083,7 @@ int btrfs_previous_extent_item(struct btrfs_root *root,
int __init btrfs_ctree_init(void)
{
- btrfs_path_cachep = kmem_cache_create("btrfs_path",
- sizeof(struct btrfs_path), 0,
- SLAB_MEM_SPREAD, NULL);
+ btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0);
if (!btrfs_path_cachep)
return -ENOMEM;
return 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 196c005c31f6..c03c58246033 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -7,25 +7,24 @@
#define BTRFS_CTREE_H
#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/xarray.h>
+#include <linux/refcount.h>
+#include <uapi/linux/btrfs_tree.h>
#include "locking.h"
#include "fs.h"
#include "accessors.h"
+#include "extent-io-tree.h"
+struct extent_buffer;
+struct btrfs_block_rsv;
struct btrfs_trans_handle;
-struct btrfs_transaction;
-struct btrfs_pending_snapshot;
-struct btrfs_delayed_ref_root;
-struct btrfs_space_info;
struct btrfs_block_group;
-struct btrfs_ordered_sum;
-struct btrfs_ref;
-struct btrfs_bio;
-struct btrfs_ioctl_encoded_io_args;
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_balance_control;
-struct btrfs_delayed_root;
-struct reloc_control;
/* Read ahead values for struct btrfs_path.reada */
enum {
@@ -212,8 +211,6 @@ struct btrfs_root {
u64 last_trans;
- u32 type;
-
u64 free_objectid;
struct btrfs_key defrag_progress;
@@ -224,18 +221,15 @@ struct btrfs_root {
struct list_head root_list;
- spinlock_t log_extents_lock[2];
- struct list_head logged_list[2];
-
spinlock_t inode_lock;
/* red-black tree that keeps track of in-memory inodes */
struct rb_root inode_tree;
/*
- * radix tree that keeps track of delayed nodes of every inode,
- * protected by inode_lock
+ * Xarray that keeps track of delayed nodes of every inode, protected
+ * by @inode_lock.
*/
- struct radix_tree_root delayed_nodes_tree;
+ struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@@ -483,8 +477,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
-int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
- u64 start, u64 end);
+void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes);
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
@@ -561,9 +554,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid);
-int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf);
+bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf);
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 5244561e2016..407ccec3e57e 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -6,7 +6,6 @@
#include <linux/sched.h>
#include "ctree.h"
#include "disk-io.h"
-#include "print-tree.h"
#include "transaction.h"
#include "locking.h"
#include "accessors.h"
@@ -148,7 +147,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->ino = btrfs_ino(inode);
defrag->transid = transid;
- defrag->root = root->root_key.objectid;
+ defrag->root = btrfs_root_id(root);
defrag->extent_thresh = extent_thresh;
spin_lock(&fs_info->defrag_inodes_lock);
@@ -521,7 +520,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
* keep_locks set and lowest_level is 1, regardless of the value of
* path->slots[1].
*/
- BUG_ON(path->locks[1] == 0);
+ ASSERT(path->locks[1] != 0);
ret = btrfs_realloc_node(trans, root,
path->nodes[1], 0,
&last_ret,
@@ -775,7 +774,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* this em, as either we don't care about the generation, or the
* merged extent map will be rejected anyway.
*/
- if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+ if (em && (em->flags & EXTENT_FLAG_MERGED) &&
newer_than && em->generation >= newer_than) {
free_extent_map(em);
em = NULL;
@@ -802,7 +801,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
const struct extent_map *em)
{
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ if (extent_map_is_compressed(em))
return BTRFS_MAX_COMPRESSED;
return fs_info->max_extent_size;
}
@@ -810,7 +809,7 @@ static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
u32 extent_thresh, u64 newer_than, bool locked)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_map *next;
bool ret = false;
@@ -828,7 +827,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
/* No more em or hole */
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
goto out;
- if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ if (next->flags & EXTENT_FLAG_PREALLOC)
goto out;
/*
* If the next extent is at its max capacity, defragging current extent
@@ -861,20 +860,21 @@ out:
* NOTE: Caller should also wait for page writeback after the cluster is
* prepared, here we don't do writeback wait for each page.
*/
-static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index)
+static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index)
{
struct address_space *mapping = inode->vfs_inode.i_mapping;
gfp_t mask = btrfs_alloc_write_mask(mapping);
u64 page_start = (u64)index << PAGE_SHIFT;
u64 page_end = page_start + PAGE_SIZE - 1;
struct extent_state *cached_state = NULL;
- struct page *page;
+ struct folio *folio;
int ret;
again:
- page = find_or_create_page(mapping, index, mask);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+ if (IS_ERR(folio))
+ return folio;
/*
* Since we can defragment files opened read-only, we can encounter
@@ -884,16 +884,16 @@ again:
* executables that explicitly enable them, so this isn't very
* restrictive.
*/
- if (PageCompound(page)) {
- unlock_page(page);
- put_page(page);
+ if (folio_test_large(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(-ETXTBSY);
}
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(ret);
}
@@ -908,17 +908,17 @@ again:
if (!ordered)
break;
- unlock_page(page);
+ folio_unlock(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
- lock_page(page);
+ folio_lock(folio);
/*
- * We unlocked the page above, so we need check if it was
+ * We unlocked the folio above, so we need check if it was
* released or not.
*/
- if (page->mapping != mapping || !PagePrivate(page)) {
- unlock_page(page);
- put_page(page);
+ if (folio->mapping != mapping || !folio->private) {
+ folio_unlock(folio);
+ folio_put(folio);
goto again;
}
}
@@ -927,21 +927,21 @@ again:
* Now the page range has no ordered extent any more. Read the page to
* make it uptodate.
*/
- if (!PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != mapping || !PagePrivate(page)) {
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != mapping || !folio->private) {
+ folio_unlock(folio);
+ folio_put(folio);
goto again;
}
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(-EIO);
}
}
- return page;
+ return folio;
}
struct defrag_target_range {
@@ -996,10 +996,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
em->len <= inode->root->fs_info->max_inline)
goto next;
- /* Skip hole/delalloc/preallocated extents */
+ /* Skip holes and preallocated extents. */
if (em->block_start == EXTENT_MAP_HOLE ||
- em->block_start == EXTENT_MAP_DELALLOC ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ (em->flags & EXTENT_FLAG_PREALLOC))
goto next;
/* Skip older extent */
@@ -1047,7 +1046,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
goto add;
/* Skip too large extent */
- if (range_len >= extent_thresh)
+ if (em->len >= extent_thresh)
goto next;
/*
@@ -1163,7 +1162,7 @@ static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
*/
static int defrag_one_locked_target(struct btrfs_inode *inode,
struct defrag_target_range *target,
- struct page **pages, int nr_pages,
+ struct folio **folios, int nr_pages,
struct extent_state **cached_state)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1172,7 +1171,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
const u64 len = target->len;
unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
unsigned long start_index = start >> PAGE_SHIFT;
- unsigned long first_index = page_index(pages[0]);
+ unsigned long first_index = folios[0]->index;
int ret = 0;
int i;
@@ -1189,8 +1188,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
/* Update the page status */
for (i = start_index - first_index; i <= last_index - first_index; i++) {
- ClearPageChecked(pages[i]);
- btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
+ folio_clear_checked(folios[i]);
+ btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len);
}
btrfs_delalloc_release_extents(inode, len);
extent_changeset_free(data_reserved);
@@ -1206,7 +1205,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
struct defrag_target_range *entry;
struct defrag_target_range *tmp;
LIST_HEAD(target_list);
- struct page **pages;
+ struct folio **folios;
const u32 sectorsize = inode->root->fs_info->sectorsize;
u64 last_index = (start + len - 1) >> PAGE_SHIFT;
u64 start_index = start >> PAGE_SHIFT;
@@ -1217,21 +1216,21 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!pages)
+ folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS);
+ if (!folios)
return -ENOMEM;
/* Prepare all pages */
for (i = 0; i < nr_pages; i++) {
- pages[i] = defrag_prepare_one_page(inode, start_index + i);
- if (IS_ERR(pages[i])) {
- ret = PTR_ERR(pages[i]);
- pages[i] = NULL;
- goto free_pages;
+ folios[i] = defrag_prepare_one_folio(inode, start_index + i);
+ if (IS_ERR(folios[i])) {
+ ret = PTR_ERR(folios[i]);
+ nr_pages = i;
+ goto free_folios;
}
}
for (i = 0; i < nr_pages; i++)
- wait_on_page_writeback(pages[i]);
+ folio_wait_writeback(folios[i]);
/* Lock the pages range */
lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
@@ -1251,7 +1250,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
goto unlock_extent;
list_for_each_entry(entry, &target_list, list) {
- ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ ret = defrag_one_locked_target(inode, entry, folios, nr_pages,
&cached_state);
if (ret < 0)
break;
@@ -1265,14 +1264,12 @@ unlock_extent:
unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
(last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
&cached_state);
-free_pages:
+free_folios:
for (i = 0; i < nr_pages; i++) {
- if (pages[i]) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
+ folio_unlock(folios[i]);
+ folio_put(folios[i]);
}
- kfree(pages);
+ kfree(folios);
return ret;
}
@@ -1367,7 +1364,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
unsigned long sectors_defragged = 0;
u64 isize = i_size_read(inode);
u64 cur;
@@ -1513,9 +1510,7 @@ void __cold btrfs_auto_defrag_exit(void)
int __init btrfs_auto_defrag_init(void)
{
btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
- sizeof(struct inode_defrag), 0,
- SLAB_MEM_SPREAD,
- NULL);
+ sizeof(struct inode_defrag), 0, 0, NULL);
if (!btrfs_inode_defrag_cachep)
return -ENOMEM;
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 5a62763528d1..878528e086fb 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -3,6 +3,16 @@
#ifndef BTRFS_DEFRAG_H
#define BTRFS_DEFRAG_H
+#include <linux/types.h>
+#include <linux/compiler_types.h>
+
+struct inode;
+struct file_ra_state;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_trans_handle;
+struct btrfs_ioctl_defrag_range_args;
+
int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 51453d4928fa..b3527efd0b4b 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -6,9 +6,7 @@
#include "block-rsv.h"
#include "btrfs_inode.h"
#include "space-info.h"
-#include "transaction.h"
#include "qgroup.h"
-#include "block-group.h"
#include "fs.h"
/*
@@ -199,7 +197,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
start = round_down(start, fs_info->sectorsize);
btrfs_free_reserved_data_space_noquota(fs_info, len);
- btrfs_qgroup_free_data(inode, reserved, start, len);
+ btrfs_qgroup_free_data(inode, reserved, start, len, NULL);
}
/*
@@ -245,7 +243,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 reserve_size = 0;
u64 qgroup_rsv_size = 0;
- u64 csum_leaves;
unsigned outstanding_extents;
lockdep_assert_held(&inode->lock);
@@ -260,10 +257,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
outstanding_extents);
reserve_size += btrfs_calc_metadata_size(fs_info, 1);
}
- csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
- inode->csum_bytes);
- reserve_size += btrfs_calc_insert_metadata_size(fs_info,
- csum_leaves);
+ if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
+ u64 csum_leaves;
+
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
+ reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
+ }
/*
* For qgroup rsv, the calculation is very simple:
* account one nodesize for each outstanding extent
@@ -278,14 +277,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
spin_unlock(&block_rsv->lock);
}
-static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
+static void calc_inode_reservations(struct btrfs_inode *inode,
u64 num_bytes, u64 disk_num_bytes,
u64 *meta_reserve, u64 *qgroup_reserve)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 nr_extents = count_max_extents(fs_info, num_bytes);
- u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+ u64 csum_leaves;
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ csum_leaves = 0;
+ else
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
nr_extents + csum_leaves);
@@ -337,7 +342,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
- calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+ calc_inode_reservations(inode, num_bytes, disk_num_bytes,
&meta_reserve, &qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
noflush);
@@ -359,7 +364,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
nr_extents = count_max_extents(fs_info, num_bytes);
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, nr_extents);
- inode->csum_bytes += disk_num_bytes;
+ if (!(inode->flags & BTRFS_INODE_NODATASUM))
+ inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -393,7 +399,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
spin_lock(&inode->lock);
- inode->csum_bytes -= num_bytes;
+ if (!(inode->flags & BTRFS_INODE_NODATASUM))
+ inode->csum_bytes -= num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index c5d573f2366e..ce4f889e4f17 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -3,7 +3,11 @@
#ifndef BTRFS_DELALLOC_SPACE_H
#define BTRFS_DELALLOC_SPACE_H
+#include <linux/types.h>
+
struct extent_changeset;
+struct btrfs_inode;
+struct btrfs_fs_info;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct btrfs_inode *inode,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 7381241334e8..95a0497fa866 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -28,11 +28,7 @@ static struct kmem_cache *delayed_node_cache;
int __init btrfs_delayed_inode_init(void)
{
- delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
- sizeof(struct btrfs_delayed_node),
- 0,
- SLAB_MEM_SPREAD,
- NULL);
+ delayed_node_cache = KMEM_CACHE(btrfs_delayed_node, 0);
if (!delayed_node_cache)
return -ENOMEM;
return 0;
@@ -43,6 +39,17 @@ void __cold btrfs_delayed_inode_exit(void)
kmem_cache_destroy(delayed_node_cache);
}
+void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root)
+{
+ atomic_set(&delayed_root->items, 0);
+ atomic_set(&delayed_root->items_seq, 0);
+ delayed_root->nodes = 0;
+ spin_lock_init(&delayed_root->lock);
+ init_waitqueue_head(&delayed_root->wait);
+ INIT_LIST_HEAD(&delayed_root->node_list);
+ INIT_LIST_HEAD(&delayed_root->prepare_list);
+}
+
static inline void btrfs_init_delayed_node(
struct btrfs_delayed_node *delayed_node,
struct btrfs_root *root, u64 inode_id)
@@ -71,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
}
spin_lock(&root->inode_lock);
- node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+ node = xa_load(&root->delayed_nodes, ino);
if (node) {
if (btrfs_inode->delayed_node) {
@@ -83,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
/*
* It's possible that we're racing into the middle of removing
- * this node from the radix tree. In this case, the refcount
+ * this node from the xarray. In this case, the refcount
* was zero and it should never go back to one. Just return
- * NULL like it was never in the radix at all; our release
+ * NULL like it was never in the xarray at all; our release
* function is in the process of removing it.
*
* Some implementations of refcount_inc refuse to bump the
@@ -93,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount.
*
- * If this node is properly in the radix, we want to bump the
+ * If this node is properly in the xarray, we want to bump the
* refcount twice, once for the inode and once for this get
* operation.
*/
@@ -120,6 +127,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
struct btrfs_root *root = btrfs_inode->root;
u64 ino = btrfs_ino(btrfs_inode);
int ret;
+ void *ptr;
again:
node = btrfs_get_delayed_node(btrfs_inode);
@@ -131,26 +139,30 @@ again:
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
- /* cached in the btrfs inode and can be accessed */
+ /* Cached in the inode and can be accessed. */
refcount_set(&node->refs, 2);
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
+ /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
+ ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
+ if (ret == -ENOMEM) {
kmem_cache_free(delayed_node_cache, node);
- return ERR_PTR(ret);
+ return ERR_PTR(-ENOMEM);
}
-
spin_lock(&root->inode_lock);
- ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
- if (ret == -EEXIST) {
+ ptr = xa_load(&root->delayed_nodes, ino);
+ if (ptr) {
+ /* Somebody inserted it, go back and read it. */
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, node);
- radix_tree_preload_end();
+ node = NULL;
goto again;
}
+ ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
+ ASSERT(xa_err(ptr) != -EINVAL);
+ ASSERT(xa_err(ptr) != -ENOMEM);
+ ASSERT(ptr == NULL);
btrfs_inode->delayed_node = node;
spin_unlock(&root->inode_lock);
- radix_tree_preload_end();
return node;
}
@@ -269,8 +281,7 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- radix_tree_delete(&root->delayed_nodes_tree,
- delayed_node->inode_id);
+ xa_erase(&root->delayed_nodes, delayed_node->inode_id);
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
@@ -426,8 +437,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
delayed_root = delayed_node->root->fs_info->delayed_root;
- BUG_ON(!delayed_root);
-
if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
else
@@ -976,7 +985,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
if (delayed_node &&
test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
- BUG_ON(!delayed_node->root);
+ ASSERT(delayed_node->root);
clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count--;
@@ -1036,14 +1045,33 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
goto out;
- path->slots[0]++;
- if (path->slots[0] >= btrfs_header_nritems(leaf))
- goto search;
-again:
+ /*
+ * Now we're going to delete the INODE_REF/EXTREF, which should be the
+ * only one ref left. Check if the next item is an INODE_REF/EXTREF.
+ *
+ * But if we're the last item already, release and search for the last
+ * INODE_REF/EXTREF.
+ */
+ if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) {
+ key.objectid = node->inode_id;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = (u64)-1;
+
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto err_out;
+ ASSERT(ret > 0);
+ ASSERT(path->slots[0] > 0);
+ ret = 0;
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ } else {
+ path->slots[0]++;
+ }
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != node->inode_id)
goto out;
-
if (key.type != BTRFS_INODE_REF_KEY &&
key.type != BTRFS_INODE_EXTREF_KEY)
goto out;
@@ -1070,22 +1098,6 @@ err_out:
btrfs_abort_transaction(trans, ret);
return ret;
-
-search:
- btrfs_release_path(path);
-
- key.type = BTRFS_INODE_EXTREF_KEY;
- key.offset = -1;
-
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
- goto err_out;
- ASSERT(ret);
-
- ret = 0;
- leaf = path->nodes[0];
- path->slots[0]--;
- goto again;
}
static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -1121,6 +1133,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
if (ret)
return ret;
+ ret = btrfs_record_root_in_trans(trans, node->root);
+ if (ret)
+ return ret;
ret = btrfs_update_delayed_inode(trans, node->root, path, node);
return ret;
}
@@ -1636,7 +1651,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- index, node->root->root_key.objectid,
+ index, btrfs_root_id(node->root),
node->inode_id, ret);
btrfs_delayed_item_release_metadata(dir->root, item);
btrfs_release_delayed_item(item);
@@ -2035,34 +2050,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
- u64 inode_id = 0;
+ unsigned long index = 0;
struct btrfs_delayed_node *delayed_nodes[8];
- int i, n;
while (1) {
+ struct btrfs_delayed_node *node;
+ int count;
+
spin_lock(&root->inode_lock);
- n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
- (void **)delayed_nodes, inode_id,
- ARRAY_SIZE(delayed_nodes));
- if (!n) {
+ if (xa_empty(&root->delayed_nodes)) {
spin_unlock(&root->inode_lock);
- break;
+ return;
}
- inode_id = delayed_nodes[n - 1]->inode_id + 1;
- for (i = 0; i < n; i++) {
+ count = 0;
+ xa_for_each_start(&root->delayed_nodes, index, node, index) {
/*
* Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below
*/
- if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
- delayed_nodes[i] = NULL;
+ if (refcount_inc_not_zero(&node->refs)) {
+ delayed_nodes[count] = node;
+ count++;
+ }
+ if (count >= ARRAY_SIZE(delayed_nodes))
+ break;
}
spin_unlock(&root->inode_lock);
+ index++;
- for (i = 0; i < n; i++) {
- if (!delayed_nodes[i])
- continue;
+ for (int i = 0; i < count; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 5cceb31bbd16..64e115d97499 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -7,15 +7,23 @@
#ifndef BTRFS_DELAYED_INODE_H
#define BTRFS_DELAYED_INODE_H
+#include <linux/types.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/wait.h>
+#include <linux/fs.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include "ctree.h"
+struct btrfs_disk_key;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_root;
+struct btrfs_trans_handle;
+
enum btrfs_delayed_item_type {
BTRFS_DELAYED_INSERTION_ITEM,
BTRFS_DELAYED_DELETION_ITEM
@@ -98,18 +106,7 @@ struct btrfs_delayed_item {
char data[] __counted_by(data_len);
};
-static inline void btrfs_init_delayed_root(
- struct btrfs_delayed_root *delayed_root)
-{
- atomic_set(&delayed_root->items, 0);
- atomic_set(&delayed_root->items_seq, 0);
- delayed_root->nodes = 0;
- spin_lock_init(&delayed_root->lock);
- init_waitqueue_head(&delayed_root->wait);
- INIT_LIST_HEAD(&delayed_root->node_list);
- INIT_LIST_HEAD(&delayed_root->prepare_list);
-}
-
+void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root);
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9223934d95f4..6cc80fb10da2 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -16,8 +16,7 @@
#include "fs.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
-struct kmem_cache *btrfs_delayed_tree_ref_cachep;
-struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_ref_node_cachep;
struct kmem_cache *btrfs_delayed_extent_op_cachep;
/*
* delayed back reference update tracking. For subvolume trees
@@ -305,50 +304,19 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
}
/*
- * compare two delayed tree backrefs with same bytenr and type
- */
-static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
- struct btrfs_delayed_tree_ref *ref2)
-{
- if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
- if (ref1->root < ref2->root)
- return -1;
- if (ref1->root > ref2->root)
- return 1;
- } else {
- if (ref1->parent < ref2->parent)
- return -1;
- if (ref1->parent > ref2->parent)
- return 1;
- }
- return 0;
-}
-
-/*
* compare two delayed data backrefs with same bytenr and type
*/
-static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
- struct btrfs_delayed_data_ref *ref2)
+static int comp_data_refs(struct btrfs_delayed_ref_node *ref1,
+ struct btrfs_delayed_ref_node *ref2)
{
- if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
- if (ref1->root < ref2->root)
- return -1;
- if (ref1->root > ref2->root)
- return 1;
- if (ref1->objectid < ref2->objectid)
- return -1;
- if (ref1->objectid > ref2->objectid)
- return 1;
- if (ref1->offset < ref2->offset)
- return -1;
- if (ref1->offset > ref2->offset)
- return 1;
- } else {
- if (ref1->parent < ref2->parent)
- return -1;
- if (ref1->parent > ref2->parent)
- return 1;
- }
+ if (ref1->data_ref.objectid < ref2->data_ref.objectid)
+ return -1;
+ if (ref1->data_ref.objectid > ref2->data_ref.objectid)
+ return 1;
+ if (ref1->data_ref.offset < ref2->data_ref.offset)
+ return -1;
+ if (ref1->data_ref.offset > ref2->data_ref.offset)
+ return 1;
return 0;
}
@@ -362,13 +330,20 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
return -1;
if (ref1->type > ref2->type)
return 1;
- if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
- ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
- ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
- btrfs_delayed_node_to_tree_ref(ref2));
- else
- ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
- btrfs_delayed_node_to_data_ref(ref2));
+ if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY ||
+ ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+ } else {
+ if (ref1->ref_root < ref2->ref_root)
+ return -1;
+ if (ref1->ref_root > ref2->ref_root)
+ return -1;
+ if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY)
+ ret = comp_data_refs(ref1, ref2);
+ }
if (ret)
return ret;
if (check_seq) {
@@ -828,18 +803,20 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
}
static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_ref *generic_ref,
struct btrfs_qgroup_extent_record *qrecord,
- u64 bytenr, u64 num_bytes, u64 ref_root,
- u64 reserved, int action, bool is_data,
- bool is_system, u64 owning_root)
+ u64 reserved)
{
int count_mod = 1;
bool must_insert_reserved = false;
/* If reserved is provided, it must be a data extent. */
- BUG_ON(!is_data && reserved);
+ BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved);
- switch (action) {
+ switch (generic_ref->action) {
+ case BTRFS_ADD_DELAYED_REF:
+ /* count_mod is already set to 1. */
+ break;
case BTRFS_UPDATE_DELAYED_HEAD:
count_mod = 0;
break;
@@ -868,14 +845,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
}
refcount_set(&head_ref->refs, 1);
- head_ref->bytenr = bytenr;
- head_ref->num_bytes = num_bytes;
+ head_ref->bytenr = generic_ref->bytenr;
+ head_ref->num_bytes = generic_ref->num_bytes;
head_ref->ref_mod = count_mod;
head_ref->reserved_bytes = reserved;
head_ref->must_insert_reserved = must_insert_reserved;
- head_ref->owning_root = owning_root;
- head_ref->is_data = is_data;
- head_ref->is_system = is_system;
+ head_ref->owning_root = generic_ref->owning_root;
+ head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA);
+ head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID);
head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list);
RB_CLEAR_NODE(&head_ref->href_node);
@@ -885,12 +862,12 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
mutex_init(&head_ref->mutex);
if (qrecord) {
- if (ref_root && reserved) {
+ if (generic_ref->ref_root && reserved) {
qrecord->data_rsv = reserved;
- qrecord->data_rsv_refroot = ref_root;
+ qrecord->data_rsv_refroot = generic_ref->ref_root;
}
- qrecord->bytenr = bytenr;
- qrecord->num_bytes = num_bytes;
+ qrecord->bytenr = generic_ref->bytenr;
+ qrecord->num_bytes = generic_ref->num_bytes;
qrecord->old_roots = NULL;
}
}
@@ -982,89 +959,104 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
*/
static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *ref,
- u64 bytenr, u64 num_bytes, u64 ref_root,
- int action, u8 ref_type)
+ struct btrfs_ref *generic_ref)
{
+ int action = generic_ref->action;
u64 seq = 0;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
- if (is_fstree(ref_root))
+ if (is_fstree(generic_ref->ref_root))
seq = atomic64_read(&fs_info->tree_mod_seq);
refcount_set(&ref->refs, 1);
- ref->bytenr = bytenr;
- ref->num_bytes = num_bytes;
+ ref->bytenr = generic_ref->bytenr;
+ ref->num_bytes = generic_ref->num_bytes;
ref->ref_mod = 1;
ref->action = action;
ref->seq = seq;
- ref->type = ref_type;
+ ref->type = btrfs_ref_type(generic_ref);
+ ref->ref_root = generic_ref->ref_root;
+ ref->parent = generic_ref->parent;
RB_CLEAR_NODE(&ref->ref_node);
INIT_LIST_HEAD(&ref->add_list);
+
+ if (generic_ref->type == BTRFS_REF_DATA)
+ ref->data_ref = generic_ref->data_ref;
+ else
+ ref->tree_ref = generic_ref->tree_ref;
}
-/*
- * add a delayed tree ref. This does all of the accounting required
- * to make sure the delayed ref is eventually processed before this
- * transaction commits.
- */
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
- struct btrfs_ref *generic_ref,
- struct btrfs_delayed_extent_op *extent_op)
+void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
+ bool skip_qgroup)
+{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* If @real_root not set, use @root as fallback */
+ generic_ref->real_root = mod_root ?: generic_ref->ref_root;
+#endif
+ generic_ref->tree_ref.level = level;
+ generic_ref->type = BTRFS_REF_METADATA;
+ if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+
+}
+
+void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
+ u64 mod_root, bool skip_qgroup)
+{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* If @real_root not set, use @root as fallback */
+ generic_ref->real_root = mod_root ?: generic_ref->ref_root;
+#endif
+ generic_ref->data_ref.objectid = ino;
+ generic_ref->data_ref.offset = offset;
+ generic_ref->type = BTRFS_REF_DATA;
+ if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+}
+
+static int add_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref,
+ struct btrfs_delayed_extent_op *extent_op,
+ u64 reserved)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_delayed_ref_node *node;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
bool qrecord_inserted;
- bool is_system;
- bool merged;
int action = generic_ref->action;
- int level = generic_ref->tree_ref.level;
- u64 bytenr = generic_ref->bytenr;
- u64 num_bytes = generic_ref->len;
- u64 parent = generic_ref->parent;
- u8 ref_type;
-
- is_system = (generic_ref->tree_ref.ref_root == BTRFS_CHUNK_TREE_OBJECTID);
+ bool merged;
- ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
- ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
- if (!ref)
+ node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS);
+ if (!node)
return -ENOMEM;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) {
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
return -ENOMEM;
}
- if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
+ if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
return -ENOMEM;
}
}
- if (parent)
- ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
- else
- ref_type = BTRFS_TREE_BLOCK_REF_KEY;
-
- init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- generic_ref->tree_ref.ref_root, action,
- ref_type);
- ref->root = generic_ref->tree_ref.ref_root;
- ref->parent = parent;
- ref->level = level;
-
- init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- generic_ref->tree_ref.ref_root, 0, action,
- false, is_system, generic_ref->owning_root);
+ init_delayed_ref_common(fs_info, node, generic_ref);
+ init_delayed_ref_head(head_ref, generic_ref, record, reserved);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -1077,7 +1069,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
- merged = insert_delayed_ref(trans, head_ref, &ref->node);
+ merged = insert_delayed_ref(trans, head_ref, node);
spin_unlock(&delayed_refs->lock);
/*
@@ -1086,107 +1078,39 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
*/
btrfs_update_delayed_refs_rsv(trans);
- trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
- action == BTRFS_ADD_DELAYED_EXTENT ?
- BTRFS_ADD_DELAYED_REF : action);
+ if (generic_ref->type == BTRFS_REF_DATA)
+ trace_add_delayed_data_ref(trans->fs_info, node);
+ else
+ trace_add_delayed_tree_ref(trans->fs_info, node);
if (merged)
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
if (qrecord_inserted)
- btrfs_qgroup_trace_extent_post(trans, record);
-
+ return btrfs_qgroup_trace_extent_post(trans, record);
return 0;
}
/*
+ * Add a delayed tree ref. This does all of the accounting required to make sure
+ * the delayed ref is eventually processed before this transaction commits.
+ */
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
+ return add_delayed_ref(trans, generic_ref, extent_op, 0);
+}
+
+/*
* add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
*/
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
u64 reserved)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_delayed_data_ref *ref;
- struct btrfs_delayed_ref_head *head_ref;
- struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_qgroup_extent_record *record = NULL;
- bool qrecord_inserted;
- int action = generic_ref->action;
- bool merged;
- u64 bytenr = generic_ref->bytenr;
- u64 num_bytes = generic_ref->len;
- u64 parent = generic_ref->parent;
- u64 ref_root = generic_ref->data_ref.ref_root;
- u64 owner = generic_ref->data_ref.ino;
- u64 offset = generic_ref->data_ref.offset;
- u8 ref_type;
-
- ASSERT(generic_ref->type == BTRFS_REF_DATA && action);
- ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
- if (!ref)
- return -ENOMEM;
-
- if (parent)
- ref_type = BTRFS_SHARED_DATA_REF_KEY;
- else
- ref_type = BTRFS_EXTENT_DATA_REF_KEY;
- init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- ref_root, action, ref_type);
- ref->root = ref_root;
- ref->parent = parent;
- ref->objectid = owner;
- ref->offset = offset;
-
-
- head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
- if (!head_ref) {
- kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
- return -ENOMEM;
- }
-
- if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
- record = kzalloc(sizeof(*record), GFP_NOFS);
- if (!record) {
- kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
- kmem_cache_free(btrfs_delayed_ref_head_cachep,
- head_ref);
- return -ENOMEM;
- }
- }
-
- init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
- reserved, action, true, false, generic_ref->owning_root);
- head_ref->extent_op = NULL;
-
- delayed_refs = &trans->transaction->delayed_refs;
- spin_lock(&delayed_refs->lock);
-
- /*
- * insert both the head node and the new ref without dropping
- * the spin lock
- */
- head_ref = add_delayed_ref_head(trans, head_ref, record,
- action, &qrecord_inserted);
-
- merged = insert_delayed_ref(trans, head_ref, &ref->node);
- spin_unlock(&delayed_refs->lock);
-
- /*
- * Need to update the delayed_refs_rsv with any changes we may have
- * made.
- */
- btrfs_update_delayed_refs_rsv(trans);
-
- trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
- action == BTRFS_ADD_DELAYED_EXTENT ?
- BTRFS_ADD_DELAYED_REF : action);
- if (merged)
- kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
-
-
- if (qrecord_inserted)
- return btrfs_qgroup_trace_extent_post(trans, record);
- return 0;
+ ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action);
+ return add_delayed_ref(trans, generic_ref, NULL, reserved);
}
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
@@ -1195,13 +1119,18 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_ref generic_ref = {
+ .type = BTRFS_REF_METADATA,
+ .action = BTRFS_UPDATE_DELAYED_HEAD,
+ .bytenr = bytenr,
+ .num_bytes = num_bytes,
+ };
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref)
return -ENOMEM;
- init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
- BTRFS_UPDATE_DELAYED_HEAD, false, false, 0);
+ init_delayed_ref_head(head_ref, &generic_ref, NULL, 0);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -1220,6 +1149,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
return 0;
}
+void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+ if (refcount_dec_and_test(&ref->refs)) {
+ WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
+ kmem_cache_free(btrfs_delayed_ref_node_cachep, ref);
+ }
+}
+
/*
* This does a simple search for the head node for a given extent. Returns the
* head node if found, or NULL if not.
@@ -1235,38 +1172,21 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt
void __cold btrfs_delayed_ref_exit(void)
{
kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
- kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
- kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+ kmem_cache_destroy(btrfs_delayed_ref_node_cachep);
kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
}
int __init btrfs_delayed_ref_init(void)
{
- btrfs_delayed_ref_head_cachep = kmem_cache_create(
- "btrfs_delayed_ref_head",
- sizeof(struct btrfs_delayed_ref_head), 0,
- SLAB_MEM_SPREAD, NULL);
+ btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0);
if (!btrfs_delayed_ref_head_cachep)
goto fail;
- btrfs_delayed_tree_ref_cachep = kmem_cache_create(
- "btrfs_delayed_tree_ref",
- sizeof(struct btrfs_delayed_tree_ref), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!btrfs_delayed_tree_ref_cachep)
- goto fail;
-
- btrfs_delayed_data_ref_cachep = kmem_cache_create(
- "btrfs_delayed_data_ref",
- sizeof(struct btrfs_delayed_data_ref), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!btrfs_delayed_data_ref_cachep)
+ btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0);
+ if (!btrfs_delayed_ref_node_cachep)
goto fail;
- btrfs_delayed_extent_op_cachep = kmem_cache_create(
- "btrfs_delayed_extent_op",
- sizeof(struct btrfs_delayed_extent_op), 0,
- SLAB_MEM_SPREAD, NULL);
+ btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0);
if (!btrfs_delayed_extent_op_cachep)
goto fail;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 62d679d40f4f..04b180ebe1fe 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -6,7 +6,17 @@
#ifndef BTRFS_DELAYED_REF_H
#define BTRFS_DELAYED_REF_H
+#include <linux/types.h>
#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <uapi/linux/btrfs_tree.h>
+
+struct btrfs_trans_handle;
+struct btrfs_fs_info;
/* these are the possible values of struct btrfs_delayed_ref_node->action */
enum btrfs_delayed_ref_action {
@@ -20,6 +30,32 @@ enum btrfs_delayed_ref_action {
BTRFS_UPDATE_DELAYED_HEAD,
} __packed;
+struct btrfs_data_ref {
+ /* For EXTENT_DATA_REF */
+
+ /* Inode which refers to this data extent */
+ u64 objectid;
+
+ /*
+ * file_offset - extent_offset
+ *
+ * file_offset is the key.offset of the EXTENT_DATA key.
+ * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data.
+ */
+ u64 offset;
+};
+
+struct btrfs_tree_ref {
+ /*
+ * Level of this tree block.
+ *
+ * Shared for skinny (TREE_BLOCK_REF) and normal tree ref.
+ */
+ int level;
+
+ /* For non-skinny metadata, no special member needed */
+};
+
struct btrfs_delayed_ref_node {
struct rb_node ref_node;
/*
@@ -38,6 +74,15 @@ struct btrfs_delayed_ref_node {
/* seq number to keep track of insertion order */
u64 seq;
+ /* The ref_root for this ref */
+ u64 ref_root;
+
+ /*
+ * The parent for this ref, if this isn't set the ref_root is the
+ * reference owner.
+ */
+ u64 parent;
+
/* ref count on this data structure */
refcount_t refs;
@@ -54,6 +99,11 @@ struct btrfs_delayed_ref_node {
unsigned int action:8;
unsigned int type:8;
+
+ union {
+ struct btrfs_tree_ref tree_ref;
+ struct btrfs_data_ref data_ref;
+ };
};
struct btrfs_delayed_extent_op {
@@ -141,21 +191,6 @@ struct btrfs_delayed_ref_head {
bool processing;
};
-struct btrfs_delayed_tree_ref {
- struct btrfs_delayed_ref_node node;
- u64 root;
- u64 parent;
- int level;
-};
-
-struct btrfs_delayed_data_ref {
- struct btrfs_delayed_ref_node node;
- u64 root;
- u64 parent;
- u64 objectid;
- u64 offset;
-};
-
enum btrfs_delayed_ref_flags {
/* Indicate that we are flushing delayed refs for the commit */
BTRFS_DELAYED_REFS_FLUSHING,
@@ -204,42 +239,6 @@ enum btrfs_ref_type {
BTRFS_REF_LAST,
} __packed;
-struct btrfs_data_ref {
- /* For EXTENT_DATA_REF */
-
- /* Root which owns this data reference. */
- u64 ref_root;
-
- /* Inode which refers to this data extent */
- u64 ino;
-
- /*
- * file_offset - extent_offset
- *
- * file_offset is the key.offset of the EXTENT_DATA key.
- * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data.
- */
- u64 offset;
-};
-
-struct btrfs_tree_ref {
- /*
- * Level of this tree block
- *
- * Shared for skinny (TREE_BLOCK_REF) and normal tree ref.
- */
- int level;
-
- /*
- * Root which owns this tree block reference.
- *
- * For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
- */
- u64 ref_root;
-
- /* For non-skinny metadata, no special member needed */
-};
-
struct btrfs_ref {
enum btrfs_ref_type type;
enum btrfs_delayed_ref_action action;
@@ -257,9 +256,15 @@ struct btrfs_ref {
u64 real_root;
#endif
u64 bytenr;
- u64 len;
+ u64 num_bytes;
u64 owning_root;
+ /*
+ * The root that owns the reference for this reference, this will be set
+ * or ->parent will be set, depending on what type of reference this is.
+ */
+ u64 ref_root;
+
/* Bytenr of the parent tree block */
u64 parent;
union {
@@ -269,8 +274,7 @@ struct btrfs_ref {
};
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
-extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
-extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_ref_node_cachep;
extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
int __init btrfs_delayed_ref_init(void);
@@ -308,53 +312,10 @@ static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *
return btrfs_calc_metadata_size(fs_info, num_csum_items);
}
-static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
- int action, u64 bytenr, u64 len,
- u64 parent, u64 owning_root)
-{
- generic_ref->action = action;
- generic_ref->bytenr = bytenr;
- generic_ref->len = len;
- generic_ref->parent = parent;
- generic_ref->owning_root = owning_root;
-}
-
-static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level,
- u64 root, u64 mod_root, bool skip_qgroup)
-{
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- /* If @real_root not set, use @root as fallback */
- generic_ref->real_root = mod_root ?: root;
-#endif
- generic_ref->tree_ref.level = level;
- generic_ref->tree_ref.ref_root = root;
- generic_ref->type = BTRFS_REF_METADATA;
- if (skip_qgroup || !(is_fstree(root) &&
- (!mod_root || is_fstree(mod_root))))
- generic_ref->skip_qgroup = true;
- else
- generic_ref->skip_qgroup = false;
-
-}
-
-static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
- u64 ref_root, u64 ino, u64 offset, u64 mod_root,
- bool skip_qgroup)
-{
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- /* If @real_root not set, use @root as fallback */
- generic_ref->real_root = mod_root ?: ref_root;
-#endif
- generic_ref->data_ref.ref_root = ref_root;
- generic_ref->data_ref.ino = ino;
- generic_ref->data_ref.offset = offset;
- generic_ref->type = BTRFS_REF_DATA;
- if (skip_qgroup || !(is_fstree(ref_root) &&
- (!mod_root || is_fstree(mod_root))))
- generic_ref->skip_qgroup = true;
- else
- generic_ref->skip_qgroup = false;
-}
+void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
+ bool skip_qgroup);
+void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
+ u64 mod_root, bool skip_qgroup);
static inline struct btrfs_delayed_extent_op *
btrfs_alloc_delayed_extent_op(void)
@@ -369,24 +330,7 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
}
-static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
-{
- if (refcount_dec_and_test(&ref->refs)) {
- WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
- switch (ref->type) {
- case BTRFS_TREE_BLOCK_REF_KEY:
- case BTRFS_SHARED_BLOCK_REF_KEY:
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
- break;
- case BTRFS_EXTENT_DATA_REF_KEY:
- case BTRFS_SHARED_DATA_REF_KEY:
- kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
- break;
- default:
- BUG();
- }
- }
-}
+void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref);
static inline u64 btrfs_ref_head_to_space_flags(
struct btrfs_delayed_ref_head *head_ref)
@@ -446,19 +390,39 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
u64 num_bytes);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
-/*
- * helper functions to cast a node into its container
- */
-static inline struct btrfs_delayed_tree_ref *
-btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
+static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
+{
+ if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ node->type == BTRFS_SHARED_DATA_REF_KEY)
+ return node->data_ref.objectid;
+ return node->tree_ref.level;
+}
+
+static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
{
- return container_of(node, struct btrfs_delayed_tree_ref, node);
+ if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ node->type == BTRFS_SHARED_DATA_REF_KEY)
+ return node->data_ref.offset;
+ return 0;
}
-static inline struct btrfs_delayed_data_ref *
-btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+static inline u8 btrfs_ref_type(struct btrfs_ref *ref)
{
- return container_of(node, struct btrfs_delayed_data_ref, node);
+ ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA);
+
+ if (ref->type == BTRFS_REF_DATA) {
+ if (ref->parent)
+ return BTRFS_SHARED_DATA_REF_KEY;
+ else
+ return BTRFS_EXTENT_DATA_REF_KEY;
+ } else {
+ if (ref->parent)
+ return BTRFS_SHARED_BLOCK_REF_KEY;
+ else
+ return BTRFS_TREE_BLOCK_REF_KEY;
+ }
+
+ return 0;
}
#endif
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f9544fda38e9..7696beec4c21 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -11,10 +11,8 @@
#include <linux/math64.h>
#include "misc.h"
#include "ctree.h"
-#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
-#include "print-tree.h"
#include "volumes.h"
#include "async-thread.h"
#include "dev-replace.h"
@@ -246,7 +244,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
@@ -257,13 +255,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return -EINVAL;
}
- bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev_handle)) {
+ if (IS_ERR(bdev_file)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
- return PTR_ERR(bdev_handle);
+ return PTR_ERR(bdev_file);
}
- bdev = bdev_handle->bdev;
+ bdev = file_bdev(bdev_file);
if (!btrfs_check_device_zone_type(fs_info, bdev)) {
btrfs_err(fs_info,
@@ -314,7 +312,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
- device->bdev_handle = bdev_handle;
+ device->bdev_file = bdev_file;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->dev_stats_valid = 1;
@@ -335,7 +333,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return 0;
error:
- bdev_release(bdev_handle);
+ fput(bdev_file);
return ret;
}
@@ -550,8 +548,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
u64 physical)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 chunk_offset = cache->start;
int num_extents, cur_extent;
int i;
@@ -567,9 +564,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
}
spin_unlock(&cache->lock);
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- ASSERT(!IS_ERR(em));
- map = em->map_lookup;
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(!IS_ERR(map));
num_extents = 0;
cur_extent = 0;
@@ -583,7 +579,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
cur_extent = i;
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
if (num_extents > 1 && cur_extent < num_extents - 1) {
/*
@@ -727,6 +723,23 @@ leave:
return ret;
}
+static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args)
+{
+ if (args->start.srcdevid == 0) {
+ if (memchr(args->start.srcdev_name, 0,
+ sizeof(args->start.srcdev_name)) == NULL)
+ return -ENAMETOOLONG;
+ } else {
+ args->start.srcdev_name[0] = 0;
+ }
+
+ if (memchr(args->start.tgtdev_name, 0,
+ sizeof(args->start.tgtdev_name)) == NULL)
+ return -ENAMETOOLONG;
+
+ return 0;
+}
+
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
@@ -739,10 +752,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
default:
return -EINVAL;
}
-
- if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
- args->start.tgtdev_name[0] == '\0')
- return -EINVAL;
+ ret = btrfs_check_replace_dev_names(args);
+ if (ret < 0)
+ return ret;
ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
args->start.srcdevid,
@@ -812,25 +824,23 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
u64 start = 0;
int i;
- write_lock(&em_tree->lock);
+ write_lock(&fs_info->mapping_tree_lock);
do {
- em = lookup_extent_mapping(em_tree, start, (u64)-1);
- if (!em)
+ struct btrfs_chunk_map *map;
+
+ map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX);
+ if (!map)
break;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
- start = em->start + em->len;
- free_extent_map(em);
+ start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
} while (start);
- write_unlock(&em_tree->lock);
+ write_unlock(&fs_info->mapping_tree_lock);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -988,8 +998,7 @@ error:
btrfs_sysfs_remove_device(src_device);
btrfs_sysfs_update_devid(tgt_device);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
- btrfs_scratch_superblocks(fs_info, src_device->bdev,
- src_device->name->str);
+ btrfs_scratch_superblocks(fs_info, src_device);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 675082ccec89..23e480efe5e6 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -6,11 +6,15 @@
#ifndef BTRFS_DEV_REPLACE_H
#define BTRFS_DEV_REPLACE_H
+#include <linux/types.h>
+#include <linux/compiler_types.h>
+
struct btrfs_ioctl_dev_replace_args;
struct btrfs_fs_info;
struct btrfs_trans_handle;
struct btrfs_dev_replace;
struct btrfs_block_group;
+struct btrfs_device;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index e40a226373d7..00b3d83d7569 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -3,9 +3,15 @@
#ifndef BTRFS_DIR_ITEM_H
#define BTRFS_DIR_ITEM_H
+#include <linux/types.h>
#include <linux/crc32c.h>
struct fscrypt_str;
+struct btrfs_fs_info;
+struct btrfs_key;
+struct btrfs_path;
+struct btrfs_root;
+struct btrfs_trans_handle;
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const struct fscrypt_str *name);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 401ea09ae4b8..a91a8056758a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,7 +29,6 @@
#include "tree-log.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "raid56.h"
#include "sysfs.h"
@@ -74,20 +73,37 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
- const int num_pages = num_extent_pages(buf);
- const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ int num_pages;
+ u32 first_page_part;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
+
+ if (buf->addr) {
+ /* Pages are contiguous, handle them as a big one. */
+ kaddr = buf->addr;
+ first_page_part = fs_info->nodesize;
+ num_pages = 1;
+ } else {
+ kaddr = folio_address(buf->folios[0]);
+ first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ num_pages = num_extent_pages(buf);
+ }
+
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
first_page_part - BTRFS_CSUM_SIZE);
+ /*
+ * Multiple single-page folios case would reach here.
+ *
+ * nodesize <= PAGE_SIZE and large folio all handled by above
+ * crypto_shash_update() already.
+ */
for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
- kaddr = page_address(buf->pages[i]);
+ kaddr = folio_address(buf->folios[i]);
crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
memset(result, 0, BTRFS_CSUM_SIZE);
@@ -166,20 +182,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i, num_pages = num_extent_pages(eb);
+ int num_folios = num_extent_folios(eb);
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
- u64 start = max_t(u64, eb->start, page_offset(p));
- u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE);
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+ u64 start = max_t(u64, eb->start, folio_pos(folio));
+ u64 end = min_t(u64, eb->start + eb->len,
+ folio_pos(folio) + eb->folio_size);
u32 len = end - start;
ret = btrfs_repair_io_failure(fs_info, 0, start, len,
- start, p, offset_in_page(start), mirror_num);
+ start, folio, offset_in_folio(folio, start),
+ mirror_num);
if (ret)
break;
}
@@ -254,15 +272,20 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
return BLK_STS_IOERR;
- if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
- WARN_ON_ONCE(found_start != 0);
+ /*
+ * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
+ * checksum it but zero-out its content. This is done to preserve
+ * ordering of I/O without unnecessarily writing out data.
+ */
+ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
+ memzero_extent_buffer(eb, 0, eb->len);
return BLK_STS_OK;
}
if (WARN_ON_ONCE(found_start != eb->start))
return BLK_STS_IOERR;
- if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start,
- eb->len)))
+ if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
+ eb->start, eb->len)))
return BLK_STS_IOERR;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
@@ -371,8 +394,8 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
}
csum_tree_block(eb, result);
- header_csum = page_address(eb->pages[0]) +
- get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
+ header_csum = folio_address(eb->folios[0]) +
+ get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
@@ -474,15 +497,15 @@ static int btree_migrate_folio(struct address_space *mapping,
static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct btrfs_fs_info *fs_info;
int ret;
if (wbc->sync_mode == WB_SYNC_NONE) {
+ struct btrfs_fs_info *fs_info;
if (wbc->for_kupdate)
return 0;
- fs_info = BTRFS_I(mapping->host)->root->fs_info;
+ fs_info = inode_to_fs_info(mapping->host);
/* this is a bit racy, but that's ok */
ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
BTRFS_DIRTY_METADATA_THRESH,
@@ -505,11 +528,12 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
struct extent_io_tree *tree;
- tree = &BTRFS_I(folio->mapping->host)->io_tree;
+
+ tree = &folio_to_inode(folio)->io_tree;
extent_invalidate_folio(tree, folio, offset);
btree_release_folio(folio, GFP_NOFS);
if (folio_get_private(folio)) {
- btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
+ btrfs_warn(folio_to_fs_info(folio),
"folio private not zero on folio %llu",
(unsigned long long)folio_pos(folio));
folio_detach_private(folio);
@@ -520,7 +544,7 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset,
static bool btree_dirty_folio(struct address_space *mapping,
struct folio *folio)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
struct btrfs_subpage_info *spi = fs_info->subpage_info;
struct btrfs_subpage *subpage;
struct extent_buffer *eb;
@@ -622,7 +646,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
- bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+ bool dummy = btrfs_is_testing(fs_info);
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
@@ -639,7 +663,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
- INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+ xa_init(&root->delayed_nodes);
btrfs_init_root_block_rsv(root);
@@ -650,14 +674,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->reloc_dirty_list);
- INIT_LIST_HEAD(&root->logged_list[0]);
- INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->inode_lock);
spin_lock_init(&root->delalloc_lock);
spin_lock_init(&root->ordered_extent_lock);
spin_lock_init(&root->accounting_lock);
- spin_lock_init(&root->log_extents_lock[0]);
- spin_lock_init(&root->log_extents_lock[1]);
spin_lock_init(&root->qgroup_meta_rsv_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
@@ -755,7 +775,7 @@ int btrfs_global_root_insert(struct btrfs_root *root)
if (tmp) {
ret = -EEXIST;
btrfs_warn(fs_info, "global root %llu %llu already exists",
- root->root_key.objectid, root->root_key.offset);
+ btrfs_root_id(root), root->root_key.offset);
}
return ret;
}
@@ -991,7 +1011,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
}
log_root->last_trans = trans->transid;
- log_root->root_key.offset = root->root_key.objectid;
+ log_root->root_key.offset = btrfs_root_id(root);
inode_item = &log_root->root_item.inode;
btrfs_set_stack_inode_generation(inode_item, 1);
@@ -1055,15 +1075,15 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
* For real fs, and not log/reloc trees, root owner must
* match its root node owner
*/
- if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
- root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
- root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
- root->root_key.objectid != btrfs_header_owner(root->node)) {
+ if (!btrfs_is_testing(fs_info) &&
+ btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
+ btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
+ btrfs_root_id(root) != btrfs_header_owner(root->node)) {
btrfs_crit(fs_info,
"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
- root->root_key.objectid, root->node->start,
+ btrfs_root_id(root), root->node->start,
btrfs_header_owner(root->node),
- root->root_key.objectid);
+ btrfs_root_id(root));
ret = -EUCLEAN;
goto fail;
}
@@ -1100,9 +1120,9 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
btrfs_drew_lock_init(&root->snapshot_lock);
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
!btrfs_is_data_reloc_root(root) &&
- is_fstree(root->root_key.objectid)) {
+ is_fstree(btrfs_root_id(root))) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1111,7 +1131,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
* Don't assign anonymous block device to roots that are not exposed to
* userspace, the id pool is limited to 1M
*/
- if (is_fstree(root->root_key.objectid) &&
+ if (is_fstree(btrfs_root_id(root)) &&
btrfs_root_refs(&root->root_item) > 0) {
if (!anon_dev) {
ret = get_anon_bdev(&root->anon_dev);
@@ -1198,7 +1218,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_insert(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
root);
if (ret == 0) {
btrfs_grab_root(root);
@@ -1223,6 +1243,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "leaked root %s refcount %d",
btrfs_root_name(&root->root_key, buf),
refcount_read(&root->refs));
+ WARN_ON_ONCE(1);
while (refcount_read(&root->refs) > 1)
btrfs_put_root(root);
btrfs_put_root(root);
@@ -1244,9 +1265,14 @@ static void free_global_roots(struct btrfs_fs_info *fs_info)
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
+ struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
+
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
percpu_counter_destroy(&fs_info->ordered_bytes);
+ if (percpu_counter_initialized(em_counter))
+ ASSERT(percpu_counter_sum_positive(em_counter) == 0);
+ percpu_counter_destroy(em_counter);
percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
btrfs_free_csum_hash(fs_info);
btrfs_free_stripe_hash_table(fs_info);
@@ -1286,12 +1312,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
*
* @objectid: root id
* @anon_dev: preallocated anonymous block device number for new roots,
- * pass 0 for new allocation.
+ * pass NULL for a new allocation.
* @check_ref: whether to check root item references, If true, return -ENOENT
* for orphan roots
*/
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev,
+ u64 objectid, dev_t *anon_dev,
bool check_ref)
{
struct btrfs_root *root;
@@ -1315,8 +1341,17 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
- /* Shouldn't get preallocated anon_dev for cached roots */
- ASSERT(!anon_dev);
+ /*
+ * Some other caller may have read out the newly inserted
+ * subvolume already (for things like backref walk etc). Not
+ * that common but still possible. In that case, we just need
+ * to free the anon_dev.
+ */
+ if (unlikely(anon_dev && *anon_dev)) {
+ free_anon_bdev(*anon_dev);
+ *anon_dev = 0;
+ }
+
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
btrfs_put_root(root);
return ERR_PTR(-ENOENT);
@@ -1336,7 +1371,7 @@ again:
goto fail;
}
- ret = btrfs_init_fs_root(root, anon_dev);
+ ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0);
if (ret)
goto fail;
@@ -1372,7 +1407,7 @@ fail:
* root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
* and once again by our caller.
*/
- if (anon_dev)
+ if (anon_dev && *anon_dev)
root->anon_dev = 0;
btrfs_put_root(root);
return ERR_PTR(ret);
@@ -1388,7 +1423,7 @@ fail:
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref)
{
- return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
+ return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref);
}
/*
@@ -1396,11 +1431,11 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
* the anonymous block device id
*
* @objectid: tree objectid
- * @anon_dev: if zero, allocate a new anonymous block device or use the
- * parameter value
+ * @anon_dev: if NULL, allocate a new anonymous block device or use the
+ * parameter value if not NULL
*/
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev)
+ u64 objectid, dev_t *anon_dev)
{
return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}
@@ -2209,7 +2244,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
struct btrfs_key location;
int ret;
- BUG_ON(!fs_info->tree_root);
+ ASSERT(fs_info->tree_root);
ret = load_global_roots(tree_root);
if (ret)
@@ -2553,7 +2588,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev
struct btrfs_tree_parent_check check = {
.level = level,
.transid = gen,
- .owner_root = root->root_key.objectid
+ .owner_root = btrfs_root_id(root)
};
int ret = 0;
@@ -2618,9 +2653,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
*/
btrfs_set_super_log_root(sb, 0);
- /* We can't trust the free space cache either */
- btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
-
btrfs_warn(fs_info, "try to load backup roots slot %d", i);
ret = read_backup_root(fs_info, i);
backup_index = ret;
@@ -2724,7 +2756,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->allocated_ebs);
spin_lock_init(&fs_info->eb_leak_lock);
#endif
- extent_map_tree_init(&fs_info->mapping_tree);
+ fs_info->mapping_tree = RB_ROOT_CACHED;
+ rwlock_init(&fs_info->mapping_tree_lock);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
@@ -2794,6 +2827,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
+ /* Default compress algorithm when user does -o compress */
+ fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
spin_lock_init(&fs_info->swapfile_pins_lock);
@@ -2808,6 +2844,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
int ret;
fs_info->sb = sb;
+ /* Temporary fixed values for block size until we read the superblock. */
sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
@@ -2815,6 +2852,10 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
+ ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
@@ -2897,7 +2938,7 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->fs_roots_radix_lock);
break;
}
- root_objectid = gang[ret - 1]->root_key.objectid + 1;
+ root_objectid = btrfs_root_id(gang[ret - 1]) + 1;
for (i = 0; i < ret; i++) {
/* Avoid to grab roots in dead_roots. */
@@ -2913,7 +2954,7 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
for (i = 0; i < ret; i++) {
if (!gang[i])
continue;
- root_objectid = gang[i]->root_key.objectid;
+ root_objectid = btrfs_root_id(gang[i]);
err = btrfs_orphan_cleanup(gang[i]);
if (err)
goto out;
@@ -2931,17 +2972,6 @@ out:
}
/*
- * Some options only have meaning at mount time and shouldn't persist across
- * remounts, or be displayed. Clear these at the end of mount and remount
- * code paths.
- */
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
-{
- btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
- btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
-}
-
-/*
* Mounting logic specific to read-write file systems. Shared by open_ctree
* and btrfs_remount when remounting from read-only to read-write.
*/
@@ -2953,7 +2983,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- rebuild_free_space_tree = true;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ btrfs_warn(fs_info,
+ "'clear_cache' option is ignored with extent tree v2");
+ else
+ rebuild_free_space_tree = true;
} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
btrfs_warn(fs_info, "free space tree is invalid");
@@ -3213,6 +3247,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
+ btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid);
/*
* Verify the type first, if that or the checksum value are
* corrupted, we'll find out
@@ -3275,13 +3310,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
- /*
- * In the long term, we'll store the compression type in the super
- * block, and it'll be used for per file compression control.
- */
- fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
-
-
/* Set up fs_info before parsing mount options */
nodesize = btrfs_super_nodesize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
@@ -3295,28 +3323,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
- ret = btrfs_parse_options(fs_info, options, sb->s_flags);
- if (ret)
+ /*
+ * Handle the space caching options appropriately now that we have the
+ * super block loaded and validated.
+ */
+ btrfs_set_free_space_cache_settings(fs_info);
+
+ if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) {
+ ret = -EINVAL;
goto fail_alloc;
+ }
ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
if (ret < 0)
goto fail_alloc;
+ /*
+ * At this point our mount options are validated, if we set ->max_inline
+ * to something non-standard make sure we truncate it to sectorsize.
+ */
+ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
+
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
- /*
- * V1 space cache has some hardcoded PAGE_SIZE usage, and is
- * going to be deprecated.
- *
- * Force to use v2 cache for subpage case.
- */
- btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
- btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
- "forcing free space tree for sector size %u with page size %lu",
- sectorsize, PAGE_SIZE);
-
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
@@ -3336,6 +3366,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
+ /* Update the values for the current filesystem. */
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -3493,29 +3524,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_cleaner;
}
- if (!btrfs_test_opt(fs_info, NOSSD) &&
- !fs_info->fs_devices->rotating) {
- btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
- }
-
- /*
- * For devices supporting discard turn on discard=async automatically,
- * unless it's already set or disabled. This could be turned off by
- * nodiscard for the same mount.
- *
- * The zoned mode piggy backs on the discard functionality for
- * resetting a zone. There is no reason to delay the zone reset as it is
- * fast enough. So, do not enable async discard for zoned mode.
- */
- if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
- btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
- btrfs_test_opt(fs_info, NODISCARD)) &&
- fs_info->fs_devices->discardable &&
- !btrfs_is_zoned(fs_info)) {
- btrfs_set_and_info(fs_info, DISCARD_ASYNC,
- "auto enabling async discard");
- }
-
ret = btrfs_read_qgroup_config(fs_info);
if (ret)
goto fail_trans_kthread;
@@ -3541,7 +3549,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
if (sb_rdonly(sb))
- goto clear_oneshot;
+ return 0;
ret = btrfs_start_pre_rw_mount(fs_info);
if (ret) {
@@ -3569,8 +3577,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
-clear_oneshot:
- btrfs_clear_oneshot_options(fs_info);
return 0;
fail_qgroup:
@@ -3607,7 +3613,7 @@ fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
fail_alloc:
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
iput(fs_info->btree_inode);
fail:
@@ -3620,28 +3626,25 @@ ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
static void btrfs_end_super_write(struct bio *bio)
{
struct btrfs_device *device = bio->bi_private;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
- struct page *page;
-
- bio_for_each_segment_all(bvec, bio, iter_all) {
- page = bvec->bv_page;
+ struct folio_iter fi;
+ bio_for_each_folio_all(fi, bio) {
if (bio->bi_status) {
btrfs_warn_rl_in_rcu(device->fs_info,
- "lost page write due to IO error on %s (%d)",
+ "lost super block write due to IO error on %s (%d)",
btrfs_dev_name(device),
blk_status_to_errno(bio->bi_status));
- ClearPageUptodate(page);
- SetPageError(page);
btrfs_dev_stat_inc_and_print(device,
BTRFS_DEV_STAT_WRITE_ERRS);
- } else {
- SetPageUptodate(page);
+ /* Ensure failure if the primary sb fails. */
+ if (bio->bi_opf & REQ_FUA)
+ atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR,
+ &device->sb_write_errors);
+ else
+ atomic_inc(&device->sb_write_errors);
}
-
- put_page(page);
- unlock_page(page);
+ folio_unlock(fi.folio);
+ folio_put(fi.folio);
}
bio_put(bio);
@@ -3728,13 +3731,13 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
/*
* Write superblock @sb to the @device. Do not wait for completion, all the
- * pages we use for writing are locked.
+ * folios we use for writing are locked.
*
* Write @max_mirrors copies of the superblock, where 0 means default that fit
* the expected device size at commit time. Note that max_mirrors must be
* same for write and wait phases.
*
- * Return number of errors when page is not found or submission fails.
+ * Return number of errors when folio is not found or submission fails.
*/
static int write_dev_supers(struct btrfs_device *device,
struct btrfs_super_block *sb, int max_mirrors)
@@ -3743,19 +3746,21 @@ static int write_dev_supers(struct btrfs_device *device,
struct address_space *mapping = device->bdev->bd_inode->i_mapping;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
int i;
- int errors = 0;
int ret;
u64 bytenr, bytenr_orig;
+ atomic_set(&device->sb_write_errors, 0);
+
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
shash->tfm = fs_info->csum_shash;
for (i = 0; i < max_mirrors; i++) {
- struct page *page;
+ struct folio *folio;
struct bio *bio;
struct btrfs_super_block *disk_super;
+ size_t offset;
bytenr_orig = btrfs_sb_offset(i);
ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
@@ -3765,7 +3770,7 @@ static int write_dev_supers(struct btrfs_device *device,
btrfs_err(device->fs_info,
"couldn't get super block location for mirror %d",
i);
- errors++;
+ atomic_inc(&device->sb_write_errors);
continue;
}
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
@@ -3778,20 +3783,20 @@ static int write_dev_supers(struct btrfs_device *device,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
sb->csum);
- page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
- GFP_NOFS);
- if (!page) {
+ folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ GFP_NOFS);
+ if (IS_ERR(folio)) {
btrfs_err(device->fs_info,
"couldn't get super block page for bytenr %llu",
bytenr);
- errors++;
+ atomic_inc(&device->sb_write_errors);
continue;
}
+ ASSERT(folio_order(folio) == 0);
- /* Bump the refcount for wait_dev_supers() */
- get_page(page);
-
- disk_super = page_address(page);
+ offset = offset_in_folio(folio, bytenr);
+ disk_super = folio_address(folio) + offset;
memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
/*
@@ -3805,8 +3810,7 @@ static int write_dev_supers(struct btrfs_device *device,
bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
bio->bi_private = device;
bio->bi_end_io = btrfs_end_super_write;
- __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
- offset_in_page(bytenr));
+ bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset);
/*
* We FUA only the first super block. The others we allow to
@@ -3818,17 +3822,17 @@ static int write_dev_supers(struct btrfs_device *device,
submit_bio(bio);
if (btrfs_advance_sb_log(device, i))
- errors++;
+ atomic_inc(&device->sb_write_errors);
}
- return errors < i ? 0 : -1;
+ return atomic_read(&device->sb_write_errors) < i ? 0 : -1;
}
/*
* Wait for write completion of superblocks done by write_dev_supers,
* @max_mirrors same for write and wait phases.
*
- * Return number of errors when page is not found or not marked up to
- * date.
+ * Return -1 if primary super block write failed or when there were no super block
+ * copies written. Otherwise 0.
*/
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
{
@@ -3842,7 +3846,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
for (i = 0; i < max_mirrors; i++) {
- struct page *page;
+ struct folio *folio;
ret = btrfs_sb_log_location(device, i, READ, &bytenr);
if (ret == -ENOENT) {
@@ -3857,30 +3861,21 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
device->commit_total_bytes)
break;
- page = find_get_page(device->bdev->bd_inode->i_mapping,
- bytenr >> PAGE_SHIFT);
- if (!page) {
- errors++;
- if (i == 0)
- primary_failed = true;
+ folio = filemap_get_folio(device->bdev->bd_inode->i_mapping,
+ bytenr >> PAGE_SHIFT);
+ /* If the folio has been removed, then we know it completed. */
+ if (IS_ERR(folio))
continue;
- }
- /* Page is submitted locked and unlocked once the IO completes */
- wait_on_page_locked(page);
- if (PageError(page)) {
- errors++;
- if (i == 0)
- primary_failed = true;
- }
-
- /* Drop our reference */
- put_page(page);
+ ASSERT(folio_order(folio) == 0);
- /* Drop the reference from the writing run */
- put_page(page);
+ /* Folio will be unlocked once the write completes. */
+ folio_wait_locked(folio);
+ folio_put(folio);
}
- /* log error, force error return */
+ errors += atomic_read(&device->sb_write_errors);
+ if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR)
+ primary_failed = true;
if (primary_failed) {
btrfs_err(device->fs_info, "error writing primary super block to device %llu",
device->devid);
@@ -4141,7 +4136,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid);
+ (unsigned long)btrfs_root_id(root));
if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
drop_ref = true;
spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -4184,9 +4179,6 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
struct btrfs_transaction *tmp;
bool found = false;
- if (list_empty(&fs_info->trans_list))
- return;
-
/*
* This function is only called at the very end of close_ctree(),
* thus no other running transaction, no need to take trans_lock.
@@ -4390,7 +4382,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
iput(fs_info->btree_inode);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
btrfs_close_devices(fs_info->fs_devices);
}
@@ -4486,7 +4478,7 @@ static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
for (i = 0; i < ret; i++) {
if (!gang[i])
continue;
- root_objectid = gang[i]->root_key.objectid;
+ root_objectid = btrfs_root_id(gang[i]);
btrfs_free_log(NULL, gang[i]);
btrfs_put_root(gang[i]);
}
@@ -4631,7 +4623,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
struct inode *inode = NULL;
btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
delalloc_inodes);
- __btrfs_del_delalloc_inode(root, btrfs_inode);
+ btrfs_del_delalloc_inode(btrfs_inode);
spin_unlock(&root->delalloc_lock);
/*
@@ -4798,6 +4790,32 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
}
}
+static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *gang[8];
+ int i;
+ int ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while (1) {
+ ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang),
+ BTRFS_ROOT_TRANS_TAG);
+ if (ret == 0)
+ break;
+ for (i = 0; i < ret; i++) {
+ struct btrfs_root *root = gang[i];
+
+ btrfs_qgroup_free_meta_all_pertrans(root);
+ radix_tree_tag_clear(&fs_info->fs_roots_radix,
+ (unsigned long)btrfs_root_id(root),
+ BTRFS_ROOT_TRANS_TAG);
+ }
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+}
+
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_fs_info *fs_info)
{
@@ -4820,8 +4838,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&fs_info->transaction_wait);
- btrfs_destroy_delayed_inodes(fs_info);
-
btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
EXTENT_DIRTY);
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
@@ -4878,6 +4894,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
btrfs_assert_delayed_root_empty(fs_info);
btrfs_destroy_all_delalloc_inodes(fs_info);
btrfs_drop_all_logs(fs_info);
+ btrfs_free_all_qgroup_pertrans(fs_info);
mutex_unlock(&fs_info->transaction_kthread_mutex);
return 0;
@@ -4902,7 +4919,14 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto error;
- BUG_ON(ret == 0); /* Corruption */
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist a root
+ * with such id, but this is out of valid range.
+ */
+ ret = -EUCLEAN;
+ goto error;
+ }
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
l = path->nodes[0];
@@ -4926,7 +4950,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
btrfs_warn(root->fs_info,
"the objectid of root %llu reaches its highest value",
- root->root_key.objectid);
+ btrfs_root_id(root));
ret = -ENOSPC;
goto out;
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 50dab8f639dc..76eb53fe7a11 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -6,6 +6,22 @@
#ifndef BTRFS_DISK_IO_H
#define BTRFS_DISK_IO_H
+#include <linux/sizes.h>
+#include <linux/compiler_types.h>
+#include "ctree.h"
+#include "fs.h"
+
+struct block_device;
+struct super_block;
+struct extent_buffer;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_fs_info;
+struct btrfs_super_block;
+struct btrfs_trans_handle;
+struct btrfs_tree_parent_check;
+struct btrfs_transaction;
+
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
@@ -25,10 +41,6 @@ static inline u64 btrfs_sb_offset(int mirror)
return BTRFS_SUPER_INFO_OFFSET;
}
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_tree_parent_check;
-
void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
@@ -37,9 +49,6 @@ struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root,
int level);
-void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
- struct extent_buffer *buf);
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb);
@@ -64,7 +73,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref);
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev);
+ u64 objectid, dev_t *anon_dev);
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
u64 objectid);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 744a02b7fd67..9e81f89e76d8 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -5,7 +5,6 @@
#include "ctree.h"
#include "disk-io.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "export.h"
#include "accessors.h"
#include "super.h"
@@ -35,7 +34,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
type = FILEID_BTRFS_WITHOUT_PARENT;
fid->objectid = btrfs_ino(BTRFS_I(inode));
- fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid;
+ fid->root_objectid = btrfs_root_id(BTRFS_I(inode)->root);
fid->gen = inode->i_generation;
if (parent) {
@@ -43,7 +42,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
fid->parent_objectid = BTRFS_I(parent)->location.objectid;
fid->parent_gen = parent->i_generation;
- parent_root_id = BTRFS_I(parent)->root->root_key.objectid;
+ parent_root_id = btrfs_root_id(BTRFS_I(parent)->root);
if (parent_root_id != fid->root_objectid) {
fid->parent_root_objectid = parent_root_id;
@@ -161,7 +160,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
return ERR_PTR(-ENOMEM);
if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) {
- key.objectid = root->root_key.objectid;
+ key.objectid = btrfs_root_id(root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
@@ -174,8 +173,15 @@ struct dentry *btrfs_get_parent(struct dentry *child)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto fail;
+ if (ret == 0) {
+ /*
+ * Key with offset of -1 found, there would have to exist an
+ * inode with such number or a root with such id.
+ */
+ ret = -EUCLEAN;
+ goto fail;
+ }
- BUG_ON(ret == 0); /* Key with offset of -1 found */
if (path->slots[0] == 0) {
ret = -ENOENT;
goto fail;
@@ -215,7 +221,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
{
struct inode *inode = d_inode(child);
struct inode *dir = d_inode(parent);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_path *path;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_inode_ref *iref;
@@ -237,7 +243,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
return -ENOMEM;
if (ino == BTRFS_FIRST_FREE_OBJECTID) {
- key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+ key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index eba6bc4f5a61..464582273af9 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -4,6 +4,10 @@
#define BTRFS_EXPORT_H
#include <linux/exportfs.h>
+#include <linux/types.h>
+
+struct dentry;
+struct super_block;
extern const struct export_operations btrfs_export_ops;
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index ea149be28dff..ed2cfc3d5d8a 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -6,7 +6,6 @@
#include "ctree.h"
#include "extent-io-tree.h"
#include "btrfs_inode.h"
-#include "misc.h"
static struct kmem_cache *extent_state_cache;
@@ -48,6 +47,7 @@ static inline void btrfs_extent_state_leak_debug_check(void)
extent_state_in_tree(state),
refcount_read(&state->refs));
list_del(&state->leak_list);
+ WARN_ON_ONCE(1);
kmem_cache_free(extent_state_cache, state);
}
}
@@ -58,12 +58,13 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree,
u64 start, u64 end)
{
- struct btrfs_inode *inode = tree->inode;
+ const struct btrfs_inode *inode;
u64 isize;
- if (!inode)
+ if (tree->owner != IO_TREE_INODE_IO)
return;
+ inode = extent_io_tree_to_inode_const(tree);
isize = i_size_read(&inode->vfs_inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
btrfs_debug_rl(inode->root->fs_info,
@@ -78,31 +79,46 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
+
/*
- * For the file_extent_tree, we want to hold the inode lock when we lookup and
- * update the disk_i_size, but lockdep will complain because our io_tree we hold
- * the tree lock and get the inode lock when setting delalloc. These two things
- * are unrelated, so make a class for the file_extent_tree so we don't get the
- * two locking patterns mixed up.
+ * The only tree allowed to set the inode is IO_TREE_INODE_IO.
*/
-static struct lock_class_key file_extent_tree_class;
+static bool is_inode_io_tree(const struct extent_io_tree *tree)
+{
+ return tree->owner == IO_TREE_INODE_IO;
+}
-struct tree_entry {
- u64 start;
- u64 end;
- struct rb_node rb_node;
-};
+/* Return the inode if it's valid for the given tree, otherwise NULL. */
+struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode;
+ return NULL;
+}
+
+/* Read-only access to the inode. */
+const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode;
+ return NULL;
+}
+
+/* For read-only access to fs_info. */
+const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode->root->fs_info;
+ return tree->fs_info;
+}
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner)
{
- tree->fs_info = fs_info;
tree->state = RB_ROOT;
spin_lock_init(&tree->lock);
- tree->inode = NULL;
+ tree->fs_info = fs_info;
tree->owner = owner;
- if (owner == IO_TREE_INODE_FILE_EXTENT)
- lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
/*
@@ -329,10 +345,14 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64
return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+static void extent_io_tree_panic(const struct extent_io_tree *tree,
+ const struct extent_state *state,
+ const char *opname,
+ int err)
{
- btrfs_panic(tree->fs_info, err,
- "locking error: extent tree was modified by another thread while locked");
+ btrfs_panic(extent_io_tree_to_fs_info(tree), err,
+ "extent io tree error on %s state start %llu end %llu",
+ opname, state->start, state->end);
}
static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
@@ -341,8 +361,9 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s
prev = prev_state(state);
if (prev && prev->end == state->start - 1 && prev->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, prev);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
+ state, prev);
state->start = prev->start;
rb_erase(&prev->rb_node, &tree->state);
RB_CLEAR_NODE(&prev->rb_node);
@@ -356,8 +377,9 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s
next = next_state(state);
if (next && next->start == state->end + 1 && next->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, next);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
+ state, next);
state->end = next->end;
rb_erase(&next->rb_node, &tree->state);
RB_CLEAR_NODE(&next->rb_node);
@@ -390,8 +412,8 @@ static void set_state_bits(struct extent_io_tree *tree,
u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
- if (tree->inode)
- btrfs_set_delalloc_extent(tree->inode, state, bits);
+ if (is_inode_io_tree(tree))
+ btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits);
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
BUG_ON(ret < 0);
@@ -436,9 +458,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
if (state->end < entry->start) {
if (try_merge && end == entry->start &&
state->state == entry->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode,
- state, entry);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(
+ extent_io_tree_to_inode(tree),
+ state, entry);
entry->start = state->start;
merge_prev_state(tree, entry);
state->state = 0;
@@ -448,9 +471,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
} else if (state->end > entry->end) {
if (try_merge && entry->end == start &&
state->state == entry->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode,
- state, entry);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(
+ extent_io_tree_to_inode(tree),
+ state, entry);
entry->end = state->end;
merge_next_state(tree, entry);
state->state = 0;
@@ -458,9 +482,6 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
}
node = &(*node)->rb_right;
} else {
- btrfs_err(tree->fs_info,
- "found node %llu %llu on insert of %llu %llu",
- entry->start, entry->end, state->start, state->end);
return ERR_PTR(-EEXIST);
}
}
@@ -505,8 +526,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct rb_node *parent = NULL;
struct rb_node **node;
- if (tree->inode)
- btrfs_split_delalloc_extent(tree->inode, orig, split);
+ if (is_inode_io_tree(tree))
+ btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig,
+ split);
prealloc->start = orig->start;
prealloc->end = split - 1;
@@ -553,8 +575,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
- if (tree->inode)
- btrfs_clear_delalloc_extent(tree->inode, state, bits);
+ if (is_inode_io_tree(tree))
+ btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state,
+ bits);
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
@@ -695,7 +718,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
prealloc = NULL;
if (err)
@@ -717,7 +740,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
if (wake)
wake_up(&state->wq);
@@ -939,6 +962,8 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state *state;
int ret = 1;
+ ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES));
+
spin_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, bits);
if (state) {
@@ -1034,7 +1059,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *prealloc = NULL;
struct rb_node **p = NULL;
struct rb_node *parent = NULL;
- int err = 0;
+ int ret = 0;
u64 last_start;
u64 last_end;
u32 exclusive_bits = (bits & EXTENT_LOCKED);
@@ -1097,7 +1122,7 @@ hit_next:
if (state->state & exclusive_bits) {
*failed_start = state->start;
cache_state(state, failed_state);
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
@@ -1133,7 +1158,7 @@ hit_next:
if (state->state & exclusive_bits) {
*failed_start = start;
cache_state(state, failed_state);
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
@@ -1150,12 +1175,12 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
+ ret = split_state(tree, state, prealloc, start);
+ if (ret)
+ extent_io_tree_panic(tree, state, "split", ret);
prealloc = NULL;
- if (err)
+ if (ret)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, bits, changeset);
@@ -1199,8 +1224,8 @@ hit_next:
prealloc->end = this_end;
inserted_state = insert_state(tree, prealloc, bits, changeset);
if (IS_ERR(inserted_state)) {
- err = PTR_ERR(inserted_state);
- extent_io_tree_panic(tree, err);
+ ret = PTR_ERR(inserted_state);
+ extent_io_tree_panic(tree, prealloc, "insert", ret);
}
cache_state(inserted_state, cached_state);
@@ -1219,16 +1244,16 @@ hit_next:
if (state->state & exclusive_bits) {
*failed_start = start;
cache_state(state, failed_state);
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
+ ret = split_state(tree, state, prealloc, end + 1);
+ if (ret)
+ extent_io_tree_panic(tree, state, "split", ret);
set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
@@ -1250,7 +1275,7 @@ out:
if (prealloc)
free_extent_state(prealloc);
- return err;
+ return ret;
}
@@ -1287,7 +1312,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *prealloc = NULL;
struct rb_node **p = NULL;
struct rb_node *parent = NULL;
- int err = 0;
+ int ret = 0;
u64 last_start;
u64 last_end;
bool first_iteration = true;
@@ -1326,7 +1351,7 @@ again:
if (!state) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
prealloc->start = start;
@@ -1377,14 +1402,14 @@ hit_next:
if (state->start < start) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
+ ret = split_state(tree, state, prealloc, start);
+ if (ret)
+ extent_io_tree_panic(tree, state, "split", ret);
prealloc = NULL;
- if (err)
+ if (ret)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, bits, NULL);
@@ -1417,7 +1442,7 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -1429,8 +1454,8 @@ hit_next:
prealloc->end = this_end;
inserted_state = insert_state(tree, prealloc, bits, NULL);
if (IS_ERR(inserted_state)) {
- err = PTR_ERR(inserted_state);
- extent_io_tree_panic(tree, err);
+ ret = PTR_ERR(inserted_state);
+ extent_io_tree_panic(tree, prealloc, "insert", ret);
}
cache_state(inserted_state, cached_state);
if (inserted_state == prealloc)
@@ -1447,13 +1472,13 @@ hit_next:
if (state->start <= end && state->end > end) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
+ ret = split_state(tree, state, prealloc, end + 1);
+ if (ret)
+ extent_io_tree_panic(tree, state, "split", ret);
set_state_bits(tree, prealloc, bits, NULL);
cache_state(prealloc, cached_state);
@@ -1475,7 +1500,7 @@ out:
if (prealloc)
free_extent_state(prealloc);
- return err;
+ return ret;
}
/*
@@ -1858,8 +1883,8 @@ void __cold extent_state_free_cachep(void)
int __init extent_state_init_cachep(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
- sizeof(struct extent_state), 0,
- SLAB_MEM_SPREAD, NULL);
+ sizeof(struct extent_state), 0, 0,
+ NULL);
if (!extent_state_cache)
return -ENOMEM;
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 5602b0137fcd..9d3a52d8f59a 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -3,9 +3,16 @@
#ifndef BTRFS_EXTENT_IO_TREE_H
#define BTRFS_EXTENT_IO_TREE_H
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/wait.h>
#include "misc.h"
struct extent_changeset;
+struct btrfs_fs_info;
+struct btrfs_inode;
/* Bits for the extent state */
enum {
@@ -87,9 +94,17 @@ enum {
struct extent_io_tree {
struct rb_root state;
- struct btrfs_fs_info *fs_info;
- /* Inode associated with this tree, or NULL. */
- struct btrfs_inode *inode;
+ /*
+ * The fs_info is needed for trace points, a tree attached to an inode
+ * needs the inode.
+ *
+ * owner == IO_TREE_INODE_IO - then inode is valid and fs_info can be
+ * accessed as inode->root->fs_info
+ */
+ union {
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_inode *inode;
+ };
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
@@ -112,6 +127,10 @@ struct extent_state {
#endif
};
+struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree);
+const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree);
+const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
+
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner);
void extent_io_tree_release(struct extent_io_tree *tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c8e5b4715b49..47d48233b592 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -18,7 +18,7 @@
#include <linux/crc32c.h>
#include "ctree.h"
#include "extent-tree.h"
-#include "tree-log.h"
+#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
#include "volumes.h"
@@ -26,14 +26,11 @@
#include "locking.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "sysfs.h"
#include "qgroup.h"
#include "ref-verify.h"
#include "space-info.h"
#include "block-rsv.h"
-#include "delalloc-space.h"
#include "discard.h"
-#include "rcu-string.h"
#include "zoned.h"
#include "dev-replace.h"
#include "fs.h"
@@ -49,9 +46,7 @@
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node, u64 parent,
- u64 root_objectid, u64 owner_objectid,
- u64 owner_offset,
+ struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
@@ -102,7 +97,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
*/
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 offset, int metadata, u64 *refs, u64 *flags)
+ u64 offset, int metadata, u64 *refs, u64 *flags,
+ u64 *owning_root)
{
struct btrfs_root *extent_root;
struct btrfs_delayed_ref_head *head;
@@ -114,6 +110,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
u32 item_size;
u64 num_refs;
u64 extent_flags;
+ u64 owner = 0;
int ret;
/*
@@ -167,6 +164,8 @@ search_again:
struct btrfs_extent_item);
num_refs = btrfs_extent_refs(leaf, ei);
extent_flags = btrfs_extent_flags(leaf, ei);
+ owner = btrfs_get_extent_owner_root(fs_info, leaf,
+ path->slots[0]);
} else {
ret = -EUCLEAN;
btrfs_err(fs_info,
@@ -226,6 +225,8 @@ out:
*refs = num_refs;
if (flags)
*flags = extent_flags;
+ if (owning_root)
+ *owning_root = owner;
out_free:
btrfs_free_path(path);
return ret;
@@ -445,9 +446,8 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_extent_data_ref *ref;
struct extent_buffer *leaf;
u32 nritems;
- int ret;
int recow;
- int err = -ENOENT;
+ int ret;
key.objectid = bytenr;
if (parent) {
@@ -461,26 +461,26 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
again:
recow = 0;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0) {
- err = ret;
- goto fail;
- }
+ if (ret < 0)
+ return ret;
if (parent) {
- if (!ret)
- return 0;
- goto fail;
+ if (ret)
+ return -ENOENT;
+ return 0;
}
+ ret = -ENOENT;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
while (1) {
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- err = ret;
- if (ret)
- goto fail;
+ if (ret) {
+ if (ret > 1)
+ return -ENOENT;
+ return ret;
+ }
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
@@ -501,37 +501,37 @@ again:
btrfs_release_path(path);
goto again;
}
- err = 0;
+ ret = 0;
break;
}
path->slots[0]++;
}
fail:
- return err;
+ return ret;
}
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- u64 bytenr, u64 parent,
- u64 root_objectid, u64 owner,
- u64 offset, int refs_to_add)
+ struct btrfs_delayed_ref_node *node,
+ u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
struct extent_buffer *leaf;
+ u64 owner = btrfs_delayed_ref_owner(node);
+ u64 offset = btrfs_delayed_ref_offset(node);
u32 size;
u32 num_refs;
int ret;
key.objectid = bytenr;
- if (parent) {
+ if (node->parent) {
key.type = BTRFS_SHARED_DATA_REF_KEY;
- key.offset = parent;
+ key.offset = node->parent;
size = sizeof(struct btrfs_shared_data_ref);
} else {
key.type = BTRFS_EXTENT_DATA_REF_KEY;
- key.offset = hash_extent_data_ref(root_objectid,
- owner, offset);
+ key.offset = hash_extent_data_ref(node->ref_root, owner, offset);
size = sizeof(struct btrfs_extent_data_ref);
}
@@ -540,15 +540,15 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
goto fail;
leaf = path->nodes[0];
- if (parent) {
+ if (node->parent) {
struct btrfs_shared_data_ref *ref;
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_shared_data_ref);
if (ret == 0) {
- btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
+ btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod);
} else {
num_refs = btrfs_shared_data_ref_count(leaf, ref);
- num_refs += refs_to_add;
+ num_refs += node->ref_mod;
btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
}
} else {
@@ -556,7 +556,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
while (ret == -EEXIST) {
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_data_ref);
- if (match_extent_data_ref(leaf, ref, root_objectid,
+ if (match_extent_data_ref(leaf, ref, node->ref_root,
owner, offset))
break;
btrfs_release_path(path);
@@ -571,14 +571,13 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_data_ref);
if (ret == 0) {
- btrfs_set_extent_data_ref_root(leaf, ref,
- root_objectid);
+ btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root);
btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
btrfs_set_extent_data_ref_offset(leaf, ref, offset);
- btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+ btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod);
} else {
num_refs = btrfs_extent_data_ref_count(leaf, ref);
- num_refs += refs_to_add;
+ num_refs += node->ref_mod;
btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
}
}
@@ -702,20 +701,20 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- u64 bytenr, u64 parent,
- u64 root_objectid)
+ struct btrfs_delayed_ref_node *node,
+ u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
int ret;
key.objectid = bytenr;
- if (parent) {
+ if (node->parent) {
key.type = BTRFS_SHARED_BLOCK_REF_KEY;
- key.offset = parent;
+ key.offset = node->parent;
} else {
key.type = BTRFS_TREE_BLOCK_REF_KEY;
- key.offset = root_objectid;
+ key.offset = node->ref_root;
}
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
@@ -1254,7 +1253,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
u64 bytes_left, end;
u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
- if (WARN_ON(start != aligned_start)) {
+ /* Adjust the range to be aligned to 512B sectors if necessary. */
+ if (start != aligned_start) {
len -= aligned_start - start;
len = round_down(len, 1 << SECTOR_SHIFT);
start = aligned_start;
@@ -1435,7 +1435,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
generic_ref->action);
BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
- generic_ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID);
+ generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -1458,34 +1458,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
* @node: The delayed ref node used to get the bytenr/length for
* extent whose references are incremented.
*
- * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
- * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
- * bytenr of the parent block. Since new extents are always
- * created with indirect references, this will only be the case
- * when relocating a shared extent. In that case, root_objectid
- * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
- * be 0
- *
- * @root_objectid: The id of the root where this modification has originated,
- * this can be either one of the well-known metadata trees or
- * the subvolume id which references this extent.
- *
- * @owner: For data extents it is the inode number of the owning file.
- * For metadata extents this parameter holds the level in the
- * tree of the extent.
- *
- * @offset: For metadata extents the offset is ignored and is currently
- * always passed as 0. For data extents it is the fileoffset
- * this extent belongs to.
- *
* @extent_op Pointer to a structure, holding information necessary when
* updating a tree block's flags
*
*/
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
- u64 parent, u64 root_objectid,
- u64 owner, u64 offset,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_path *path;
@@ -1494,6 +1472,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_key key;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
+ u64 owner = btrfs_delayed_ref_owner(node);
+ u64 offset = btrfs_delayed_ref_offset(node);
u64 refs;
int refs_to_add = node->ref_mod;
int ret;
@@ -1504,7 +1484,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
- parent, root_objectid, owner,
+ node->parent, node->ref_root, owner,
offset, refs_to_add, extent_op);
if ((ret < 0 && ret != -EAGAIN) || !ret)
goto out;
@@ -1527,12 +1507,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
/* now insert the actual backref */
if (owner < BTRFS_FIRST_FREE_OBJECTID)
- ret = insert_tree_block_ref(trans, path, bytenr, parent,
- root_objectid);
+ ret = insert_tree_block_ref(trans, path, node, bytenr);
else
- ret = insert_extent_data_ref(trans, path, bytenr, parent,
- root_objectid, owner, offset,
- refs_to_add);
+ ret = insert_extent_data_ref(trans, path, node, bytenr);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -1541,6 +1518,23 @@ out:
return ret;
}
+static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_head *href)
+{
+ u64 root = href->owning_root;
+
+ /*
+ * Don't check must_insert_reserved, as this is called from contexts
+ * where it has already been unset.
+ */
+ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
+ !href->is_data || !is_fstree(root))
+ return;
+
+ btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
+ BTRFS_QGROUP_RSV_DATA);
+}
+
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node,
@@ -1548,26 +1542,25 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
bool insert_reserved)
{
int ret = 0;
- struct btrfs_delayed_data_ref *ref;
u64 parent = 0;
u64 flags = 0;
- ref = btrfs_delayed_node_to_data_ref(node);
- trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
+ trace_run_delayed_data_ref(trans->fs_info, node);
if (node->type == BTRFS_SHARED_DATA_REF_KEY)
- parent = ref->parent;
+ parent = node->parent;
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
struct btrfs_key key;
struct btrfs_squota_delta delta = {
.root = href->owning_root,
.num_bytes = node->num_bytes,
- .rsv_bytes = href->reserved_bytes,
.is_data = true,
.is_inc = true,
.generation = trans->transid,
};
+ u64 owner = btrfs_delayed_ref_owner(node);
+ u64 offset = btrfs_delayed_ref_offset(node);
if (extent_op)
flags |= extent_op->flags_to_set;
@@ -1576,23 +1569,17 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = node->num_bytes;
- ret = alloc_reserved_file_extent(trans, parent, ref->root,
- flags, ref->objectid,
- ref->offset, &key,
- node->ref_mod, href->owning_root);
+ ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
+ flags, owner, offset, &key,
+ node->ref_mod,
+ href->owning_root);
+ free_head_ref_squota_rsv(trans->fs_info, href);
if (!ret)
ret = btrfs_record_squota_delta(trans->fs_info, &delta);
- else
- btrfs_qgroup_free_refroot(trans->fs_info, delta.root,
- delta.rsv_bytes, BTRFS_QGROUP_RSV_DATA);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root,
- ref->objectid, ref->offset,
- extent_op);
+ ret = __btrfs_inc_extent_ref(trans, node, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, href, node, parent,
- ref->root, ref->objectid,
- ref->offset, extent_op);
+ ret = __btrfs_free_extent(trans, href, node, extent_op);
} else {
BUG();
}
@@ -1714,16 +1701,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
{
int ret = 0;
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_delayed_tree_ref *ref;
u64 parent = 0;
u64 ref_root = 0;
- ref = btrfs_delayed_node_to_tree_ref(node);
- trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
+ trace_run_delayed_tree_ref(trans->fs_info, node);
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
- parent = ref->parent;
- ref_root = ref->root;
+ parent = node->parent;
+ ref_root = node->ref_root;
if (unlikely(node->ref_mod != 1)) {
btrfs_err(trans->fs_info,
@@ -1736,7 +1721,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_squota_delta delta = {
.root = href->owning_root,
.num_bytes = fs_info->nodesize,
- .rsv_bytes = 0,
.is_data = false,
.is_inc = true,
.generation = trans->transid,
@@ -1747,11 +1731,9 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
if (!ret)
btrfs_record_squota_delta(fs_info, &delta);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
- ref->level, 0, extent_op);
+ ret = __btrfs_inc_extent_ref(trans, node, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, href, node, parent, ref_root,
- ref->level, 0, extent_op);
+ ret = __btrfs_free_extent(trans, href, node, extent_op);
} else {
BUG();
}
@@ -1768,8 +1750,10 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
int ret = 0;
if (TRANS_ABORTED(trans)) {
- if (insert_reserved)
+ if (insert_reserved) {
btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
+ free_head_ref_squota_rsv(trans->fs_info, href);
+ }
return 0;
}
@@ -1865,6 +1849,8 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
+ u64 ret = 0;
+
/*
* We had csum deletions accounted for in our delayed refs rsv, we need
* to drop the csum leaves for this update from our delayed_refs_rsv.
@@ -1879,14 +1865,13 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
- return btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
+ ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
}
- if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE &&
- head->must_insert_reserved && head->is_data)
- btrfs_qgroup_free_refroot(fs_info, head->owning_root,
- head->reserved_bytes, BTRFS_QGROUP_RSV_DATA);
+ /* must_insert_reserved can be set only if we didn't run the head ref. */
+ if (head->must_insert_reserved)
+ free_head_ref_squota_rsv(fs_info, head);
- return 0;
+ return ret;
}
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
@@ -2027,6 +2012,12 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
* spin lock.
*/
must_insert_reserved = locked_ref->must_insert_reserved;
+ /*
+ * Unsetting this on the head ref relinquishes ownership of
+ * the rsv_bytes, so it is critical that every possible code
+ * path from here forward frees all reserves including qgroup
+ * reserve.
+ */
locked_ref->must_insert_reserved = false;
extent_op = locked_ref->extent_op;
@@ -2266,7 +2257,6 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
{
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_node *ref;
- struct btrfs_delayed_data_ref *data_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_transaction *cur_trans;
struct rb_node *node;
@@ -2320,6 +2310,9 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
*/
for (node = rb_first_cached(&head->ref_tree); node;
node = rb_next(node)) {
+ u64 ref_owner;
+ u64 ref_offset;
+
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
@@ -2327,15 +2320,15 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
break;
}
- data_ref = btrfs_delayed_node_to_data_ref(ref);
+ ref_owner = btrfs_delayed_ref_owner(ref);
+ ref_offset = btrfs_delayed_ref_offset(ref);
/*
* If our ref doesn't match the one we're currently looking at
* then we have a cross reference.
*/
- if (data_ref->root != root->root_key.objectid ||
- data_ref->objectid != objectid ||
- data_ref->offset != offset) {
+ if (ref->ref_root != btrfs_root_id(root) ||
+ ref_owner != objectid || ref_offset != offset) {
ret = 1;
break;
}
@@ -2370,7 +2363,14 @@ static noinline int check_committed_ref(struct btrfs_root *root,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0); /* Corruption */
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = -ENOENT;
if (path->slots[0] == 0)
@@ -2421,8 +2421,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
ref = (struct btrfs_extent_data_ref *)(&iref->offset);
if (btrfs_extent_refs(leaf, ei) !=
btrfs_extent_data_ref_count(leaf, ref) ||
- btrfs_extent_data_ref_root(leaf, ref) !=
- root->root_key.objectid ||
+ btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
btrfs_extent_data_ref_offset(leaf, ref) != offset)
goto out;
@@ -2459,14 +2458,11 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int full_backref, int inc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 bytenr;
- u64 num_bytes;
u64 parent;
u64 ref_root;
u32 nritems;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
- struct btrfs_ref generic_ref = { 0 };
bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
int i;
int action;
@@ -2493,6 +2489,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
action = BTRFS_DROP_DELAYED_REF;
for (i = 0; i < nritems; i++) {
+ struct btrfs_ref ref = {
+ .action = action,
+ .parent = parent,
+ .ref_root = ref_root,
+ };
+
if (level == 0) {
btrfs_item_key_to_cpu(buf, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY)
@@ -2502,35 +2504,33 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
if (btrfs_file_extent_type(buf, fi) ==
BTRFS_FILE_EXTENT_INLINE)
continue;
- bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
- if (bytenr == 0)
+ ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (ref.bytenr == 0)
continue;
- num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+ ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+ ref.owning_root = ref_root;
+
key.offset -= btrfs_file_extent_offset(buf, fi);
- btrfs_init_generic_ref(&generic_ref, action, bytenr,
- num_bytes, parent, ref_root);
- btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
- key.offset, root->root_key.objectid,
- for_reloc);
+ btrfs_init_data_ref(&ref, key.objectid, key.offset,
+ btrfs_root_id(root), for_reloc);
if (inc)
- ret = btrfs_inc_extent_ref(trans, &generic_ref);
+ ret = btrfs_inc_extent_ref(trans, &ref);
else
- ret = btrfs_free_extent(trans, &generic_ref);
+ ret = btrfs_free_extent(trans, &ref);
if (ret)
goto fail;
} else {
- bytenr = btrfs_node_blockptr(buf, i);
- num_bytes = fs_info->nodesize;
- /* We don't know the owning_root, use 0. */
- btrfs_init_generic_ref(&generic_ref, action, bytenr,
- num_bytes, parent, 0);
- btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
- root->root_key.objectid, for_reloc);
+ /* We don't know the owning_root, leave as 0. */
+ ref.bytenr = btrfs_node_blockptr(buf, i);
+ ref.num_bytes = fs_info->nodesize;
+
+ btrfs_init_tree_ref(&ref, level - 1,
+ btrfs_root_id(root), for_reloc);
if (inc)
- ret = btrfs_inc_extent_ref(trans, &generic_ref);
+ ret = btrfs_inc_extent_ref(trans, &ref);
else
- ret = btrfs_free_extent(trans, &generic_ref);
+ ret = btrfs_free_extent(trans, &ref);
if (ret)
goto fail;
}
@@ -2751,6 +2751,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 total_unpinned = 0;
u64 empty_cluster = 0;
bool readonly;
+ int ret = 0;
while (start <= end) {
readonly = false;
@@ -2760,7 +2761,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(cache);
total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start);
- BUG_ON(!cache); /* Logic error */
+ if (cache == NULL) {
+ /* Logic error, something removed the block group. */
+ ret = -EUCLEAN;
+ goto out;
+ }
cluster = fetch_cluster_info(fs_info,
cache->space_info,
@@ -2829,7 +2834,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
if (cache)
btrfs_put_block_group(cache);
- return 0;
+out:
+ return ret;
}
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
@@ -2859,7 +2865,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
end + 1 - start, NULL);
clear_extent_dirty(unpin, start, end, &cached_state);
- unpin_extent_range(fs_info, start, end, true);
+ ret = unpin_extent_range(fs_info, start, end, true);
+ BUG_ON(ret);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
free_extent_state(cached_state);
cond_resched();
@@ -3059,9 +3066,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
*/
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node, u64 parent,
- u64 root_objectid, u64 owner_objectid,
- u64 owner_offset,
+ struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *info = trans->fs_info;
@@ -3081,6 +3086,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 refs;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
+ u64 owner_objectid = btrfs_delayed_ref_owner(node);
+ u64 owner_offset = btrfs_delayed_ref_offset(node);
bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
u64 delayed_ref_root = href->owning_root;
@@ -3106,7 +3113,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
skinny_metadata = false;
ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
- parent, root_objectid, owner_objectid,
+ node->parent, node->ref_root, owner_objectid,
owner_offset);
if (ret == 0) {
/*
@@ -3208,7 +3215,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
} else if (WARN_ON(ret == -ENOENT)) {
abort_and_dump(trans, path,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
- bytenr, parent, root_objectid, owner_objectid,
+ bytenr, node->parent, node->ref_root, owner_objectid,
owner_offset, path->slots[0]);
goto out;
} else {
@@ -3286,7 +3293,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_squota_delta delta = {
.root = delayed_ref_root,
.num_bytes = num_bytes,
- .rsv_bytes = 0,
.is_data = is_data,
.is_inc = false,
.generation = btrfs_extent_generation(leaf, ei),
@@ -3419,81 +3425,91 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
u64 parent, int last_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_ref generic_ref = { 0 };
+ struct btrfs_block_group *bg;
int ret;
- btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
- buf->start, buf->len, parent, btrfs_header_owner(buf));
- btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
- root_id, 0, false);
-
if (root_id != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_ref generic_ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = buf->start,
+ .num_bytes = buf->len,
+ .parent = parent,
+ .owning_root = btrfs_header_owner(buf),
+ .ref_root = root_id,
+ };
+
+ /*
+ * Assert that the extent buffer is not cleared due to
+ * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
+ * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
+ * detail.
+ */
+ ASSERT(btrfs_header_bytenr(buf) != 0);
+
+ btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
BUG_ON(ret); /* -ENOMEM */
}
- if (last_ref && btrfs_header_generation(buf) == trans->transid) {
- struct btrfs_block_group *cache;
- bool must_pin = false;
+ if (!last_ref)
+ return;
- if (root_id != BTRFS_TREE_LOG_OBJECTID) {
- ret = check_ref_cleanup(trans, buf->start);
- if (!ret) {
- btrfs_redirty_list_add(trans->transaction, buf);
- goto out;
- }
- }
-
- cache = btrfs_lookup_block_group(fs_info, buf->start);
+ if (btrfs_header_generation(buf) != trans->transid)
+ goto out;
- if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(trans, cache, buf->start, buf->len, 1);
- btrfs_put_block_group(cache);
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
+ ret = check_ref_cleanup(trans, buf->start);
+ if (!ret)
goto out;
- }
+ }
- /*
- * If there are tree mod log users we may have recorded mod log
- * operations for this node. If we re-allocate this node we
- * could replay operations on this node that happened when it
- * existed in a completely different root. For example if it
- * was part of root A, then was reallocated to root B, and we
- * are doing a btrfs_old_search_slot(root b), we could replay
- * operations that happened when the block was part of root A,
- * giving us an inconsistent view of the btree.
- *
- * We are safe from races here because at this point no other
- * node or root points to this extent buffer, so if after this
- * check a new tree mod log user joins we will not have an
- * existing log of operations on this node that we have to
- * contend with.
- */
- if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
- must_pin = true;
+ bg = btrfs_lookup_block_group(fs_info, buf->start);
- if (must_pin || btrfs_is_zoned(fs_info)) {
- btrfs_redirty_list_add(trans->transaction, buf);
- pin_down_extent(trans, cache, buf->start, buf->len, 1);
- btrfs_put_block_group(cache);
- goto out;
- }
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ btrfs_put_block_group(bg);
+ goto out;
+ }
- WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+ /*
+ * If there are tree mod log users we may have recorded mod log
+ * operations for this node. If we re-allocate this node we
+ * could replay operations on this node that happened when it
+ * existed in a completely different root. For example if it
+ * was part of root A, then was reallocated to root B, and we
+ * are doing a btrfs_old_search_slot(root b), we could replay
+ * operations that happened when the block was part of root A,
+ * giving us an inconsistent view of the btree.
+ *
+ * We are safe from races here because at this point no other
+ * node or root points to this extent buffer, so if after this
+ * check a new tree mod log user joins we will not have an
+ * existing log of operations on this node that we have to
+ * contend with.
+ */
- btrfs_add_free_space(cache, buf->start, buf->len);
- btrfs_free_reserved_bytes(cache, buf->len, 0);
- btrfs_put_block_group(cache);
- trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+ if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
+ || btrfs_is_zoned(fs_info)) {
+ pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ btrfs_put_block_group(bg);
+ goto out;
}
+
+ WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+ btrfs_add_free_space(bg, buf->start, buf->len);
+ btrfs_free_reserved_bytes(bg, buf->len, 0);
+ btrfs_put_block_group(bg);
+ trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+
out:
- if (last_ref) {
- /*
- * Deleting the buffer, clear the corrupt flag since it doesn't
- * matter anymore.
- */
- clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
- }
+
+ /*
+ * Deleting the buffer, clear the corrupt flag since it doesn't
+ * matter anymore.
+ */
+ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
}
/* Can return -ENOMEM */
@@ -3509,11 +3525,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
* tree log blocks never actually go into the extent allocation
* tree, just update pinning info and exit early.
*/
- if ((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) ||
- (ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
- btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
+ if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
+ btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
@@ -3521,10 +3534,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
ret = btrfs_add_delayed_data_ref(trans, ref, 0);
}
- if (!((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) ||
- (ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
+ if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID)
btrfs_ref_tree_mod(fs_info, ref);
return ret;
@@ -4273,6 +4283,42 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
return 0;
}
+static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ if (ffe_ctl->for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg)
+ ffe_ctl->hint_byte = fs_info->treelog_bg;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ } else if (ffe_ctl->for_data_reloc) {
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg)
+ ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+ struct btrfs_block_group *block_group;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
+ /*
+ * No lock is OK here because avail is monotinically
+ * decreasing, and this is just a hint.
+ */
+ u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+
+ if (block_group_bits(block_group, ffe_ctl->flags) &&
+ avail >= ffe_ctl->num_bytes) {
+ ffe_ctl->hint_byte = block_group->start;
+ break;
+ }
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+
+ return 0;
+}
+
static int prepare_allocation(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
@@ -4283,19 +4329,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
case BTRFS_EXTENT_ALLOC_ZONED:
- if (ffe_ctl->for_treelog) {
- spin_lock(&fs_info->treelog_bg_lock);
- if (fs_info->treelog_bg)
- ffe_ctl->hint_byte = fs_info->treelog_bg;
- spin_unlock(&fs_info->treelog_bg_lock);
- }
- if (ffe_ctl->for_data_reloc) {
- spin_lock(&fs_info->relocation_bg_lock);
- if (fs_info->data_reloc_bg)
- ffe_ctl->hint_byte = fs_info->data_reloc_bg;
- spin_unlock(&fs_info->relocation_bg_lock);
- }
- return 0;
+ return prepare_allocation_zoned(fs_info, ffe_ctl);
default:
BUG();
}
@@ -4635,7 +4669,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
- bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
flags = get_alloc_profile_by_root(root, is_data);
@@ -4829,16 +4863,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_extent_inline_ref *iref;
struct btrfs_path *path;
struct extent_buffer *leaf;
- struct btrfs_delayed_tree_ref *ref;
u32 size = sizeof(*extent_item) + sizeof(*iref);
u64 flags = extent_op->flags_to_set;
+ /* The owner of a tree block is the level. */
+ int level = btrfs_delayed_ref_owner(node);
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
- ref = btrfs_delayed_node_to_tree_ref(node);
-
extent_key.objectid = node->bytenr;
if (skinny_metadata) {
- extent_key.offset = ref->level;
+ /* The owner of a tree block is the level. */
+ extent_key.offset = level;
extent_key.type = BTRFS_METADATA_ITEM_KEY;
} else {
extent_key.offset = node->num_bytes;
@@ -4871,18 +4905,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
- btrfs_set_tree_block_level(leaf, block_info, ref->level);
+ btrfs_set_tree_block_level(leaf, block_info, level);
iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
}
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_SHARED_BLOCK_REF_KEY);
- btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent);
} else {
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_TREE_BLOCK_REF_KEY);
- btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
}
btrfs_mark_buffer_dirty(trans, leaf);
@@ -4896,19 +4930,20 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins)
{
- struct btrfs_ref generic_ref = { 0 };
- u64 root_objectid = root->root_key.objectid;
- u64 owning_root = root_objectid;
+ struct btrfs_ref generic_ref = {
+ .action = BTRFS_ADD_DELAYED_EXTENT,
+ .bytenr = ins->objectid,
+ .num_bytes = ins->offset,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
- BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
- owning_root = root->relocation_src_root;
+ generic_ref.owning_root = root->relocation_src_root;
- btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
- ins->objectid, ins->offset, 0, owning_root);
- btrfs_init_data_ref(&generic_ref, root_objectid, owner,
- offset, 0, false);
+ btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
@@ -4931,7 +4966,6 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
.root = root_objectid,
.num_bytes = ins->offset,
.generation = trans->transid,
- .rsv_bytes = 0,
.is_data = true,
.is_inc = true,
};
@@ -5032,10 +5066,10 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
*/
btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
- __btrfs_tree_lock(buf, nest);
+ btrfs_tree_lock_nested(buf, nest);
btrfs_clear_buffer_dirty(trans, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
- clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
+ clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
set_extent_buffer_uptodate(buf);
@@ -5047,7 +5081,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_header_owner(buf, owner);
write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
- if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
buf->log_index = root->log_transid % 2;
/*
* we allow two log transactions at a time, use different
@@ -5088,7 +5122,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
struct btrfs_delayed_extent_op *extent_op;
- struct btrfs_ref generic_ref = { 0 };
u64 flags = 0;
int ret;
u32 blocksize = fs_info->nodesize;
@@ -5131,6 +5164,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(parent > 0);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_ref generic_ref = {
+ .action = BTRFS_ADD_DELAYED_EXTENT,
+ .bytenr = ins.objectid,
+ .num_bytes = ins.offset,
+ .parent = parent,
+ .owning_root = owning_root,
+ .ref_root = root_objectid,
+ };
extent_op = btrfs_alloc_delayed_extent_op();
if (!extent_op) {
ret = -ENOMEM;
@@ -5145,10 +5186,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->update_flags = true;
extent_op->level = level;
- btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
- ins.objectid, ins.offset, parent, owning_root);
- btrfs_init_tree_ref(&generic_ref, level, root_objectid,
- root->root_key.objectid, false);
+ btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
if (ret)
@@ -5234,7 +5272,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
/* We don't lock the tree block, it's OK to be racy here */
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
wc->level - 1, 1, &refs,
- &flags);
+ &flags, NULL);
/* We don't care about errors in readahead. */
if (ret < 0)
continue;
@@ -5286,8 +5324,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
int ret;
- if (wc->stage == UPDATE_BACKREF &&
- btrfs_header_owner(eb) != root->root_key.objectid)
+ if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root))
return 1;
/*
@@ -5301,7 +5338,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level],
+ NULL);
BUG_ON(ret == -ENOMEM);
if (ret)
return ret;
@@ -5360,7 +5398,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
ret = lookup_extent_backref(trans, path, &iref, bytenr,
root->fs_info->nodesize, parent,
- root->root_key.objectid, level, 0);
+ btrfs_root_id(root), level, 0);
btrfs_free_path(path);
if (ret == -ENOENT)
return 0;
@@ -5390,10 +5428,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 generation;
- u64 parent;
+ u64 owner_root = 0;
struct btrfs_tree_parent_check check = { 0 };
struct btrfs_key key;
- struct btrfs_ref ref = { 0 };
struct extent_buffer *next;
int level = wc->level;
int reada = 0;
@@ -5417,7 +5454,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
check.level = level - 1;
check.transid = generation;
- check.owner_root = root->root_key.objectid;
+ check.owner_root = btrfs_root_id(root);
check.has_first_key = true;
btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
path->slots[level]);
@@ -5425,7 +5462,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
next = find_extent_buffer(fs_info, bytenr);
if (!next) {
next = btrfs_find_create_tree_block(fs_info, bytenr,
- root->root_key.objectid, level - 1);
+ btrfs_root_id(root), level - 1);
if (IS_ERR(next))
return PTR_ERR(next);
reada = 1;
@@ -5434,7 +5471,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
- &wc->flags[level - 1]);
+ &wc->flags[level - 1],
+ &owner_root);
if (ret < 0)
goto out_unlock;
@@ -5509,19 +5547,25 @@ skip:
wc->refs[level - 1] = 0;
wc->flags[level - 1] = 0;
if (wc->stage == DROP_REFERENCE) {
+ struct btrfs_ref ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = bytenr,
+ .num_bytes = fs_info->nodesize,
+ .owning_root = owner_root,
+ .ref_root = btrfs_root_id(root),
+ };
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
- parent = path->nodes[level]->start;
+ ref.parent = path->nodes[level]->start;
} else {
- ASSERT(root->root_key.objectid ==
+ ASSERT(btrfs_root_id(root) ==
btrfs_header_owner(path->nodes[level]));
- if (root->root_key.objectid !=
+ if (btrfs_root_id(root) !=
btrfs_header_owner(path->nodes[level])) {
btrfs_err(root->fs_info,
"mismatched block owner");
ret = -EIO;
goto out_unlock;
}
- parent = 0;
}
/*
@@ -5531,7 +5575,7 @@ skip:
* ->restarted flag.
*/
if (wc->restarted) {
- ret = check_ref_exists(trans, root, bytenr, parent,
+ ret = check_ref_exists(trans, root, bytenr, ref.parent,
level - 1);
if (ret < 0)
goto out_unlock;
@@ -5546,8 +5590,7 @@ skip:
* already accounted them at merge time (replace_path),
* thus we could skip expensive subtree trace here.
*/
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
- need_account) {
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && need_account) {
ret = btrfs_qgroup_trace_subtree(trans, next,
generation, level - 1);
if (ret) {
@@ -5566,11 +5609,7 @@ skip:
wc->drop_level = level;
find_next_key(path, level, &wc->drop_progress);
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- fs_info->nodesize, parent,
- btrfs_header_owner(next));
- btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
- 0, false);
+ btrfs_init_tree_ref(&ref, level - 1, 0, false);
ret = btrfs_free_extent(trans, &ref);
if (ret)
goto out_unlock;
@@ -5635,7 +5674,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level],
+ NULL);
if (ret < 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
@@ -5660,7 +5700,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
else
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
- if (is_fstree(root->root_key.objectid)) {
+ if (is_fstree(btrfs_root_id(root))) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
if (ret) {
btrfs_err_rl(fs_info,
@@ -5680,12 +5720,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
- else if (root->root_key.objectid != btrfs_header_owner(eb))
+ else if (btrfs_root_id(root) != btrfs_header_owner(eb))
goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
- else if (root->root_key.objectid !=
+ else if (btrfs_root_id(root) !=
btrfs_header_owner(path->nodes[level + 1]))
goto owner_mismatch;
}
@@ -5699,7 +5739,7 @@ out:
owner_mismatch:
btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
- btrfs_header_owner(eb), root->root_key.objectid);
+ btrfs_header_owner(eb), btrfs_root_id(root));
return -EUCLEAN;
}
@@ -5785,8 +5825,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
*/
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
- const bool is_reloc_root = (root->root_key.objectid ==
- BTRFS_TREE_RELOC_OBJECTID);
+ const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -5800,7 +5839,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
bool root_dropped = false;
bool unfinished_drop = false;
- btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
+ btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root));
path = btrfs_alloc_path();
if (!path) {
@@ -5880,7 +5919,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
level, 1, &wc->refs[level],
- &wc->flags[level]);
+ &wc->flags[level], NULL);
if (ret < 0) {
err = ret;
goto out_end_trans;
@@ -5998,8 +6037,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
*
* The most common failure here is just -ENOENT.
*/
- btrfs_del_orphan_item(trans, tree_root,
- root->root_key.objectid);
+ btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root));
}
}
@@ -6061,9 +6099,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
int level;
int parent_level;
int ret = 0;
- int wret;
- BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
path = btrfs_alloc_path();
if (!path)
@@ -6097,17 +6134,16 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
while (1) {
- wret = walk_down_tree(trans, root, path, wc);
- if (wret < 0) {
- ret = wret;
+ ret = walk_down_tree(trans, root, path, wc);
+ if (ret < 0)
break;
- }
- wret = walk_up_tree(trans, root, path, wc, parent_level);
- if (wret < 0)
- ret = wret;
- if (wret != 0)
+ ret = walk_up_tree(trans, root, path, wc, parent_level);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
break;
+ }
}
kfree(wc);
@@ -6115,10 +6151,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
- u64 start, u64 end)
+/*
+ * Unpin the extent range in an error context and don't add the space back.
+ * Errors are not propagated further.
+ */
+void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
{
- return unpin_extent_range(fs_info, start, end, false);
+ unpin_extent_range(fs_info, start, end, false);
}
/*
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 0716f65d9753..af9f8800d5ac 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -3,11 +3,21 @@
#ifndef BTRFS_EXTENT_TREE_H
#define BTRFS_EXTENT_TREE_H
+#include <linux/types.h>
#include "misc.h"
#include "block-group.h"
+#include "locking.h"
+struct extent_buffer;
struct btrfs_free_cluster;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_ref;
+struct btrfs_disk_key;
struct btrfs_delayed_ref_head;
+struct btrfs_delayed_ref_root;
+struct btrfs_extent_inline_ref;
enum btrfs_extent_allocation_policy {
BTRFS_EXTENT_ALLOC_CLUSTERED,
@@ -99,7 +109,8 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 offset, int metadata, u64 *refs, u64 *flags);
+ u64 offset, int metadata, u64 *refs, u64 *flags,
+ u64 *owner_root);
int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 03cef28d9e37..597387e9f040 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -14,7 +14,6 @@
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/fsverity.h>
-#include "misc.h"
#include "extent_io.h"
#include "extent-io-tree.h"
#include "extent_map.h"
@@ -22,7 +21,6 @@
#include "btrfs_inode.h"
#include "bio.h"
#include "locking.h"
-#include "rcu-string.h"
#include "backref.h"
#include "disk-io.h"
#include "subpage.h"
@@ -78,10 +76,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
eb = list_first_entry(&fs_info->allocated_ebs,
struct extent_buffer, leak_list);
pr_err(
- "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
+ "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n",
eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
btrfs_header_owner(eb));
list_del(&eb->leak_list);
+ WARN_ON_ONCE(1);
kmem_cache_free(extent_buffer_cache, eb);
}
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
@@ -147,8 +146,8 @@ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
int __init extent_buffer_init_cachep(void)
{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
- sizeof(struct extent_buffer), 0,
- SLAB_MEM_SPREAD, NULL);
+ sizeof(struct extent_buffer), 0, 0,
+ NULL);
if (!extent_buffer_cache)
return -ENOMEM;
@@ -184,29 +183,30 @@ static void process_one_page(struct btrfs_fs_info *fs_info,
struct page *page, struct page *locked_page,
unsigned long page_ops, u64 start, u64 end)
{
+ struct folio *folio = page_folio(page);
u32 len;
ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
len = end + 1 - start;
if (page_ops & PAGE_SET_ORDERED)
- btrfs_page_clamp_set_ordered(fs_info, page, start, len);
+ btrfs_folio_clamp_set_ordered(fs_info, folio, start, len);
if (page_ops & PAGE_START_WRITEBACK) {
- btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
- btrfs_page_clamp_set_writeback(fs_info, page, start, len);
+ btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
+ btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
}
if (page_ops & PAGE_END_WRITEBACK)
- btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
+ btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
if (page != locked_page && (page_ops & PAGE_UNLOCK))
- btrfs_page_end_writer_lock(fs_info, page, start, len);
+ btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}
static void __process_pages_contig(struct address_space *mapping,
struct page *locked_page, u64 start, u64 end,
unsigned long page_ops)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
pgoff_t index = start_index;
@@ -250,7 +250,7 @@ static noinline int lock_delalloc_pages(struct inode *inode,
u64 start,
u64 end)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct address_space *mapping = inode->i_mapping;
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
@@ -271,19 +271,20 @@ static noinline int lock_delalloc_pages(struct inode *inode,
goto out;
for (i = 0; i < found_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
+ struct page *page = folio_page(folio, 0);
u32 len = end + 1 - start;
if (page == locked_page)
continue;
- if (btrfs_page_start_writer_lock(fs_info, page, start,
- len))
+ if (btrfs_folio_start_writer_lock(fs_info, folio, start,
+ len))
goto out;
if (!PageDirty(page) || page->mapping != mapping) {
- btrfs_page_end_writer_lock(fs_info, page, start,
- len);
+ btrfs_folio_end_writer_lock(fs_info, folio, start,
+ len);
goto out;
}
@@ -321,7 +322,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
u64 *end)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
const u64 orig_start = *start;
const u64 orig_end = *end;
@@ -395,15 +396,14 @@ again:
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
EXTENT_DELALLOC, cached_state);
+
+ unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
if (!ret) {
- unlock_extent(tree, delalloc_start, delalloc_end,
- &cached_state);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
goto again;
}
- free_extent_state(cached_state);
*start = delalloc_start;
*end = delalloc_end;
out_failed:
@@ -412,9 +412,10 @@ out_failed:
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
+ struct extent_state **cached,
u32 clear_bits, unsigned long page_ops)
{
- clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start, end, page_ops);
@@ -431,61 +432,65 @@ static bool btrfs_verify_page(struct page *page, u64 start)
static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = page_to_fs_info(page);
+ struct folio *folio = page_folio(page);
ASSERT(page_offset(page) <= start &&
start + len <= page_offset(page) + PAGE_SIZE);
if (uptodate && btrfs_verify_page(page, start))
- btrfs_page_set_uptodate(fs_info, page, start, len);
+ btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
- btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_folio_clear_uptodate(fs_info, folio, start, len);
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, page->mapping))
unlock_page(page);
else
- btrfs_subpage_end_reader(fs_info, page, start, len);
+ btrfs_subpage_end_reader(fs_info, folio, start, len);
}
/*
- * after a writepage IO is done, we need to:
- * clear the uptodate bits on error
- * clear the writeback bits in the extent tree for this IO
- * end_page_writeback if the page has no more pending IO
+ * After a write IO is done, we need to:
+ *
+ * - clear the uptodate bits on error
+ * - clear the writeback bits in the extent tree for the range
+ * - filio_end_writeback() if there is no more pending io for the folio
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct btrfs_bio *bbio)
+static void end_bbio_data_write(struct btrfs_bio *bbio)
{
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
+ const u32 sectorsize = fs_info->sectorsize;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- const u32 sectorsize = fs_info->sectorsize;
- u64 start = page_offset(page) + bvec->bv_offset;
- u32 len = bvec->bv_len;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
+ u64 start = folio_pos(folio) + fi.offset;
+ u32 len = fi.length;
+
+ /* Only order 0 (single page) folios are allowed for data. */
+ ASSERT(folio_order(folio) == 0);
/* Our read/write should always be sector aligned. */
- if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
- "partial page write in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
+ "partial page write in btrfs with offset %zu and length %zu",
+ fi.offset, fi.length);
+ else if (!IS_ALIGNED(fi.length, sectorsize))
btrfs_info(fs_info,
- "incomplete page write with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
+ "incomplete page write with offset %zu and length %zu",
+ fi.offset, fi.length);
- btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error);
+ btrfs_finish_ordered_extent(bbio->ordered,
+ folio_page(folio, 0), start, len, !error);
if (error)
- mapping_set_error(page->mapping, error);
- btrfs_page_clear_writeback(fs_info, page, start, len);
+ mapping_set_error(folio->mapping, error);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
bio_put(bio);
@@ -562,104 +567,99 @@ update:
static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
{
- ASSERT(PageLocked(page));
- if (!btrfs_is_subpage(fs_info, page))
+ struct folio *folio = page_folio(page);
+
+ ASSERT(folio_test_locked(folio));
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page));
- btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
+ ASSERT(folio_test_private(folio));
+ btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE);
}
/*
- * after a readpage IO is done, we need to:
- * clear the uptodate bits on error
- * set the uptodate bits if things worked
- * set the page up to date if all extents in the tree are uptodate
- * clear the lock bit in the extent tree
- * unlock the page if there are no other extents locked for it
+ * After a data read IO is done, we need to:
+ *
+ * - clear the uptodate bits on error
+ * - set the uptodate bits if things worked
+ * - set the folio up to date if all extents in the tree are uptodate
+ * - clear the lock bit in the extent tree
+ * - unlock the folio if there are no other extents locked for it
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct btrfs_bio *bbio)
+static void end_bbio_data_read(struct btrfs_bio *bbio)
{
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
struct bio *bio = &bbio->bio;
- struct bio_vec *bvec;
struct processed_extent processed = { 0 };
- /*
- * The offset to the beginning of a bio, since one bio can never be
- * larger than UINT_MAX, u32 here is enough.
- */
- u32 bio_offset = 0;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
+ const u32 sectorsize = fs_info->sectorsize;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
bool uptodate = !bio->bi_status;
- struct page *page = bvec->bv_page;
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- const u32 sectorsize = fs_info->sectorsize;
+ struct folio *folio = fi.folio;
+ struct inode *inode = folio->mapping->host;
u64 start;
u64 end;
u32 len;
+ /* For now only order 0 folios are supported for data. */
+ ASSERT(folio_order(folio) == 0);
btrfs_debug(fs_info,
- "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- bio->bi_iter.bi_sector, bio->bi_status,
+ "%s: bi_sector=%llu, err=%d, mirror=%u",
+ __func__, bio->bi_iter.bi_sector, bio->bi_status,
bbio->mirror_num);
/*
* We always issue full-sector reads, but if some block in a
- * page fails to read, blk_update_request() will advance
+ * folio fails to read, blk_update_request() will advance
* bv_offset and adjust bv_len to compensate. Print a warning
* for unaligned offsets, and an error if they don't add up to
* a full sector.
*/
- if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
- "partial page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
- sectorsize))
+ "partial page read in btrfs with offset %zu and length %zu",
+ fi.offset, fi.length);
+ else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize))
btrfs_info(fs_info,
- "incomplete page read with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
+ "incomplete page read with offset %zu and length %zu",
+ fi.offset, fi.length);
- start = page_offset(page) + bvec->bv_offset;
- end = start + bvec->bv_len - 1;
- len = bvec->bv_len;
+ start = folio_pos(folio) + fi.offset;
+ end = start + fi.length - 1;
+ len = fi.length;
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
+ pgoff_t end_index = i_size >> folio_shift(folio);
/*
* Zero out the remaining part if this range straddles
* i_size.
*
- * Here we should only zero the range inside the bvec,
+ * Here we should only zero the range inside the folio,
* not touch anything else.
*
* NOTE: i_size is exclusive while end is inclusive.
*/
- if (page->index == end_index && i_size <= end) {
- u32 zero_start = max(offset_in_page(i_size),
- offset_in_page(start));
+ if (folio_index(folio) == end_index && i_size <= end) {
+ u32 zero_start = max(offset_in_folio(folio, i_size),
+ offset_in_folio(folio, start));
+ u32 zero_len = offset_in_folio(folio, end) + 1 -
+ zero_start;
- zero_user_segment(page, zero_start,
- offset_in_page(end) + 1);
+ folio_zero_range(folio, zero_start, zero_len);
}
}
/* Update page status and unlock. */
- end_page_read(page, uptodate, start, len);
+ end_page_read(folio_page(folio, 0), uptodate, start, len);
endio_readpage_release_extent(&processed, BTRFS_I(inode),
start, end, uptodate);
-
- ASSERT(bio_offset + len > bio_offset);
- bio_offset += len;
-
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
@@ -667,38 +667,89 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
}
/*
+ * Populate every free slot in a provided array with folios.
+ *
+ * @nr_folios: number of folios to allocate
+ * @folio_array: the array to fill with folios; any existing non-NULL entries in
+ * the array will be skipped
+ * @extra_gfp: the extra GFP flags for the allocation
+ *
+ * Return: 0 if all folios were able to be allocated;
+ * -ENOMEM otherwise, the partially allocated folios would be freed and
+ * the array slots zeroed
+ */
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
+ gfp_t extra_gfp)
+{
+ for (int i = 0; i < nr_folios; i++) {
+ if (folio_array[i])
+ continue;
+ folio_array[i] = folio_alloc(GFP_NOFS | extra_gfp, 0);
+ if (!folio_array[i])
+ goto error;
+ }
+ return 0;
+error:
+ for (int i = 0; i < nr_folios; i++) {
+ if (folio_array[i])
+ folio_put(folio_array[i]);
+ }
+ return -ENOMEM;
+}
+
+/*
* Populate every free slot in a provided array with pages.
*
* @nr_pages: number of pages to allocate
* @page_array: the array to fill with pages; any existing non-null entries in
* the array will be skipped
+ * @extra_gfp: the extra GFP flags for the allocation.
*
* Return: 0 if all pages were able to be allocated;
- * -ENOMEM otherwise, and the caller is responsible for freeing all
- * non-null page pointers in the array.
+ * -ENOMEM otherwise, the partially allocated pages would be freed and
+ * the array slots zeroed
*/
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+ gfp_t extra_gfp)
{
+ const gfp_t gfp = GFP_NOFS | extra_gfp;
unsigned int allocated;
for (allocated = 0; allocated < nr_pages;) {
unsigned int last = allocated;
- allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
+ allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array);
+ if (unlikely(allocated == last)) {
+ /* No progress, fail and do cleanup. */
+ for (int i = 0; i < allocated; i++) {
+ __free_page(page_array[i]);
+ page_array[i] = NULL;
+ }
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
- if (allocated == nr_pages)
- return 0;
+/*
+ * Populate needed folios for the extent buffer.
+ *
+ * For now, the folios populated are always in order 0 (aka, single page).
+ */
+static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp)
+{
+ struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
+ int num_pages = num_extent_pages(eb);
+ int ret;
- /*
- * During this iteration, no page could be allocated, even
- * though alloc_pages_bulk_array() falls back to alloc_page()
- * if it could not bulk-allocate. So we must be out of memory.
- */
- if (allocated == last)
- return -ENOMEM;
+ ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp);
+ if (ret < 0)
+ return ret;
- memalloc_retry_wait(GFP_NOFS);
- }
+ for (int i = 0; i < num_pages; i++)
+ eb->folios[i] = page_folio(page_array[i]);
+ eb->folio_size = PAGE_SIZE;
+ eb->folio_shift = PAGE_SHIFT;
return 0;
}
@@ -788,7 +839,7 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
u64 disk_bytenr, struct page *page,
size_t size, unsigned long pg_offset)
{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_inode *inode = page_to_inode(page);
ASSERT(pg_offset + size <= PAGE_SIZE);
ASSERT(bio_ctrl->end_io_func);
@@ -856,9 +907,9 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
} while (size);
}
-static int attach_extent_buffer_page(struct extent_buffer *eb,
- struct page *page,
- struct btrfs_subpage *prealloc)
+static int attach_extent_buffer_folio(struct extent_buffer *eb,
+ struct folio *folio,
+ struct btrfs_subpage *prealloc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;
@@ -869,74 +920,80 @@ static int attach_extent_buffer_page(struct extent_buffer *eb,
* For cloned or dummy extent buffers, their pages are not mapped and
* will not race with any other ebs.
*/
- if (page->mapping)
- lockdep_assert_held(&page->mapping->private_lock);
+ if (folio->mapping)
+ lockdep_assert_held(&folio->mapping->i_private_lock);
if (fs_info->nodesize >= PAGE_SIZE) {
- if (!PagePrivate(page))
- attach_page_private(page, eb);
+ if (!folio_test_private(folio))
+ folio_attach_private(folio, eb);
else
- WARN_ON(page->private != (unsigned long)eb);
+ WARN_ON(folio_get_private(folio) != eb);
return 0;
}
/* Already mapped, just free prealloc */
- if (PagePrivate(page)) {
+ if (folio_test_private(folio)) {
btrfs_free_subpage(prealloc);
return 0;
}
if (prealloc)
/* Has preallocated memory for subpage */
- attach_page_private(page, prealloc);
+ folio_attach_private(folio, prealloc);
else
/* Do new allocation to attach subpage */
- ret = btrfs_attach_subpage(fs_info, page,
- BTRFS_SUBPAGE_METADATA);
+ ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return ret;
}
int set_page_extent_mapped(struct page *page)
{
+ return set_folio_extent_mapped(page_folio(page));
+}
+
+int set_folio_extent_mapped(struct folio *folio)
+{
struct btrfs_fs_info *fs_info;
- ASSERT(page->mapping);
+ ASSERT(folio->mapping);
- if (PagePrivate(page))
+ if (folio_test_private(folio))
return 0;
- fs_info = btrfs_sb(page->mapping->host->i_sb);
+ fs_info = folio_to_fs_info(folio);
- if (btrfs_is_subpage(fs_info, page))
- return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
+ if (btrfs_is_subpage(fs_info, folio->mapping))
+ return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
- attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
return 0;
}
void clear_page_extent_mapped(struct page *page)
{
+ struct folio *folio = page_folio(page);
struct btrfs_fs_info *fs_info;
ASSERT(page->mapping);
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return;
- fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (btrfs_is_subpage(fs_info, page))
- return btrfs_detach_subpage(fs_info, page);
+ fs_info = page_to_fs_info(page);
+ if (btrfs_is_subpage(fs_info, page->mapping))
+ return btrfs_detach_subpage(fs_info, folio);
- detach_page_private(page);
+ folio_detach_private(folio);
}
-static struct extent_map *
-__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
+static struct extent_map *__get_extent_map(struct inode *inode, struct page *page,
u64 start, u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
- if (em_cached && *em_cached) {
+ ASSERT(em_cached);
+
+ if (*em_cached) {
em = *em_cached;
if (extent_map_in_tree(em) && start >= em->start &&
start < extent_map_end(em)) {
@@ -948,8 +1005,8 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
*em_cached = NULL;
}
- em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
- if (em_cached && !IS_ERR(em)) {
+ em = btrfs_get_extent(BTRFS_I(inode), page, start, len);
+ if (!IS_ERR(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
*em_cached = em;
@@ -967,7 +1024,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
u64 start = page_offset(page);
const u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
@@ -978,7 +1035,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
int ret = 0;
size_t pg_offset = 0;
size_t iosize;
- size_t blocksize = inode->i_sb->s_blocksize;
+ size_t blocksize = fs_info->sectorsize;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
ret = set_page_extent_mapped(page);
@@ -996,7 +1053,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
memzero_page(page, zero_offset, iosize);
}
}
- bio_ctrl->end_io_func = end_bio_extent_readpage;
+ bio_ctrl->end_io_func = end_bbio_data_read;
begin_page_read(fs_info, page);
while (cur <= end) {
enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
@@ -1011,8 +1068,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
end_page_read(page, true, cur, iosize);
break;
}
- em = __get_extent_map(inode, page, pg_offset, cur,
- end - cur + 1, em_cached);
+ em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached);
if (IS_ERR(em)) {
unlock_extent(tree, cur, end, NULL);
end_page_read(page, false, cur, end + 1 - cur);
@@ -1022,8 +1078,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- compress_type = em->compress_type;
+ compress_type = extent_map_compression(em);
iosize = min(extent_map_end(em) - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
@@ -1032,7 +1087,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
else
disk_bytenr = em->block_start + extent_offset;
block_start = em->block_start;
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
/*
@@ -1069,7 +1124,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
* is a corner case so we prioritize correctness over
* non-optimal behavior (submitting 2 bios for the same extent).
*/
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
+ if (compress_type != BTRFS_COMPRESS_NONE &&
prev_em_start && *prev_em_start != (u64)-1 &&
*prev_em_start != em->start)
force_bio_submit = true;
@@ -1118,15 +1173,18 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
int btrfs_read_folio(struct file *file, struct folio *folio)
{
struct page *page = &folio->page;
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_inode *inode = page_to_inode(page);
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+ struct extent_map *em_cached = NULL;
int ret;
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
- ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL);
+ ret = btrfs_do_readpage(page, &em_cached, &bio_ctrl, NULL);
+ free_extent_map(em_cached);
+
/*
* If btrfs_do_readpage() failed we will want to submit the assembled
* bio to do the cleanup.
@@ -1141,9 +1199,11 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
struct btrfs_bio_ctrl *bio_ctrl,
u64 *prev_em_start)
{
- struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
+ struct btrfs_inode *inode = page_to_inode(pages[0]);
int index;
+ ASSERT(em_cached);
+
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
@@ -1235,7 +1295,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
struct page *page, u64 *start, u64 *end)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct folio *folio = page_folio(page);
+ struct btrfs_subpage *subpage = folio_get_private(folio);
struct btrfs_subpage_info *spi = fs_info->subpage_info;
u64 orig_start = *start;
/* Declare as unsigned long so we can use bitmap ops */
@@ -1247,7 +1308,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
* For regular sector size == page size case, since one page only
* contains one sector, we return the page offset directly.
*/
- if (!btrfs_is_subpage(fs_info, page)) {
+ if (!btrfs_is_subpage(fs_info, page->mapping)) {
*start = page_offset(page);
*end = page_offset(page) + PAGE_SIZE;
return;
@@ -1300,7 +1361,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
return 1;
}
- bio_ctrl->end_io_func = end_bio_extent_writepage;
+ bio_ctrl->end_io_func = end_bbio_data_write;
while (cur <= end) {
u32 len = end - cur + 1;
u64 disk_bytenr;
@@ -1320,7 +1381,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* writeback the sectors with subpage dirty bits,
* causing writeback without ordered extent.
*/
- btrfs_page_clear_dirty(fs_info, page, cur, len);
+ btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len);
break;
}
@@ -1331,7 +1392,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
continue;
}
- em = btrfs_get_extent(inode, NULL, 0, cur, len);
+ em = btrfs_get_extent(inode, NULL, cur, len);
if (IS_ERR(em)) {
ret = PTR_ERR_OR_ZERO(em);
goto out_error;
@@ -1347,7 +1408,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
block_start = em->block_start;
disk_bytenr = em->block_start + extent_offset;
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(!extent_map_is_compressed(em));
ASSERT(block_start != EXTENT_MAP_HOLE);
ASSERT(block_start != EXTENT_MAP_INLINE);
@@ -1372,7 +1433,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* So clear subpage dirty bit here so next time we won't submit
* page for range already written to disk.
*/
- btrfs_page_clear_dirty(fs_info, page, cur, iosize);
+ btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);
submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
cur - page_offset(page));
@@ -1380,7 +1441,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
nr++;
}
- btrfs_page_assert_not_dirty(fs_info, page);
+ btrfs_folio_assert_not_dirty(fs_info, page_folio(page));
*nr_ret = nr;
return 0;
@@ -1541,7 +1602,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
* can be no longer dirty nor marked anymore for writeback (if a
* subsequent modification to the extent buffer didn't happen before the
* transaction commit), which makes filemap_fdata[write|wait]_range not
- * able to find the pages tagged with SetPageError at transaction
+ * able to find the pages which contain errors at transaction
* commit time. So if this happens we must abort the transaction,
* otherwise we commit a super block with btree roots that point to
* btree nodes/leafs whose content on disk is invalid - either garbage
@@ -1602,24 +1663,23 @@ static struct extent_buffer *find_extent_buffer_nolock(
return NULL;
}
-static void extent_buffer_write_end_io(struct btrfs_bio *bbio)
+static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ struct folio_iter fi;
u32 bio_offset = 0;
if (!uptodate)
set_btree_ioerr(eb);
- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
u64 start = eb->start + bio_offset;
- struct page *page = bvec->bv_page;
- u32 len = bvec->bv_len;
+ struct folio *folio = fi.folio;
+ u32 len = fi.length;
- btrfs_page_clear_writeback(fs_info, page, start, len);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
bio_offset += len;
}
@@ -1668,36 +1728,44 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
- eb->fs_info, extent_buffer_write_end_io, eb);
+ eb->fs_info, end_bbio_meta_write, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
wbc_init_bio(wbc, &bbio->bio);
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
if (fs_info->nodesize < PAGE_SIZE) {
- struct page *p = eb->pages[0];
+ struct folio *folio = eb->folios[0];
+ bool ret;
- lock_page(p);
- btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len);
- if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start,
+ folio_lock(folio);
+ btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len);
+ if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start,
eb->len)) {
- clear_page_dirty_for_io(p);
+ folio_clear_dirty_for_io(folio);
wbc->nr_to_write--;
}
- __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p));
- wbc_account_cgroup_owner(wbc, p, eb->len);
- unlock_page(p);
+ ret = bio_add_folio(&bbio->bio, folio, eb->len,
+ eb->start - folio_pos(folio));
+ ASSERT(ret);
+ wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
+ folio_unlock(folio);
} else {
- for (int i = 0; i < num_extent_pages(eb); i++) {
- struct page *p = eb->pages[i];
-
- lock_page(p);
- clear_page_dirty_for_io(p);
- set_page_writeback(p);
- __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0);
- wbc_account_cgroup_owner(wbc, p, PAGE_SIZE);
- wbc->nr_to_write--;
- unlock_page(p);
+ int num_folios = num_extent_folios(eb);
+
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+ bool ret;
+
+ folio_lock(folio);
+ folio_clear_dirty_for_io(folio);
+ folio_start_writeback(folio);
+ ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
+ ASSERT(ret);
+ wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
+ eb->folio_size);
+ wbc->nr_to_write -= folio_nr_pages(folio);
+ folio_unlock(folio);
}
}
btrfs_submit_bio(bbio, 0);
@@ -1719,7 +1787,8 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
*/
static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = page_to_fs_info(page);
+ struct folio *folio = page_folio(page);
int submitted = 0;
u64 page_start = page_offset(page);
int bit_start = 0;
@@ -1727,7 +1796,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
/* Lock and write each dirty extent buffers in the range */
while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
struct extent_buffer *eb;
unsigned long flags;
u64 start;
@@ -1736,16 +1805,16 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
* Take private lock to ensure the subpage won't be detached
* in the meantime.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&page->mapping->private_lock);
+ spin_lock(&page->mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&page->mapping->i_private_lock);
break;
}
spin_lock_irqsave(&subpage->lock, flags);
if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
bit_start++;
continue;
}
@@ -1759,7 +1828,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
*/
eb = find_extent_buffer_nolock(fs_info, start);
spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
/*
* The eb has already reached 0 refs thus find_extent_buffer()
@@ -1802,38 +1871,39 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
{
struct writeback_control *wbc = ctx->wbc;
struct address_space *mapping = page->mapping;
+ struct folio *folio = page_folio(page);
struct extent_buffer *eb;
int ret;
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return 0;
- if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
+ if (page_to_fs_info(page)->nodesize < PAGE_SIZE)
return submit_eb_subpage(page, wbc);
- spin_lock(&mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
/*
* Shouldn't happen and normally this would be a BUG_ON but no point
* crashing the machine for something we can survive anyway.
*/
if (WARN_ON(!eb)) {
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
if (eb == ctx->eb) {
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
if (!ret)
return 0;
@@ -1866,7 +1936,7 @@ int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct btrfs_eb_write_context ctx = { .wbc = wbc };
- struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
@@ -2154,7 +2224,7 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
bool found_error = false;
int ret = 0;
struct address_space *mapping = inode->i_mapping;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
const u32 sectorsize = fs_info->sectorsize;
loff_t i_size = i_size_read(inode);
u64 cur = start;
@@ -2196,7 +2266,7 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
cur, cur_len, !ret);
mapping_set_error(page->mapping, ret);
}
- btrfs_page_unlock_writer(fs_info, page, cur, cur_len);
+ btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len);
if (ret < 0)
found_error = true;
next_page:
@@ -2207,8 +2277,7 @@ next_page:
submit_write_bio(&bio_ctrl, found_error ? ret : 0);
}
-int extent_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
int ret = 0;
@@ -2228,7 +2297,7 @@ int extent_writepages(struct address_space *mapping,
return ret;
}
-void extent_readahead(struct readahead_control *rac)
+void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
struct page *pagepool[16];
@@ -2260,7 +2329,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
struct extent_state *cached_state = NULL;
u64 start = folio_pos(folio);
u64 end = start + folio_size(folio) - 1;
- size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
+ size_t blocksize = folio_to_fs_info(folio)->sectorsize;
/* This function is only called for the btree inode */
ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
@@ -2286,18 +2355,20 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
-static int try_release_extent_state(struct extent_io_tree *tree,
+static bool try_release_extent_state(struct extent_io_tree *tree,
struct page *page, gfp_t mask)
{
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- int ret = 1;
+ bool ret;
if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
- ret = 0;
+ ret = false;
} else {
u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
- EXTENT_DELALLOC_NEW | EXTENT_CTLBITS);
+ EXTENT_DELALLOC_NEW | EXTENT_CTLBITS |
+ EXTENT_QGROUP_RESERVED);
+ int ret2;
/*
* At this point we can safely clear everything except the
@@ -2305,15 +2376,15 @@ static int try_release_extent_state(struct extent_io_tree *tree,
* The delalloc new bit will be cleared by ordered extent
* completion.
*/
- ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
+ ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
*/
- if (ret < 0)
- ret = 0;
+ if (ret2 < 0)
+ ret = false;
else
- ret = 1;
+ ret = true;
}
return ret;
}
@@ -2323,92 +2394,141 @@ static int try_release_extent_state(struct extent_io_tree *tree,
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
-int try_release_extent_mapping(struct page *page, gfp_t mask)
+bool try_release_extent_mapping(struct page *page, gfp_t mask)
{
- struct extent_map *em;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
- struct extent_io_tree *tree = &btrfs_inode->io_tree;
- struct extent_map_tree *map = &btrfs_inode->extent_tree;
-
- if (gfpflags_allow_blocking(mask) &&
- page->mapping->host->i_size > SZ_16M) {
- u64 len;
- while (start <= end) {
- struct btrfs_fs_info *fs_info;
- u64 cur_gen;
-
- len = end - start + 1;
- write_lock(&map->lock);
- em = lookup_extent_mapping(map, start, len);
- if (!em) {
- write_unlock(&map->lock);
- break;
- }
- if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
- em->start != start) {
- write_unlock(&map->lock);
- free_extent_map(em);
- break;
- }
- if (test_range_bit_exists(tree, em->start,
- extent_map_end(em) - 1,
- EXTENT_LOCKED))
- goto next;
- /*
- * If it's not in the list of modified extents, used
- * by a fast fsync, we can remove it. If it's being
- * logged we can safely remove it since fsync took an
- * extra reference on the em.
- */
- if (list_empty(&em->list) ||
- test_bit(EXTENT_FLAG_LOGGING, &em->flags))
- goto remove_em;
- /*
- * If it's in the list of modified extents, remove it
- * only if its generation is older then the current one,
- * in which case we don't need it for a fast fsync.
- * Otherwise don't remove it, we could be racing with an
- * ongoing fast fsync that could miss the new extent.
- */
- fs_info = btrfs_inode->root->fs_info;
- spin_lock(&fs_info->trans_lock);
- cur_gen = fs_info->generation;
- spin_unlock(&fs_info->trans_lock);
- if (em->generation >= cur_gen)
- goto next;
-remove_em:
- /*
- * We only remove extent maps that are not in the list of
- * modified extents or that are in the list but with a
- * generation lower then the current generation, so there
- * is no need to set the full fsync flag on the inode (it
- * hurts the fsync performance for workloads with a data
- * size that exceeds or is close to the system's memory).
- */
- remove_extent_mapping(map, em);
- /* once for the rb tree */
+ struct btrfs_inode *inode = page_to_inode(page);
+ struct extent_io_tree *io_tree = &inode->io_tree;
+
+ while (start <= end) {
+ const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ const u64 len = end - start + 1;
+ struct extent_map_tree *extent_tree = &inode->extent_tree;
+ struct extent_map *em;
+
+ write_lock(&extent_tree->lock);
+ em = lookup_extent_mapping(extent_tree, start, len);
+ if (!em) {
+ write_unlock(&extent_tree->lock);
+ break;
+ }
+ if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) {
+ write_unlock(&extent_tree->lock);
free_extent_map(em);
+ break;
+ }
+ if (test_range_bit_exists(io_tree, em->start,
+ extent_map_end(em) - 1, EXTENT_LOCKED))
+ goto next;
+ /*
+ * If it's not in the list of modified extents, used by a fast
+ * fsync, we can remove it. If it's being logged we can safely
+ * remove it since fsync took an extra reference on the em.
+ */
+ if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING))
+ goto remove_em;
+ /*
+ * If it's in the list of modified extents, remove it only if
+ * its generation is older then the current one, in which case
+ * we don't need it for a fast fsync. Otherwise don't remove it,
+ * we could be racing with an ongoing fast fsync that could miss
+ * the new extent.
+ */
+ if (em->generation >= cur_gen)
+ goto next;
+remove_em:
+ /*
+ * We only remove extent maps that are not in the list of
+ * modified extents or that are in the list but with a
+ * generation lower then the current generation, so there is no
+ * need to set the full fsync flag on the inode (it hurts the
+ * fsync performance for workloads with a data size that exceeds
+ * or is close to the system's memory).
+ */
+ remove_extent_mapping(inode, em);
+ /* Once for the inode's extent map tree. */
+ free_extent_map(em);
next:
- start = extent_map_end(em);
- write_unlock(&map->lock);
+ start = extent_map_end(em);
+ write_unlock(&extent_tree->lock);
- /* once for us */
- free_extent_map(em);
+ /* Once for us, for the lookup_extent_mapping() reference. */
+ free_extent_map(em);
- cond_resched(); /* Allow large-extent preemption. */
+ if (need_resched()) {
+ /*
+ * If we need to resched but we can't block just exit
+ * and leave any remaining extent maps.
+ */
+ if (!gfpflags_allow_blocking(mask))
+ break;
+
+ cond_resched();
}
}
- return try_release_extent_state(tree, page, mask);
+ return try_release_extent_state(io_tree, page, mask);
}
+struct btrfs_fiemap_entry {
+ u64 offset;
+ u64 phys;
+ u64 len;
+ u32 flags;
+};
+
+/*
+ * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
+ * range from the inode's io tree, unlock the subvolume tree search path, flush
+ * the fiemap cache and relock the file range and research the subvolume tree.
+ * The value here is something negative that can't be confused with a valid
+ * errno value and different from 1 because that's also a return value from
+ * fiemap_fill_next_extent() and also it's often used to mean some btree search
+ * did not find a key, so make it some distinct negative value.
+ */
+#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
+
/*
- * To cache previous fiemap extent
+ * Used to:
+ *
+ * - Cache the next entry to be emitted to the fiemap buffer, so that we can
+ * merge extents that are contiguous and can be grouped as a single one;
*
- * Will be used for merging fiemap extent
+ * - Store extents ready to be written to the fiemap buffer in an intermediary
+ * buffer. This intermediary buffer is to ensure that in case the fiemap
+ * buffer is memory mapped to the fiemap target file, we don't deadlock
+ * during btrfs_page_mkwrite(). This is because during fiemap we are locking
+ * an extent range in order to prevent races with delalloc flushing and
+ * ordered extent completion, which is needed in order to reliably detect
+ * delalloc in holes and prealloc extents. And this can lead to a deadlock
+ * if the fiemap buffer is memory mapped to the file we are running fiemap
+ * against (a silly, useless in practice scenario, but possible) because
+ * btrfs_page_mkwrite() will try to lock the same extent range.
*/
struct fiemap_cache {
+ /* An array of ready fiemap entries. */
+ struct btrfs_fiemap_entry *entries;
+ /* Number of entries in the entries array. */
+ int entries_size;
+ /* Index of the next entry in the entries array to write to. */
+ int entries_pos;
+ /*
+ * Once the entries array is full, this indicates what's the offset for
+ * the next file extent item we must search for in the inode's subvolume
+ * tree after unlocking the extent range in the inode's io tree and
+ * releasing the search path.
+ */
+ u64 next_search_offset;
+ /*
+ * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
+ * to count ourselves emitted extents and stop instead of relying on
+ * fiemap_fill_next_extent() because we buffer ready fiemap entries at
+ * the @entries array, and we want to stop as soon as we hit the max
+ * amount of extents to map, not just to save time but also to make the
+ * logic at extent_fiemap() simpler.
+ */
+ unsigned int extents_mapped;
+ /* Fields for the cached extent (unsubmitted, not ready, extent). */
u64 offset;
u64 phys;
u64 len;
@@ -2416,6 +2536,28 @@ struct fiemap_cache {
bool cached;
};
+static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache)
+{
+ for (int i = 0; i < cache->entries_pos; i++) {
+ struct btrfs_fiemap_entry *entry = &cache->entries[i];
+ int ret;
+
+ ret = fiemap_fill_next_extent(fieinfo, entry->offset,
+ entry->phys, entry->len,
+ entry->flags);
+ /*
+ * Ignore 1 (reached max entries) because we keep track of that
+ * ourselves in emit_fiemap_extent().
+ */
+ if (ret < 0)
+ return ret;
+ }
+ cache->entries_pos = 0;
+
+ return 0;
+}
+
/*
* Helper to submit fiemap extent.
*
@@ -2430,7 +2572,8 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
struct fiemap_cache *cache,
u64 offset, u64 phys, u64 len, u32 flags)
{
- int ret = 0;
+ struct btrfs_fiemap_entry *entry;
+ u64 cache_end;
/* Set at the end of extent_fiemap(). */
ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
@@ -2439,15 +2582,104 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
goto assign;
/*
- * Sanity check, extent_fiemap() should have ensured that new
- * fiemap extent won't overlap with cached one.
- * Not recoverable.
+ * When iterating the extents of the inode, at extent_fiemap(), we may
+ * find an extent that starts at an offset behind the end offset of the
+ * previous extent we processed. This happens if fiemap is called
+ * without FIEMAP_FLAG_SYNC and there are ordered extents completing
+ * after we had to unlock the file range, release the search path, emit
+ * the fiemap extents stored in the buffer (cache->entries array) and
+ * the lock the remainder of the range and re-search the btree.
*
- * NOTE: Physical address can overlap, due to compression
+ * For example we are in leaf X processing its last item, which is the
+ * file extent item for file range [512K, 1M[, and after
+ * btrfs_next_leaf() releases the path, there's an ordered extent that
+ * completes for the file range [768K, 2M[, and that results in trimming
+ * the file extent item so that it now corresponds to the file range
+ * [512K, 768K[ and a new file extent item is inserted for the file
+ * range [768K, 2M[, which may end up as the last item of leaf X or as
+ * the first item of the next leaf - in either case btrfs_next_leaf()
+ * will leave us with a path pointing to the new extent item, for the
+ * file range [768K, 2M[, since that's the first key that follows the
+ * last one we processed. So in order not to report overlapping extents
+ * to user space, we trim the length of the previously cached extent and
+ * emit it.
+ *
+ * Upon calling btrfs_next_leaf() we may also find an extent with an
+ * offset smaller than or equals to cache->offset, and this happens
+ * when we had a hole or prealloc extent with several delalloc ranges in
+ * it, but after btrfs_next_leaf() released the path, delalloc was
+ * flushed and the resulting ordered extents were completed, so we can
+ * now have found a file extent item for an offset that is smaller than
+ * or equals to what we have in cache->offset. We deal with this as
+ * described below.
*/
- if (cache->offset + cache->len > offset) {
- WARN_ON(1);
- return -EINVAL;
+ cache_end = cache->offset + cache->len;
+ if (cache_end > offset) {
+ if (offset == cache->offset) {
+ /*
+ * We cached a dealloc range (found in the io tree) for
+ * a hole or prealloc extent and we have now found a
+ * file extent item for the same offset. What we have
+ * now is more recent and up to date, so discard what
+ * we had in the cache and use what we have just found.
+ */
+ goto assign;
+ } else if (offset > cache->offset) {
+ /*
+ * The extent range we previously found ends after the
+ * offset of the file extent item we found and that
+ * offset falls somewhere in the middle of that previous
+ * extent range. So adjust the range we previously found
+ * to end at the offset of the file extent item we have
+ * just found, since this extent is more up to date.
+ * Emit that adjusted range and cache the file extent
+ * item we have just found. This corresponds to the case
+ * where a previously found file extent item was split
+ * due to an ordered extent completing.
+ */
+ cache->len = offset - cache->offset;
+ goto emit;
+ } else {
+ const u64 range_end = offset + len;
+
+ /*
+ * The offset of the file extent item we have just found
+ * is behind the cached offset. This means we were
+ * processing a hole or prealloc extent for which we
+ * have found delalloc ranges (in the io tree), so what
+ * we have in the cache is the last delalloc range we
+ * found while the file extent item we found can be
+ * either for a whole delalloc range we previously
+ * emmitted or only a part of that range.
+ *
+ * We have two cases here:
+ *
+ * 1) The file extent item's range ends at or behind the
+ * cached extent's end. In this case just ignore the
+ * current file extent item because we don't want to
+ * overlap with previous ranges that may have been
+ * emmitted already;
+ *
+ * 2) The file extent item starts behind the currently
+ * cached extent but its end offset goes beyond the
+ * end offset of the cached extent. We don't want to
+ * overlap with a previous range that may have been
+ * emmitted already, so we emit the currently cached
+ * extent and then partially store the current file
+ * extent item's range in the cache, for the subrange
+ * going the cached extent's end to the end of the
+ * file extent item.
+ */
+ if (range_end <= cache_end)
+ return 0;
+
+ if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
+ phys += cache_end - offset;
+
+ offset = cache_end;
+ len = range_end - cache_end;
+ goto emit;
+ }
}
/*
@@ -2467,12 +2699,37 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
return 0;
}
+emit:
/* Not mergeable, need to submit cached one */
- ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
- cache->len, cache->flags);
- cache->cached = false;
- if (ret)
- return ret;
+
+ if (cache->entries_pos == cache->entries_size) {
+ /*
+ * We will need to research for the end offset of the last
+ * stored extent and not from the current offset, because after
+ * unlocking the range and releasing the path, if there's a hole
+ * between that end offset and this current offset, a new extent
+ * may have been inserted due to a new write, so we don't want
+ * to miss it.
+ */
+ entry = &cache->entries[cache->entries_size - 1];
+ cache->next_search_offset = entry->offset + entry->len;
+ cache->cached = false;
+
+ return BTRFS_FIEMAP_FLUSH_CACHE;
+ }
+
+ entry = &cache->entries[cache->entries_pos];
+ entry->offset = cache->offset;
+ entry->phys = cache->phys;
+ entry->len = cache->len;
+ entry->flags = cache->flags;
+ cache->entries_pos++;
+ cache->extents_mapped++;
+
+ if (cache->extents_mapped == fieinfo->fi_extents_max) {
+ cache->cached = false;
+ return 1;
+ }
assign:
cache->cached = true;
cache->offset = offset;
@@ -2512,7 +2769,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
{
- struct extent_buffer *clone;
+ struct extent_buffer *clone = path->nodes[0];
struct btrfs_key key;
int slot;
int ret;
@@ -2521,29 +2778,51 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p
if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
return 0;
+ /*
+ * Add a temporary extra ref to an already cloned extent buffer to
+ * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
+ * the cost of allocating a new one.
+ */
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
+ atomic_inc(&clone->refs);
+
ret = btrfs_next_leaf(inode->root, path);
if (ret != 0)
- return ret;
+ goto out;
/*
* Don't bother with cloning if there are no more file extent items for
* our inode.
*/
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY)
- return 1;
+ if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
+ ret = 1;
+ goto out;
+ }
+ /*
+ * Important to preserve the start field, for the optimizations when
+ * checking if extents are shared (see extent_fiemap()).
+ *
+ * We must set ->start before calling copy_extent_buffer_full(). If we
+ * are on sub-pagesize blocksize, we use ->start to determine the offset
+ * into the folio where our eb exists, and if we update ->start after
+ * the fact then any subsequent reads of the eb may read from a
+ * different offset in the folio than where we originally copied into.
+ */
+ clone->start = path->nodes[0]->start;
/* See the comment at fiemap_search_slot() about why we clone. */
- clone = btrfs_clone_extent_buffer(path->nodes[0]);
- if (!clone)
- return -ENOMEM;
+ copy_extent_buffer_full(clone, path->nodes[0]);
slot = path->slots[0];
btrfs_release_path(path);
path->nodes[0] = clone;
path->slots[0] = slot;
+out:
+ if (ret)
+ free_extent_buffer(clone);
- return 0;
+ return ret;
}
/*
@@ -2598,8 +2877,8 @@ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path
* neighbour leaf).
* We also need the private clone because holding a read lock on an
* extent buffer of the subvolume's b+tree will make lockdep unhappy
- * when we call fiemap_fill_next_extent(), because that may cause a page
- * fault when filling the user space buffer with fiemap data.
+ * when we check if extents are shared, as backref walking may need to
+ * lock the same leaf we are processing.
*/
clone = btrfs_clone_extent_buffer(path->nodes[0]);
if (!clone)
@@ -2823,24 +3102,29 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
struct btrfs_backref_share_check_ctx *backref_ctx;
u64 last_extent_end;
u64 prev_extent_end;
- u64 lockstart;
- u64 lockend;
+ u64 range_start;
+ u64 range_end;
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
bool stopped = false;
int ret;
+ cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
+ cache.entries = kmalloc_array(cache.entries_size,
+ sizeof(struct btrfs_fiemap_entry),
+ GFP_KERNEL);
backref_ctx = btrfs_alloc_backref_share_check_ctx();
path = btrfs_alloc_path();
- if (!backref_ctx || !path) {
+ if (!cache.entries || !backref_ctx || !path) {
ret = -ENOMEM;
goto out;
}
- lockstart = round_down(start, inode->root->fs_info->sectorsize);
- lockend = round_up(start + len, inode->root->fs_info->sectorsize);
- prev_extent_end = lockstart;
+restart:
+ range_start = round_down(start, sectorsize);
+ range_end = round_up(start + len, sectorsize);
+ prev_extent_end = range_start;
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
if (ret < 0)
@@ -2848,7 +3132,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
btrfs_release_path(path);
path->reada = READA_FORWARD;
- ret = fiemap_search_slot(inode, path, lockstart);
+ ret = fiemap_search_slot(inode, path, range_start);
if (ret < 0) {
goto out_unlock;
} else if (ret > 0) {
@@ -2860,7 +3144,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto check_eof_delalloc;
}
- while (prev_extent_end < lockend) {
+ while (prev_extent_end < range_end) {
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_file_extent_item *ei;
struct btrfs_key key;
@@ -2883,19 +3167,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
* The first iteration can leave us at an extent item that ends
* before our range's start. Move to the next item.
*/
- if (extent_end <= lockstart)
+ if (extent_end <= range_start)
goto next_item;
backref_ctx->curr_leaf_bytenr = leaf->start;
/* We have in implicit hole (NO_HOLES feature enabled). */
if (prev_extent_end < key.offset) {
- const u64 range_end = min(key.offset, lockend) - 1;
+ const u64 hole_end = min(key.offset, range_end) - 1;
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state,
backref_ctx, 0, 0, 0,
- prev_extent_end, range_end);
+ prev_extent_end, hole_end);
if (ret < 0) {
goto out_unlock;
} else if (ret > 0) {
@@ -2905,7 +3189,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
}
/* We've reached the end of the fiemap range, stop. */
- if (key.offset >= lockend) {
+ if (key.offset >= range_end) {
stopped = true;
break;
}
@@ -2966,7 +3250,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
if (ret < 0) {
goto out_unlock;
} else if (ret > 0) {
- /* fiemap_fill_next_extent() told us to stop. */
+ /* emit_fiemap_extent() told us to stop. */
stopped = true;
break;
}
@@ -2989,23 +3273,13 @@ next_item:
}
check_eof_delalloc:
- /*
- * Release (and free) the path before emitting any final entries to
- * fiemap_fill_next_extent() to keep lockdep happy. This is because
- * once we find no more file extent items exist, we may have a
- * non-cloned leaf, and fiemap_fill_next_extent() can trigger page
- * faults when copying data to the user space buffer.
- */
- btrfs_free_path(path);
- path = NULL;
-
- if (!stopped && prev_extent_end < lockend) {
+ if (!stopped && prev_extent_end < range_end) {
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state, backref_ctx,
- 0, 0, 0, prev_extent_end, lockend - 1);
+ 0, 0, 0, prev_extent_end, range_end - 1);
if (ret < 0)
goto out_unlock;
- prev_extent_end = lockend;
+ prev_extent_end = range_end;
}
if (cache.cached && cache.offset + cache.len >= last_extent_end) {
@@ -3029,13 +3303,39 @@ check_eof_delalloc:
}
}
- ret = emit_last_fiemap_cache(fieinfo, &cache);
-
out_unlock:
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+
+ if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
+ btrfs_release_path(path);
+ ret = flush_fiemap_cache(fieinfo, &cache);
+ if (ret)
+ goto out;
+ len -= cache.next_search_offset - start;
+ start = cache.next_search_offset;
+ goto restart;
+ } else if (ret < 0) {
+ goto out;
+ }
+
+ /*
+ * Must free the path before emitting to the fiemap buffer because we
+ * may have a non-cloned leaf and if the fiemap buffer is memory mapped
+ * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
+ * waiting for an ordered extent that in order to complete needs to
+ * modify that leaf, therefore leading to a deadlock.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+
+ ret = flush_fiemap_cache(fieinfo, &cache);
+ if (ret)
+ goto out;
+
+ ret = emit_last_fiemap_cache(fieinfo, &cache);
out:
free_extent_state(delalloc_cached_state);
+ kfree(cache.entries);
btrfs_free_backref_share_ctx(backref_ctx);
btrfs_free_path(path);
return ret;
@@ -3052,14 +3352,14 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
-static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
+static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- lockdep_assert_held(&page->mapping->private_lock);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- if (PagePrivate(page)) {
- subpage = (struct btrfs_subpage *)page->private;
+ if (folio_test_private(folio)) {
+ subpage = folio_get_private(folio);
if (atomic_read(&subpage->eb_refs))
return true;
/*
@@ -3072,21 +3372,21 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
return false;
}
-static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
/*
- * For mapped eb, we're going to change the page private, which should
- * be done under the private_lock.
+ * For mapped eb, we're going to change the folio private, which should
+ * be done under the i_private_lock.
*/
if (mapped)
- spin_lock(&page->mapping->private_lock);
+ spin_lock(&folio->mapping->i_private_lock);
- if (!PagePrivate(page)) {
+ if (!folio_test_private(folio)) {
if (mapped)
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return;
}
@@ -3095,66 +3395,58 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
* and have this page now attached to the new eb. So
- * only clear page_private if it's still connected to
+ * only clear folio if it's still connected to
* this eb.
*/
- if (PagePrivate(page) &&
- page->private == (unsigned long)eb) {
+ if (folio_test_private(folio) && folio_get_private(folio) == eb) {
BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
- BUG_ON(PageDirty(page));
- BUG_ON(PageWriteback(page));
- /*
- * We need to make sure we haven't be attached
- * to a new eb.
- */
- detach_page_private(page);
+ BUG_ON(folio_test_dirty(folio));
+ BUG_ON(folio_test_writeback(folio));
+ /* We need to make sure we haven't be attached to a new eb. */
+ folio_detach_private(folio);
}
if (mapped)
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return;
}
/*
- * For subpage, we can have dummy eb with page private. In this case,
- * we can directly detach the private as such page is only attached to
- * one dummy eb, no sharing.
+ * For subpage, we can have dummy eb with folio private attached. In
+ * this case, we can directly detach the private as such folio is only
+ * attached to one dummy eb, no sharing.
*/
if (!mapped) {
- btrfs_detach_subpage(fs_info, page);
+ btrfs_detach_subpage(fs_info, folio);
return;
}
- btrfs_page_dec_eb_refs(fs_info, page);
+ btrfs_folio_dec_eb_refs(fs_info, folio);
/*
- * We can only detach the page private if there are no other ebs in the
+ * We can only detach the folio private if there are no other ebs in the
* page range and no unfinished IO.
*/
- if (!page_range_has_eb(fs_info, page))
- btrfs_detach_subpage(fs_info, page);
+ if (!folio_range_has_eb(fs_info, folio))
+ btrfs_detach_subpage(fs_info, folio);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
}
/* Release all pages attached to the extent buffer */
static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
{
- int i;
- int num_pages;
-
ASSERT(!extent_buffer_under_io(eb));
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *page = eb->pages[i];
+ for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) {
+ struct folio *folio = eb->folios[i];
- if (!page)
+ if (!folio)
continue;
- detach_extent_buffer_page(eb, page);
+ detach_extent_buffer_folio(eb, folio);
- /* One for when we allocated the page */
- put_page(page);
+ /* One for when we allocated the folio. */
+ folio_put(folio);
}
}
@@ -3192,9 +3484,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
- int i;
struct extent_buffer *new;
- int num_pages = num_extent_pages(src);
+ int num_folios = num_extent_folios(src);
int ret;
new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
@@ -3208,22 +3499,22 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
- ret = btrfs_alloc_page_array(num_pages, new->pages);
+ ret = alloc_eb_folio_array(new, 0);
if (ret) {
btrfs_release_extent_buffer(new);
return NULL;
}
- for (i = 0; i < num_pages; i++) {
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = new->folios[i];
int ret;
- struct page *p = new->pages[i];
- ret = attach_extent_buffer_page(new, p, NULL);
+ ret = attach_extent_buffer_folio(new, folio, NULL);
if (ret < 0) {
btrfs_release_extent_buffer(new);
return NULL;
}
- WARN_ON(PageDirty(p));
+ WARN_ON(folio_test_dirty(folio));
}
copy_extent_buffer_full(new, src);
set_extent_buffer_uptodate(new);
@@ -3235,23 +3526,20 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len)
{
struct extent_buffer *eb;
- int num_pages;
- int i;
+ int num_folios = 0;
int ret;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
- num_pages = num_extent_pages(eb);
- ret = btrfs_alloc_page_array(num_pages, eb->pages);
+ ret = alloc_eb_folio_array(eb, 0);
if (ret)
goto err;
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- ret = attach_extent_buffer_page(eb, p, NULL);
+ num_folios = num_extent_folios(eb);
+ for (int i = 0; i < num_folios; i++) {
+ ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
if (ret < 0)
goto err;
}
@@ -3262,10 +3550,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
err:
- for (i = 0; i < num_pages; i++) {
- if (eb->pages[i]) {
- detach_extent_buffer_page(eb, eb->pages[i]);
- __free_page(eb->pages[i]);
+ for (int i = 0; i < num_folios; i++) {
+ if (eb->folios[i]) {
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ __folio_put(eb->folios[i]);
}
}
__free_extent_buffer(eb);
@@ -3314,20 +3602,14 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
-static void mark_extent_buffer_accessed(struct extent_buffer *eb,
- struct page *accessed)
+static void mark_extent_buffer_accessed(struct extent_buffer *eb)
{
- int num_pages, i;
+ int num_folios= num_extent_folios(eb);
check_buffer_tree_ref(eb);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- if (p != accessed)
- mark_page_accessed(p);
- }
+ for (int i = 0; i < num_folios; i++)
+ folio_mark_accessed(eb->folios[i]);
}
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -3355,7 +3637,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
spin_lock(&eb->refs_lock);
spin_unlock(&eb->refs_lock);
}
- mark_extent_buffer_accessed(eb, NULL);
+ mark_extent_buffer_accessed(eb);
return eb;
}
@@ -3404,6 +3686,7 @@ free_eb:
static struct extent_buffer *grab_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page)
{
+ struct folio *folio = page_folio(page);
struct extent_buffer *exists;
/*
@@ -3415,21 +3698,21 @@ static struct extent_buffer *grab_extent_buffer(
return NULL;
/* Page not yet attached to an extent buffer */
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return NULL;
/*
* We could have already allocated an eb for this page and attached one
* so lets see if we can get a ref on the existing eb, and if we can we
* know it's good and we can just return that one, else we know we can
- * just overwrite page->private.
+ * just overwrite folio private.
*/
- exists = (struct extent_buffer *)page->private;
+ exists = folio_get_private(folio);
if (atomic_inc_not_zero(&exists->refs))
return exists;
WARN_ON(PageDirty(page));
- detach_page_private(page);
+ folio_detach_private(folio);
return NULL;
}
@@ -3463,19 +3746,88 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
return 0;
}
+
+/*
+ * Return 0 if eb->folios[i] is attached to btree inode successfully.
+ * Return >0 if there is already another extent buffer for the range,
+ * and @found_eb_ret would be updated.
+ * Return -EAGAIN if the filemap has an existing folio but with different size
+ * than @eb.
+ * The caller needs to free the existing folios and retry using the same order.
+ */
+static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
+ struct extent_buffer **found_eb_ret)
+{
+
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ const unsigned long index = eb->start >> PAGE_SHIFT;
+ struct folio *existing_folio;
+ int ret;
+
+ ASSERT(found_eb_ret);
+
+ /* Caller should ensure the folio exists. */
+ ASSERT(eb->folios[i]);
+
+retry:
+ ret = filemap_add_folio(mapping, eb->folios[i], index + i,
+ GFP_NOFS | __GFP_NOFAIL);
+ if (!ret)
+ return 0;
+
+ existing_folio = filemap_lock_folio(mapping, index + i);
+ /* The page cache only exists for a very short time, just retry. */
+ if (IS_ERR(existing_folio))
+ goto retry;
+
+ /* For now, we should only have single-page folios for btree inode. */
+ ASSERT(folio_nr_pages(existing_folio) == 1);
+
+ if (folio_size(existing_folio) != eb->folio_size) {
+ folio_unlock(existing_folio);
+ folio_put(existing_folio);
+ return -EAGAIN;
+ }
+
+ if (fs_info->nodesize < PAGE_SIZE) {
+ /*
+ * We're going to reuse the existing page, can drop our page
+ * and subpage structure now.
+ */
+ __free_page(folio_page(eb->folios[i], 0));
+ eb->folios[i] = existing_folio;
+ } else {
+ struct extent_buffer *existing_eb;
+
+ existing_eb = grab_extent_buffer(fs_info,
+ folio_page(existing_folio, 0));
+ if (existing_eb) {
+ /* The extent buffer still exists, we can use it directly. */
+ *found_eb_ret = existing_eb;
+ folio_unlock(existing_folio);
+ folio_put(existing_folio);
+ return 1;
+ }
+ /* The extent buffer no longer exists, we can reuse the folio. */
+ __free_page(folio_page(eb->folios[i], 0));
+ eb->folios[i] = existing_folio;
+ }
+ return 0;
+}
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
unsigned long len = fs_info->nodesize;
- int num_pages;
- int i;
- unsigned long index = start >> PAGE_SHIFT;
+ int num_folios;
+ int attached = 0;
struct extent_buffer *eb;
- struct extent_buffer *exists = NULL;
- struct page *p;
+ struct extent_buffer *existing_eb = NULL;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct btrfs_subpage *prealloc = NULL;
u64 lockdep_owner = owner_root;
+ bool page_contig = true;
int uptodate = 1;
int ret;
@@ -3510,11 +3862,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
- num_pages = num_extent_pages(eb);
-
/*
- * Preallocate page->private for subpage case, so that we won't
- * allocate memory with private_lock nor page lock hold.
+ * Preallocate folio private for subpage case, so that we won't
+ * allocate memory with i_private_lock nor page lock hold.
*
* The memory will be freed by attach_extent_buffer_page() or freed
* manually if we exit earlier.
@@ -3522,47 +3872,91 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (fs_info->nodesize < PAGE_SIZE) {
prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
- exists = ERR_CAST(prealloc);
- goto free_eb;
+ ret = PTR_ERR(prealloc);
+ goto out;
}
}
- for (i = 0; i < num_pages; i++, index++) {
- p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
- if (!p) {
- exists = ERR_PTR(-ENOMEM);
- btrfs_free_subpage(prealloc);
- goto free_eb;
+reallocate:
+ /* Allocate all pages first. */
+ ret = alloc_eb_folio_array(eb, __GFP_NOFAIL);
+ if (ret < 0) {
+ btrfs_free_subpage(prealloc);
+ goto out;
+ }
+
+ num_folios = num_extent_folios(eb);
+ /* Attach all pages to the filemap. */
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio;
+
+ ret = attach_eb_folio_to_filemap(eb, i, &existing_eb);
+ if (ret > 0) {
+ ASSERT(existing_eb);
+ goto out;
}
- spin_lock(&mapping->private_lock);
- exists = grab_extent_buffer(fs_info, p);
- if (exists) {
- spin_unlock(&mapping->private_lock);
- unlock_page(p);
- put_page(p);
- mark_extent_buffer_accessed(exists, p);
- btrfs_free_subpage(prealloc);
- goto free_eb;
+ /*
+ * TODO: Special handling for a corner case where the order of
+ * folios mismatch between the new eb and filemap.
+ *
+ * This happens when:
+ *
+ * - the new eb is using higher order folio
+ *
+ * - the filemap is still using 0-order folios for the range
+ * This can happen at the previous eb allocation, and we don't
+ * have higher order folio for the call.
+ *
+ * - the existing eb has already been freed
+ *
+ * In this case, we have to free the existing folios first, and
+ * re-allocate using the same order.
+ * Thankfully this is not going to happen yet, as we're still
+ * using 0-order folios.
+ */
+ if (unlikely(ret == -EAGAIN)) {
+ ASSERT(0);
+ goto reallocate;
}
+ attached++;
+
+ /*
+ * Only after attach_eb_folio_to_filemap(), eb->folios[] is
+ * reliable, as we may choose to reuse the existing page cache
+ * and free the allocated page.
+ */
+ folio = eb->folios[i];
+ eb->folio_size = folio_size(folio);
+ eb->folio_shift = folio_shift(folio);
+ spin_lock(&mapping->i_private_lock);
/* Should not fail, as we have preallocated the memory */
- ret = attach_extent_buffer_page(eb, p, prealloc);
+ ret = attach_extent_buffer_folio(eb, folio, prealloc);
ASSERT(!ret);
/*
* To inform we have extra eb under allocation, so that
- * detach_extent_buffer_page() won't release the page private
+ * detach_extent_buffer_page() won't release the folio private
* when the eb hasn't yet been inserted into radix tree.
*
* The ref will be decreased when the eb released the page, in
* detach_extent_buffer_page().
* Thus needs no special handling in error path.
*/
- btrfs_page_inc_eb_refs(fs_info, p);
- spin_unlock(&mapping->private_lock);
+ btrfs_folio_inc_eb_refs(fs_info, folio);
+ spin_unlock(&mapping->i_private_lock);
+
+ WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
+
+ /*
+ * Check if the current page is physically contiguous with previous eb
+ * page.
+ * At this stage, either we allocated a large folio, thus @i
+ * would only be 0, or we fall back to per-page allocation.
+ */
+ if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
+ page_contig = false;
- WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
- eb->pages[i] = p;
- if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len))
+ if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len))
uptodate = 0;
/*
@@ -3575,12 +3969,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ /* All pages are physically contiguous, can skip cross page handling. */
+ if (page_contig)
+ eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
again:
ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- exists = ERR_PTR(ret);
- goto free_eb;
- }
+ if (ret)
+ goto out;
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
@@ -3588,9 +3983,10 @@ again:
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
- goto free_eb;
+ ret = 0;
+ existing_eb = find_extent_buffer(fs_info, start);
+ if (existing_eb)
+ goto out;
else
goto again;
}
@@ -3603,19 +3999,46 @@ again:
* btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
- for (i = 0; i < num_pages; i++)
- unlock_page(eb->pages[i]);
+ for (int i = 0; i < num_folios; i++)
+ unlock_page(folio_page(eb->folios[i], 0));
return eb;
-free_eb:
+out:
WARN_ON(!atomic_dec_and_test(&eb->refs));
- for (i = 0; i < num_pages; i++) {
- if (eb->pages[i])
- unlock_page(eb->pages[i]);
+
+ /*
+ * Any attached folios need to be detached before we unlock them. This
+ * is because when we're inserting our new folios into the mapping, and
+ * then attaching our eb to that folio. If we fail to insert our folio
+ * we'll lookup the folio for that index, and grab that EB. We do not
+ * want that to grab this eb, as we're getting ready to free it. So we
+ * have to detach it first and then unlock it.
+ *
+ * We have to drop our reference and NULL it out here because in the
+ * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb.
+ * Below when we call btrfs_release_extent_buffer() we will call
+ * detach_extent_buffer_folio() on our remaining pages in the !subpage
+ * case. If we left eb->folios[i] populated in the subpage case we'd
+ * double put our reference and be super sad.
+ */
+ for (int i = 0; i < attached; i++) {
+ ASSERT(eb->folios[i]);
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ unlock_page(folio_page(eb->folios[i], 0));
+ folio_put(eb->folios[i]);
+ eb->folios[i] = NULL;
}
+ /*
+ * Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
+ * so it can be cleaned up without utlizing page->mapping.
+ */
+ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
btrfs_release_extent_buffer(eb);
- return exists;
+ if (ret < 0)
+ return ERR_PTR(ret);
+ ASSERT(existing_eb);
+ return existing_eb;
}
static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
@@ -3707,31 +4130,30 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
-static void btree_clear_page_dirty(struct page *page)
+static void btree_clear_folio_dirty(struct folio *folio)
{
- ASSERT(PageDirty(page));
- ASSERT(PageLocked(page));
- clear_page_dirty_for_io(page);
- xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page))
- __xa_clear_mark(&page->mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(&page->mapping->i_pages);
+ ASSERT(folio_test_dirty(folio));
+ ASSERT(folio_test_locked(folio));
+ folio_clear_dirty_for_io(folio);
+ xa_lock_irq(&folio->mapping->i_pages);
+ if (!folio_test_dirty(folio))
+ __xa_clear_mark(&folio->mapping->i_pages,
+ folio_index(folio), PAGECACHE_TAG_DIRTY);
+ xa_unlock_irq(&folio->mapping->i_pages);
}
static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page = eb->pages[0];
+ struct folio *folio = eb->folios[0];
bool last;
- /* btree_clear_page_dirty() needs page locked */
- lock_page(page);
- last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
- eb->len);
+ /* btree_clear_folio_dirty() needs page locked. */
+ folio_lock(folio);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len);
if (last)
- btree_clear_page_dirty(page);
- unlock_page(page);
+ btree_clear_folio_dirty(folio);
+ folio_unlock(folio);
WARN_ON(atomic_read(&eb->refs) == 0);
}
@@ -3739,15 +4161,27 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i;
- int num_pages;
- struct page *page;
+ int num_folios;
btrfs_assert_tree_write_locked(eb);
if (trans && btrfs_header_generation(eb) != trans->transid)
return;
+ /*
+ * Instead of clearing the dirty flag off of the buffer, mark it as
+ * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve
+ * write-ordering in zoned mode, without the need to later re-dirty
+ * the extent_buffer.
+ *
+ * The actual zeroout of the buffer will happen later in
+ * btree_csum_one_bio.
+ */
+ if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags);
+ return;
+ }
+
if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
return;
@@ -3757,32 +4191,32 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
if (eb->fs_info->nodesize < PAGE_SIZE)
return clear_subpage_extent_buffer_dirty(eb);
- num_pages = num_extent_pages(eb);
+ num_folios = num_extent_folios(eb);
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- if (!PageDirty(page))
+ if (!folio_test_dirty(folio))
continue;
- lock_page(page);
- btree_clear_page_dirty(page);
- unlock_page(page);
+ folio_lock(folio);
+ btree_clear_folio_dirty(folio);
+ folio_unlock(folio);
}
WARN_ON(atomic_read(&eb->refs) == 0);
}
void set_extent_buffer_dirty(struct extent_buffer *eb)
{
- int i;
- int num_pages;
+ int num_folios;
bool was_dirty;
check_buffer_tree_ref(eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- num_pages = num_extent_pages(eb);
+ num_folios = num_extent_folios(eb);
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+ WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
if (!was_dirty) {
bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
@@ -3799,34 +4233,32 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
* the above race.
*/
if (subpage)
- lock_page(eb->pages[0]);
- for (i = 0; i < num_pages; i++)
- btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
- eb->start, eb->len);
+ lock_page(folio_page(eb->folios[0], 0));
+ for (int i = 0; i < num_folios; i++)
+ btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
+ eb->start, eb->len);
if (subpage)
- unlock_page(eb->pages[0]);
+ unlock_page(folio_page(eb->folios[0], 0));
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
eb->len,
eb->fs_info->dirty_metadata_batch);
}
#ifdef CONFIG_BTRFS_DEBUG
- for (i = 0; i < num_pages; i++)
- ASSERT(PageDirty(eb->pages[i]));
+ for (int i = 0; i < num_folios; i++)
+ ASSERT(folio_test_dirty(eb->folios[i]));
#endif
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page;
- int num_pages;
- int i;
+ int num_folios = num_extent_folios(eb);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- if (!page)
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+
+ if (!folio)
continue;
/*
@@ -3834,46 +4266,56 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb)
* btrfs_is_subpage() can not handle cloned/dummy metadata.
*/
if (fs_info->nodesize >= PAGE_SIZE)
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
else
- btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
- eb->len);
+ btrfs_subpage_clear_uptodate(fs_info, folio,
+ eb->start, eb->len);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page;
- int num_pages;
- int i;
+ int num_folios = num_extent_folios(eb);
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
/*
* This is special handling for metadata subpage, as regular
* btrfs_is_subpage() can not handle cloned/dummy metadata.
*/
if (fs_info->nodesize >= PAGE_SIZE)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
else
- btrfs_subpage_set_uptodate(fs_info, page, eb->start,
- eb->len);
+ btrfs_subpage_set_uptodate(fs_info, folio,
+ eb->start, eb->len);
}
}
-static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
+static void clear_extent_buffer_reading(struct extent_buffer *eb)
+{
+ clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+ smp_mb__after_atomic();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+}
+
+static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ struct folio_iter fi;
u32 bio_offset = 0;
+ /*
+ * If the extent buffer is marked UPTODATE before the read operation
+ * completes, other calls to read_extent_buffer_pages() will return
+ * early without waiting for the read to finish, causing data races.
+ */
+ WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags));
+
eb->read_mirror = bbio->mirror_num;
if (uptodate &&
@@ -3887,22 +4329,20 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
}
- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
+ struct folio *folio = fi.folio;
u64 start = eb->start + bio_offset;
- struct page *page = bvec->bv_page;
- u32 len = bvec->bv_len;
+ u32 len = fi.length;
if (uptodate)
- btrfs_page_set_uptodate(fs_info, page, start, len);
+ btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
- btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_folio_clear_uptodate(fs_info, folio, start, len);
bio_offset += len;
}
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_extent_buffer_reading(eb);
free_extent_buffer(eb);
bio_put(&bbio->bio);
@@ -3911,8 +4351,8 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
struct btrfs_tree_parent_check *check)
{
- int num_pages = num_extent_pages(eb), i;
struct btrfs_bio *bbio;
+ bool ret;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -3929,6 +4369,17 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags))
goto done;
+ /*
+ * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above
+ * test_and_set_bit(EXTENT_BUFFER_READING), someone else could have
+ * started and finished reading the same eb. In this case, UPTODATE
+ * will now be set, and we shouldn't read it in again.
+ */
+ if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) {
+ clear_extent_buffer_reading(eb);
+ return 0;
+ }
+
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
check_buffer_tree_ref(eb);
@@ -3936,17 +4387,24 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_READ | REQ_META, eb->fs_info,
- extent_buffer_read_end_io, eb);
+ end_bbio_meta_read, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
memcpy(&bbio->parent_check, check, sizeof(*check));
if (eb->fs_info->nodesize < PAGE_SIZE) {
- __bio_add_page(&bbio->bio, eb->pages[0], eb->len,
- eb->start - page_offset(eb->pages[0]));
+ ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len,
+ eb->start - folio_pos(eb->folios[0]));
+ ASSERT(ret);
} else {
- for (i = 0; i < num_pages; i++)
- __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0);
+ int num_folios = num_extent_folios(eb);
+
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+
+ ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
+ ASSERT(ret);
+ }
}
btrfs_submit_bio(bbio, mirror_num);
@@ -3964,7 +4422,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
unsigned long len)
{
btrfs_warn(eb->fs_info,
- "access to eb bytenr %llu len %lu out of range start %lu len %lu",
+ "access to eb bytenr %llu len %u out of range start %lu len %lu",
eb->start, eb->len, start, len);
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
@@ -3993,29 +4451,33 @@ static inline int check_eb_range(const struct extent_buffer *eb,
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
+ const int unit_size = eb->folio_size;
size_t cur;
size_t offset;
- struct page *page;
- char *kaddr;
char *dst = (char *)dstv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
if (check_eb_range(eb, start, len)) {
/*
* Invalid range hit, reset the memory, so callers won't get
- * some random garbage for their uninitialzed memory.
+ * some random garbage for their uninitialized memory.
*/
memset(dstv, 0, len);
return;
}
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ memcpy(dstv, eb->addr + start, len);
+ return;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
+ char *kaddr;
- cur = min(len, (PAGE_SIZE - offset));
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
memcpy(dst, kaddr + offset, cur);
dst += cur;
@@ -4029,24 +4491,29 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
void __user *dstv,
unsigned long start, unsigned long len)
{
+ const int unit_size = eb->folio_size;
size_t cur;
size_t offset;
- struct page *page;
- char *kaddr;
char __user *dst = (char __user *)dstv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ if (copy_to_user_nofault(dstv, eb->addr + start, len))
+ ret = -EFAULT;
+ return ret;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
+ char *kaddr;
- cur = min(len, (PAGE_SIZE - offset));
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
@@ -4064,25 +4531,25 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len)
{
+ const int unit_size = eb->folio_size;
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
int ret = 0;
if (check_eb_range(eb, start, len))
return -EINVAL;
- offset = get_eb_offset_in_page(eb, start);
-
- while (len > 0) {
- page = eb->pages[i];
+ if (eb->addr)
+ return memcmp(ptrv, eb->addr + start, len);
- cur = min(len, (PAGE_SIZE - offset));
+ offset = get_eb_offset_in_folio(eb, start);
- kaddr = page_address(page);
+ while (len > 0) {
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
ret = memcmp(ptr, kaddr + offset, cur);
if (ret)
break;
@@ -4101,10 +4568,12 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
* For regular sector size == PAGE_SIZE case, check if @page is uptodate.
* For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
*/
-static void assert_eb_page_uptodate(const struct extent_buffer *eb,
- struct page *page)
+static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct folio *folio = eb->folios[i];
+
+ ASSERT(folio);
/*
* If we are using the commit root we could potentially clear a page
@@ -4118,11 +4587,14 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
return;
if (fs_info->nodesize < PAGE_SIZE) {
- if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page,
+ struct folio *folio = eb->folios[0];
+
+ ASSERT(i == 0);
+ if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
eb->start, eb->len)))
- btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len);
+ btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len);
} else {
- WARN_ON(!PageUptodate(page));
+ WARN_ON(!folio_test_uptodate(folio));
}
}
@@ -4130,29 +4602,34 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
const void *srcv, unsigned long start,
unsigned long len, bool use_memmove)
{
+ const int unit_size = eb->folio_size;
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
char *src = (char *)srcv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
/* For unmapped (dummy) ebs, no need to check their uptodate status. */
const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
- WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
-
if (check_eb_range(eb, start, len))
return;
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ if (use_memmove)
+ memmove(eb->addr + start, srcv, len);
+ else
+ memcpy(eb->addr + start, srcv, len);
+ return;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
if (check_uptodate)
- assert_eb_page_uptodate(eb, page);
+ assert_eb_folio_uptodate(eb, i);
- cur = min(len, PAGE_SIZE - offset);
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
if (use_memmove)
memmove(kaddr + offset, src, cur);
else
@@ -4174,16 +4651,21 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
static void memset_extent_buffer(const struct extent_buffer *eb, int c,
unsigned long start, unsigned long len)
{
+ const int unit_size = eb->folio_size;
unsigned long cur = start;
+ if (eb->addr) {
+ memset(eb->addr + start, c, len);
+ return;
+ }
+
while (cur < start + len) {
- unsigned long index = get_eb_page_index(cur);
- unsigned int offset = get_eb_offset_in_page(eb, cur);
- unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset);
- struct page *page = eb->pages[index];
+ unsigned long index = get_eb_folio_index(eb, cur);
+ unsigned int offset = get_eb_offset_in_folio(eb, cur);
+ unsigned int cur_len = min(start + len - cur, unit_size - offset);
- assert_eb_page_uptodate(eb, page);
- memset(page_address(page) + offset, c, cur_len);
+ assert_eb_folio_uptodate(eb, index);
+ memset(folio_address(eb->folios[index]) + offset, c, cur_len);
cur += cur_len;
}
@@ -4200,15 +4682,16 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
void copy_extent_buffer_full(const struct extent_buffer *dst,
const struct extent_buffer *src)
{
+ const int unit_size = src->folio_size;
unsigned long cur = 0;
ASSERT(dst->len == src->len);
while (cur < src->len) {
- unsigned long index = get_eb_page_index(cur);
- unsigned long offset = get_eb_offset_in_page(src, cur);
- unsigned long cur_len = min(src->len, PAGE_SIZE - offset);
- void *addr = page_address(src->pages[index]) + offset;
+ unsigned long index = get_eb_folio_index(src, cur);
+ unsigned long offset = get_eb_offset_in_folio(src, cur);
+ unsigned long cur_len = min(src->len, unit_size - offset);
+ void *addr = folio_address(src->folios[index]) + offset;
write_extent_buffer(dst, addr, cur, cur_len);
@@ -4221,12 +4704,12 @@ void copy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
+ const int unit_size = dst->folio_size;
u64 dst_len = dst->len;
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
- unsigned long i = get_eb_page_index(dst_offset);
+ unsigned long i = get_eb_folio_index(dst, dst_offset);
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(src, src_offset, len))
@@ -4234,15 +4717,14 @@ void copy_extent_buffer(const struct extent_buffer *dst,
WARN_ON(src->len != dst_len);
- offset = get_eb_offset_in_page(dst, dst_offset);
+ offset = get_eb_offset_in_folio(dst, dst_offset);
while (len > 0) {
- page = dst->pages[i];
- assert_eb_page_uptodate(dst, page);
+ assert_eb_folio_uptodate(dst, i);
- cur = min(len, (unsigned long)(PAGE_SIZE - offset));
+ cur = min(len, (unsigned long)(unit_size - offset));
- kaddr = page_address(page);
+ kaddr = folio_address(dst->folios[i]);
read_extent_buffer(src, kaddr + offset, src_offset, cur);
src_offset += cur;
@@ -4253,22 +4735,22 @@ void copy_extent_buffer(const struct extent_buffer *dst,
}
/*
- * Calculate the page and offset of the byte containing the given bit number.
+ * Calculate the folio and offset of the byte containing the given bit number.
*
* @eb: the extent buffer
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number
- * @page_index: return index of the page in the extent buffer that contains
+ * @folio_index: return index of the folio in the extent buffer that contains
* the given bit number
- * @page_offset: return offset into the page given by page_index
+ * @folio_offset: return offset into the folio given by folio_index
*
* This helper hides the ugliness of finding the byte in an extent buffer which
* contains a given bit.
*/
static inline void eb_bitmap_offset(const struct extent_buffer *eb,
unsigned long start, unsigned long nr,
- unsigned long *page_index,
- size_t *page_offset)
+ unsigned long *folio_index,
+ size_t *folio_offset)
{
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
@@ -4278,10 +4760,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start + offset_in_page(eb->start) + byte_offset;
+ offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset;
- *page_index = offset >> PAGE_SHIFT;
- *page_offset = offset_in_page(offset);
+ *folio_index = offset >> eb->folio_shift;
+ *folio_offset = offset_in_eb_folio(eb, offset);
}
/*
@@ -4294,25 +4776,23 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long nr)
{
- u8 *kaddr;
- struct page *page;
unsigned long i;
size_t offset;
+ u8 *kaddr;
eb_bitmap_offset(eb, start, nr, &i, &offset);
- page = eb->pages[i];
- assert_eb_page_uptodate(eb, page);
- kaddr = page_address(page);
+ assert_eb_folio_uptodate(eb, i);
+ kaddr = folio_address(eb->folios[i]);
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr)
{
- unsigned long index = get_eb_page_index(bytenr);
+ unsigned long index = get_eb_folio_index(eb, bytenr);
if (check_eb_range(eb, bytenr, 1))
return NULL;
- return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr);
+ return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr);
}
/*
@@ -4397,19 +4877,30 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
+ const int unit_size = dst->folio_size;
unsigned long cur_off = 0;
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(dst, src_offset, len))
return;
+ if (dst->addr) {
+ const bool use_memmove = areas_overlap(src_offset, dst_offset, len);
+
+ if (use_memmove)
+ memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+ else
+ memcpy(dst->addr + dst_offset, dst->addr + src_offset, len);
+ return;
+ }
+
while (cur_off < len) {
unsigned long cur_src = cur_off + src_offset;
- unsigned long pg_index = get_eb_page_index(cur_src);
- unsigned long pg_off = get_eb_offset_in_page(dst, cur_src);
+ unsigned long folio_index = get_eb_folio_index(dst, cur_src);
+ unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src);
unsigned long cur_len = min(src_offset + len - cur_src,
- PAGE_SIZE - pg_off);
- void *src_addr = page_address(dst->pages[pg_index]) + pg_off;
+ unit_size - folio_off);
+ void *src_addr = folio_address(dst->folios[folio_index]) + folio_off;
const bool use_memmove = areas_overlap(src_offset + cur_off,
dst_offset + cur_off, cur_len);
@@ -4435,24 +4926,29 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
return;
}
+ if (dst->addr) {
+ memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+ return;
+ }
+
while (len > 0) {
unsigned long src_i;
size_t cur;
- size_t dst_off_in_page;
- size_t src_off_in_page;
+ size_t dst_off_in_folio;
+ size_t src_off_in_folio;
void *src_addr;
bool use_memmove;
- src_i = get_eb_page_index(src_end);
+ src_i = get_eb_folio_index(dst, src_end);
- dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
- src_off_in_page = get_eb_offset_in_page(dst, src_end);
+ dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end);
+ src_off_in_folio = get_eb_offset_in_folio(dst, src_end);
- cur = min_t(unsigned long, len, src_off_in_page + 1);
- cur = min(cur, dst_off_in_page + 1);
+ cur = min_t(unsigned long, len, src_off_in_folio + 1);
+ cur = min(cur, dst_off_in_folio + 1);
- src_addr = page_address(dst->pages[src_i]) + src_off_in_page -
- cur + 1;
+ src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio -
+ cur + 1;
use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1,
cur);
@@ -4505,7 +5001,7 @@ out:
static int try_release_subpage_extent_buffer(struct page *page)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = page_to_fs_info(page);
u64 cur = page_offset(page);
const u64 end = page_offset(page) + PAGE_SIZE;
int ret;
@@ -4514,7 +5010,7 @@ static int try_release_subpage_extent_buffer(struct page *page)
struct extent_buffer *eb = NULL;
/*
- * Unlike try_release_extent_buffer() which uses page->private
+ * Unlike try_release_extent_buffer() which uses folio private
* to grab buffer, for subpage case we rely on radix tree, thus
* we need to ensure radix tree consistency.
*
@@ -4554,43 +5050,44 @@ static int try_release_subpage_extent_buffer(struct page *page)
/*
* Here we don't care about the return value, we will always
- * check the page private at the end. And
+ * check the folio private at the end. And
* release_extent_buffer() will release the refs_lock.
*/
release_extent_buffer(eb);
}
/*
- * Finally to check if we have cleared page private, as if we have
- * released all ebs in the page, the page private should be cleared now.
+ * Finally to check if we have cleared folio private, as if we have
+ * released all ebs in the page, the folio private should be cleared now.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page))
+ spin_lock(&page->mapping->i_private_lock);
+ if (!folio_test_private(page_folio(page)))
ret = 1;
else
ret = 0;
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
return ret;
}
int try_release_extent_buffer(struct page *page)
{
+ struct folio *folio = page_folio(page);
struct extent_buffer *eb;
- if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
+ if (page_to_fs_info(page)->nodesize < PAGE_SIZE)
return try_release_subpage_extent_buffer(page);
/*
- * We need to make sure nobody is changing page->private, as we rely on
- * page->private as the pointer to extent buffer.
+ * We need to make sure nobody is changing folio private, as we rely on
+ * folio private as the pointer to extent buffer.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&page->mapping->private_lock);
+ spin_lock(&page->mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&page->mapping->i_private_lock);
return 1;
}
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
BUG_ON(!eb);
/*
@@ -4601,10 +5098,10 @@ int try_release_extent_buffer(struct page *page)
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
return 0;
}
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
/*
* If tree ref isn't set then we know the ref on this eb is a real ref,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2171057a4477..dca6b12769ec 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -7,11 +7,33 @@
#include <linux/refcount.h>
#include <linux/fiemap.h>
#include <linux/btrfs_tree.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/slab.h>
#include "compression.h"
+#include "messages.h"
#include "ulist.h"
#include "misc.h"
+struct page;
+struct file;
+struct folio;
+struct inode;
+struct fiemap_extent_info;
+struct readahead_control;
+struct address_space;
+struct writeback_control;
+struct extent_io_tree;
+struct extent_map_tree;
+struct extent_state;
+struct btrfs_block_group;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_root;
struct btrfs_trans_handle;
+struct btrfs_tree_parent_check;
enum {
EXTENT_BUFFER_UPTODATE,
@@ -28,7 +50,8 @@ enum {
EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
- EXTENT_BUFFER_NO_CHECK,
+ /* Indicate the extent buffer is written zeroed out (for zoned) */
+ EXTENT_BUFFER_ZONED_ZEROOUT,
/* Indicate that extent buffer pages a being read */
EXTENT_BUFFER_READING,
};
@@ -43,10 +66,10 @@ enum {
};
/*
- * page->private values. Every page that is controlled by the extent
- * map has page->private set to one.
+ * Folio private values. Every page that is controlled by the extent map has
+ * folio private set to this value.
*/
-#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_FOLIO_PRIVATE 1
/*
* The extent buffer bitmap operations are done with byte granularity instead of
@@ -62,11 +85,6 @@ enum {
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
-struct btrfs_root;
-struct btrfs_inode;
-struct btrfs_fs_info;
-struct extent_io_tree;
-struct btrfs_tree_parent_check;
int __init extent_buffer_init_cachep(void);
void __cold extent_buffer_free_cachep(void);
@@ -74,19 +92,33 @@ void __cold extent_buffer_free_cachep(void);
#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
struct extent_buffer {
u64 start;
- unsigned long len;
+ u32 len;
+ u32 folio_size;
unsigned long bflags;
struct btrfs_fs_info *fs_info;
+
+ /*
+ * The address where the eb can be accessed without any cross-page handling.
+ * This can be NULL if not possible.
+ */
+ void *addr;
+
spinlock_t refs_lock;
atomic_t refs;
int read_mirror;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
s8 log_index;
+ u8 folio_shift;
struct rcu_head rcu_head;
struct rw_semaphore lock;
- struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+ /*
+ * Pointers to all the folios of the extent buffer.
+ *
+ * For now the folio is always order 0 (aka, a single page).
+ */
+ struct folio *folios[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
pid_t lock_owner;
@@ -100,6 +132,13 @@ struct btrfs_eb_write_context {
struct btrfs_block_group *zoned_bg;
};
+static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb,
+ u64 start)
+{
+ ASSERT(eb->folio_size);
+ return start & (eb->folio_size - 1);
+}
+
/*
* Get the correct offset inside the page of extent buffer.
*
@@ -108,29 +147,43 @@ struct btrfs_eb_write_context {
*
* Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
*/
-static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
- unsigned long offset)
+static inline size_t get_eb_offset_in_folio(const struct extent_buffer *eb,
+ unsigned long offset)
{
/*
- * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
- * to PAGE_SIZE, thus adding it won't cause any difference.
+ * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+ * 1.1) One large folio covering the whole eb
+ * The eb->start is aligned to folio size, thus adding it
+ * won't cause any difference.
+ * 1.2) Several page sized folios
+ * The eb->start is aligned to folio (page) size, thus
+ * adding it won't cause any difference.
*
- * For sectorsize < PAGE_SIZE, we must only read the data that belongs
- * to the eb, thus we have to take the eb->start into consideration.
+ * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+ * In this case there would only be one page sized folio, and there
+ * may be several different extent buffers in the page/folio.
+ * We need to add eb->start to properly access the offset inside
+ * that eb.
*/
- return offset_in_page(offset + eb->start);
+ return offset_in_folio(eb->folios[0], offset + eb->start);
}
-static inline unsigned long get_eb_page_index(unsigned long offset)
+static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb,
+ unsigned long offset)
{
/*
- * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+ * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+ * 1.1) One large folio covering the whole eb.
+ * the folio_shift would be large enough to always make us
+ * return 0 as index.
+ * 1.2) Several page sized folios
+ * The folio_shift would be PAGE_SHIFT, giving us the correct
+ * index.
*
- * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
- * and have ensured that all tree blocks are contained in one page,
- * thus we always get index == 0.
+ * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+ * The folio would only be page sized, and always give us 0 as index.
*/
- return offset >> PAGE_SHIFT;
+ return offset >> eb->folio_shift;
}
/*
@@ -178,22 +231,20 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
kfree(changeset);
}
-struct extent_map_tree;
-
-int try_release_extent_mapping(struct page *page, gfp_t mask);
+bool try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int btrfs_read_folio(struct file *file, struct folio *folio);
void extent_write_locked_range(struct inode *inode, struct page *locked_page,
u64 start, u64 end, struct writeback_control *wbc,
bool pages_dirty);
-int extent_writepages(struct address_space *mapping,
- struct writeback_control *wbc);
+int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
-void extent_readahead(struct readahead_control *rac);
+void btrfs_readahead(struct readahead_control *rac);
int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
+int set_folio_extent_mapped(struct folio *folio);
int set_page_extent_mapped(struct page *page);
void clear_page_extent_mapped(struct page *page);
@@ -230,6 +281,20 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
return (eb->len >> PAGE_SHIFT) ?: 1;
}
+/*
+ * This can only be determined at runtime by checking eb::folios[0].
+ *
+ * As we can have either one large folio covering the whole eb
+ * (either nodesize <= PAGE_SIZE, or high order folio), or multiple
+ * single-paged folios.
+ */
+static inline int num_extent_folios(const struct extent_buffer *eb)
+{
+ if (folio_order(eb->folios[0]))
+ return 1;
+ return num_extent_pages(eb);
+}
+
static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
{
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -288,13 +353,17 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb);
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
+ struct extent_state **cached,
u32 bits_to_clear, unsigned long page_ops);
int extent_invalidate_folio(struct extent_io_tree *tree,
struct folio *folio, size_t offset);
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *buf);
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+ gfp_t extra_gfp);
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
+ gfp_t extra_gfp);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a6d8368ed0ed..744e8952abb0 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -5,10 +5,10 @@
#include <linux/spinlock.h>
#include "messages.h"
#include "ctree.h"
-#include "volumes.h"
#include "extent_map.h"
#include "compression.h"
#include "btrfs_inode.h"
+#include "disk-io.h"
static struct kmem_cache *extent_map_cache;
@@ -16,8 +16,7 @@ static struct kmem_cache *extent_map_cache;
int __init extent_map_init(void)
{
extent_map_cache = kmem_cache_create("btrfs_extent_map",
- sizeof(struct extent_map), 0,
- SLAB_MEM_SPREAD, NULL);
+ sizeof(struct extent_map), 0, 0, NULL);
if (!extent_map_cache)
return -ENOMEM;
return 0;
@@ -50,7 +49,6 @@ struct extent_map *alloc_extent_map(void)
if (!em)
return NULL;
RB_CLEAR_NODE(&em->rb_node);
- em->compress_type = BTRFS_COMPRESS_NONE;
refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
@@ -67,8 +65,6 @@ void free_extent_map(struct extent_map *em)
if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- kfree(em->map_lookup);
kmem_cache_free(extent_map_cache, em);
}
}
@@ -81,6 +77,14 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
+static void dec_evictable_extent_maps(struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
+ percpu_counter_dec(&fs_info->evictable_extent_maps);
+}
+
static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
{
struct rb_node **p = &root->rb_root.rb_node;
@@ -182,54 +186,55 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return NULL;
}
-/* Check to see if two extent_map structs are adjacent and safe to merge. */
-static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+static inline u64 extent_map_block_end(const struct extent_map *em)
{
- if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
- return 0;
+ if (em->block_start + em->block_len < em->block_start)
+ return (u64)-1;
+ return em->block_start + em->block_len;
+}
- /*
- * don't merge compressed extents, we need to know their
- * actual size
- */
- if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
- return 0;
+static bool can_merge_extent_map(const struct extent_map *em)
+{
+ if (em->flags & EXTENT_FLAG_PINNED)
+ return false;
- if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
- test_bit(EXTENT_FLAG_LOGGING, &next->flags))
- return 0;
+ /* Don't merge compressed extents, we need to know their actual size. */
+ if (extent_map_is_compressed(em))
+ return false;
+
+ if (em->flags & EXTENT_FLAG_LOGGING)
+ return false;
/*
* We don't want to merge stuff that hasn't been written to the log yet
* since it may not reflect exactly what is on disk, and that would be
* bad.
*/
- if (!list_empty(&prev->list) || !list_empty(&next->list))
- return 0;
+ if (!list_empty(&em->list))
+ return false;
- ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
- prev->block_start != EXTENT_MAP_DELALLOC);
-
- if (prev->map_lookup || next->map_lookup)
- ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) &&
- test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags));
-
- if (extent_map_end(prev) == next->start &&
- prev->flags == next->flags &&
- prev->map_lookup == next->map_lookup &&
- ((next->block_start == EXTENT_MAP_HOLE &&
- prev->block_start == EXTENT_MAP_HOLE) ||
- (next->block_start == EXTENT_MAP_INLINE &&
- prev->block_start == EXTENT_MAP_INLINE) ||
- (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
- next->block_start == extent_map_block_end(prev)))) {
- return 1;
- }
- return 0;
+ return true;
}
-static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
+/* Check to see if two extent_map structs are adjacent and safe to merge. */
+static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
+{
+ if (extent_map_end(prev) != next->start)
+ return false;
+
+ if (prev->flags != next->flags)
+ return false;
+
+ if (next->block_start < EXTENT_MAP_LAST_BYTE - 1)
+ return next->block_start == extent_map_block_end(prev);
+
+ /* HOLES and INLINE extents. */
+ return next->block_start == prev->block_start;
+}
+
+static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
struct extent_map *merge = NULL;
struct rb_node *rb;
@@ -244,46 +249,48 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
if (refcount_read(&em->refs) > 2)
return;
+ if (!can_merge_extent_map(em))
+ return;
+
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
- if (rb && mergable_maps(merge, em)) {
+ if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
em->start = merge->start;
em->orig_start = merge->orig_start;
em->len += merge->len;
em->block_len += merge->block_len;
em->block_start = merge->block_start;
- em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
- em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
- set_bit(EXTENT_FLAG_MERGED, &em->flags);
+ em->flags |= EXTENT_FLAG_MERGED;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
+ dec_evictable_extent_maps(inode);
}
}
rb = rb_next(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
- if (rb && mergable_maps(em, merge)) {
+ if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
em->len += merge->len;
em->block_len += merge->block_len;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
- em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
- set_bit(EXTENT_FLAG_MERGED, &em->flags);
+ em->flags |= EXTENT_FLAG_MERGED;
free_extent_map(merge);
+ dec_evictable_extent_maps(inode);
}
}
/*
* Unpin an extent from the cache.
*
- * @tree: tree to unpin the extent in
+ * @inode: the inode from which we are unpinning an extent range
* @start: logical offset in the file
* @len: length of the extent
* @gen: generation that this extent has been modified in
@@ -291,133 +298,107 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
* Called after an extent has been written to disk properly. Set the generation
* to the generation that actually added the file item to the inode so we know
* we need to sync this extent when we call fsync().
+ *
+ * Returns: 0 on success
+ * -ENOENT when the extent is not found in the tree
+ * -EUCLEAN if the found extent does not match the expected start
*/
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
- u64 gen)
+int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map_tree *tree = &inode->extent_tree;
int ret = 0;
struct extent_map *em;
- bool prealloc = false;
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, start, len);
- WARN_ON(!em || em->start != start);
-
- if (!em)
+ if (WARN_ON(!em)) {
+ btrfs_warn(fs_info,
+"no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
+ btrfs_ino(inode), btrfs_root_id(inode->root),
+ start, start + len, gen);
+ ret = -ENOENT;
goto out;
+ }
- em->generation = gen;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- em->mod_start = em->start;
- em->mod_len = em->len;
-
- if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
- prealloc = true;
- clear_bit(EXTENT_FLAG_FILLING, &em->flags);
+ if (WARN_ON(em->start != start)) {
+ btrfs_warn(fs_info,
+"found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
+ btrfs_ino(inode), btrfs_root_id(inode->root),
+ em->start, start, start + len, gen);
+ ret = -EUCLEAN;
+ goto out;
}
- try_merge_map(tree, em);
+ em->generation = gen;
+ em->flags &= ~EXTENT_FLAG_PINNED;
- if (prealloc) {
- em->mod_start = em->start;
- em->mod_len = em->len;
- }
+ try_merge_map(inode, em);
- free_extent_map(em);
out:
write_unlock(&tree->lock);
+ free_extent_map(em);
return ret;
}
-void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
{
- lockdep_assert_held_write(&tree->lock);
+ lockdep_assert_held_write(&inode->extent_tree.lock);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags &= ~EXTENT_FLAG_LOGGING;
if (extent_map_in_tree(em))
- try_merge_map(tree, em);
+ try_merge_map(inode, em);
}
-static inline void setup_extent_mapping(struct extent_map_tree *tree,
+static inline void setup_extent_mapping(struct btrfs_inode *inode,
struct extent_map *em,
int modified)
{
refcount_inc(&em->refs);
- em->mod_start = em->start;
- em->mod_len = em->len;
+
+ ASSERT(list_empty(&em->list));
if (modified)
- list_move(&em->list, &tree->modified_extents);
+ list_add(&em->list, &inode->extent_tree.modified_extents);
else
- try_merge_map(tree, em);
-}
-
-static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
-{
- struct map_lookup *map = em->map_lookup;
- u64 stripe_size = em->orig_block_len;
- int i;
-
- for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_io_stripe *stripe = &map->stripes[i];
- struct btrfs_device *device = stripe->dev;
-
- set_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + stripe_size - 1,
- bits | EXTENT_NOWAIT, NULL);
- }
-}
-
-static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
-{
- struct map_lookup *map = em->map_lookup;
- u64 stripe_size = em->orig_block_len;
- int i;
-
- for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_io_stripe *stripe = &map->stripes[i];
- struct btrfs_device *device = stripe->dev;
-
- __clear_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + stripe_size - 1,
- bits | EXTENT_NOWAIT,
- NULL, NULL);
- }
+ try_merge_map(inode, em);
}
/*
- * Add new extent map to the extent tree
+ * Add a new extent map to an inode's extent map tree.
*
- * @tree: tree to insert new map in
+ * @inode: the target inode
* @em: map to insert
* @modified: indicate whether the given @em should be added to the
* modified list, which indicates the extent needs to be logged
*
- * Insert @em into @tree or perform a simple forward/backward merge with
- * existing mappings. The extent_map struct passed in will be inserted
- * into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was successful.
+ * Insert @em into the @inode's extent map tree or perform a simple
+ * forward/backward merge with existing mappings. The extent_map struct passed
+ * in will be inserted into the tree directly, with an additional reference
+ * taken, or a reference dropped if the merge attempt was successful.
*/
-int add_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *em, int modified)
+static int add_extent_mapping(struct btrfs_inode *inode,
+ struct extent_map *em, int modified)
{
- int ret = 0;
+ struct extent_map_tree *tree = &inode->extent_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
lockdep_assert_held_write(&tree->lock);
ret = tree_insert(&tree->map, em);
if (ret)
- goto out;
+ return ret;
- setup_extent_mapping(tree, em, modified);
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) {
- extent_map_device_set_bits(em, CHUNK_ALLOCATED);
- extent_map_device_clear_bits(em, CHUNK_TRIMMED);
- }
-out:
- return ret;
+ setup_extent_mapping(inode, em, modified);
+
+ if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root)))
+ percpu_counter_inc(&fs_info->evictable_extent_maps);
+
+ return 0;
}
static struct extent_map *
@@ -483,42 +464,46 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
}
/*
- * Remove an extent_map from the extent tree.
+ * Remove an extent_map from its inode's extent tree.
*
- * @tree: extent tree to remove from
+ * @inode: the inode the extent map belongs to
* @em: extent map being removed
*
- * Remove @em from @tree. No reference counts are dropped, and no checks
- * are done to see if the range is in use.
+ * Remove @em from the extent tree of @inode. No reference counts are dropped,
+ * and no checks are done to see if the range is in use.
*/
-void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
+
lockdep_assert_held_write(&tree->lock);
- WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ WARN_ON(em->flags & EXTENT_FLAG_PINNED);
rb_erase_cached(&em->rb_node, &tree->map);
- if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list);
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
RB_CLEAR_NODE(&em->rb_node);
+
+ dec_evictable_extent_maps(inode);
}
-static void replace_extent_mapping(struct extent_map_tree *tree,
+static void replace_extent_mapping(struct btrfs_inode *inode,
struct extent_map *cur,
struct extent_map *new,
int modified)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
+
lockdep_assert_held_write(&tree->lock);
- WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+ WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
ASSERT(extent_map_in_tree(cur));
- if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+ if (!(cur->flags & EXTENT_FLAG_LOGGING))
list_del_init(&cur->list);
rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
RB_CLEAR_NODE(&cur->rb_node);
- setup_extent_mapping(tree, new, modified);
+ setup_extent_mapping(inode, new, modified);
}
static struct extent_map *next_extent_map(const struct extent_map *em)
@@ -547,7 +532,7 @@ static struct extent_map *prev_extent_map(struct extent_map *em)
* and an extent that you want to insert, deal with overlap and insert
* the best fitted new extent into the tree.
*/
-static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
+static noinline int merge_extent_mapping(struct btrfs_inode *inode,
struct extent_map *existing,
struct extent_map *em,
u64 map_start)
@@ -558,7 +543,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
u64 end;
u64 start_diff;
- BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+ if (map_start < em->start || map_start >= extent_map_end(em))
+ return -EINVAL;
if (existing->start > map_start) {
next = existing;
@@ -576,18 +562,17 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
em->start = start;
em->len = end - start;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
- !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ !extent_map_is_compressed(em)) {
em->block_start += start_diff;
em->block_len = em->len;
}
- return add_extent_mapping(em_tree, em, 0);
+ return add_extent_mapping(inode, em, 0);
}
/*
- * Add extent mapping into em_tree.
+ * Add extent mapping into an inode's extent map tree.
*
- * @fs_info: the filesystem
- * @em_tree: extent tree into which we want to insert the extent mapping
+ * @inode: target inode
* @em_in: extent we are inserting
* @start: start of the logical range btrfs_get_extent() is requesting
* @len: length of the logical range btrfs_get_extent() is requesting
@@ -595,8 +580,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
* Note that @em_in's range may be different from [start, start+len),
* but they must be overlapped.
*
- * Insert @em_in into @em_tree. In case there is an overlapping range, handle
- * the -EEXIST by either:
+ * Insert @em_in into the inode's extent map tree. In case there is an
+ * overlapping range, handle the -EEXIST by either:
* a) Returning the existing extent in @em_in if @start is within the
* existing em.
* b) Merge the existing extent with @em_in passed in.
@@ -604,12 +589,12 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
* Return 0 on success, otherwise -EEXIST.
*
*/
-int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_inode *inode,
struct extent_map **em_in, u64 start, u64 len)
{
int ret;
struct extent_map *em = *em_in;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
/*
* Tree-checker should have rejected any inline extent with non-zero
@@ -618,7 +603,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
if (em->block_start == EXTENT_MAP_INLINE)
ASSERT(em->start == 0);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = add_extent_mapping(inode, em, 0);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
* an overlapping map exists in the tree
@@ -626,9 +611,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
if (ret == -EEXIST) {
struct extent_map *existing;
- ret = 0;
-
- existing = search_extent_mapping(em_tree, start, len);
+ existing = search_extent_mapping(&inode->extent_tree, start, len);
trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
@@ -649,15 +632,14 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
* The existing extent map is the one nearest to
* the [start, start + len) range which overlaps
*/
- ret = merge_extent_mapping(em_tree, existing,
- em, start);
- if (ret) {
+ ret = merge_extent_mapping(inode, existing, em, start);
+ if (WARN_ON(ret)) {
free_extent_map(em);
*em_in = NULL;
- WARN_ONCE(ret,
-"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n",
- ret, existing->start, existing->len,
- orig_start, orig_len);
+ btrfs_warn(fs_info,
+"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu",
+ existing->start, extent_map_end(existing),
+ orig_start, orig_start + orig_len, start);
}
free_extent_map(existing);
}
@@ -672,8 +654,10 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
* if needed. This avoids searching the tree, from the root down to the first
* extent map, before each deletion.
*/
-static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
+static void drop_all_extent_maps_fast(struct btrfs_inode *inode)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
+
write_lock(&tree->lock);
while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
struct extent_map *em;
@@ -681,9 +665,8 @@ static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
node = rb_first_cached(&tree->map);
em = rb_entry(node, struct extent_map, rb_node);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
- remove_extent_mapping(tree, em);
+ em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
+ remove_extent_mapping(inode, em);
free_extent_map(em);
cond_resched_rwlock_write(&tree->lock);
}
@@ -716,7 +699,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
WARN_ON(end < start);
if (end == (u64)-1) {
if (start == 0 && !skip_pinned) {
- drop_all_extent_maps_fast(em_tree);
+ drop_all_extent_maps_fast(inode);
return;
}
len = (u64)-1;
@@ -758,19 +741,18 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
}
}
- if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) {
start = em_end;
goto next;
}
flags = em->flags;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
/*
* In case we split the extent map, we want to preserve the
* EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
* it on the new extent maps.
*/
- clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
modified = !list_empty(&em->list);
/*
@@ -781,7 +763,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
goto remove_em;
gen = em->generation;
- compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ compressed = extent_map_is_compressed(em);
if (em->start < start) {
if (!split) {
@@ -814,8 +796,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->generation = gen;
split->flags = flags;
- split->compress_type = em->compress_type;
- replace_extent_mapping(em_tree, em, split, modified);
+ replace_extent_mapping(inode, em, split, modified);
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -831,7 +812,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->len = em_end - end;
split->block_start = em->block_start;
split->flags = flags;
- split->compress_type = em->compress_type;
split->generation = gen;
if (em->block_start < EXTENT_MAP_LAST_BYTE) {
@@ -843,7 +823,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->block_len = em->block_len;
split->orig_start = em->orig_start;
} else {
- const u64 diff = start + len - em->start;
+ const u64 diff = end - em->start;
split->block_len = split->len;
split->block_start += diff;
@@ -857,13 +837,11 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
}
if (extent_map_in_tree(em)) {
- replace_extent_mapping(em_tree, em, split,
- modified);
+ replace_extent_mapping(inode, em, split, modified);
} else {
int ret;
- ret = add_extent_mapping(em_tree, split,
- modified);
+ ret = add_extent_mapping(inode, split, modified);
/* Logic error, shouldn't happen. */
ASSERT(ret == 0);
if (WARN_ON(ret != 0) && modified)
@@ -898,7 +876,7 @@ remove_em:
ASSERT(!split);
btrfs_set_inode_full_sync(inode);
}
- remove_extent_mapping(em_tree, em);
+ remove_extent_mapping(inode, em);
}
/*
@@ -953,7 +931,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
do {
btrfs_drop_extent_map_range(inode, new_em->start, end, false);
write_lock(&tree->lock);
- ret = add_extent_mapping(tree, new_em, modified);
+ ret = add_extent_mapping(inode, new_em, modified);
write_unlock(&tree->lock);
} while (ret == -EEXIST);
@@ -997,14 +975,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
}
ASSERT(em->len == len);
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(!extent_map_is_compressed(em));
ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
- ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(em->flags & EXTENT_FLAG_PINNED);
+ ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
ASSERT(!list_empty(&em->list));
flags = em->flags;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags &= ~EXTENT_FLAG_PINNED;
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
@@ -1015,10 +993,9 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_pre->orig_block_len = split_pre->block_len;
split_pre->ram_bytes = split_pre->len;
split_pre->flags = flags;
- split_pre->compress_type = em->compress_type;
split_pre->generation = em->generation;
- replace_extent_mapping(em_tree, em, split_pre, 1);
+ replace_extent_mapping(inode, em, split_pre, 1);
/*
* Now we only have an extent_map at:
@@ -1034,9 +1011,8 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_mid->orig_block_len = split_mid->block_len;
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
- split_mid->compress_type = em->compress_type;
split_mid->generation = em->generation;
- add_extent_mapping(em_tree, split_mid, 1);
+ add_extent_mapping(inode, split_mid, 1);
/* Once for us */
free_extent_map(em);
@@ -1051,3 +1027,175 @@ out_free_pre:
free_extent_map(split_pre);
return ret;
}
+
+static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
+{
+ const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ struct extent_map_tree *tree = &inode->extent_tree;
+ long nr_dropped = 0;
+ struct rb_node *node;
+
+ /*
+ * Take the mmap lock so that we serialize with the inode logging phase
+ * of fsync because we may need to set the full sync flag on the inode,
+ * in case we have to remove extent maps in the tree's list of modified
+ * extents. If we set the full sync flag in the inode while an fsync is
+ * in progress, we may risk missing new extents because before the flag
+ * is set, fsync decides to only wait for writeback to complete and then
+ * during inode logging it sees the flag set and uses the subvolume tree
+ * to find new extents, which may not be there yet because ordered
+ * extents haven't completed yet.
+ *
+ * We also do a try lock because otherwise we could deadlock. This is
+ * because the shrinker for this filesystem may be invoked while we are
+ * in a path that is holding the mmap lock in write mode. For example in
+ * a reflink operation while COWing an extent buffer, when allocating
+ * pages for a new extent buffer and under memory pressure, the shrinker
+ * may be invoked, and therefore we would deadlock by attempting to read
+ * lock the mmap lock while we are holding already a write lock on it.
+ */
+ if (!down_read_trylock(&inode->i_mmap_lock))
+ return 0;
+
+ write_lock(&tree->lock);
+ node = rb_first_cached(&tree->map);
+ while (node) {
+ struct extent_map *em;
+
+ em = rb_entry(node, struct extent_map, rb_node);
+ node = rb_next(node);
+ (*scanned)++;
+
+ if (em->flags & EXTENT_FLAG_PINNED)
+ goto next;
+
+ /*
+ * If the inode is in the list of modified extents (new) and its
+ * generation is the same (or is greater than) the current fs
+ * generation, it means it was not yet persisted so we have to
+ * set the full sync flag so that the next fsync will not miss
+ * it.
+ */
+ if (!list_empty(&em->list) && em->generation >= cur_fs_gen)
+ btrfs_set_inode_full_sync(inode);
+
+ remove_extent_mapping(inode, em);
+ trace_btrfs_extent_map_shrinker_remove_em(inode, em);
+ /* Drop the reference for the tree. */
+ free_extent_map(em);
+ nr_dropped++;
+next:
+ if (*scanned >= nr_to_scan)
+ break;
+
+ /*
+ * Restart if we had to reschedule, and any extent maps that were
+ * pinned before may have become unpinned after we released the
+ * lock and took it again.
+ */
+ if (cond_resched_rwlock_write(&tree->lock))
+ node = rb_first_cached(&tree->map);
+ }
+ write_unlock(&tree->lock);
+ up_read(&inode->i_mmap_lock);
+
+ return nr_dropped;
+}
+
+static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_inode *inode;
+ long nr_dropped = 0;
+ u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
+
+ inode = btrfs_find_first_inode(root, min_ino);
+ while (inode) {
+ nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
+
+ min_ino = btrfs_ino(inode) + 1;
+ fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
+ iput(&inode->vfs_inode);
+
+ if (*scanned >= nr_to_scan)
+ break;
+
+ cond_resched();
+ inode = btrfs_find_first_inode(root, min_ino);
+ }
+
+ if (inode) {
+ /*
+ * There are still inodes in this root or we happened to process
+ * the last one and reached the scan limit. In either case set
+ * the current root to this one, so we'll resume from the next
+ * inode if there is one or we will find out this was the last
+ * one and move to the next root.
+ */
+ fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
+ } else {
+ /*
+ * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
+ * that when processing the next root we start from its first inode.
+ */
+ fs_info->extent_map_shrinker_last_ino = 0;
+ fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
+ }
+
+ return nr_dropped;
+}
+
+long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+{
+ const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
+ u64 next_root_id = start_root_id;
+ bool cycled = false;
+ long nr_dropped = 0;
+ long scanned = 0;
+
+ if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
+ s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
+ }
+
+ while (scanned < nr_to_scan) {
+ struct btrfs_root *root;
+ unsigned long count;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ count = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)&root,
+ (unsigned long)next_root_id, 1);
+ if (count == 0) {
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ if (start_root_id > 0 && !cycled) {
+ next_root_id = 0;
+ fs_info->extent_map_shrinker_last_root = 0;
+ fs_info->extent_map_shrinker_last_ino = 0;
+ cycled = true;
+ continue;
+ }
+ break;
+ }
+ next_root_id = btrfs_root_id(root) + 1;
+ root = btrfs_grab_root(root);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ if (!root)
+ continue;
+
+ if (is_fstree(btrfs_root_id(root)))
+ nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
+
+ btrfs_put_root(root);
+ }
+
+ if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
+ s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+ trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
+ }
+
+ return nr_dropped;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 35d27c756e08..6d587111f73a 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -3,44 +3,104 @@
#ifndef BTRFS_EXTENT_MAP_H
#define BTRFS_EXTENT_MAP_H
+#include <linux/compiler_types.h>
+#include <linux/rwlock_types.h>
#include <linux/rbtree.h>
+#include <linux/list.h>
#include <linux/refcount.h>
+#include "misc.h"
+#include "extent_map.h"
+#include "compression.h"
+
+struct btrfs_inode;
+struct btrfs_fs_info;
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
#define EXTENT_MAP_INLINE ((u64)-2)
-/* used only during fiemap calls */
-#define EXTENT_MAP_DELALLOC ((u64)-1)
/* bits for the extent_map::flags field */
enum {
/* this entry not yet on disk, don't free it */
- EXTENT_FLAG_PINNED,
- EXTENT_FLAG_COMPRESSED,
+ ENUM_BIT(EXTENT_FLAG_PINNED),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_ZLIB),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_LZO),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_ZSTD),
/* pre-allocated extent */
- EXTENT_FLAG_PREALLOC,
+ ENUM_BIT(EXTENT_FLAG_PREALLOC),
/* Logging this extent */
- EXTENT_FLAG_LOGGING,
- /* Filling in a preallocated extent */
- EXTENT_FLAG_FILLING,
- /* filesystem extent mapping type */
- EXTENT_FLAG_FS_MAPPING,
+ ENUM_BIT(EXTENT_FLAG_LOGGING),
/* This em is merged from two or more physically adjacent ems */
- EXTENT_FLAG_MERGED,
+ ENUM_BIT(EXTENT_FLAG_MERGED),
};
+/*
+ * This structure represents file extents and holes.
+ *
+ * Unlike on-disk file extent items, extent maps can be merged to save memory.
+ * This means members only match file extent items before any merging.
+ *
+ * Keep this structure as compact as possible, as we can have really large
+ * amounts of allocated extent maps at any time.
+ */
struct extent_map {
struct rb_node rb_node;
- /* all of these are in bytes */
+ /* All of these are in bytes. */
+
+ /* File offset matching the offset of a BTRFS_EXTENT_ITEM_KEY key. */
u64 start;
+
+ /*
+ * Length of the file extent.
+ *
+ * For non-inlined file extents it's btrfs_file_extent_item::num_bytes.
+ * For inline extents it's sectorsize, since inline data starts at
+ * offsetof(struct btrfs_file_extent_item, disk_bytenr) thus
+ * btrfs_file_extent_item::num_bytes is not valid.
+ */
u64 len;
- u64 mod_start;
- u64 mod_len;
+
+ /*
+ * The file offset of the original file extent before splitting.
+ *
+ * This is an in-memory only member, matching
+ * extent_map::start - btrfs_file_extent_item::offset for
+ * regular/preallocated extents. EXTENT_MAP_HOLE otherwise.
+ */
u64 orig_start;
+
+ /*
+ * The full on-disk extent length, matching
+ * btrfs_file_extent_item::disk_num_bytes.
+ */
u64 orig_block_len;
+
+ /*
+ * The decompressed size of the whole on-disk extent, matching
+ * btrfs_file_extent_item::ram_bytes.
+ */
u64 ram_bytes;
+
+ /*
+ * The on-disk logical bytenr for the file extent.
+ *
+ * For compressed extents it matches btrfs_file_extent_item::disk_bytenr.
+ * For uncompressed extents it matches
+ * btrfs_file_extent_item::disk_bytenr + btrfs_file_extent_item::offset
+ *
+ * For holes it is EXTENT_MAP_HOLE and for inline extents it is
+ * EXTENT_MAP_INLINE.
+ */
u64 block_start;
+
+ /*
+ * The on-disk length for the file extent.
+ *
+ * For compressed extents it matches btrfs_file_extent_item::disk_num_bytes.
+ * For uncompressed extents it matches extent_map::len.
+ * For holes and inline extents it's -1 and shouldn't be used.
+ */
u64 block_len;
/*
@@ -49,11 +109,8 @@ struct extent_map {
* For non-merged extents, it's from btrfs_file_extent_item::generation.
*/
u64 generation;
- unsigned long flags;
- /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
- struct map_lookup *map_lookup;
+ u32 flags;
refcount_t refs;
- unsigned int compress_type;
struct list_head list;
};
@@ -65,31 +122,58 @@ struct extent_map_tree {
struct btrfs_inode;
+static inline void extent_map_set_compression(struct extent_map *em,
+ enum btrfs_compression_type type)
+{
+ if (type == BTRFS_COMPRESS_ZLIB)
+ em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
+ else if (type == BTRFS_COMPRESS_LZO)
+ em->flags |= EXTENT_FLAG_COMPRESS_LZO;
+ else if (type == BTRFS_COMPRESS_ZSTD)
+ em->flags |= EXTENT_FLAG_COMPRESS_ZSTD;
+}
+
+static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em)
+{
+ if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB)
+ return BTRFS_COMPRESS_ZLIB;
+
+ if (em->flags & EXTENT_FLAG_COMPRESS_LZO)
+ return BTRFS_COMPRESS_LZO;
+
+ if (em->flags & EXTENT_FLAG_COMPRESS_ZSTD)
+ return BTRFS_COMPRESS_ZSTD;
+
+ return BTRFS_COMPRESS_NONE;
+}
+
+/*
+ * More efficient way to determine if extent is compressed, instead of using
+ * 'extent_map_compression() != BTRFS_COMPRESS_NONE'.
+ */
+static inline bool extent_map_is_compressed(const struct extent_map *em)
+{
+ return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB |
+ EXTENT_FLAG_COMPRESS_LZO |
+ EXTENT_FLAG_COMPRESS_ZSTD)) != 0;
+}
+
static inline int extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
}
-static inline u64 extent_map_end(struct extent_map *em)
+static inline u64 extent_map_end(const struct extent_map *em)
{
if (em->start + em->len < em->start)
return (u64)-1;
return em->start + em->len;
}
-static inline u64 extent_map_block_end(struct extent_map *em)
-{
- if (em->block_start + em->block_len < em->block_start)
- return (u64)-1;
- return em->block_start + em->block_len;
-}
-
void extent_map_tree_init(struct extent_map_tree *tree);
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
-int add_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *em, int modified);
-void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em);
int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
u64 new_logical);
@@ -97,12 +181,11 @@ struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void __cold extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
-void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
+int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
-int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_inode *inode,
struct extent_map **em_in, u64 start, u64 len);
void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
u64 start, u64 end,
@@ -110,5 +193,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map *new_em,
bool modified);
+long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 45cae356e89b..bce95f871750 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -10,17 +10,14 @@
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "messages.h"
-#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "bio.h"
-#include "print-tree.h"
#include "compression.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
-#include "super.h"
#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
@@ -59,7 +56,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
goto out_unlock;
}
- ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start,
+ ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
&end, EXTENT_DIRTY);
if (!ret && start == 0)
i_size = min(i_size, end + 1);
@@ -94,7 +91,7 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
- return set_extent_bit(&inode->file_extent_tree, start, start + len - 1,
+ return set_extent_bit(inode->file_extent_tree, start, start + len - 1,
EXTENT_DIRTY, NULL);
}
@@ -123,7 +120,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
- return clear_extent_bit(&inode->file_extent_tree, start,
+ return clear_extent_bit(inode->file_extent_tree, start,
start + len - 1, EXTENT_DIRTY, NULL);
}
@@ -179,7 +176,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
sizeof(*item));
if (ret < 0)
goto out;
- BUG_ON(ret); /* Can't happen */
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -434,8 +430,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
memset(csum_dst, 0, csum_size);
count = 1;
- if (inode->root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) {
u64 file_offset = bbio->file_offset + bio_offset;
set_extent_bit(&inode->io_tree, file_offset,
@@ -454,9 +449,22 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
return ret;
}
+/*
+ * Search for checksums for a given logical range.
+ *
+ * @root: The root where to look for checksums.
+ * @start: Logical address of target checksum range.
+ * @end: End offset (inclusive) of the target checksum range.
+ * @list: List for adding each checksum that was found.
+ * Can be NULL in case the caller only wants to check if
+ * there any checksums for the range.
+ * @nowait: Indicate if the search must be non-blocking or not.
+ *
+ * Return < 0 on error, 0 if no checksums were found, or 1 if checksums were
+ * found.
+ */
int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit,
- bool nowait)
+ struct list_head *list, bool nowait)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
@@ -464,8 +472,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
struct extent_buffer *leaf;
struct btrfs_ordered_sum *sums;
struct btrfs_csum_item *item;
- LIST_HEAD(tmplist);
int ret;
+ bool found_csums = false;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
@@ -475,11 +483,6 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
return -ENOMEM;
path->nowait = nowait;
- if (search_commit) {
- path->skip_locking = 1;
- path->reada = READA_FORWARD;
- path->search_commit_root = 1;
- }
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.offset = start;
@@ -487,7 +490,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto fail;
+ goto out;
if (ret > 0 && path->slots[0] > 0) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
@@ -522,7 +525,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto fail;
+ goto out;
if (ret > 0)
break;
leaf = path->nodes[0];
@@ -544,6 +547,10 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
continue;
}
+ found_csums = true;
+ if (!list)
+ goto out;
+
csum_end = min(csum_end, end + 1);
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
@@ -557,7 +564,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
GFP_NOFS);
if (!sums) {
ret = -ENOMEM;
- goto fail;
+ goto out;
}
sums->logical = start;
@@ -571,21 +578,24 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
bytes_to_csum_size(fs_info, size));
start += size;
- list_add_tail(&sums->list, &tmplist);
+ list_add_tail(&sums->list, list);
}
path->slots[0]++;
}
- ret = 0;
-fail:
- while (ret < 0 && !list_empty(&tmplist)) {
- sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
- list_del(&sums->list);
- kfree(sums);
+out:
+ btrfs_free_path(path);
+ if (ret < 0) {
+ if (list) {
+ struct btrfs_ordered_sum *tmp_sums;
+
+ list_for_each_entry_safe(sums, tmp_sums, list, list)
+ kfree(sums);
+ }
+
+ return ret;
}
- list_splice_tail(&tmplist, list);
- btrfs_free_path(path);
- return ret;
+ return found_csums ? 1 : 0;
}
/*
@@ -874,8 +884,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
const u32 csum_size = fs_info->csum_size;
u32 blocksize_bits = fs_info->sectorsize_bits;
- ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
- root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID ||
+ btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
path = btrfs_alloc_path();
if (!path)
@@ -1175,7 +1185,7 @@ extend_csum:
* search, etc, because log trees are temporary anyway and it
* would only save a few bytes of leaf space.
*/
- if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
if (path->slots[0] + 1 >=
btrfs_header_nritems(path->nodes[0])) {
ret = find_next_csum_offset(root, path, &next_offset);
@@ -1229,8 +1239,6 @@ insert:
ins_size);
if (ret < 0)
goto out;
- if (WARN_ON(ret != 0))
- goto out;
leaf = path->nodes[0];
csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
@@ -1271,20 +1279,19 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
struct extent_buffer *leaf = path->nodes[0];
const int slot = path->slots[0];
struct btrfs_key key;
- u64 extent_start, extent_end;
+ u64 extent_start;
u64 bytenr;
u8 type = btrfs_file_extent_type(leaf, fi);
int compress_type = btrfs_file_extent_compression(leaf, fi);
btrfs_item_key_to_cpu(leaf, &key, slot);
extent_start = key.offset;
- extent_end = btrfs_file_extent_end(path);
em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
em->generation = btrfs_file_extent_generation(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
em->start = extent_start;
- em->len = extent_end - extent_start;
+ em->len = btrfs_file_extent_end(path) - extent_start;
em->orig_start = extent_start -
btrfs_file_extent_offset(leaf, fi);
em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
@@ -1294,8 +1301,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
return;
}
if (compress_type != BTRFS_COMPRESS_NONE) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
+ extent_map_set_compression(em, compress_type);
em->block_start = bytenr;
em->block_len = em->orig_block_len;
} else {
@@ -1303,26 +1309,27 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->block_start = bytenr;
em->block_len = em->len;
if (type == BTRFS_FILE_EXTENT_PREALLOC)
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->flags |= EXTENT_FLAG_PREALLOC;
}
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ /* Tree-checker has ensured this. */
+ ASSERT(extent_start == 0);
+
em->block_start = EXTENT_MAP_INLINE;
- em->start = extent_start;
- em->len = extent_end - extent_start;
+ em->start = 0;
+ em->len = fs_info->sectorsize;
/*
* Initialize orig_start and block_len with the same values
* as in inode.c:btrfs_get_extent().
*/
em->orig_start = EXTENT_MAP_HOLE;
em->block_len = (u64)-1;
- em->compress_type = compress_type;
- if (compress_type != BTRFS_COMPRESS_NONE)
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ extent_map_set_compression(em, compress_type);
} else {
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "
"root %llu", type, btrfs_ino(inode), extent_start,
- root->root_key.objectid);
+ btrfs_root_id(root));
}
}
@@ -1343,12 +1350,10 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path)
ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
- end = btrfs_file_extent_ram_bytes(leaf, fi);
- end = ALIGN(key.offset + end, leaf->fs_info->sectorsize);
- } else {
+ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE)
+ end = leaf->fs_info->sectorsize;
+ else
end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- }
return end;
}
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 04bd2d34efb1..557dc43d7142 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -3,8 +3,21 @@
#ifndef BTRFS_FILE_ITEM_H
#define BTRFS_FILE_ITEM_H
+#include <linux/list.h>
+#include <uapi/linux/btrfs_tree.h>
#include "accessors.h"
+struct extent_map;
+struct btrfs_file_extent_item;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_bio;
+struct btrfs_trans_handle;
+struct btrfs_root;
+struct btrfs_ordered_sum;
+struct btrfs_path;
+struct btrfs_inode;
+
#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
(offsetof(struct btrfs_file_extent_item, disk_bytenr))
@@ -55,8 +68,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit,
bool nowait);
int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit,
- bool nowait);
+ struct list_head *list, bool nowait);
int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path,
u64 start, u64 end, u8 *csum_buf,
unsigned long *csum_bitmap);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f47731c45bb5..e764ac3f22e2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -22,10 +22,8 @@
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "tree-log.h"
#include "locking.h"
-#include "volumes.h"
#include "qgroup.h"
#include "compression.h"
#include "delalloc-space.h"
@@ -111,8 +109,8 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
* accessed as prepare_pages should have marked them accessed
* in prepare_pages via find_or_create_page()
*/
- btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
- block_len);
+ btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]),
+ block_start, block_len);
unlock_page(pages[i]);
put_page(pages[i]);
}
@@ -130,7 +128,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
struct extent_state **cached, bool noreserve)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int err = 0;
+ int ret = 0;
int i;
u64 num_bytes;
u64 start_pos;
@@ -160,17 +158,20 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
cached);
- err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+ ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
- if (err)
- return err;
+ if (ret)
+ return ret;
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
- btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
- btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
- btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
+ btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
+ start_pos, num_bytes);
+ btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
+ start_pos, num_bytes);
+ btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
+ start_pos, num_bytes);
}
/*
@@ -205,7 +206,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
- struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
u64 ino = btrfs_ino(inode);
@@ -245,7 +245,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (args->start >= inode->disk_i_size && !args->replace_extent)
modify_tree = 0;
- update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
+ update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -372,15 +372,17 @@ next_slot:
btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0) {
- btrfs_init_generic_ref(&ref,
- BTRFS_ADD_DELAYED_REF,
- disk_bytenr, num_bytes, 0,
- root->root_key.objectid);
- btrfs_init_data_ref(&ref,
- root->root_key.objectid,
- new_key.objectid,
- args->start - extent_offset,
- 0, false);
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = disk_bytenr,
+ .num_bytes = num_bytes,
+ .parent = 0,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+ btrfs_init_data_ref(&ref, new_key.objectid,
+ args->start - extent_offset,
+ 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -463,15 +465,17 @@ delete_extent_item:
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
} else if (update_refs && disk_bytenr > 0) {
- btrfs_init_generic_ref(&ref,
- BTRFS_DROP_DELAYED_REF,
- disk_bytenr, num_bytes, 0,
- root->root_key.objectid);
- btrfs_init_data_ref(&ref,
- root->root_key.objectid,
- key.objectid,
- key.offset - extent_offset, 0,
- false);
+ struct btrfs_ref ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = disk_bytenr,
+ .num_bytes = num_bytes,
+ .parent = 0,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+ btrfs_init_data_ref(&ref, key.objectid,
+ key.offset - extent_offset,
+ 0, false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -747,10 +751,13 @@ again:
extent_end - split);
btrfs_mark_buffer_dirty(trans, leaf);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
- num_bytes, 0, root->root_key.objectid);
- btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
- orig_offset, 0, false);
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.bytenr = bytenr;
+ ref.num_bytes = num_bytes;
+ ref.parent = 0;
+ ref.owning_root = btrfs_root_id(root);
+ ref.ref_root = btrfs_root_id(root);
+ btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -773,10 +780,14 @@ again:
other_start = end;
other_end = 0;
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- num_bytes, 0, root->root_key.objectid);
- btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
- 0, false);
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.bytenr = bytenr;
+ ref.num_bytes = num_bytes;
+ ref.parent = 0;
+ ref.owning_root = btrfs_root_id(root);
+ ref.ref_root = btrfs_root_id(root);
+ btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
if (extent_mergeable(leaf, path->slots[0] + 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
@@ -869,9 +880,9 @@ static int prepare_uptodate_page(struct inode *inode,
* released.
*
* The private flag check is essential for subpage as we need
- * to store extra bitmap using page->private.
+ * to store extra bitmap using folio private.
*/
- if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
+ if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
unlock_page(page);
return -EAGAIN;
}
@@ -914,7 +925,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
unsigned long index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
- int err = 0;
+ int ret = 0;
int faili;
for (i = 0; i < num_pages; i++) {
@@ -924,28 +935,28 @@ again:
if (!pages[i]) {
faili = i - 1;
if (nowait)
- err = -EAGAIN;
+ ret = -EAGAIN;
else
- err = -ENOMEM;
+ ret = -ENOMEM;
goto fail;
}
- err = set_page_extent_mapped(pages[i]);
- if (err < 0) {
+ ret = set_page_extent_mapped(pages[i]);
+ if (ret < 0) {
faili = i;
goto fail;
}
if (i == 0)
- err = prepare_uptodate_page(inode, pages[i], pos,
+ ret = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
- if (!err && i == num_pages - 1)
- err = prepare_uptodate_page(inode, pages[i],
+ if (!ret && i == num_pages - 1)
+ ret = prepare_uptodate_page(inode, pages[i],
pos + write_bytes, false);
- if (err) {
+ if (ret) {
put_page(pages[i]);
- if (!nowait && err == -EAGAIN) {
- err = 0;
+ if (!nowait && ret == -EAGAIN) {
+ ret = 0;
goto again;
}
faili = i - 1;
@@ -961,7 +972,7 @@ fail:
put_page(pages[faili]);
faili--;
}
- return err;
+ return ret;
}
@@ -1134,7 +1145,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
loff_t pos = iocb->ki_pos;
int ret;
loff_t oldsize;
@@ -1182,7 +1193,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct file *file = iocb->ki_filp;
loff_t pos;
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
@@ -1458,13 +1469,13 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
loff_t pos;
ssize_t written = 0;
ssize_t written_buffered;
size_t prev_left = 0;
loff_t endbyte;
- ssize_t err;
+ ssize_t ret;
unsigned int ilock_flags = 0;
struct iomap_dio *dio;
@@ -1481,9 +1492,9 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
ilock_flags |= BTRFS_ILOCK_SHARED;
relock:
- err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
- if (err < 0)
- return err;
+ ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
+ if (ret < 0)
+ return ret;
/* Shared lock cannot be used with security bits set. */
if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
@@ -1492,14 +1503,14 @@ relock:
goto relock;
}
- err = generic_write_checks(iocb, from);
- if (err <= 0) {
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- return err;
+ return ret;
}
- err = btrfs_write_check(iocb, from, err);
- if (err < 0) {
+ ret = btrfs_write_check(iocb, from, ret);
+ if (ret < 0) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto out;
}
@@ -1551,15 +1562,15 @@ relock:
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
if (IS_ERR_OR_NULL(dio))
- err = PTR_ERR_OR_ZERO(dio);
+ ret = PTR_ERR_OR_ZERO(dio);
else
- err = iomap_dio_complete(dio);
+ ret = iomap_dio_complete(dio);
/* No increment (+=) because iomap returns a cumulative value. */
- if (err > 0)
- written = err;
+ if (ret > 0)
+ written = ret;
- if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
+ if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
const size_t left = iov_iter_count(from);
/*
* We have more data left to write. Try to fault in as many as
@@ -1576,7 +1587,7 @@ relock:
* to buffered IO in case we haven't made any progress.
*/
if (left == prev_left) {
- err = -ENOTBLK;
+ ret = -ENOTBLK;
} else {
fault_in_iov_iter_readable(from, left);
prev_left = left;
@@ -1585,10 +1596,10 @@ relock:
}
/*
- * If 'err' is -ENOTBLK or we have not written all data, then it means
+ * If 'ret' is -ENOTBLK or we have not written all data, then it means
* we must fallback to buffered IO.
*/
- if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
+ if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
goto out;
buffered:
@@ -1599,14 +1610,14 @@ buffered:
* below, we will block when flushing and waiting for the IO.
*/
if (iocb->ki_flags & IOCB_NOWAIT) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
pos = iocb->ki_pos;
written_buffered = btrfs_buffered_write(iocb, from);
if (written_buffered < 0) {
- err = written_buffered;
+ ret = written_buffered;
goto out;
}
/*
@@ -1614,18 +1625,18 @@ buffered:
* able to read what was just written.
*/
endbyte = pos + written_buffered - 1;
- err = btrfs_fdatawrite_range(inode, pos, endbyte);
- if (err)
+ ret = btrfs_fdatawrite_range(inode, pos, endbyte);
+ if (ret)
goto out;
- err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
- if (err)
+ ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
+ if (ret)
goto out;
written += written_buffered;
iocb->ki_pos = pos + written_buffered;
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
out:
- return err < 0 ? err : written;
+ return ret < 0 ? ret : written;
}
static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
@@ -1784,7 +1795,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
@@ -1909,6 +1920,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out_release_extents;
}
+ btrfs_init_log_ctx_scratch_eb(&ctx);
+
/*
* We use start here because we will need to wait on the IO to complete
* in btrfs_sync_log, which could require joining a transaction (for
@@ -1928,6 +1941,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans->in_fsync = true;
ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+ /*
+ * Scratch eb no longer needed, release before syncing log or commit
+ * transaction, to avoid holding unnecessary memory during such long
+ * operations.
+ */
+ if (ctx.scratch_eb) {
+ free_extent_buffer(ctx.scratch_eb);
+ ctx.scratch_eb = NULL;
+ }
btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
@@ -2003,6 +2025,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = btrfs_commit_transaction(trans);
out:
+ free_extent_buffer(ctx.scratch_eb);
ASSERT(list_empty(&ctx.list));
ASSERT(list_empty(&ctx.conflict_inodes));
err = file_check_and_advance_wb_err(file);
@@ -2016,6 +2039,172 @@ out_release_extents:
goto out;
}
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF. Because
+ * truncate_setsize() writes the inode size before removing pages, once we have
+ * the page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
+ unsigned long zero_start;
+ loff_t size;
+ vm_fault_t ret;
+ int ret2;
+ int reserved = 0;
+ u64 reserved_space;
+ u64 page_start;
+ u64 page_end;
+ u64 end;
+
+ ASSERT(folio_order(folio) == 0);
+
+ reserved_space = PAGE_SIZE;
+
+ sb_start_pagefault(inode->i_sb);
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_SIZE - 1;
+ end = page_end;
+
+ /*
+ * Reserving delalloc space after obtaining the page lock can lead to
+ * deadlock. For example, if a dirty page is locked by this function
+ * and the call to btrfs_delalloc_reserve_space() ends up triggering
+ * dirty page write out, then the btrfs_writepages() function could
+ * end up waiting indefinitely to get a lock on the page currently
+ * being processed by btrfs_page_mkwrite() function.
+ */
+ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
+ page_start, reserved_space);
+ if (!ret2) {
+ ret2 = file_update_time(vmf->vma->vm_file);
+ reserved = 1;
+ }
+ if (ret2) {
+ ret = vmf_error(ret2);
+ if (reserved)
+ goto out;
+ goto out_noreserve;
+ }
+
+ /* Make the VM retry the fault. */
+ ret = VM_FAULT_NOPAGE;
+again:
+ down_read(&BTRFS_I(inode)->i_mmap_lock);
+ lock_page(page);
+ size = i_size_read(inode);
+
+ if ((page->mapping != inode->i_mapping) ||
+ (page_start >= size)) {
+ /* Page got truncated out from underneath us. */
+ goto out_unlock;
+ }
+ wait_on_page_writeback(page);
+
+ lock_extent(io_tree, page_start, page_end, &cached_state);
+ ret2 = set_page_extent_mapped(page);
+ if (ret2 < 0) {
+ ret = vmf_error(ret2);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ goto out_unlock;
+ }
+
+ /*
+ * We can't set the delalloc bits if there are pending ordered
+ * extents. Drop our locks and wait for them to finish.
+ */
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+ btrfs_start_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ if (page->index == ((size - 1) >> PAGE_SHIFT)) {
+ reserved_space = round_up(size - page_start, fs_info->sectorsize);
+ if (reserved_space < PAGE_SIZE) {
+ end = page_start + reserved_space - 1;
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved, page_start,
+ PAGE_SIZE - reserved_space, true);
+ }
+ }
+
+ /*
+ * page_mkwrite gets called when the page is firstly dirtied after it's
+ * faulted in, but write(2) could also dirty a page and set delalloc
+ * bits, thus in this case for space account reason, we still need to
+ * clear any delalloc bits within this page range since we have to
+ * reserve data&meta space before lock_page() (see above comments).
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
+
+ ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
+ &cached_state);
+ if (ret2) {
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ ret = VM_FAULT_SIGBUS;
+ goto out_unlock;
+ }
+
+ /* Page is wholly or partially inside EOF. */
+ if (page_start + PAGE_SIZE > size)
+ zero_start = offset_in_page(size);
+ else
+ zero_start = PAGE_SIZE;
+
+ if (zero_start != PAGE_SIZE)
+ memzero_page(page, zero_start, PAGE_SIZE - zero_start);
+
+ btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
+ btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
+
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
+ return VM_FAULT_LOCKED;
+
+out_unlock:
+ unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+out:
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
+ reserved_space, (ret != 0));
+out_noreserve:
+ sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
+ return ret;
+}
+
static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
@@ -2150,7 +2339,6 @@ out:
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
- hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -2174,7 +2362,7 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
struct extent_map *em;
int ret = 0;
- em = btrfs_get_extent(inode, NULL, 0,
+ em = btrfs_get_extent(inode, NULL,
round_down(*start, fs_info->sectorsize),
round_up(*len, fs_info->sectorsize));
if (IS_ERR(em))
@@ -2246,7 +2434,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
int slot;
- struct btrfs_ref ref = { 0 };
int ret;
if (replace_len == 0)
@@ -2302,15 +2489,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
extent_info->qgroup_reserved,
&key);
} else {
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = extent_info->disk_offset,
+ .num_bytes = extent_info->disk_len,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
u64 ref_offset;
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- extent_info->disk_offset,
- extent_info->disk_len, 0,
- root->root_key.objectid);
ref_offset = extent_info->file_offset - extent_info->data_offset;
- btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(inode), ref_offset, 0, false);
+ btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
}
@@ -2591,7 +2780,7 @@ out:
static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
@@ -2833,13 +3022,13 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
int ret;
offset = round_down(offset, sectorsize);
- em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(inode, NULL, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
if (em->block_start == EXTENT_MAP_HOLE)
ret = RANGE_BOUNDARY_HOLE;
- else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ else if (em->flags & EXTENT_FLAG_PREALLOC)
ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
else
ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
@@ -2864,7 +3053,7 @@ static int btrfs_zero_range(struct inode *inode,
u64 bytes_to_reserve = 0;
bool space_reserved = false;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
alloc_end - alloc_start);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -2879,8 +3068,7 @@ static int btrfs_zero_range(struct inode *inode,
* extents and holes, we drop all the existing extents and allocate a
* new prealloc extent, so that we get a larger contiguous disk extent.
*/
- if (em->start <= alloc_start &&
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
const u64 em_end = em->start + em->len;
if (em_end >= offset + len) {
@@ -2908,14 +3096,13 @@ static int btrfs_zero_range(struct inode *inode,
if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
- sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
}
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (em->flags & EXTENT_FLAG_PREALLOC) {
free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
@@ -3004,7 +3191,7 @@ reserve_space:
}
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
- i_blocksize(inode),
+ fs_info->sectorsize,
offset + len, &alloc_hint);
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
@@ -3048,7 +3235,7 @@ static long btrfs_fallocate(struct file *file, int mode,
int ret;
/* Do not allow fallocate in ZONED mode */
- if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
+ if (btrfs_is_zoned(inode_to_fs_info(inode)))
return -EOPNOTSUPP;
alloc_start = round_down(offset, blocksize);
@@ -3125,7 +3312,7 @@ static long btrfs_fallocate(struct file *file, int mode,
/* First, check if we exceed the qgroup limit */
while (cur_offset < alloc_end) {
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
alloc_end - cur_offset);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -3136,7 +3323,7 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = ALIGN(last_byte, blocksize);
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ !(em->flags & EXTENT_FLAG_PREALLOC))) {
const u64 range_len = last_byte - cur_offset;
ret = add_falloc_range(&reserve_list, cur_offset, range_len);
@@ -3176,7 +3363,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (!ret) {
ret = btrfs_prealloc_file_range(inode, mode,
range->start,
- range->len, i_blocksize(inode),
+ range->len, blocksize,
offset + len, &alloc_hint);
/*
* btrfs_prealloc_file_range() releases space even
@@ -3192,7 +3379,7 @@ static long btrfs_fallocate(struct file *file, int mode,
qgroup_reserved -= range->len;
} else if (qgroup_reserved > 0) {
btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
- range->start, range->len);
+ range->start, range->len, NULL);
qgroup_reserved -= range->len;
}
list_del(&range->list);
@@ -3709,8 +3896,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
{
int ret;
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
- FMODE_CAN_ODIRECT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
ret = fsverity_file_open(inode, filp);
if (ret)
@@ -3753,7 +3939,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
if (fsverity_active(inode))
return 0;
- if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
+ if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
return 0;
btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
@@ -3840,6 +4026,7 @@ const struct file_operations btrfs_file_operations = {
.compat_ioctl = btrfs_compat_ioctl,
#endif
.remap_file_range = btrfs_remap_file_range,
+ .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
};
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index 82b34fbb295f..77aaca208c7b 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -3,6 +3,21 @@
#ifndef BTRFS_FILE_H
#define BTRFS_FILE_H
+#include <linux/types.h>
+
+struct file;
+struct extent_state;
+struct kiocb;
+struct iov_iter;
+struct page;
+struct btrfs_ioctl_encoded_io_args;
+struct btrfs_drop_extents_args;
+struct btrfs_inode;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_replace_extent_info;
+struct btrfs_trans_handle;
+
extern const struct file_operations btrfs_file_operations;
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6f93c9a2c3e3..3ab8dea5036b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -19,9 +19,7 @@
#include "transaction.h"
#include "disk-io.h"
#include "extent_io.h"
-#include "volumes.h"
#include "space-info.h"
-#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
#include "subpage.h"
@@ -399,7 +397,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
return -ENOMEM;
io_ctl->num_pages = num_pages;
- io_ctl->fs_info = btrfs_sb(inode->i_sb);
+ io_ctl->fs_info = inode_to_fs_info(inode);
io_ctl->inode = inode;
return 0;
@@ -439,8 +437,8 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
for (i = 0; i < io_ctl->num_pages; i++) {
if (io_ctl->pages[i]) {
- btrfs_page_clear_checked(io_ctl->fs_info,
- io_ctl->pages[i],
+ btrfs_folio_clear_checked(io_ctl->fs_info,
+ page_folio(io_ctl->pages[i]),
page_offset(io_ctl->pages[i]),
PAGE_SIZE);
unlock_page(io_ctl->pages[i]);
@@ -1913,9 +1911,9 @@ static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
ctl->free_space -= bytes;
}
-static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info, u64 offset,
- u64 bytes)
+static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
{
unsigned long start, count, end;
int extent_delta = 1;
@@ -2251,7 +2249,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
bytes_to_set = min(end - offset, bytes);
- bitmap_set_bits(ctl, info, offset, bytes_to_set);
+ btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set);
return bytes_to_set;
@@ -2621,7 +2619,7 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
}
}
-int __btrfs_add_free_space(struct btrfs_block_group *block_group,
+static int __btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 offset, u64 bytes,
enum btrfs_trim_state trim_state)
{
@@ -4156,15 +4154,13 @@ out:
int __init btrfs_free_space_init(void)
{
- btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
- sizeof(struct btrfs_free_space), 0,
- SLAB_MEM_SPREAD, NULL);
+ btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0);
if (!btrfs_free_space_cachep)
return -ENOMEM;
btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
PAGE_SIZE, PAGE_SIZE,
- SLAB_MEM_SPREAD, NULL);
+ 0, NULL);
if (!btrfs_free_space_bitmap_cachep) {
kmem_cache_destroy(btrfs_free_space_cachep);
return -ENOMEM;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 33b4da3271b1..83774bfd7b3b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -6,6 +6,19 @@
#ifndef BTRFS_FREE_SPACE_CACHE_H
#define BTRFS_FREE_SPACE_CACHE_H
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include "fs.h"
+
+struct inode;
+struct page;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_trans_handle;
+struct btrfs_trim_block_group;
+
/*
* This is the trim state of an extent or bitmap.
*
@@ -114,8 +127,6 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl);
-int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr,
- u64 size, enum btrfs_trim_state trim_state);
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 7b598b070700..90f2938bd743 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1176,12 +1176,16 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
BTRFS_FREE_SPACE_TREE_OBJECTID);
if (IS_ERR(free_space_root)) {
ret = PTR_ERR(free_space_root);
- goto abort;
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out_clear;
}
ret = btrfs_global_root_insert(free_space_root);
if (ret) {
btrfs_put_root(free_space_root);
- goto abort;
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out_clear;
}
node = rb_first_cached(&fs_info->block_group_cache_tree);
@@ -1189,8 +1193,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out_clear;
+ }
node = rb_next(node);
}
@@ -1206,11 +1213,9 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
return ret;
-abort:
+out_clear:
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
return ret;
}
@@ -1273,12 +1278,18 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
ret = clear_free_space_tree(trans, free_space_root);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
ret = btrfs_del_root(trans, &free_space_root->root_key);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
btrfs_global_root_delete(free_space_root);
@@ -1295,11 +1306,6 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_put_root(free_space_root);
return btrfs_commit_transaction(trans);
-
-abort:
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
}
int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
@@ -1322,8 +1328,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
ret = clear_free_space_tree(trans, free_space_root);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
@@ -1332,8 +1341,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
node = rb_next(node);
}
@@ -1344,10 +1356,6 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
ret = btrfs_commit_transaction(trans);
clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
return ret;
-abort:
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
}
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 6d5551d0ced8..e6c6d6f4f221 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -6,7 +6,13 @@
#ifndef BTRFS_FREE_SPACE_TREE_H
#define BTRFS_FREE_SPACE_TREE_H
+#include <linux/bits.h>
+
struct btrfs_caching_control;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_block_group;
+struct btrfs_trans_handle;
/*
* The default size for new free space bitmap items. The last bitmap in a block
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 318df6f9d9cb..89f0650631cd 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -4,13 +4,49 @@
#define BTRFS_FS_H
#include <linux/blkdev.h>
-#include <linux/fs.h>
-#include <linux/btrfs_tree.h>
#include <linux/sizes.h>
+#include <linux/time64.h>
+#include <linux/compiler.h>
+#include <linux/math.h>
+#include <linux/atomic.h>
+#include <linux/percpu_counter.h>
+#include <linux/completion.h>
+#include <linux/lockdep.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwlock_types.h>
+#include <linux/rwsem.h>
+#include <linux/semaphore.h>
+#include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
+#include <linux/wait_bit.h>
+#include <linux/sched.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/btrfs.h>
+#include <uapi/linux/btrfs_tree.h>
#include "extent-io-tree.h"
-#include "extent_map.h"
#include "async-thread.h"
#include "block-rsv.h"
+#include "fs.h"
+
+struct inode;
+struct super_block;
+struct kobject;
+struct reloc_control;
+struct crypto_shash;
+struct ulist;
+struct btrfs_device;
+struct btrfs_block_group;
+struct btrfs_root;
+struct btrfs_fs_devices;
+struct btrfs_transaction;
+struct btrfs_delayed_root;
+struct btrfs_balance_control;
+struct btrfs_subpage_info;
+struct btrfs_stripe_hash_table;
+struct btrfs_space_info;
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
@@ -188,6 +224,7 @@ enum {
BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27),
BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28),
BTRFS_MOUNT_NODISCARD = (1UL << 29),
+ BTRFS_MOUNT_NOSPACECACHE = (1UL << 30),
};
/*
@@ -398,7 +435,8 @@ struct btrfs_fs_info {
struct extent_io_tree excluded_extents;
/* logical->physical extent mapping */
- struct extent_map_tree mapping_tree;
+ struct rb_root_cached mapping_tree;
+ rwlock_t mapping_tree_lock;
/*
* Block reservation for extent, checksum, root tree and delayed dir
@@ -591,6 +629,10 @@ struct btrfs_fs_info {
s32 dirty_metadata_batch;
s32 delalloc_batch;
+ struct percpu_counter evictable_extent_maps;
+ u64 extent_map_shrinker_last_root;
+ u64 extent_map_shrinker_last_ino;
+
/* Protected by 'trans_lock'. */
struct list_head dirty_cowonly_roots;
@@ -730,10 +772,13 @@ struct btrfs_fs_info {
/* Reclaim partially filled block groups in the background */
struct work_struct reclaim_bgs_work;
+ /* Protected by unused_bgs_lock. */
struct list_head reclaim_bgs;
int bg_reclaim_threshold;
+ /* Protects the lists unused_bgs and reclaim_bgs. */
spinlock_t unused_bgs_lock;
+ /* Protected by unused_bgs_lock. */
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
/* Protect block groups that are going to be deleted */
@@ -827,6 +872,17 @@ struct btrfs_fs_info {
#endif
};
+#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \
+ struct page *: (_page))->mapping->host))
+#define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \
+ struct folio *: (_folio))->mapping->host))
+
+#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info)
+#define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info)
+
+#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \
+ struct inode *: (_inode)))->root->fs_info)
+
static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
{
return READ_ONCE(fs_info->generation);
@@ -920,6 +976,8 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation op);
+int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args);
+
/* Compatibility and incompatibility defines */
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name);
@@ -960,20 +1018,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
BTRFS_MOUNT_##opt)
-#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (!btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_set_opt(fs_info->mount_opt, opt); \
-} while (0)
-
-#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_clear_opt(fs_info->mount_opt, opt); \
-} while (0)
-
static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
{
/* Do it this way so we only ever do one test_bit in the normal case. */
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 7d734830e514..84a94d19b22c 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -9,7 +9,6 @@
#include "inode-item.h"
#include "disk-io.h"
#include "transaction.h"
-#include "print-tree.h"
#include "space-info.h"
#include "accessors.h"
#include "extent-tree.h"
@@ -671,16 +670,18 @@ delete:
}
if (del_item && extent_start != 0 && !control->skip_ref_updates) {
- struct btrfs_ref ref = { 0 };
+ struct btrfs_ref ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = extent_start,
+ .num_bytes = extent_num_bytes,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_header_owner(leaf),
+ };
bytes_deleted += extent_num_bytes;
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
- extent_start, extent_num_bytes, 0,
- root->root_key.objectid);
- btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- control->ino, extent_offset,
- root->root_key.objectid, false);
+ btrfs_init_data_ref(&ref, control->ino, extent_offset,
+ btrfs_root_id(root), false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index 4337bb26f419..c4aded82709b 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -6,14 +6,15 @@
#include <linux/types.h>
#include <linux/crc32c.h>
+struct fscrypt_str;
+struct extent_buffer;
struct btrfs_trans_handle;
struct btrfs_root;
struct btrfs_path;
struct btrfs_key;
struct btrfs_inode_extref;
struct btrfs_inode;
-struct extent_buffer;
-struct fscrypt_str;
+struct btrfs_truncate_control;
/*
* Return this if we need to call truncate_block for the last bit of the
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5e3fccddde0c..753db965f7c0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -39,14 +39,12 @@
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "ordered-data.h"
#include "xattr.h"
#include "tree-log.h"
#include "bio.h"
#include "compression.h"
#include "locking.h"
-#include "free-space-cache.h"
#include "props.h"
#include "qgroup.h"
#include "delalloc-space.h"
@@ -114,6 +112,15 @@ struct data_reloc_warn {
int mirror_num;
};
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -247,7 +254,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- inode->root->root_key.objectid, btrfs_ino(inode), file_off,
+ btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
@@ -257,7 +264,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
logical += file_off;
btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
btrfs_ino(inode), file_off, logical,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -324,15 +331,15 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
const u32 csum_size = root->fs_info->csum_size;
/* For data reloc tree, it's better to do a backref lookup instead. */
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
return print_data_reloc_error(inode, logical_start, csum,
csum_expected, mirror_num);
/* Output without objectid, which is more meaningful */
- if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
+ if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(root), btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -340,7 +347,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
} else {
btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(root), btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -447,8 +454,8 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
- btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
- offset, bytes);
+ btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
+ page_folio(page), offset, bytes);
put_page(page);
}
@@ -505,12 +512,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, bool extent_inserted,
size_t size, size_t compressed_size,
int compress_type,
- struct page **compressed_pages,
+ struct folio *compressed_folio,
bool update_i_size)
{
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct page *page = NULL;
+ const u32 sectorsize = trans->fs_info->sectorsize;
char *kaddr;
unsigned long ptr;
struct btrfs_file_extent_item *ei;
@@ -518,10 +526,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t cur_size = size;
u64 i_size;
- ASSERT((compressed_size > 0 && compressed_pages) ||
- (compressed_size == 0 && !compressed_pages));
+ /*
+ * The decompressed size must still be no larger than a sector. Under
+ * heavy race, we can have size == 0 passed in, but that shouldn't be a
+ * big deal and we can continue the insertion.
+ */
+ ASSERT(size <= sectorsize);
- if (compressed_size && compressed_pages)
+ /*
+ * The compressed size also needs to be no larger than a sector.
+ * That's also why we only need one page as the parameter.
+ */
+ if (compressed_folio)
+ ASSERT(compressed_size <= sectorsize);
+ else
+ ASSERT(compressed_size == 0);
+
+ if (compressed_size && compressed_folio)
cur_size = compressed_size;
if (!extent_inserted) {
@@ -549,21 +570,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
ptr = btrfs_file_extent_inline_start(ei);
if (compress_type != BTRFS_COMPRESS_NONE) {
- struct page *cpage;
- int i = 0;
- while (compressed_size > 0) {
- cpage = compressed_pages[i];
- cur_size = min_t(unsigned long, compressed_size,
- PAGE_SIZE);
-
- kaddr = kmap_local_page(cpage);
- write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_local(kaddr);
+ kaddr = kmap_local_folio(compressed_folio, 0);
+ write_extent_buffer(leaf, kaddr, ptr, compressed_size);
+ kunmap_local(kaddr);
- i++;
- ptr += cur_size;
- compressed_size -= cur_size;
- }
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
@@ -604,17 +614,62 @@ fail:
return ret;
}
+static bool can_cow_file_range_inline(struct btrfs_inode *inode,
+ u64 offset, u64 size,
+ size_t compressed_size)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u64 data_len = (compressed_size ?: size);
+
+ /* Inline extents must start at offset 0. */
+ if (offset != 0)
+ return false;
+
+ /*
+ * Due to the page size limit, for subpage we can only trigger the
+ * writeback for the dirty sectors of page, that means data writeback
+ * is doing more writeback than what we want.
+ *
+ * This is especially unexpected for some call sites like fallocate,
+ * where we only increase i_size after everything is done.
+ * This means we can trigger inline extent even if we didn't want to.
+ * So here we skip inline extent creation completely.
+ */
+ if (fs_info->sectorsize != PAGE_SIZE)
+ return false;
+
+ /* Inline extents are limited to sectorsize. */
+ if (size > fs_info->sectorsize)
+ return false;
+
+ /* We cannot exceed the maximum inline data size. */
+ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
+ return false;
+
+ /* We cannot exceed the user specified max_inline size. */
+ if (data_len > fs_info->max_inline)
+ return false;
+
+ /* Inline extents must be the entirety of the file. */
+ if (size < i_size_read(&inode->vfs_inode))
+ return false;
+
+ return true;
+}
/*
* conditionally insert an inline extent into the file. This
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
+ *
+ * If being used directly, you must have already checked we're allowed to cow
+ * the range by getting true from can_cow_file_range_inline().
*/
-static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
- size_t compressed_size,
- int compress_type,
- struct page **compressed_pages,
- bool update_i_size)
+static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
+ u64 size, size_t compressed_size,
+ int compress_type,
+ struct folio *compressed_folio,
+ bool update_i_size)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
@@ -624,18 +679,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
int ret;
struct btrfs_path *path;
- /*
- * We can create an inline extent if it ends at or beyond the current
- * i_size, is no larger than a sector (decompressed), and the (possibly
- * compressed) data fits in a leaf and the configured maximum inline
- * size.
- */
- if (size < i_size_read(&inode->vfs_inode) ||
- size > fs_info->sectorsize ||
- data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
- data_len > fs_info->max_inline)
- return 1;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -661,7 +704,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
size, compressed_size, compress_type,
- compressed_pages, update_i_size);
+ compressed_folio, update_i_size);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -688,18 +731,50 @@ out:
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
- btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
btrfs_free_path(path);
btrfs_end_transaction(trans);
return ret;
}
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
+ u64 end,
+ size_t compressed_size,
+ int compress_type,
+ struct folio *compressed_folio,
+ bool update_i_size)
+{
+ struct extent_state *cached = NULL;
+ unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
+ u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
+ int ret;
+
+ if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
+ return 1;
+
+ lock_extent(&inode->io_tree, offset, end, &cached);
+ ret = __cow_file_range_inline(inode, offset, size, compressed_size,
+ compress_type, compressed_folio,
+ update_i_size);
+ if (ret > 0) {
+ unlock_extent(&inode->io_tree, offset, end, &cached);
+ return ret;
+ }
+
+ extent_clear_unlock_delalloc(inode, offset, end, NULL, &cached,
+ clear_flags,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ return ret;
+}
+
struct async_extent {
u64 start;
u64 ram_size;
u64 compressed_size;
- struct page **pages;
- unsigned long nr_pages;
+ struct folio **folios;
+ unsigned long nr_folios;
int compress_type;
struct list_head list;
};
@@ -724,19 +799,20 @@ struct async_cow {
static noinline int add_async_extent(struct async_chunk *cow,
u64 start, u64 ram_size,
u64 compressed_size,
- struct page **pages,
- unsigned long nr_pages,
+ struct folio **folios,
+ unsigned long nr_folios,
int compress_type)
{
struct async_extent *async_extent;
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
- BUG_ON(!async_extent); /* -ENOMEM */
+ if (!async_extent)
+ return -ENOMEM;
async_extent->start = start;
async_extent->ram_size = ram_size;
async_extent->compressed_size = compressed_size;
- async_extent->pages = pages;
- async_extent->nr_pages = nr_pages;
+ async_extent->folios = folios;
+ async_extent->nr_folios = nr_folios;
async_extent->compress_type = compress_type;
list_add_tail(&async_extent->list, &cow->extents);
return 0;
@@ -840,8 +916,8 @@ static void compress_file_range(struct btrfs_work *work)
u64 actual_end;
u64 i_size;
int ret = 0;
- struct page **pages;
- unsigned long nr_pages;
+ struct folio **folios;
+ unsigned long nr_folios;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
unsigned int poff;
@@ -871,9 +947,9 @@ static void compress_file_range(struct btrfs_work *work)
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
- pages = NULL;
- nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
+ folios = NULL;
+ nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
/*
* we don't want to send crud past the end of i_size through
@@ -922,8 +998,8 @@ again:
if (!inode_need_compress(inode, start, end))
goto cleanup_and_bail_uncompressed;
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!pages) {
+ folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
+ if (!folios) {
/*
* Memory allocation failure is not a fatal error, we can fall
* back to uncompressed code.
@@ -937,9 +1013,9 @@ again:
compress_type = inode->prop_compress;
/* Compression level is applied here. */
- ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4),
- mapping, start, pages, &nr_pages, &total_in,
- &total_compressed);
+ ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
+ mapping, start, folios, &nr_folios, &total_in,
+ &total_compressed);
if (ret)
goto mark_incompressible;
@@ -949,7 +1025,7 @@ again:
*/
poff = offset_in_page(total_compressed);
if (poff)
- memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff);
+ folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
/*
* Try to create an inline extent.
@@ -960,43 +1036,16 @@ again:
* Check cow_file_range() for why we don't even try to create inline
* extent for the subpage case.
*/
- if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
- if (total_in < actual_end) {
- ret = cow_file_range_inline(inode, actual_end, 0,
- BTRFS_COMPRESS_NONE, NULL,
- false);
- } else {
- ret = cow_file_range_inline(inode, actual_end,
- total_compressed,
- compress_type, pages,
- false);
- }
- if (ret <= 0) {
- unsigned long clear_flags = EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
- EXTENT_DO_ACCOUNTING;
-
- if (ret < 0)
- mapping_set_error(mapping, -EIO);
-
- /*
- * inline extent creation worked or returned error,
- * we don't need to create any more async work items.
- * Unlock and free up our temp pages.
- *
- * We use DO_ACCOUNTING here because we need the
- * delalloc_release_metadata to be done _after_ we drop
- * our outstanding extent for clearing delalloc for this
- * range.
- */
- extent_clear_unlock_delalloc(inode, start, end,
- NULL,
- clear_flags,
- PAGE_UNLOCK |
- PAGE_START_WRITEBACK |
- PAGE_END_WRITEBACK);
- goto free_pages;
- }
+ if (total_in < actual_end)
+ ret = cow_file_range_inline(inode, start, end, 0,
+ BTRFS_COMPRESS_NONE, NULL, false);
+ else
+ ret = cow_file_range_inline(inode, start, end, total_compressed,
+ compress_type, folios[0], false);
+ if (ret <= 0) {
+ if (ret < 0)
+ mapping_set_error(mapping, -EIO);
+ goto free_pages;
}
/*
@@ -1018,8 +1067,9 @@ again:
* The async work queues will take care of doing actual allocation on
* disk for these compressed pages, and will submit the bios.
*/
- add_async_extent(async_chunk, start, total_in, total_compressed, pages,
- nr_pages, compress_type);
+ ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
+ nr_folios, compress_type);
+ BUG_ON(ret);
if (start + total_in < end) {
start += total_in;
cond_resched();
@@ -1031,15 +1081,16 @@ mark_incompressible:
if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
inode->flags |= BTRFS_INODE_NOCOMPRESS;
cleanup_and_bail_uncompressed:
- add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
- BTRFS_COMPRESS_NONE);
+ ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
+ BTRFS_COMPRESS_NONE);
+ BUG_ON(ret);
free_pages:
- if (pages) {
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ if (folios) {
+ for (i = 0; i < nr_folios; i++) {
+ WARN_ON(folios[i]->mapping);
+ btrfs_free_compr_folio(folios[i]);
}
- kfree(pages);
+ kfree(folios);
}
}
@@ -1047,16 +1098,16 @@ static void free_async_extent_pages(struct async_extent *async_extent)
{
int i;
- if (!async_extent->pages)
+ if (!async_extent->folios)
return;
- for (i = 0; i < async_extent->nr_pages; i++) {
- WARN_ON(async_extent->pages[i]->mapping);
- put_page(async_extent->pages[i]);
+ for (i = 0; i < async_extent->nr_folios; i++) {
+ WARN_ON(async_extent->folios[i]->mapping);
+ btrfs_free_compr_folio(async_extent->folios[i]);
}
- kfree(async_extent->pages);
- async_extent->nr_pages = 0;
- async_extent->pages = NULL;
+ kfree(async_extent->folios);
+ async_extent->nr_folios = 0;
+ async_extent->folios = NULL;
}
static void submit_uncompressed_range(struct btrfs_inode *inode,
@@ -1103,6 +1154,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct btrfs_ordered_extent *ordered;
struct btrfs_key ins;
struct page *locked_page = NULL;
+ struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
u64 start = async_extent->start;
@@ -1122,7 +1174,6 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
if (!(start >= locked_page_end || end <= locked_page_start))
locked_page = async_chunk->locked_page;
}
- lock_extent(io_tree, start, end, NULL);
if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
submit_uncompressed_range(inode, async_extent, locked_page);
@@ -1135,15 +1186,17 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
0, *alloc_hint, &ins, 1, 1);
if (ret) {
/*
- * Here we used to try again by going back to non-compressed
- * path for ENOSPC. But we can't reserve space even for
- * compressed size, how could it work for uncompressed size
- * which requires larger size? So here we directly go error
- * path.
+ * We can't reserve contiguous space for the compressed size.
+ * Unlikely, but it's possible that we could have enough
+ * non-contiguous space for the uncompressed size instead. So
+ * fall back to uncompressed.
*/
- goto out_free;
+ submit_uncompressed_range(inode, async_extent, locked_page);
+ goto done;
}
+ lock_extent(io_tree, start, end, &cached);
+
/* Here we're doing allocation and writeback of the compressed pages */
em = create_io_em(inode, start,
async_extent->ram_size, /* len */
@@ -1177,11 +1230,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
/* Clear dirty, set writeback and unlock the pages. */
extent_clear_unlock_delalloc(inode, start, end,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_START_WRITEBACK);
btrfs_submit_compressed_write(ordered,
- async_extent->pages, /* compressed_pages */
- async_extent->nr_pages,
+ async_extent->folios, /* compressed_folios */
+ async_extent->nr_folios,
async_chunk->write_flags, true);
*alloc_hint = ins.objectid + ins.offset;
done:
@@ -1193,10 +1246,10 @@ done:
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
-out_free:
mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
extent_clear_unlock_delalloc(inode, start, end,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+ NULL, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_START_WRITEBACK |
@@ -1206,7 +1259,7 @@ out_free:
kthread_associate_blkcg(NULL);
btrfs_debug(fs_info,
"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
- root->root_key.objectid, btrfs_ino(inode), start,
+ btrfs_root_id(root), btrfs_ino(inode), start,
async_extent->ram_size, ret);
kfree(async_extent);
}
@@ -1278,6 +1331,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_state *cached = NULL;
u64 alloc_hint = 0;
u64 orig_start = start;
u64 num_bytes;
@@ -1303,53 +1357,21 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- /*
- * Due to the page size limit, for subpage we can only trigger the
- * writeback for the dirty sectors of page, that means data writeback
- * is doing more writeback than what we want.
- *
- * This is especially unexpected for some call sites like fallocate,
- * where we only increase i_size after everything is done.
- * This means we can trigger inline extent even if we didn't want to.
- * So here we skip inline extent creation completely.
- */
- if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
- u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
- end + 1);
-
+ if (!no_inline) {
/* lets try to make an inline extent */
- ret = cow_file_range_inline(inode, actual_end, 0,
+ ret = cow_file_range_inline(inode, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false);
- if (ret == 0) {
- /*
- * We use DO_ACCOUNTING here because we need the
- * delalloc_release_metadata to be run _after_ we drop
- * our outstanding extent for clearing delalloc for this
- * range.
- */
- extent_clear_unlock_delalloc(inode, start, end,
- locked_page,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
- EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
+ if (ret <= 0) {
/*
- * locked_page is locked by the caller of
- * writepage_delalloc(), not locked by
- * __process_pages_contig().
+ * We succeeded, return 1 so the caller knows we're done
+ * with this page and already handled the IO.
*
- * We can't let __process_pages_contig() to unlock it,
- * as it doesn't have any subpage::writers recorded.
- *
- * Here we manually unlock the page, since the caller
- * can't determine if it's an inline extent or a
- * compressed extent.
+ * If there was an error then cow_file_range_inline() has
+ * already done the cleanup.
*/
- unlock_page(locked_page);
- ret = 1;
+ if (ret == 0)
+ ret = 1;
goto done;
- } else if (ret < 0) {
- goto out_unlock;
}
}
@@ -1409,6 +1431,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
extent_reserved = true;
ram_size = ins.offset;
+
+ lock_extent(&inode->io_tree, start, start + ram_size - 1,
+ &cached);
+
em = create_io_em(inode, start, ins.offset, /* len */
start, /* orig_start */
ins.objectid, /* block_start */
@@ -1418,6 +1444,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
BTRFS_COMPRESS_NONE, /* compress_type */
BTRFS_ORDERED_REGULAR /* type */);
if (IS_ERR(em)) {
+ unlock_extent(&inode->io_tree, start,
+ start + ram_size - 1, &cached);
ret = PTR_ERR(em);
goto out_reserve;
}
@@ -1428,6 +1456,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
0, 1 << BTRFS_ORDERED_REGULAR,
BTRFS_COMPRESS_NONE);
if (IS_ERR(ordered)) {
+ unlock_extent(&inode->io_tree, start,
+ start + ram_size - 1, &cached);
ret = PTR_ERR(ordered);
goto out_drop_extent_cache;
}
@@ -1467,7 +1497,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
- locked_page,
+ locked_page, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
if (num_bytes < cur_alloc_size)
@@ -1526,10 +1556,17 @@ out_unlock:
if (!locked_page)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
- locked_page, 0, page_ops);
+ locked_page, NULL, 0, page_ops);
}
/*
+ * At this point we're unlocked, we want to make sure we're only
+ * clearing these flags under the extent lock, so lock the rest of the
+ * range and clear everything up.
+ */
+ lock_extent(&inode->io_tree, start, end, NULL);
+
+ /*
* For the range (2). If we reserved an extent for our delalloc range
* (or a subrange) and failed to create the respective ordered extent,
* then it means that when we reserved the extent we decremented the
@@ -1542,7 +1579,7 @@ out_unlock:
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1,
- locked_page,
+ locked_page, &cached,
clear_bits,
page_ops);
start += cur_alloc_size;
@@ -1557,7 +1594,7 @@ out_unlock:
if (start < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV;
extent_clear_unlock_delalloc(inode, start, end, locked_page,
- clear_bits, page_ops);
+ &cached, clear_bits, page_ops);
}
return ret;
}
@@ -1630,7 +1667,6 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
if (!ctx)
return false;
- unlock_extent(&inode->io_tree, start, end, NULL);
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
async_chunk = ctx->chunks;
@@ -1724,29 +1760,6 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
return 1;
}
-static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 num_bytes, bool nowait)
-{
- struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
- struct btrfs_ordered_sum *sums;
- int ret;
- LIST_HEAD(list);
-
- ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
- &list, 0, nowait);
- if (ret == 0 && list_empty(&list))
- return 0;
-
- while (!list_empty(&list)) {
- sums = list_entry(list.next, struct btrfs_ordered_sum, list);
- list_del(&sums->list);
- kfree(sums);
- }
- if (ret < 0)
- return ret;
- return 1;
-}
-
static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
const u64 start, const u64 end)
{
@@ -1754,6 +1767,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_state *cached_state = NULL;
u64 range_start = start;
u64 count;
int ret;
@@ -1790,6 +1804,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
* group that contains that extent to RO mode and therefore force COW
* when starting writeback.
*/
+ lock_extent(io_tree, start, end, &cached_state);
count = count_range_bits(io_tree, &range_start, end, range_bytes,
EXTENT_NORESERVE, 0, NULL);
if (count > 0 || is_space_ino || is_reloc_ino) {
@@ -1808,6 +1823,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
NULL);
}
+ unlock_extent(io_tree, start, end, &cached_state);
/*
* Don't try to create inline extents, as a mix of inline extent that
@@ -1861,6 +1877,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *fi;
+ struct btrfs_root *csum_root;
u64 extent_end;
u8 extent_type;
int can_nocow = 0;
@@ -1921,7 +1938,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
if (args->free_path) {
/*
* We don't need the path anymore, plus through the
- * csum_exist_in_range() call below we will end up allocating
+ * btrfs_lookup_csums_list() call below we will end up allocating
* another path. So free the path to avoid unnecessary extra
* memory usage.
*/
@@ -1942,8 +1959,11 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* Force COW if csums exist in the range. This ensures that csums for a
* given extent are either valid or do not exist.
*/
- ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
- nowait);
+
+ csum_root = btrfs_csum_root(root->fs_info, args->disk_bytenr);
+ ret = btrfs_lookup_csums_list(csum_root, args->disk_bytenr,
+ args->disk_bytenr + args->num_bytes - 1,
+ NULL, nowait);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -1993,12 +2013,13 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
nocow_args.end = end;
nocow_args.writeback_path = true;
- while (1) {
+ while (cur_offset <= end) {
struct btrfs_block_group *nocow_bg = NULL;
struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
+ struct extent_state *cached_state = NULL;
u64 extent_end;
u64 ram_bytes;
u64 nocow_end;
@@ -2136,6 +2157,8 @@ must_cow:
}
nocow_end = cur_offset + nocow_args.num_bytes - 1;
+ lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
+
is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
if (is_prealloc) {
u64 orig_start = found_key.offset - nocow_args.extent_offset;
@@ -2149,6 +2172,8 @@ must_cow:
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
+ unlock_extent(&inode->io_tree, cur_offset,
+ nocow_end, &cached_state);
btrfs_dec_nocow_writers(nocow_bg);
ret = PTR_ERR(em);
goto error;
@@ -2169,6 +2194,8 @@ must_cow:
btrfs_drop_extent_map_range(inode, cur_offset,
nocow_end, false);
}
+ unlock_extent(&inode->io_tree, cur_offset,
+ nocow_end, &cached_state);
ret = PTR_ERR(ordered);
goto error;
}
@@ -2183,8 +2210,8 @@ must_cow:
btrfs_put_ordered_extent(ordered);
extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
- locked_page, EXTENT_LOCKED |
- EXTENT_DELALLOC |
+ locked_page, &cached_state,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
PAGE_UNLOCK | PAGE_SET_ORDERED);
@@ -2197,8 +2224,6 @@ must_cow:
*/
if (ret)
goto error;
- if (cur_offset > end)
- break;
}
btrfs_release_path(path);
@@ -2224,13 +2249,23 @@ error:
*/
if (cow_start != (u64)-1)
cur_offset = cow_start;
- if (cur_offset < end)
+
+ /*
+ * We need to lock the extent here because we're clearing DELALLOC and
+ * we're not locked at this point.
+ */
+ if (cur_offset < end) {
+ struct extent_state *cached = NULL;
+
+ lock_extent(&inode->io_tree, cur_offset, end, &cached);
extent_clear_unlock_delalloc(inode, cur_offset, end,
- locked_page, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_DEFRAG |
+ locked_page, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
+ }
btrfs_free_path(path);
return ret;
}
@@ -2293,6 +2328,8 @@ void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 size;
+ lockdep_assert_held(&inode->io_tree.lock);
+
/* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
return;
@@ -2331,6 +2368,8 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
u64 new_size, old_size;
u32 num_extents;
+ lockdep_assert_held(&inode->io_tree.lock);
+
/* not delalloc, ignore it */
if (!(other->state & EXTENT_DELALLOC))
return;
@@ -2378,55 +2417,50 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
spin_unlock(&inode->lock);
}
-static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
- struct btrfs_inode *inode)
+static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&root->delalloc_lock);
- if (list_empty(&inode->delalloc_inodes)) {
- list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
- set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
- root->nr_delalloc_inodes++;
- if (root->nr_delalloc_inodes == 1) {
- spin_lock(&fs_info->delalloc_root_lock);
- BUG_ON(!list_empty(&root->delalloc_root));
- list_add_tail(&root->delalloc_root,
- &fs_info->delalloc_roots);
- spin_unlock(&fs_info->delalloc_root_lock);
- }
+ ASSERT(list_empty(&inode->delalloc_inodes));
+ list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
+ root->nr_delalloc_inodes++;
+ if (root->nr_delalloc_inodes == 1) {
+ spin_lock(&fs_info->delalloc_root_lock);
+ ASSERT(list_empty(&root->delalloc_root));
+ list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
}
spin_unlock(&root->delalloc_lock);
}
-void __btrfs_del_delalloc_inode(struct btrfs_root *root,
- struct btrfs_inode *inode)
+void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
+ lockdep_assert_held(&root->delalloc_lock);
+
+ /*
+ * We may be called after the inode was already deleted from the list,
+ * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
+ * and then later through btrfs_clear_delalloc_extent() while the inode
+ * still has ->delalloc_bytes > 0.
+ */
if (!list_empty(&inode->delalloc_inodes)) {
list_del_init(&inode->delalloc_inodes);
- clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &inode->runtime_flags);
root->nr_delalloc_inodes--;
if (!root->nr_delalloc_inodes) {
ASSERT(list_empty(&root->delalloc_inodes));
spin_lock(&fs_info->delalloc_root_lock);
- BUG_ON(list_empty(&root->delalloc_root));
+ ASSERT(!list_empty(&root->delalloc_root));
list_del_init(&root->delalloc_root);
spin_unlock(&fs_info->delalloc_root_lock);
}
}
}
-static void btrfs_del_delalloc_inode(struct btrfs_root *root,
- struct btrfs_inode *inode)
-{
- spin_lock(&root->delalloc_lock);
- __btrfs_del_delalloc_inode(root, inode);
- spin_unlock(&root->delalloc_lock);
-}
-
/*
* Properly track delayed allocation bytes in the inode and to maintain the
* list of inodes that have pending delalloc work to be done.
@@ -2436,6 +2470,8 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ lockdep_assert_held(&inode->io_tree.lock);
+
if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
WARN_ON(1);
/*
@@ -2444,10 +2480,9 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
* bit, which is only set or cleared with irqs on
*/
if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
- struct btrfs_root *root = inode->root;
u64 len = state->end + 1 - state->start;
+ u64 prev_delalloc_bytes;
u32 num_extents = count_max_extents(fs_info, len);
- bool do_list = !btrfs_is_free_space_inode(inode);
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, num_extents);
@@ -2460,13 +2495,20 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
fs_info->delalloc_batch);
spin_lock(&inode->lock);
+ prev_delalloc_bytes = inode->delalloc_bytes;
inode->delalloc_bytes += len;
if (bits & EXTENT_DEFRAG)
inode->defrag_bytes += len;
- if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &inode->runtime_flags))
- btrfs_add_delalloc_inodes(root, inode);
spin_unlock(&inode->lock);
+
+ /*
+ * We don't need to be under the protection of the inode's lock,
+ * because we are called while holding the inode's io_tree lock
+ * and are therefore protected against concurrent calls of this
+ * function and btrfs_clear_delalloc_extent().
+ */
+ if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
+ btrfs_add_delalloc_inode(inode);
}
if (!(state->state & EXTENT_DELALLOC_NEW) &&
@@ -2488,6 +2530,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(fs_info, len);
+ lockdep_assert_held(&inode->io_tree.lock);
+
if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
spin_lock(&inode->lock);
inode->defrag_bytes -= len;
@@ -2501,7 +2545,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
*/
if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = inode->root;
- bool do_list = !btrfs_is_free_space_inode(inode);
+ u64 new_delalloc_bytes;
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, -num_extents);
@@ -2514,14 +2558,15 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
*/
if (bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(inode, len, false);
+ btrfs_delalloc_release_metadata(inode, len, true);
/* For sanity tests. */
if (btrfs_is_testing(fs_info))
return;
if (!btrfs_is_data_reloc_root(root) &&
- do_list && !(state->state & EXTENT_NORESERVE) &&
+ !btrfs_is_free_space_inode(inode) &&
+ !(state->state & EXTENT_NORESERVE) &&
(bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(fs_info, len);
@@ -2529,11 +2574,20 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
fs_info->delalloc_batch);
spin_lock(&inode->lock);
inode->delalloc_bytes -= len;
- if (do_list && inode->delalloc_bytes == 0 &&
- test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &inode->runtime_flags))
- btrfs_del_delalloc_inode(root, inode);
+ new_delalloc_bytes = inode->delalloc_bytes;
spin_unlock(&inode->lock);
+
+ /*
+ * We don't need to be under the protection of the inode's lock,
+ * because we are called while holding the inode's io_tree lock
+ * and are therefore protected against concurrent calls of this
+ * function and btrfs_set_delalloc_extent().
+ */
+ if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
+ spin_lock(&root->delalloc_lock);
+ btrfs_del_delalloc_inode(inode);
+ spin_unlock(&root->delalloc_lock);
+ }
}
if ((state->state & EXTENT_DELALLOC_NEW) &&
@@ -2623,7 +2677,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
u64 em_len;
int ret = 0;
- em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+ em = btrfs_get_extent(inode, NULL, search_start, search_len);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -2793,7 +2847,7 @@ out_page:
PAGE_SIZE, !ret);
clear_page_dirty_for_io(page);
}
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
kfree(fixup);
@@ -2820,7 +2874,7 @@ out_page:
int btrfs_writepage_cow_fixup(struct page *page)
{
struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_writepage_fixup *fixup;
/* This page has ordered extent covering it already */
@@ -2848,7 +2902,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
* page->mapping outside of the page lock.
*/
ihold(inode);
- btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
+ btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
fixup->page = page;
@@ -3118,8 +3172,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
- ordered_extent->num_bytes, trans->transid);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = unpin_extent_cache(inode, ordered_extent->file_offset,
+ ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3148,7 +3207,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
btrfs_abort_transaction(trans, ret);
goto out;
}
- ret = 0;
out:
clear_extent_bit(&inode->io_tree, start, end, clear_bits,
&cached_state);
@@ -3167,16 +3225,30 @@ out:
* set the mapping error, so we need to set it if we're the ones
* marking this ordered extent as failed.
*/
- if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
- &ordered_extent->flags))
- mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
+ if (ret)
+ btrfs_mark_ordered_extent_error(ordered_extent);
if (truncated)
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
- /* Drop extent maps for the part of the extent we didn't write. */
- btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
+ /*
+ * Drop extent maps for the part of the extent we didn't write.
+ *
+ * We have an exception here for the free_space_inode, this is
+ * because when we do btrfs_get_extent() on the free space inode
+ * we will search the commit root. If this is a new block group
+ * we won't find anything, and we will trip over the assert in
+ * writepage where we do ASSERT(em->block_start !=
+ * EXTENT_MAP_HOLE).
+ *
+ * Theoretically we could also skip this for any NOCOW extent as
+ * we don't mess with the extent map tree in the NOCOW case, but
+ * for now simply skip this if we are the free space inode.
+ */
+ if (!btrfs_is_free_space_inode(inode))
+ btrfs_drop_extent_map_range(inode, unwritten_start,
+ end, false);
/*
* If the ordered extent had an IOERR or something else went
@@ -3208,7 +3280,7 @@ out:
* Actually free the qgroup rsv which was released when
* the ordered extent was created.
*/
- btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
ordered_extent->qgroup_rsv,
BTRFS_QGROUP_RSV_DATA);
}
@@ -3230,7 +3302,7 @@ out:
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
{
- if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
+ if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) &&
!test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
list_empty(&ordered->bioc_list))
btrfs_finish_ordered_zoned(ordered);
@@ -3715,7 +3787,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
static int btrfs_read_locked_inode(struct inode *inode,
struct btrfs_path *in_path)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_path *path = in_path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
@@ -3796,7 +3868,7 @@ cache_index:
* cache.
*
* This is required for both inode re-read from disk and delayed inode
- * in delayed_nodes_tree.
+ * in the delayed_nodes xarray.
*/
if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -3875,7 +3947,7 @@ cache_acl:
btrfs_err(fs_info,
"error loading props for ino %llu (root %llu): %d",
btrfs_ino(BTRFS_I(inode)),
- root->root_key.objectid, ret);
+ btrfs_root_id(root), ret);
}
if (path != in_path)
btrfs_free_path(path);
@@ -4234,7 +4306,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
/* This needs to handle no-key deletions later on */
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
- objectid = inode->root->root_key.objectid;
+ objectid = btrfs_root_id(inode->root);
} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
objectid = inode->location.objectid;
} else {
@@ -4292,7 +4364,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
} else {
ret = btrfs_del_root_ref(trans, objectid,
- root->root_key.objectid, dir_ino,
+ btrfs_root_id(root), dir_ino,
&index, &fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -4342,7 +4414,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
dir_id, &name, 0);
if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
- if (key.objectid == root->root_key.objectid) {
+ if (key.objectid == btrfs_root_id(root)) {
ret = -EPERM;
btrfs_err(fs_info,
"deleting default subvolume %llu is not allowed",
@@ -4352,21 +4424,27 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
btrfs_release_path(path);
}
- key.objectid = root->root_key.objectid;
+ key.objectid = btrfs_root_id(root);
key.type = BTRFS_ROOT_REF_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist a root
+ * with such id, but this is out of valid range.
+ */
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = 0;
if (path->slots[0] > 0) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == root->root_key.objectid &&
- key.type == BTRFS_ROOT_REF_KEY)
+ if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
ret = -ENOTEMPTY;
}
out:
@@ -4378,77 +4456,42 @@ out:
static void btrfs_prune_dentries(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct rb_node *node;
- struct rb_node *prev;
- struct btrfs_inode *entry;
- struct inode *inode;
- u64 objectid = 0;
+ struct btrfs_inode *inode;
+ u64 min_ino = 0;
if (!BTRFS_FS_ERROR(fs_info))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
- spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- entry = rb_entry(node, struct btrfs_inode, rb_node);
-
- if (objectid < btrfs_ino(entry))
- node = node->rb_left;
- else if (objectid > btrfs_ino(entry))
- node = node->rb_right;
- else
- break;
- }
- if (!node) {
- while (prev) {
- entry = rb_entry(prev, struct btrfs_inode, rb_node);
- if (objectid <= btrfs_ino(entry)) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
- while (node) {
- entry = rb_entry(node, struct btrfs_inode, rb_node);
- objectid = btrfs_ino(entry) + 1;
- inode = igrab(&entry->vfs_inode);
- if (inode) {
- spin_unlock(&root->inode_lock);
- if (atomic_read(&inode->i_count) > 1)
- d_prune_aliases(inode);
- /*
- * btrfs_drop_inode will have it removed from the inode
- * cache when its usage count hits zero.
- */
- iput(inode);
- cond_resched();
- spin_lock(&root->inode_lock);
- goto again;
- }
-
- if (cond_resched_lock(&root->inode_lock))
- goto again;
+ inode = btrfs_find_first_inode(root, min_ino);
+ while (inode) {
+ if (atomic_read(&inode->vfs_inode.i_count) > 1)
+ d_prune_aliases(&inode->vfs_inode);
- node = rb_next(node);
+ min_ino = btrfs_ino(inode) + 1;
+ /*
+ * btrfs_drop_inode() will have it removed from the inode
+ * cache when its usage count hits zero.
+ */
+ iput(&inode->vfs_inode);
+ cond_resched();
+ inode = btrfs_find_first_inode(root, min_ino);
}
- spin_unlock(&root->inode_lock);
}
int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
struct btrfs_root *root = dir->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode = d_inode(dentry);
struct btrfs_root *dest = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_block_rsv block_rsv;
u64 root_flags;
+ u64 qgroup_reserved = 0;
int ret;
+ down_write(&fs_info->subvol_sem);
+
/*
* Don't allow to delete a subvolume with send in progress. This is
* inside the inode lock so the error handling that has to drop the bit
@@ -4459,26 +4502,26 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
- dest->root_key.objectid);
- return -EPERM;
+ btrfs_root_id(dest));
+ ret = -EPERM;
+ goto out_up_write;
}
if (atomic_read(&dest->nr_swapfiles)) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu with active swapfile",
- root->root_key.objectid);
- return -EPERM;
+ btrfs_root_id(root));
+ ret = -EPERM;
+ goto out_up_write;
}
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- down_write(&fs_info->subvol_sem);
-
ret = may_destroy_subvol(dest);
if (ret)
- goto out_up_write;
+ goto out_undead;
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
@@ -4488,13 +4531,21 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
*/
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
if (ret)
- goto out_up_write;
+ goto out_undead;
+ qgroup_reserved = block_rsv.qgroup_rsv_reserved;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_release;
}
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+ qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
@@ -4520,7 +4571,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
ret = btrfs_insert_orphan_item(trans,
fs_info->tree_root,
- dest->root_key.objectid);
+ btrfs_root_id(dest));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4528,8 +4579,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
}
ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
- BTRFS_UUID_KEY_SUBVOL,
- dest->root_key.objectid);
+ BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4538,7 +4588,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = btrfs_uuid_tree_remove(trans,
dest->root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- dest->root_key.objectid);
+ btrfs_root_id(dest));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4553,16 +4603,20 @@ out_end_trans:
ret = btrfs_end_transaction(trans);
inode->i_flags |= S_DEAD;
out_release:
- btrfs_subvolume_release_metadata(root, &block_rsv);
-out_up_write:
- up_write(&fs_info->subvol_sem);
+ btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
+ if (qgroup_reserved)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
+out_undead:
if (ret) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- } else {
+ }
+out_up_write:
+ up_write(&fs_info->subvol_sem);
+ if (!ret) {
d_invalidate(dentry);
btrfs_prune_dentries(dest);
ASSERT(dest->send_in_progress == 0);
@@ -4575,7 +4629,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- int err = 0;
+ int ret = 0;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
struct fscrypt_name fname;
@@ -4591,33 +4645,33 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
}
- err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
- if (err)
- return err;
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (ret)
+ return ret;
/* This needs to handle no-key deletions later on */
trans = __unlink_start_trans(BTRFS_I(dir));
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_notrans;
}
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
goto out;
}
- err = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (err)
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret)
goto out;
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
&fname.disk_name);
- if (!err) {
+ if (!ret) {
btrfs_i_size_write(BTRFS_I(inode), 0);
/*
* Propagate the last_unlink_trans value of the deleted dir to
@@ -4639,7 +4693,7 @@ out_notrans:
btrfs_btree_balance_dirty(fs_info);
fscrypt_free_filename(&fname);
- return err;
+ return ret;
}
/*
@@ -4667,7 +4721,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (blocksize - 1);
- struct page *page;
+ struct folio *folio;
gfp_t mask = btrfs_alloc_write_mask(mapping);
size_t write_bytes = blocksize;
int ret = 0;
@@ -4699,8 +4753,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
goto out;
}
again:
- page = find_or_create_page(mapping, index, mask);
- if (!page) {
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+ if (IS_ERR(folio)) {
btrfs_delalloc_release_space(inode, data_reserved, block_start,
blocksize, true);
btrfs_delalloc_release_extents(inode, blocksize);
@@ -4708,15 +4763,15 @@ again:
goto out;
}
- if (!PageUptodate(page)) {
- ret = btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != mapping) {
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ ret = btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
goto again;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto out_unlock;
}
@@ -4725,22 +4780,22 @@ again:
/*
* We unlock the page after the io is completed and then re-lock it
* above. release_folio() could have come in between that and cleared
- * PagePrivate(), but left the page in the mapping. Set the page mapped
+ * folio private, but left the page in the mapping. Set the page mapped
* here to make sure it's properly set for the subpage stuff.
*/
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto out_unlock;
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
lock_extent(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
unlock_extent(io_tree, block_start, block_end, &cached_state);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
@@ -4761,15 +4816,17 @@ again:
if (!len)
len = blocksize - offset;
if (front)
- memzero_page(page, (block_start - page_offset(page)),
- offset);
+ folio_zero_range(folio, block_start - folio_pos(folio),
+ offset);
else
- memzero_page(page, (block_start - page_offset(page)) + offset,
- len);
- }
- btrfs_page_clear_checked(fs_info, page, block_start,
- block_end + 1 - block_start);
- btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
+ folio_zero_range(folio,
+ (block_start - folio_pos(folio)) + offset,
+ len);
+ }
+ btrfs_folio_clear_checked(fs_info, folio, block_start,
+ block_end + 1 - block_start);
+ btrfs_folio_set_dirty(fs_info, folio, block_start,
+ block_end + 1 - block_start);
unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
@@ -4785,8 +4842,8 @@ out_unlock:
block_start, blocksize, true);
}
btrfs_delalloc_release_extents(inode, blocksize);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
out:
if (only_release_metadata)
btrfs_check_nocow_unlock(inode);
@@ -4860,16 +4917,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
u64 last_byte;
u64 cur_offset;
u64 hole_size;
- int err = 0;
+ int ret = 0;
/*
* If our size started in the middle of a block we need to zero out the
* rest of the block before we expand the i_size, otherwise we could
* expose stale data.
*/
- err = btrfs_truncate_block(inode, oldsize, 0, 0);
- if (err)
- return err;
+ ret = btrfs_truncate_block(inode, oldsize, 0, 0);
+ if (ret)
+ return ret;
if (size <= hole_start)
return 0;
@@ -4878,10 +4935,9 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
&cached_state);
cur_offset = hole_start;
while (1) {
- em = btrfs_get_extent(inode, NULL, 0, cur_offset,
- block_end - cur_offset);
+ em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
if (IS_ERR(em)) {
- err = PTR_ERR(em);
+ ret = PTR_ERR(em);
em = NULL;
break;
}
@@ -4889,16 +4945,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
struct extent_map *hole_em;
- err = maybe_insert_hole(inode, cur_offset, hole_size);
- if (err)
+ ret = maybe_insert_hole(inode, cur_offset, hole_size);
+ if (ret)
break;
- err = btrfs_inode_set_file_extent_range(inode,
+ ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
- if (err)
+ if (ret)
break;
hole_em = alloc_extent_map();
@@ -4917,15 +4973,14 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
- hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = btrfs_get_fs_generation(fs_info);
- err = btrfs_replace_extent_map_range(inode, hole_em, true);
+ ret = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
} else {
- err = btrfs_inode_set_file_extent_range(inode,
+ ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
- if (err)
+ if (ret)
break;
}
next:
@@ -4937,7 +4992,7 @@ next:
}
free_extent_map(em);
unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
- return err;
+ return ret;
}
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
@@ -4991,7 +5046,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
if (btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode,
@@ -5132,7 +5187,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
*/
if (state_flags & EXTENT_DELALLOC)
btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
- end - start + 1);
+ end - start + 1, NULL);
clear_extent_bit(io_tree, start, end,
EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
@@ -5194,7 +5249,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
void btrfs_evict_inode(struct inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv = NULL;
@@ -5208,11 +5263,12 @@ void btrfs_evict_inode(struct inode *inode)
return;
}
+ fs_info = inode_to_fs_info(inode);
evict_inode_truncate_pages(inode);
if (inode->i_nlink &&
((btrfs_root_refs(&root->root_item) != 0 &&
- root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+ btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
btrfs_is_free_space_inode(BTRFS_I(inode))))
goto out;
@@ -5224,7 +5280,7 @@ void btrfs_evict_inode(struct inode *inode)
if (inode->i_nlink > 0) {
BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
- root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
+ btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
goto out;
}
@@ -5396,7 +5452,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
}
err = -ENOENT;
- key.objectid = dir->root->root_key.objectid;
+ key.objectid = btrfs_root_id(dir->root);
key.type = BTRFS_ROOT_REF_KEY;
key.offset = location->objectid;
@@ -5505,7 +5561,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
BTRFS_I(inode)->location.offset = 0;
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
- BUG_ON(args->root && !BTRFS_I(inode)->root);
if (args->root && args->root == args->root->fs_info->tree_root &&
args->ino != BTRFS_BTREE_INODE_OBJECTID)
@@ -5633,7 +5688,7 @@ static inline u8 btrfs_inode_type(struct inode *inode)
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
@@ -6172,7 +6227,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct inode *dir = args->dir;
struct inode *inode = args->inode;
const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_root *root;
struct btrfs_inode_item *inode_item;
struct btrfs_key *location;
@@ -6217,6 +6272,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
inode->i_generation = BTRFS_I(inode)->generation;
/*
+ * We don't have any capability xattrs set here yet, shortcut any
+ * queries for the xattrs here. If we add them later via the inode
+ * security init path or any other path this flag will be cleared.
+ */
+ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ /*
* Subvolumes don't inherit flags from their parent directory.
* Originally this was probably by accident, but we probably can't
* change it now without compatibility issues.
@@ -6349,8 +6411,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_err(fs_info,
"error inheriting props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
- ret);
+ btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
}
/*
@@ -6423,7 +6484,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_add_root_ref(trans, key.objectid,
- root->root_key.objectid, parent_ino,
+ btrfs_root_id(root), parent_ino,
index, name);
} else if (add_backref) {
ret = btrfs_insert_inode_ref(trans, root, name,
@@ -6466,7 +6527,7 @@ fail_dir_item:
u64 local_index;
int err;
err = btrfs_del_root_ref(trans, key.objectid,
- root->root_key.objectid, parent_ino,
+ btrfs_root_id(root), parent_ino,
&local_index, name);
if (err)
btrfs_abort_transaction(trans, err);
@@ -6487,7 +6548,7 @@ fail_dir_item:
static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
struct inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_new_inode_args new_inode_args = {
.dir = dir,
@@ -6557,14 +6618,14 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct fscrypt_name fname;
u64 index;
int err;
int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
- if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
+ if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
return -EXDEV;
if (inode->i_nlink >= BTRFS_LINK_MAX)
@@ -6721,7 +6782,6 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
*
* @inode: file to search in
* @page: page to read extent data into if the extent is inline
- * @pg_offset: offset into @page to copy to
* @start: file offset
* @len: length of range starting at @start
*
@@ -6735,8 +6795,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
* Return: ERR_PTR on error, non-NULL extent_map on success.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len)
+ struct page *page, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
@@ -6879,7 +6938,6 @@ next:
* ensured by tree-checker and inline extent creation path.
* Thus all members representing file offsets should be zero.
*/
- ASSERT(pg_offset == 0);
ASSERT(extent_start == 0);
ASSERT(em->start == 0);
@@ -6914,7 +6972,7 @@ insert:
}
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
@@ -6983,8 +7041,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
int ret;
alloc_hint = get_extent_allocation_hint(inode, start, len);
+again:
ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
0, alloc_hint, &ins, 1, 1);
+ if (ret == -EAGAIN) {
+ ASSERT(btrfs_is_zoned(fs_info));
+ wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ goto again;
+ }
if (ret)
return ERR_PTR(ret);
@@ -7036,7 +7101,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes, bool nowait, bool strict)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct can_nocow_file_extent_args nocow_args = { 0 };
struct btrfs_path *path;
int ret;
@@ -7234,11 +7299,49 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
struct extent_map *em;
int ret;
+ /*
+ * Note the missing NOCOW type.
+ *
+ * For pure NOCOW writes, we should not create an io extent map, but
+ * just reusing the existing one.
+ * Only PREALLOC writes (NOCOW write into preallocated range) can
+ * create an io extent map.
+ */
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
type == BTRFS_ORDERED_COMPRESSED ||
- type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
+ switch (type) {
+ case BTRFS_ORDERED_PREALLOC:
+ /* Uncompressed extents. */
+ ASSERT(block_len == len);
+
+ /* We're only referring part of a larger preallocated extent. */
+ ASSERT(block_len <= ram_bytes);
+ break;
+ case BTRFS_ORDERED_REGULAR:
+ /* Uncompressed extents. */
+ ASSERT(block_len == len);
+
+ /* COW results a new extent matching our file extent size. */
+ ASSERT(orig_block_len == len);
+ ASSERT(ram_bytes == len);
+
+ /* Since it's a new extent, we should not have any offset. */
+ ASSERT(orig_start == start);
+ break;
+ case BTRFS_ORDERED_COMPRESSED:
+ /* Must be compressed. */
+ ASSERT(compress_type != BTRFS_COMPRESS_NONE);
+
+ /*
+ * Encoded write can make us to refer to part of the
+ * uncompressed extent.
+ */
+ ASSERT(len <= ram_bytes);
+ break;
+ }
+
em = alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7251,13 +7354,9 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
em->orig_block_len = orig_block_len;
em->ram_bytes = ram_bytes;
em->generation = -1;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
- if (type == BTRFS_ORDERED_PREALLOC) {
- set_bit(EXTENT_FLAG_FILLING, &em->flags);
- } else if (type == BTRFS_ORDERED_COMPRESSED) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- }
+ em->flags |= EXTENT_FLAG_PINNED;
+ if (type == BTRFS_ORDERED_COMPRESSED)
+ extent_map_set_compression(em, compress_type);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
@@ -7277,7 +7376,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
unsigned int iomap_flags)
{
const bool nowait = (iomap_flags & IOMAP_NOWAIT);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_map *em = *map;
int type;
u64 block_start, orig_start, orig_block_len, ram_bytes;
@@ -7297,10 +7396,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
* just use the extent.
*
*/
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ if ((em->flags & EXTENT_FLAG_PREALLOC) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
type = BTRFS_ORDERED_PREALLOC;
else
type = BTRFS_ORDERED_NOCOW;
@@ -7417,7 +7516,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
struct iomap *srcmap)
{
struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = iter->private;
@@ -7515,7 +7614,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (ret < 0)
goto err;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto unlock_err;
@@ -7535,7 +7634,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* to buffered IO. Don't blame me, this is the price we pay for using
* the generic code.
*/
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+ if (extent_map_is_compressed(em) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
/*
@@ -7631,7 +7730,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* that, since we have locked only the parts we are performing I/O in.
*/
if ((em->block_start == EXTENT_MAP_HOLE) ||
- (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else {
@@ -7795,6 +7894,7 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
int ret;
ret = fiemap_prep(inode, fieinfo, start, &len, 0);
@@ -7820,18 +7920,26 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
- return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
-}
+ btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
-static int btrfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return extent_writepages(mapping, wbc);
-}
+ /*
+ * We did an initial flush to avoid holding the inode's lock while
+ * triggering writeback and waiting for the completion of IO and ordered
+ * extents. Now after we locked the inode we do it again, because it's
+ * possible a new write may have happened in between those two steps.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
+ if (ret) {
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+ }
+ }
-static void btrfs_readahead(struct readahead_control *rac)
-{
- extent_readahead(rac);
+ ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ return ret;
}
/*
@@ -7843,14 +7951,15 @@ static void btrfs_readahead(struct readahead_control *rac)
*/
static void wait_subpage_spinlock(struct page *page)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = page_to_fs_info(page);
+ struct folio *folio = page_folio(page);
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, page->mapping))
return;
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ subpage = folio_get_private(folio);
/*
* This may look insane as we just acquire the spinlock and release it,
@@ -7869,13 +7978,12 @@ static void wait_subpage_spinlock(struct page *page)
static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- int ret = try_release_extent_mapping(&folio->page, gfp_flags);
-
- if (ret == 1) {
+ if (try_release_extent_mapping(&folio->page, gfp_flags)) {
wait_subpage_spinlock(&folio->page);
clear_page_extent_mapped(&folio->page);
+ return true;
}
- return ret;
+ return false;
}
static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
@@ -7909,7 +8017,7 @@ static int btrfs_migrate_folio(struct address_space *mapping,
static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
- struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
+ struct btrfs_inode *inode = folio_to_inode(folio);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
@@ -7988,7 +8096,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
- if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
+ if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
/*
* If Ordered (Private2) is cleared, it means endio has
* already been executed for the range.
@@ -7997,7 +8105,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
*/
goto next;
}
- btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
+ btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
/*
* IO on this page will never be started, so we need to account
@@ -8052,7 +8160,7 @@ next:
* reserved data space.
* Since the IO will never happen for this page.
*/
- btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
+ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
if (!inode_evicting) {
clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_UPTODATE |
@@ -8067,176 +8175,12 @@ next:
* did something wrong.
*/
ASSERT(!folio_test_ordered(folio));
- btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
+ btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
__btrfs_release_folio(folio, GFP_NOFS);
clear_page_extent_mapped(&folio->page);
}
-/*
- * btrfs_page_mkwrite() is not allowed to change the file size as it gets
- * called from a page fault handler when a page is first dirtied. Hence we must
- * be careful to check for EOF conditions here. We set the page up correctly
- * for a written page which means we get ENOSPC checking when writing into
- * holes and correct delalloc and unwritten extent mapping on filesystems that
- * support these features.
- *
- * We are not allowed to take the i_mutex here so we have to play games to
- * protect against truncate races as the page could now be beyond EOF. Because
- * truncate_setsize() writes the inode size before removing pages, once we have
- * the page lock we can determine safely if the page is beyond EOF. If it is not
- * beyond EOF, then the page is guaranteed safe against truncation until we
- * unlock the page.
- */
-vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
-{
- struct page *page = vmf->page;
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- struct extent_changeset *data_reserved = NULL;
- unsigned long zero_start;
- loff_t size;
- vm_fault_t ret;
- int ret2;
- int reserved = 0;
- u64 reserved_space;
- u64 page_start;
- u64 page_end;
- u64 end;
-
- reserved_space = PAGE_SIZE;
-
- sb_start_pagefault(inode->i_sb);
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
- end = page_end;
-
- /*
- * Reserving delalloc space after obtaining the page lock can lead to
- * deadlock. For example, if a dirty page is locked by this function
- * and the call to btrfs_delalloc_reserve_space() ends up triggering
- * dirty page write out, then the btrfs_writepages() function could
- * end up waiting indefinitely to get a lock on the page currently
- * being processed by btrfs_page_mkwrite() function.
- */
- ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- page_start, reserved_space);
- if (!ret2) {
- ret2 = file_update_time(vmf->vma->vm_file);
- reserved = 1;
- }
- if (ret2) {
- ret = vmf_error(ret2);
- if (reserved)
- goto out;
- goto out_noreserve;
- }
-
- ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
-again:
- down_read(&BTRFS_I(inode)->i_mmap_lock);
- lock_page(page);
- size = i_size_read(inode);
-
- if ((page->mapping != inode->i_mapping) ||
- (page_start >= size)) {
- /* page got truncated out from underneath us */
- goto out_unlock;
- }
- wait_on_page_writeback(page);
-
- lock_extent(io_tree, page_start, page_end, &cached_state);
- ret2 = set_page_extent_mapped(page);
- if (ret2 < 0) {
- ret = vmf_error(ret2);
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- goto out_unlock;
- }
-
- /*
- * we can't set the delalloc bits if there are pending ordered
- * extents. Drop our locks and wait for them to finish
- */
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
- PAGE_SIZE);
- if (ordered) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- unlock_page(page);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
- btrfs_start_ordered_extent(ordered);
- btrfs_put_ordered_extent(ordered);
- goto again;
- }
-
- if (page->index == ((size - 1) >> PAGE_SHIFT)) {
- reserved_space = round_up(size - page_start,
- fs_info->sectorsize);
- if (reserved_space < PAGE_SIZE) {
- end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, page_start,
- PAGE_SIZE - reserved_space, true);
- }
- }
-
- /*
- * page_mkwrite gets called when the page is firstly dirtied after it's
- * faulted in, but write(2) could also dirty a page and set delalloc
- * bits, thus in this case for space account reason, we still need to
- * clear any delalloc bits within this page range since we have to
- * reserve data&meta space before lock_page() (see above comments).
- */
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
-
- ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
- &cached_state);
- if (ret2) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- ret = VM_FAULT_SIGBUS;
- goto out_unlock;
- }
-
- /* page is wholly or partially inside EOF */
- if (page_start + PAGE_SIZE > size)
- zero_start = offset_in_page(size);
- else
- zero_start = PAGE_SIZE;
-
- if (zero_start != PAGE_SIZE)
- memzero_page(page, zero_start, PAGE_SIZE - zero_start);
-
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
- btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
- btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
-
- btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
-
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
-
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- sb_end_pagefault(inode->i_sb);
- extent_changeset_free(data_reserved);
- return VM_FAULT_LOCKED;
-
-out_unlock:
- unlock_page(page);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
-out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
- reserved_space, (ret != 0));
-out_noreserve:
- sb_end_pagefault(inode->i_sb);
- extent_changeset_free(data_reserved);
- return ret;
-}
-
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
{
struct btrfs_truncate_control control = {
@@ -8455,10 +8399,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_inode *ei;
struct inode *inode;
+ struct extent_io_tree *file_extent_tree = NULL;
+
+ /* Self tests may pass a NULL fs_info. */
+ if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
+ if (!file_extent_tree)
+ return NULL;
+ }
ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
- if (!ei)
+ if (!ei) {
+ kfree(file_extent_tree);
return NULL;
+ }
ei->root = NULL;
ei->generation = 0;
@@ -8494,10 +8448,18 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
+
+ /* This io tree sets the valid inode. */
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
- extent_io_tree_init(fs_info, &ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT);
+
+ ei->file_extent_tree = file_extent_tree;
+ if (file_extent_tree) {
+ extent_io_tree_init(fs_info, ei->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT);
+ /* Lockdep class is set only for the file extent tree. */
+ lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class);
+ }
mutex_init(&ei->log_mutex);
spin_lock_init(&ei->ordered_tree_lock);
ei->ordered_tree = RB_ROOT;
@@ -8514,12 +8476,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
void btrfs_test_destroy_inode(struct inode *inode)
{
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif
void btrfs_free_inode(struct inode *inode)
{
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
@@ -8616,7 +8580,7 @@ int __init btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
init_once);
if (!btrfs_inode_cachep)
goto fail;
@@ -8639,7 +8603,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
u64 delalloc_bytes;
u64 inode_bytes;
struct inode *inode = d_inode(path->dentry);
- u32 blocksize = inode->i_sb->s_blocksize;
+ u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
u32 bi_flags = BTRFS_I(inode)->flags;
u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
@@ -8665,6 +8629,9 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
+ stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
+ stat->result_mask |= STATX_SUBVOL;
+
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
inode_bytes = inode_get_bytes(inode);
@@ -8679,7 +8646,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_dir,
struct dentry *new_dentry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
struct btrfs_trans_handle *trans;
unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
@@ -8931,7 +8898,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
struct btrfs_new_inode_args whiteout_args = {
.dir = old_dir,
.dentry = old_dentry,
@@ -9373,7 +9340,7 @@ out:
static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
@@ -9484,7 +9451,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
struct btrfs_path *path;
u64 start = ins->objectid;
u64 len = ins->offset;
- int qgroup_released;
+ u64 qgroup_released = 0;
int ret;
memset(&stack_fi, 0, sizeof(stack_fi));
@@ -9497,9 +9464,9 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
/* Encryption and other encoding is reserved and all 0 */
- qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
- if (qgroup_released < 0)
- return ERR_PTR(qgroup_released);
+ ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
+ if (ret < 0)
+ return ERR_PTR(ret);
if (trans) {
ret = insert_reserved_file_extent(trans, inode,
@@ -9544,7 +9511,7 @@ free_qgroup:
* or we leak qgroup data reservation.
*/
btrfs_qgroup_free_refroot(inode->root->fs_info,
- inode->root->root_key.objectid, qgroup_released,
+ btrfs_root_id(inode->root), qgroup_released,
BTRFS_QGROUP_RSV_DATA);
return ERR_PTR(ret);
}
@@ -9554,7 +9521,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
loff_t actual_len, u64 *alloc_hint,
struct btrfs_trans_handle *trans)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
@@ -9625,7 +9592,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ins.offset;
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->flags |= EXTENT_FLAG_PREALLOC;
em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
@@ -9706,7 +9673,7 @@ static int btrfs_permission(struct mnt_idmap *idmap,
static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct file *file, umode_t mode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode;
@@ -9778,7 +9745,9 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
- btrfs_page_set_writeback(fs_info, page, start, len);
+ /* This is for data, which doesn't yet support larger folio. */
+ ASSERT(folio_order(page_folio(page)) == 0);
+ btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
put_page(page);
index++;
}
@@ -9987,7 +9956,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- ret = btrfs_alloc_page_array(nr_pages, pages);
+ ret = btrfs_alloc_page_array(nr_pages, pages, 0);
if (ret) {
ret = -ENOMEM;
goto out;
@@ -10078,7 +10047,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
cond_resched();
}
- em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+ em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_unlock_extent;
@@ -10106,12 +10075,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->len = min_t(u64, extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ (em->flags & EXTENT_FLAG_PREALLOC)) {
disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
- } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ } else if (extent_map_is_compressed(em)) {
disk_bytenr = em->block_start;
/*
* Bail if the buffer isn't large enough to return the whole
@@ -10126,7 +10095,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
ret = btrfs_encoded_io_compression_from_extent(fs_info,
- em->compress_type);
+ extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
@@ -10190,8 +10159,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
size_t orig_count;
u64 start, end;
u64 num_bytes, ram_bytes, disk_num_bytes;
- unsigned long nr_pages, i;
- struct page **pages;
+ unsigned long nr_folios, i;
+ struct folio **folios;
struct btrfs_key ins;
bool extent_reserved = false;
struct extent_map *em;
@@ -10222,6 +10191,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
return -EINVAL;
+ /*
+ * Compressed extents should always have checksums, so error out if we
+ * have a NOCOW file or inode was created while mounted with NODATASUM.
+ */
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ return -EINVAL;
+
orig_count = iov_iter_count(from);
/* The extent size must be sane. */
@@ -10273,24 +10249,24 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
* isn't.
*/
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
- nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
- pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
- if (!pages)
+ nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
+ folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ if (!folios)
return -ENOMEM;
- for (i = 0; i < nr_pages; i++) {
+ for (i = 0; i < nr_folios; i++) {
size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
char *kaddr;
- pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!pages[i]) {
+ folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
+ if (!folios[i]) {
ret = -ENOMEM;
- goto out_pages;
+ goto out_folios;
}
- kaddr = kmap_local_page(pages[i]);
+ kaddr = kmap_local_folio(folios[i], 0);
if (copy_from_iter(kaddr, bytes, from) != bytes) {
kunmap_local(kaddr);
ret = -EFAULT;
- goto out_pages;
+ goto out_folios;
}
if (bytes < PAGE_SIZE)
memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
@@ -10302,12 +10278,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
if (ret)
- goto out_pages;
+ goto out_folios;
ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
if (ret)
- goto out_pages;
+ goto out_folios;
lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
if (!ordered &&
@@ -10335,10 +10311,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
goto out_qgroup_free_data;
/* Try an inline extent first. */
- if (start == 0 && encoded->unencoded_len == encoded->len &&
- encoded->unencoded_offset == 0) {
- ret = cow_file_range_inline(inode, encoded->len, orig_count,
- compression, pages, true);
+ if (encoded->unencoded_len == encoded->len &&
+ encoded->unencoded_offset == 0 &&
+ can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
+ ret = __cow_file_range_inline(inode, start, encoded->len,
+ orig_count, compression, folios[0],
+ true);
if (ret <= 0) {
if (ret == 0)
ret = orig_count;
@@ -10382,7 +10360,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
btrfs_delalloc_release_extents(inode, num_bytes);
- btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
+ btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
ret = orig_count;
goto out;
@@ -10394,7 +10372,7 @@ out_delalloc_release:
btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
out_qgroup_free_data:
if (ret < 0)
- btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
+ btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
out_free_data_space:
/*
* If btrfs_reserve_extent() succeeded, then we already decremented
@@ -10404,12 +10382,12 @@ out_free_data_space:
btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
out_unlock:
unlock_extent(io_tree, start, end, &cached_state);
-out_pages:
- for (i = 0; i < nr_pages; i++) {
- if (pages[i])
- __free_page(pages[i]);
+out_folios:
+ for (i = 0; i < nr_folios; i++) {
+ if (folios[i])
+ __folio_put(folios[i]);
}
- kvfree(pages);
+ kvfree(folios);
out:
if (ret >= 0)
iocb->ki_pos += encoded->len;
@@ -10557,6 +10535,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
struct extent_map *em = NULL;
+ struct btrfs_chunk_map *map = NULL;
struct btrfs_device *device = NULL;
struct btrfs_swap_info bsi = {
.lowest_ppage = (sector_t)-1ULL,
@@ -10635,7 +10614,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because subvolume %llu is being deleted",
- root->root_key.objectid);
+ btrfs_root_id(root));
return -EPERM;
}
atomic_inc(&root->nr_swapfiles);
@@ -10650,7 +10629,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct btrfs_block_group *bg;
u64 len = isize - start;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -10673,7 +10652,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = -EINVAL;
goto out;
}
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ if (extent_map_is_compressed(em)) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL;
goto out;
@@ -10696,13 +10675,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out;
}
- em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
goto out;
}
- if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
btrfs_warn(fs_info,
"swapfile must have single data profile");
ret = -EINVAL;
@@ -10710,23 +10689,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
}
if (device == NULL) {
- device = em->map_lookup->stripes[0].dev;
+ device = map->stripes[0].dev;
ret = btrfs_add_swapfile_pin(inode, device, false);
if (ret == 1)
ret = 0;
else if (ret)
goto out;
- } else if (device != em->map_lookup->stripes[0].dev) {
+ } else if (device != map->stripes[0].dev) {
btrfs_warn(fs_info, "swapfile must be on one device");
ret = -EINVAL;
goto out;
}
- physical_block_start = (em->map_lookup->stripes[0].physical +
- (logical_block_start - em->start));
- len = min(len, em->len - (logical_block_start - em->start));
- free_extent_map(em);
- em = NULL;
+ physical_block_start = (map->stripes[0].physical +
+ (logical_block_start - map->start));
+ len = min(len, map->chunk_len - (logical_block_start - map->start));
+ btrfs_free_chunk_map(map);
+ map = NULL;
bg = btrfs_lookup_block_group(fs_info, logical_block_start);
if (!bg) {
@@ -10779,6 +10758,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
out:
if (!IS_ERR_OR_NULL(em))
free_extent_map(em);
+ if (!IS_ERR_OR_NULL(map))
+ btrfs_free_chunk_map(map);
unlock_extent(io_tree, 0, isize - 1, &cached_state);
@@ -10859,7 +10840,7 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
if (ordered) {
btrfs_err(root->fs_info,
"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
- start, end, btrfs_ino(inode), root->root_key.objectid,
+ start, end, btrfs_ino(inode), btrfs_root_id(root),
ordered->file_offset,
ordered->file_offset + ordered->num_bytes - 1);
btrfs_put_ordered_extent(ordered);
@@ -10868,6 +10849,65 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
ASSERT(ordered == NULL);
}
+/*
+ * Find the first inode with a minimum number.
+ *
+ * @root: The root to search for.
+ * @min_ino: The minimum inode number.
+ *
+ * Find the first inode in the @root with a number >= @min_ino and return it.
+ * Returns NULL if no such inode found.
+ */
+struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
+{
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *inode;
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ inode = rb_entry(node, struct btrfs_inode, rb_node);
+ if (min_ino < btrfs_ino(inode))
+ node = node->rb_left;
+ else if (min_ino > btrfs_ino(inode))
+ node = node->rb_right;
+ else
+ break;
+ }
+
+ if (!node) {
+ while (prev) {
+ inode = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (min_ino <= btrfs_ino(inode)) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+
+ while (node) {
+ inode = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (igrab(&inode->vfs_inode)) {
+ spin_unlock(&root->inode_lock);
+ return inode;
+ }
+
+ min_ino = btrfs_ino(inode) + 1;
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+
+ return NULL;
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -10923,7 +10963,7 @@ static const struct address_space_operations btrfs_aops = {
.release_folio = btrfs_release_folio,
.migrate_folio = btrfs_migrate_folio,
.dirty_folio = filemap_dirty_folio,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 752acff2c734..efd5d6e9589e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -34,11 +34,9 @@
#include "export.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "volumes.h"
#include "locking.h"
#include "backref.h"
-#include "rcu-string.h"
#include "send.h"
#include "dev-replace.h"
#include "props.h"
@@ -47,9 +45,7 @@
#include "tree-log.h"
#include "compression.h"
#include "space-info.h"
-#include "delalloc-space.h"
#include "block-group.h"
-#include "subpage.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
@@ -231,6 +227,20 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
return 0;
}
+int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args)
+{
+ if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL)
+ return -ENAMETOOLONG;
+ return 0;
+}
+
+static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2)
+{
+ if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL)
+ return -ENAMETOOLONG;
+ return 0;
+}
+
/*
* Set flags/xflags from the internal inode flags. The remaining items of
* fsxattr are zeroed.
@@ -247,7 +257,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa)
{
struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_inode *binode = BTRFS_I(inode);
struct btrfs_root *root = binode->root;
struct btrfs_trans_handle *trans;
@@ -528,7 +538,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
* block group is in the logical address space, which can be any
* sectorsize aligned bytenr in the range [0, U64_MAX].
*/
- if (range.len < fs_info->sb->s_blocksize)
+ if (range.len < fs_info->sectorsize)
return -EINVAL;
range.minlen = max(range.minlen, minlen);
@@ -584,7 +594,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
struct inode *dir, struct dentry *dentry,
struct btrfs_qgroup_inherit *inherit)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_trans_handle *trans;
struct btrfs_key key;
struct btrfs_root_item *root_item;
@@ -603,6 +613,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
int ret;
dev_t anon_dev;
u64 objectid;
+ u64 qgroup_reserved = 0;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
@@ -640,19 +651,24 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
trans_num_items, false);
if (ret)
goto out_new_inode_args;
+ qgroup_reserved = block_rsv.qgroup_rsv_reserved;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_subvolume_release_metadata(root, &block_rsv);
- goto out_new_inode_args;
+ goto out_release_rsv;
}
+ ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
+ if (ret)
+ goto out;
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+ qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
/* Tree log can't currently deal with an inode which is a new root. */
btrfs_set_log_full_commit(trans);
- ret = btrfs_qgroup_inherit(trans, 0, objectid, root->root_key.objectid, inherit);
+ ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit);
if (ret)
goto out;
@@ -721,7 +737,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
free_extent_buffer(leaf);
leaf = NULL;
- new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
+ new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
@@ -757,9 +773,11 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
out:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
- btrfs_subvolume_release_metadata(root, &block_rsv);
-
btrfs_end_transaction(trans);
+out_release_rsv:
+ btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
+ if (qgroup_reserved)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
@@ -776,11 +794,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct dentry *dentry, bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
+ struct btrfs_block_rsv *block_rsv;
+ u64 qgroup_reserved = 0;
int ret;
/* We do not support snapshotting right now. */
@@ -790,6 +810,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
return -EOPNOTSUPP;
}
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return -ENOENT;
+
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return -EINVAL;
@@ -814,19 +837,19 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
goto free_pending;
}
- btrfs_init_block_rsv(&pending_snapshot->block_rsv,
- BTRFS_BLOCK_RSV_TEMP);
+ block_rsv = &pending_snapshot->block_rsv;
+ btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
* 1 to add dir item
* 1 to add dir index
* 1 to update parent inode item
*/
trans_num_items = create_subvol_num_items(inherit) + 3;
- ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
- &pending_snapshot->block_rsv,
+ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv,
trans_num_items, false);
if (ret)
goto free_pending;
+ qgroup_reserved = block_rsv->qgroup_rsv_reserved;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
@@ -839,6 +862,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
ret = PTR_ERR(trans);
goto fail;
}
+ ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ goto fail;
+ }
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+ qgroup_reserved = 0;
trans->pending_snapshot = pending_snapshot;
@@ -868,7 +898,9 @@ fail:
if (ret && pending_snapshot->snap)
pending_snapshot->snap->anon_dev = 0;
btrfs_put_root(pending_snapshot->snap);
- btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
+ btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL);
+ if (qgroup_reserved)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
free_pending:
if (pending_snapshot->anon_dev)
free_anon_bdev(pending_snapshot->anon_dev);
@@ -907,7 +939,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap,
if (d_really_is_negative(victim))
return -ENOENT;
- BUG_ON(d_inode(victim->d_parent) != dir);
+ /* The @victim is not inside @dir. */
+ if (d_inode(victim->d_parent) != dir)
+ return -EINVAL;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
@@ -959,7 +993,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
struct btrfs_qgroup_inherit *inherit)
{
struct inode *dir = d_inode(parent->dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct dentry *dentry;
struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
int error;
@@ -1094,7 +1128,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
u64 new_size;
u64 old_size;
u64 devid = 1;
@@ -1125,7 +1159,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = PTR_ERR(vol_args);
goto out_drop;
}
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out_free;
+
sizestr = vol_args->name;
cancel = (strcmp("cancel", sizestr) == 0);
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
@@ -1290,6 +1327,15 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
* are limited to own subvolumes only
*/
ret = -EPERM;
+ } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * Snapshots must be made with the src_inode referring
+ * to the subvolume inode, otherwise the permission
+ * checking above is useless because we may have
+ * permission on a lower directory but not the subvol
+ * itself.
+ */
+ ret = -EINVAL;
} else {
ret = btrfs_mksnapshot(&file->f_path, idmap,
name, namelen,
@@ -1316,12 +1362,15 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out;
ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
vol_args->name, vol_args->fd, subvol,
false, NULL);
+out:
kfree(vol_args);
return ret;
}
@@ -1340,7 +1389,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
- vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
+ if (ret < 0)
+ goto free_args;
if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
ret = -EOPNOTSUPP;
@@ -1350,7 +1401,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
- u64 nums;
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
if (vol_args->size < sizeof(*inherit) ||
vol_args->size > PAGE_SIZE) {
@@ -1363,19 +1414,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
goto free_args;
}
- if (inherit->num_qgroups > PAGE_SIZE ||
- inherit->num_ref_copies > PAGE_SIZE ||
- inherit->num_excl_copies > PAGE_SIZE) {
- ret = -EINVAL;
- goto free_inherit;
- }
-
- nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
- 2 * inherit->num_excl_copies;
- if (vol_args->size != struct_size(inherit, qgroups, nums)) {
- ret = -EINVAL;
+ ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size);
+ if (ret < 0)
goto free_inherit;
- }
}
ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
@@ -1393,7 +1434,7 @@ free_args:
static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
void __user *arg)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
u64 flags = 0;
@@ -1416,7 +1457,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
void __user *arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
u64 root_flags;
@@ -1469,7 +1510,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
spin_unlock(&root->root_item_lock);
btrfs_warn(fs_info,
"Attempt to set subvolume %llu read-write during send",
- root->root_key.objectid);
+ btrfs_root_id(root));
ret = -EPERM;
goto out_drop_sem;
}
@@ -1528,7 +1569,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
- size_t *buf_size,
+ u64 *buf_size,
char __user *ubuf,
unsigned long *sk_offset,
int *num_found)
@@ -1660,10 +1701,10 @@ out:
static noinline int search_ioctl(struct inode *inode,
struct btrfs_ioctl_search_key *sk,
- size_t *buf_size,
+ u64 *buf_size,
char __user *ubuf)
{
- struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *info = inode_to_fs_info(inode);
struct btrfs_root *root;
struct btrfs_key key;
struct btrfs_path *path;
@@ -1733,7 +1774,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
struct btrfs_ioctl_search_args __user *uargs = argp;
struct btrfs_ioctl_search_key sk;
int ret;
- size_t buf_size;
+ u64 buf_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1763,8 +1804,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
struct btrfs_ioctl_search_args_v2 args;
int ret;
- size_t buf_size;
- const size_t buf_limit = SZ_16M;
+ u64 buf_size;
+ const u64 buf_limit = SZ_16M;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1878,7 +1919,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct super_block *sb = inode->i_sb;
struct btrfs_key upper_limit = BTRFS_I(inode)->location;
- u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
+ u64 treeid = btrfs_root_id(BTRFS_I(inode)->root);
u64 dirid = args->dirid;
unsigned long item_off;
unsigned long item_len;
@@ -2050,7 +2091,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
* path is reset so it's consistent with btrfs_search_path_in_tree.
*/
if (args->treeid == 0)
- args->treeid = root->root_key.objectid;
+ args->treeid = btrfs_root_id(root);
if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
args->name[0] = 0;
@@ -2146,7 +2187,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
fs_info = BTRFS_I(inode)->root->fs_info;
/* Get root_item of inode's subvolume */
- key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+ key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
root = btrfs_get_fs_root(fs_info, key.objectid, true);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
@@ -2261,7 +2302,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
return PTR_ERR(rootrefs);
}
- objectid = root->root_key.objectid;
+ objectid = btrfs_root_id(root);
key.objectid = objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = rootrefs->min_treeid;
@@ -2334,9 +2375,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
bool destroy_v2)
{
struct dentry *parent = file->f_path.dentry;
- struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
struct dentry *dentry;
struct inode *dir = d_inode(parent);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
@@ -2345,7 +2386,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct mnt_idmap *idmap = file_mnt_idmap(file);
char *subvol_name, *subvol_name_ptr = NULL;
int subvol_namelen;
- int err = 0;
+ int ret = 0;
bool destroy_parent = false;
/* We don't support snapshots with extent tree v2 yet. */
@@ -2361,7 +2402,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
return PTR_ERR(vol_args2);
if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
- err = -EOPNOTSUPP;
+ ret = -EOPNOTSUPP;
goto out;
}
@@ -2370,29 +2411,31 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* name, same as v1 currently does.
*/
if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
- vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0;
+ ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2);
+ if (ret < 0)
+ goto out;
subvol_name = vol_args2->name;
- err = mnt_want_write_file(file);
- if (err)
+ ret = mnt_want_write_file(file);
+ if (ret)
goto out;
} else {
struct inode *old_dir;
if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
- err = -EINVAL;
+ ret = -EINVAL;
goto out;
}
- err = mnt_want_write_file(file);
- if (err)
+ ret = mnt_want_write_file(file);
+ if (ret)
goto out;
dentry = btrfs_get_dentry(fs_info->sb,
BTRFS_FIRST_FREE_OBJECTID,
vol_args2->subvolid, 0);
if (IS_ERR(dentry)) {
- err = PTR_ERR(dentry);
+ ret = PTR_ERR(dentry);
goto out_drop_write;
}
@@ -2412,7 +2455,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
*/
dput(dentry);
if (IS_ERR(parent)) {
- err = PTR_ERR(parent);
+ ret = PTR_ERR(parent);
goto out_drop_write;
}
old_dir = dir;
@@ -2436,14 +2479,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* to delete without an idmapped mount.
*/
if (old_dir != dir && idmap != &nop_mnt_idmap) {
- err = -EOPNOTSUPP;
+ ret = -EOPNOTSUPP;
goto free_parent;
}
subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
fs_info, vol_args2->subvolid);
if (IS_ERR(subvol_name_ptr)) {
- err = PTR_ERR(subvol_name_ptr);
+ ret = PTR_ERR(subvol_name_ptr);
goto free_parent;
}
/* subvol_name_ptr is already nul terminated */
@@ -2454,11 +2497,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
- vol_args->name[BTRFS_PATH_NAME_MAX] = 0;
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out;
+
subvol_name = vol_args->name;
- err = mnt_want_write_file(file);
- if (err)
+ ret = mnt_want_write_file(file);
+ if (ret)
goto out;
}
@@ -2466,26 +2512,26 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (strchr(subvol_name, '/') ||
strncmp(subvol_name, "..", subvol_namelen) == 0) {
- err = -EINVAL;
+ ret = -EINVAL;
goto free_subvol_name;
}
if (!S_ISDIR(dir->i_mode)) {
- err = -ENOTDIR;
+ ret = -ENOTDIR;
goto free_subvol_name;
}
- err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
- if (err == -EINTR)
+ ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
+ if (ret == -EINTR)
goto free_subvol_name;
dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen);
if (IS_ERR(dentry)) {
- err = PTR_ERR(dentry);
+ ret = PTR_ERR(dentry);
goto out_unlock_dir;
}
if (d_really_is_negative(dentry)) {
- err = -ENOENT;
+ ret = -ENOENT;
goto out_dput;
}
@@ -2505,7 +2551,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* Users who want to delete empty subvols should try
* rmdir(2).
*/
- err = -EPERM;
+ ret = -EPERM;
if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
goto out_dput;
@@ -2516,29 +2562,29 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* of the subvol, not a random directory contained
* within it.
*/
- err = -EINVAL;
+ ret = -EINVAL;
if (root == dest)
goto out_dput;
- err = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
- if (err)
+ ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
+ if (ret)
goto out_dput;
}
/* check if subvolume may be deleted by a user */
- err = btrfs_may_delete(idmap, dir, dentry, 1);
- if (err)
+ ret = btrfs_may_delete(idmap, dir, dentry, 1);
+ if (ret)
goto out_dput;
if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
- err = -EINVAL;
+ ret = -EINVAL;
goto out_dput;
}
btrfs_inode_lock(BTRFS_I(inode), 0);
- err = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
+ ret = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
btrfs_inode_unlock(BTRFS_I(inode), 0);
- if (!err)
+ if (!ret)
d_delete_notify(dir, dentry);
out_dput:
@@ -2555,7 +2601,7 @@ out_drop_write:
out:
kfree(vol_args2);
kfree(vol_args);
- return err;
+ return ret;
}
static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
@@ -2599,6 +2645,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EFAULT;
goto out;
}
+ if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
/* compression requires us to start the IO */
if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
@@ -2661,12 +2711,16 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
goto out;
}
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out_free;
+
ret = btrfs_init_new_device(fs_info, vol_args->name);
if (!ret)
btrfs_info(fs_info, "disk added %s", vol_args->name);
+out_free:
kfree(vol_args);
out:
if (restore_op)
@@ -2680,9 +2734,9 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_ioctl_vol_args_v2 *vol_args;
- struct bdev_handle *bdev_handle = NULL;
+ struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2698,7 +2752,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto out;
}
- vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
+ if (ret < 0)
+ goto out;
+
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
args.devid = vol_args->devid;
} else if (!strcmp("cancel", vol_args->name)) {
@@ -2719,7 +2776,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto err_drop;
/* Exclusive operation is now claimed */
- ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_file);
btrfs_exclop_finish(fs_info);
@@ -2733,8 +2790,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
}
err_drop:
mnt_drop_write_file(file);
- if (bdev_handle)
- bdev_release(bdev_handle);
+ if (bdev_file)
+ fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2745,9 +2802,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_ioctl_vol_args *vol_args;
- struct bdev_handle *bdev_handle = NULL;
+ struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2758,7 +2815,10 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out_free;
+
if (!strcmp("cancel", vol_args->name)) {
cancel = true;
} else {
@@ -2774,17 +2834,18 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_file);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
mnt_drop_write_file(file);
- if (bdev_handle)
- bdev_release(bdev_handle);
+ if (bdev_file)
+ fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
+out_free:
kfree(vol_args);
return ret;
}
@@ -2888,7 +2949,7 @@ out:
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root *new_root;
struct btrfs_dir_item *di;
@@ -2920,7 +2981,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
- if (!is_fstree(new_root->root_key.objectid)) {
+ if (!is_fstree(btrfs_root_id(new_root))) {
ret = -ENOENT;
goto out_free;
}
@@ -3162,7 +3223,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
struct btrfs_ioctl_scrub_args *sa;
int ret;
@@ -3680,7 +3741,7 @@ out:
static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_ioctl_quota_ctl_args *sa;
int ret;
@@ -3697,15 +3758,43 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
goto drop_write;
}
- down_write(&fs_info->subvol_sem);
-
switch (sa->cmd) {
case BTRFS_QUOTA_CTL_ENABLE:
case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA:
+ down_write(&fs_info->subvol_sem);
ret = btrfs_quota_enable(fs_info, sa);
+ up_write(&fs_info->subvol_sem);
break;
case BTRFS_QUOTA_CTL_DISABLE:
+ /*
+ * Lock the cleaner mutex to prevent races with concurrent
+ * relocation, because relocation may be building backrefs for
+ * blocks of the quota root while we are deleting the root. This
+ * is like dropping fs roots of deleted snapshots/subvolumes, we
+ * need the same protection.
+ *
+ * This also prevents races between concurrent tasks trying to
+ * disable quotas, because we will unlock and relock
+ * qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ *
+ * We take this here because we have the dependency of
+ *
+ * inode_lock -> subvol_sem
+ *
+ * because of rename. With relocation we can prealloc extents,
+ * so that makes the dependency chain
+ *
+ * cleaner_mutex -> inode_lock -> subvol_sem
+ *
+ * so we must take the cleaner_mutex here before we take the
+ * subvol_sem. The deadlock can't actually happen, but this
+ * quiets lockdep.
+ */
+ mutex_lock(&fs_info->cleaner_mutex);
+ down_write(&fs_info->subvol_sem);
ret = btrfs_quota_disable(fs_info);
+ up_write(&fs_info->subvol_sem);
+ mutex_unlock(&fs_info->cleaner_mutex);
break;
default:
ret = -EINVAL;
@@ -3713,7 +3802,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
}
kfree(sa);
- up_write(&fs_info->subvol_sem);
drop_write:
mnt_drop_write_file(file);
return ret;
@@ -3722,7 +3810,7 @@ drop_write:
static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_qgroup_assign_args *sa;
struct btrfs_trans_handle *trans;
@@ -3799,6 +3887,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
+ if (sa->create && is_fstree(sa->qgroupid)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -3854,7 +3947,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
qgroupid = sa->qgroupid;
if (!qgroupid) {
/* take the current subvol as qgroup */
- qgroupid = root->root_key.objectid;
+ qgroupid = btrfs_root_id(root);
}
ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim);
@@ -3873,7 +3966,7 @@ drop_write:
static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_ioctl_quota_rescan_args *qsa;
int ret;
@@ -3937,7 +4030,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
struct btrfs_ioctl_received_subvol_args *sa)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root_item *root_item = &root->root_item;
struct btrfs_trans_handle *trans;
@@ -3985,7 +4078,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
!btrfs_is_empty_uuid(root_item->received_uuid)) {
ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- root->root_key.objectid);
+ btrfs_root_id(root));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -4009,7 +4102,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
ret = btrfs_uuid_tree_add(trans, sa->uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- root->root_key.objectid);
+ btrfs_root_id(root));
if (ret < 0 && ret != -EEXIST) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -4125,7 +4218,7 @@ static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_trans_handle *trans;
@@ -4268,7 +4361,7 @@ check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \
static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_ioctl_feature_flags flags[2];
@@ -4356,6 +4449,7 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat
arg->clone_sources = compat_ptr(args32.clone_sources);
arg->parent_root = args32.parent_root;
arg->flags = args32.flags;
+ arg->version = args32.version;
memcpy(arg->reserved, args32.reserved,
sizeof(args32.reserved));
#else
@@ -4523,29 +4617,29 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
if (ret < 0)
goto out_acct;
- file_start_write(file);
-
if (iov_iter_count(&iter) == 0) {
ret = 0;
- goto out_end_write;
+ goto out_iov;
}
pos = args.offset;
ret = rw_verify_area(WRITE, file, &pos, args.len);
if (ret < 0)
- goto out_end_write;
+ goto out_iov;
init_sync_kiocb(&kiocb, file);
ret = kiocb_set_rw_flags(&kiocb, 0);
if (ret)
- goto out_end_write;
+ goto out_iov;
kiocb.ki_pos = pos;
+ file_start_write(file);
+
ret = btrfs_do_write_iter(&kiocb, &iter, &args);
if (ret > 0)
fsnotify_modify(file);
-out_end_write:
file_end_write(file);
+out_iov:
kfree(iov);
out_acct:
if (ret > 0)
@@ -4558,7 +4652,7 @@ long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
void __user *argp = (void __user *)arg;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index d51b9a2f2f6e..2c5dc25ec670 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -3,6 +3,15 @@
#ifndef BTRFS_IOCTL_H
#define BTRFS_IOCTL_H
+#include <linux/types.h>
+
+struct file;
+struct dentry;
+struct mnt_idmap;
+struct fileattr;
+struct btrfs_fs_info;
+struct btrfs_ioctl_balance_args;
+
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 74d8e2003f58..6a0b7abb5bd9 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -13,7 +13,6 @@
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"
-#include "accessors.h"
/*
* Lockdep class keys for extent_buffer->lock's in this root. For a given
@@ -85,7 +84,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int
{
struct btrfs_lockdep_keyset *ks;
- BUG_ON(level >= ARRAY_SIZE(ks->keys));
+ ASSERT(level < ARRAY_SIZE(ks->keys));
/* Find the matching keyset, id 0 is the default entry */
for (ks = btrfs_lockdep_keysets; ks->id; ks++)
@@ -98,7 +97,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int
void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb)
{
if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
- btrfs_set_buffer_lockdep_class(root->root_key.objectid,
+ btrfs_set_buffer_lockdep_class(btrfs_root_id(root),
eb, btrfs_header_level(eb));
}
@@ -130,14 +129,14 @@ static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { }
*/
/*
- * __btrfs_tree_read_lock - lock extent buffer for read
+ * btrfs_tree_read_lock_nested - lock extent buffer for read
* @eb: the eb to be locked
* @nest: the nesting level to be used for lockdep
*
* This takes the read lock on the extent buffer, using the specified nesting
* level for lockdep purposes.
*/
-void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
+void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
{
u64 start_ns = 0;
@@ -148,11 +147,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne
trace_btrfs_tree_read_lock(eb, start_ns);
}
-void btrfs_tree_read_lock(struct extent_buffer *eb)
-{
- __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL);
-}
-
/*
* Try-lock for read.
*
@@ -199,7 +193,7 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
*
* Returns with the eb->lock write locked.
*/
-void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
+void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
__acquires(&eb->lock)
{
u64 start_ns = 0;
@@ -212,11 +206,6 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
trace_btrfs_tree_lock(eb, start_ns);
}
-void btrfs_tree_lock(struct extent_buffer *eb)
-{
- __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
-}
-
/*
* Release the write lock.
*/
@@ -375,8 +364,12 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
{
- atomic_dec(&lock->writers);
- cond_wake_up(&lock->pending_readers);
+ /*
+ * atomic_dec_and_test() implies a full barrier, so woken up readers are
+ * guaranteed to see the decrement.
+ */
+ if (atomic_dec_and_test(&lock->writers))
+ wake_up(&lock->pending_readers);
}
void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 7d6ee1e609bf..1bc8e6738879 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -8,8 +8,14 @@
#include <linux/atomic.h>
#include <linux/wait.h>
+#include <linux/lockdep.h>
#include <linux/percpu_counter.h>
#include "extent_io.h"
+#include "locking.h"
+
+struct extent_buffer;
+struct btrfs_path;
+struct btrfs_root;
#define BTRFS_WRITE_LOCK 1
#define BTRFS_READ_LOCK 2
@@ -157,14 +163,22 @@ enum btrfs_lockdep_trans_states {
static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
"too many lock subclasses defined");
-struct btrfs_path;
+void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
+
+static inline void btrfs_tree_lock(struct extent_buffer *eb)
+{
+ btrfs_tree_lock_nested(eb, BTRFS_NESTING_NORMAL);
+}
-void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
-void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
-void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
-void btrfs_tree_read_lock(struct extent_buffer *eb);
+void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
+
+static inline void btrfs_tree_read_lock(struct extent_buffer *eb)
+{
+ btrfs_tree_read_lock_nested(eb, BTRFS_NESTING_NORMAL);
+}
+
void btrfs_tree_read_unlock(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c
index 0fe0ae54ac67..fd88af17d8d9 100644
--- a/fs/btrfs/lru_cache.c
+++ b/fs/btrfs/lru_cache.c
@@ -9,7 +9,7 @@
*
* @cache: The cache.
* @max_size: Maximum size (number of entries) for the cache.
- * Use 0 for unlimited size, it's the user's responsability to
+ * Use 0 for unlimited size, it's the user's responsibility to
* trim the cache in that case.
*/
void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size)
diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
index 00328c856be6..e32906ab6faa 100644
--- a/fs/btrfs/lru_cache.h
+++ b/fs/btrfs/lru_cache.h
@@ -3,8 +3,10 @@
#ifndef BTRFS_LRU_CACHE_H
#define BTRFS_LRU_CACHE_H
+#include <linux/types.h>
#include <linux/maple_tree.h>
#include <linux/list.h>
+#include "lru_cache.h"
/*
* A cache entry. This is meant to be embedded in a structure of a user of
@@ -50,11 +52,6 @@ struct btrfs_lru_cache {
#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \
list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list)
-static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache)
-{
- return cache->size;
-}
-
static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry(
struct btrfs_lru_cache *cache)
{
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d3fcfc628a4f..1c396ac167aa 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -130,17 +130,17 @@ static inline size_t read_compress_length(const char *buf)
*/
static int copy_compressed_data_to_page(char *compressed_data,
size_t compressed_size,
- struct page **out_pages,
- unsigned long max_nr_page,
+ struct folio **out_folios,
+ unsigned long max_nr_folio,
u32 *cur_out,
const u32 sectorsize)
{
u32 sector_bytes_left;
u32 orig_out;
- struct page *cur_page;
+ struct folio *cur_folio;
char *kaddr;
- if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
return -E2BIG;
/*
@@ -149,16 +149,16 @@ static int copy_compressed_data_to_page(char *compressed_data,
*/
ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
- cur_page = out_pages[*cur_out / PAGE_SIZE];
+ cur_folio = out_folios[*cur_out / PAGE_SIZE];
/* Allocate a new page */
- if (!cur_page) {
- cur_page = alloc_page(GFP_NOFS);
- if (!cur_page)
+ if (!cur_folio) {
+ cur_folio = btrfs_alloc_compr_folio();
+ if (!cur_folio)
return -ENOMEM;
- out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ out_folios[*cur_out / PAGE_SIZE] = cur_folio;
}
- kaddr = kmap_local_page(cur_page);
+ kaddr = kmap_local_folio(cur_folio, 0);
write_compress_length(kaddr + offset_in_page(*cur_out),
compressed_size);
*cur_out += LZO_LEN;
@@ -172,18 +172,18 @@ static int copy_compressed_data_to_page(char *compressed_data,
kunmap_local(kaddr);
- if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
return -E2BIG;
- cur_page = out_pages[*cur_out / PAGE_SIZE];
+ cur_folio = out_folios[*cur_out / PAGE_SIZE];
/* Allocate a new page */
- if (!cur_page) {
- cur_page = alloc_page(GFP_NOFS);
- if (!cur_page)
+ if (!cur_folio) {
+ cur_folio = btrfs_alloc_compr_folio();
+ if (!cur_folio)
return -ENOMEM;
- out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ out_folios[*cur_out / PAGE_SIZE] = cur_folio;
}
- kaddr = kmap_local_page(cur_page);
+ kaddr = kmap_local_folio(cur_folio, 0);
memcpy(kaddr + offset_in_page(*cur_out),
compressed_data + *cur_out - orig_out, copy_len);
@@ -209,15 +209,15 @@ out:
return 0;
}
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out)
+int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
- struct page *page_in = NULL;
+ const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize;
+ struct folio *folio_in = NULL;
char *sizes_ptr;
- const unsigned long max_nr_page = *out_pages;
+ const unsigned long max_nr_folio = *out_folios;
int ret = 0;
/* Points to the file offset of input data */
u64 cur_in = start;
@@ -225,8 +225,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
u32 cur_out = 0;
u32 len = *total_out;
- ASSERT(max_nr_page > 0);
- *out_pages = 0;
+ ASSERT(max_nr_folio > 0);
+ *out_folios = 0;
*total_out = 0;
*total_in = 0;
@@ -243,15 +243,16 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
size_t out_len;
/* Get the input page first */
- if (!page_in) {
- page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
- ASSERT(page_in);
+ if (!folio_in) {
+ ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in);
+ if (ret < 0)
+ goto out;
}
/* Compress at most one sector of data each time */
in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
ASSERT(in_len);
- data_in = kmap_local_page(page_in);
+ data_in = kmap_local_folio(folio_in, 0);
ret = lzo1x_1_compress(data_in +
offset_in_page(cur_in), in_len,
workspace->cbuf, &out_len,
@@ -264,7 +265,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
}
ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
- pages, max_nr_page,
+ folios, max_nr_folio,
&cur_out, sectorsize);
if (ret < 0)
goto out;
@@ -282,13 +283,13 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we have reached page boundary */
if (PAGE_ALIGNED(cur_in)) {
- put_page(page_in);
- page_in = NULL;
+ folio_put(folio_in);
+ folio_in = NULL;
}
}
/* Store the size of all chunks of compressed data */
- sizes_ptr = kmap_local_page(pages[0]);
+ sizes_ptr = kmap_local_folio(folios[0], 0);
write_compress_length(sizes_ptr, cur_out);
kunmap_local(sizes_ptr);
@@ -296,9 +297,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_out = cur_out;
*total_in = cur_in - start;
out:
- if (page_in)
- put_page(page_in);
- *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
+ if (folio_in)
+ folio_put(folio_in);
+ *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE);
return ret;
}
@@ -313,15 +314,15 @@ static void copy_compressed_segment(struct compressed_bio *cb,
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
- struct page *cur_page;
+ struct folio *cur_folio;
u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
orig_in + len - *cur_in);
ASSERT(copy_len);
- cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+ cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE];
- memcpy_from_page(dest + *cur_in - orig_in, cur_page,
- offset_in_page(*cur_in), copy_len);
+ memcpy_from_folio(dest + *cur_in - orig_in, cur_folio,
+ offset_in_folio(cur_folio, *cur_in), copy_len);
*cur_in += copy_len;
}
@@ -341,7 +342,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Bytes decompressed so far */
u32 cur_out = 0;
- kaddr = kmap_local_page(cb->compressed_pages[0]);
+ kaddr = kmap_local_folio(cb->compressed_folios[0], 0);
len_in = read_compress_length(kaddr);
kunmap_local(kaddr);
cur_in += LZO_LEN;
@@ -363,7 +364,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Go through each lzo segment */
while (cur_in < len_in) {
- struct page *cur_page;
+ struct folio *cur_folio;
/* Length of the compressed segment */
u32 seg_len;
u32 sector_bytes_left;
@@ -375,9 +376,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
*/
ASSERT(cur_in / sectorsize ==
(cur_in + LZO_LEN - 1) / sectorsize);
- cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
- ASSERT(cur_page);
- kaddr = kmap_local_page(cur_page);
+ cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE];
+ ASSERT(cur_folio);
+ kaddr = kmap_local_folio(cur_folio, 0);
seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
kunmap_local(kaddr);
cur_in += LZO_LEN;
@@ -425,16 +426,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page);
+ const u32 sectorsize = fs_info->sectorsize;
size_t in_len;
size_t out_len;
size_t max_segment_len = WORKSPACE_BUF_LENGTH;
int ret = 0;
- char *kaddr;
- unsigned long bytes;
if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
return -EUCLEAN;
@@ -451,7 +452,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
}
data_in += LZO_LEN;
- out_len = PAGE_SIZE;
+ out_len = sectorsize;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (ret != LZO_E_OK) {
pr_warn("BTRFS: decompress failed!\n");
@@ -459,29 +460,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
goto out;
}
- if (out_len < start_byte) {
+ ASSERT(out_len <= sectorsize);
+ memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len);
+ /* Early end, considered as an error. */
+ if (unlikely(out_len < destlen)) {
ret = -EIO;
- goto out;
+ memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len);
}
-
- /*
- * the caller is already checking against PAGE_SIZE, but lets
- * move this check closer to the memcpy/memset
- */
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes = min_t(unsigned long, destlen, out_len - start_byte);
-
- kaddr = kmap_local_page(dest_page);
- memcpy(kaddr, workspace->buf + start_byte, bytes);
-
- /*
- * btrfs_getblock is doing a zero on the tail of the page too,
- * but this will cover anything missing from the decompressed
- * data.
- */
- if (bytes < destlen)
- memset(kaddr+bytes, 0, destlen-bytes);
- kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index b8f9c9e56c8c..210d9c82e2ae 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -3,13 +3,11 @@
#include "fs.h"
#include "messages.h"
#include "discard.h"
-#include "transaction.h"
-#include "space-info.h"
#include "super.h"
#ifdef CONFIG_PRINTK
-#define STATE_STRING_PREFACE ": state "
+#define STATE_STRING_PREFACE " state "
#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1)
/*
@@ -287,7 +285,7 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
* panic or BUGs, depending on mount options.
*/
__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int error, const char *fmt, ...)
{
char *s_id = "<unknown>";
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 4d04c1fa5899..08a9272399d2 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -194,7 +194,7 @@ const char * __attribute_const__ btrfs_decode_error(int error);
__printf(5, 6)
__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int error, const char *fmt, ...);
/*
* If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 40f2d9f1a17a..dde4904aead9 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_MISC_H
#define BTRFS_MISC_H
+#include <linux/types.h>
+#include <linux/bitmap.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/math64.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 574e8a55e24a..c5bdd674f55c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -19,7 +19,6 @@
#include "qgroup.h"
#include "subpage.h"
#include "file.h"
-#include "super.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -152,11 +151,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
{
struct btrfs_ordered_extent *entry;
int ret;
+ u64 qgroup_rsv = 0;
if (flags &
((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
/* For nocow write, we can release the qgroup rsv right now */
- ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
+ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
if (ret < 0)
return ERR_PTR(ret);
} else {
@@ -164,7 +164,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
* The ordered extent has reserved qgroup space, release now
* and pass the reserved number for qgroup_record to free.
*/
- ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
+ ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
if (ret < 0)
return ERR_PTR(ret);
}
@@ -182,7 +182,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
entry->inode = igrab(&inode->vfs_inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
- entry->qgroup_rsv = ret;
+ entry->qgroup_rsv = qgroup_rsv;
entry->flags = flags;
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
@@ -294,6 +294,12 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
spin_unlock_irq(&inode->ordered_tree_lock);
}
+void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
+{
+ if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+ mapping_set_error(ordered->inode->i_mapping, -EIO);
+}
+
static void finish_ordered_fn(struct btrfs_work *work)
{
struct btrfs_ordered_extent *ordered_extent;
@@ -322,16 +328,17 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
*
* If there's no such bit, we need to skip to next range.
*/
- if (!btrfs_page_test_ordered(fs_info, page, file_offset, len))
+ if (!btrfs_folio_test_ordered(fs_info, page_folio(page),
+ file_offset, len))
return false;
- btrfs_page_clear_ordered(fs_info, page, file_offset, len);
+ btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len);
}
/* Now we're fine to update the accounting. */
if (WARN_ON_ONCE(len > ordered->bytes_left)) {
btrfs_crit(fs_info,
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu",
- inode->root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(inode->root), btrfs_ino(inode),
ordered->file_offset, ordered->num_bytes,
len, ordered->bytes_left);
ordered->bytes_left = 0;
@@ -599,7 +606,9 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
release = entry->disk_num_bytes;
else
release = entry->num_bytes;
- btrfs_delalloc_release_metadata(btrfs_inode, release, false);
+ btrfs_delalloc_release_metadata(btrfs_inode, release,
+ test_bit(BTRFS_ORDERED_IOERR,
+ &entry->flags));
}
percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
@@ -1185,6 +1194,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
ordered->disk_bytenr += len;
ordered->num_bytes -= len;
ordered->disk_num_bytes -= len;
+ ordered->ram_bytes -= len;
if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
ASSERT(ordered->bytes_left == 0);
@@ -1232,10 +1242,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
int __init ordered_data_init(void)
{
- btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
- sizeof(struct btrfs_ordered_extent), 0,
- SLAB_MEM_SPREAD,
- NULL);
+ btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0);
if (!btrfs_ordered_extent_cache)
return -ENOMEM;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 567a6d3d4712..b6f6c6b91732 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -6,6 +6,21 @@
#ifndef BTRFS_ORDERED_DATA_H
#define BTRFS_ORDERED_DATA_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/refcount.h>
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+#include <linux/wait.h>
+#include "async-thread.h"
+
+struct inode;
+struct page;
+struct extent_state;
+struct btrfs_inode;
+struct btrfs_root;
+struct btrfs_fs_info;
+
struct btrfs_ordered_sum {
/*
* Logical start address and length for of the blocks covered by
@@ -97,13 +112,6 @@ struct btrfs_ordered_extent {
u64 bytes_left;
/*
- * the end of the ordered extent which is behind it but
- * didn't update disk_i_size. Please see the comment of
- * btrfs_ordered_update_i_size();
- */
- u64 outstanding_isize;
-
- /*
* If we get truncated we need to adjust the file extent we enter for
* this ordered extent so that we do not expose stale data.
*/
@@ -195,6 +203,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
struct btrfs_ordered_extent *btrfs_split_ordered_extent(
struct btrfs_ordered_extent *ordered, u64 len);
+void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered);
int __init ordered_data_init(void);
void __cold ordered_data_exit(void);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 7a1b021b5669..6195a2215b8f 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -4,7 +4,6 @@
*/
#include "ctree.h"
-#include "disk-io.h"
#include "orphan.h"
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h
index 3faab5cbb59a..aa54a88a60de 100644
--- a/fs/btrfs/orphan.h
+++ b/fs/btrfs/orphan.h
@@ -3,6 +3,11 @@
#ifndef BTRFS_ORPHAN_H
#define BTRFS_ORPHAN_H
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_root;
+
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index c42bc666d5ee..8504bf1702c7 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -9,6 +9,9 @@
/* Buffer size to contain tree name and possibly additional data (offset) */
#define BTRFS_ROOT_NAME_BUF_LEN 48
+struct extent_buffer;
+struct btrfs_key;
+
void btrfs_print_leaf(const struct extent_buffer *l);
void btrfs_print_tree(const struct extent_buffer *c, bool follow);
const char *btrfs_root_name(const struct btrfs_key *key, char *buf);
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index f9bf591a0718..155570e20f45 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -4,6 +4,7 @@
*/
#include <linux/hashtable.h>
+#include <linux/xattr.h>
#include "messages.h"
#include "props.h"
#include "btrfs_inode.h"
@@ -267,7 +268,7 @@ static void inode_prop_iterator(void *ctx,
btrfs_warn(root->fs_info,
"error applying prop %s to ino %llu (root %llu): %d",
handler->xattr_name, btrfs_ino(BTRFS_I(inode)),
- root->root_key.objectid, ret);
+ btrfs_root_id(root), ret);
else
set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
}
@@ -302,7 +303,7 @@ static int prop_compression_validate(const struct btrfs_inode *inode,
static int prop_compression_apply(struct inode *inode, const char *value,
size_t len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
int type;
/* Reset to defaults */
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 6e283196e38a..f60cd89feb29 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -6,7 +6,12 @@
#ifndef BTRFS_PROPS_H
#define BTRFS_PROPS_H
-#include "ctree.h"
+#include <linux/compiler_types.h>
+
+struct inode;
+struct btrfs_inode;
+struct btrfs_path;
+struct btrfs_trans_handle;
int __init btrfs_props_init(void);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index edb84cc03237..eb28141d5c37 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -194,7 +194,7 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
*
* Must be called with qgroup_lock held and @prealloc preallocated.
*
- * The control on the lifespan of @prealloc would be transfered to this
+ * The control on the lifespan of @prealloc would be transferred to this
* function, thus caller should no longer touch @prealloc.
*/
static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -1324,7 +1324,7 @@ static int flush_reservations(struct btrfs_fs_info *fs_info)
trans = btrfs_join_transaction(fs_info->tree_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_commit_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
return ret;
}
@@ -1342,16 +1342,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
lockdep_assert_held_write(&fs_info->subvol_sem);
/*
- * Lock the cleaner mutex to prevent races with concurrent relocation,
- * because relocation may be building backrefs for blocks of the quota
- * root while we are deleting the root. This is like dropping fs roots
- * of deleted snapshots/subvolumes, we need the same protection.
- *
- * This also prevents races between concurrent tasks trying to disable
- * quotas, because we will unlock and relock qgroup_ioctl_lock across
- * BTRFS_FS_QUOTA_ENABLED changes.
+ * Relocation will mess with backrefs, so make sure we have the
+ * cleaner_mutex held to protect us from relocate.
*/
- mutex_lock(&fs_info->cleaner_mutex);
+ lockdep_assert_held(&fs_info->cleaner_mutex);
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
@@ -1373,9 +1367,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
+ /*
+ * We have nothing held here and no trans handle, just return the error
+ * if there is one.
+ */
ret = flush_reservations(fs_info);
if (ret)
- goto out_unlock_cleaner;
+ return ret;
/*
* 1 For the root item
@@ -1439,9 +1437,6 @@ out:
btrfs_end_transaction(trans);
else if (trans)
ret = btrfs_commit_transaction(trans);
-out_unlock_cleaner:
- mutex_unlock(&fs_info->cleaner_mutex);
-
return ret;
}
@@ -1541,18 +1536,15 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info,
{
struct btrfs_qgroup *qgroup;
int ret = 1;
- int err = 0;
qgroup = find_qgroup_rb(fs_info, src);
if (!qgroup)
goto out;
if (qgroup->excl == qgroup->rfer) {
- ret = 0;
- err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
- if (err < 0) {
- ret = err;
+ ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
+ if (ret < 0)
goto out;
- }
+ ret = 0;
}
out:
if (ret)
@@ -1736,6 +1728,15 @@ out:
return ret;
}
+static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
+{
+ return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
+ qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
+}
+
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1755,6 +1756,11 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
+ if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* Check if there are no children of this qgroup */
if (!list_empty(&qgroup->members)) {
ret = -EBUSY;
@@ -1888,7 +1894,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
u64 bytenr = record->bytenr;
if (!btrfs_qgroup_full_accounting(fs_info))
- return 0;
+ return 1;
lockdep_assert_held(&delayed_refs->lock);
trace_btrfs_qgroup_trace_extent(fs_info, record);
@@ -2491,8 +2497,8 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *eb = root_eb;
struct btrfs_path *path = NULL;
- BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
- BUG_ON(root_eb == NULL);
+ ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL);
+ ASSERT(root_eb != NULL);
if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
@@ -2847,8 +2853,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (nr_old_roots == 0 && nr_new_roots == 0)
goto out_free;
- BUG_ON(!fs_info->quota_root);
-
trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
num_bytes, nr_old_roots, nr_new_roots);
@@ -2875,12 +2879,18 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
num_bytes, seq);
/*
+ * We're done using the iterator, release all its qgroups while holding
+ * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup()
+ * and trigger use-after-free accesses to qgroups.
+ */
+ qgroup_iterator_nested_clean(&qgroups);
+
+ /*
* Bump qgroup_seq to avoid seq overlap
*/
fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
spin_unlock(&fs_info->qgroup_lock);
out_free:
- qgroup_iterator_nested_clean(&qgroups);
ulist_free(old_roots);
ulist_free(new_roots);
return ret;
@@ -2939,11 +2949,6 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
ctx.roots = NULL;
}
- /* Free the reserved data space */
- btrfs_qgroup_free_refroot(fs_info,
- record->data_rsv_refroot,
- record->data_rsv,
- BTRFS_QGROUP_RSV_DATA);
/*
* Use BTRFS_SEQ_LAST as time_seq to do special search,
* which doesn't lock tree or delayed_refs and search
@@ -2967,6 +2972,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
record->old_roots = NULL;
new_roots = NULL;
}
+ /* Free the reserved data space */
+ btrfs_qgroup_free_refroot(fs_info,
+ record->data_rsv_refroot,
+ record->data_rsv,
+ BTRFS_QGROUP_RSV_DATA);
cleanup:
ulist_free(record->old_roots);
ulist_free(new_roots);
@@ -3028,6 +3038,56 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
return ret;
}
+int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_inherit *inherit,
+ size_t size)
+{
+ if (!btrfs_qgroup_enabled(fs_info))
+ return 0;
+ if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
+ return -EOPNOTSUPP;
+ if (size < sizeof(*inherit) || size > PAGE_SIZE)
+ return -EINVAL;
+
+ /*
+ * In the past we allowed btrfs_qgroup_inherit to specify to copy
+ * rfer/excl numbers directly from other qgroups. This behavior has
+ * been disabled in userspace for a very long time, but here we should
+ * also disable it in kernel, as this behavior is known to mark qgroup
+ * inconsistent, and a rescan would wipe out the changes anyway.
+ *
+ * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies.
+ */
+ if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0)
+ return -EINVAL;
+
+ if (size != struct_size(inherit, qgroups, inherit->num_qgroups))
+ return -EINVAL;
+
+ /*
+ * Now check all the remaining qgroups, they should all:
+ *
+ * - Exist
+ * - Be higher level qgroups.
+ */
+ for (int i = 0; i < inherit->num_qgroups; i++) {
+ struct btrfs_qgroup *qgroup;
+ u64 qgroupid = inherit->qgroups[i];
+
+ if (btrfs_qgroup_level(qgroupid) == 0)
+ return -EINVAL;
+
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup) {
+ spin_unlock(&fs_info->qgroup_lock);
+ return -ENOENT;
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+ }
+ return 0;
+}
+
static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
u64 inode_rootid,
struct btrfs_qgroup_inherit **inherit)
@@ -3063,13 +3123,69 @@ static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
qgids = res->qgroups;
list_for_each_entry(qg_list, &inode_qg->groups, next_group)
- qgids[i] = qg_list->group->qgroupid;
+ qgids[i++] = qg_list->group->qgroupid;
*inherit = res;
return 0;
}
/*
+ * Check if we can skip rescan when inheriting qgroups. If @src has a single
+ * @parent, and that @parent is owning all its bytes exclusively, we can skip
+ * the full rescan, by just adding nodesize to the @parent's excl/rfer.
+ *
+ * Return <0 for fatal errors (like srcid/parentid has no qgroup).
+ * Return 0 if a quick inherit is done.
+ * Return >0 if a quick inherit is not possible, and a full rescan is needed.
+ */
+static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info,
+ u64 srcid, u64 parentid)
+{
+ struct btrfs_qgroup *src;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup_list *list;
+ int nr_parents = 0;
+
+ src = find_qgroup_rb(fs_info, srcid);
+ if (!src)
+ return -ENOENT;
+ parent = find_qgroup_rb(fs_info, parentid);
+ if (!parent)
+ return -ENOENT;
+
+ /*
+ * Source has no parent qgroup, but our new qgroup would have one.
+ * Qgroup numbers would become inconsistent.
+ */
+ if (list_empty(&src->groups))
+ return 1;
+
+ list_for_each_entry(list, &src->groups, next_group) {
+ /* The parent is not the same, quick update is not possible. */
+ if (list->group->qgroupid != parentid)
+ return 1;
+ nr_parents++;
+ /*
+ * More than one parent qgroup, we can't be sure about accounting
+ * consistency.
+ */
+ if (nr_parents > 1)
+ return 1;
+ }
+
+ /*
+ * The parent is not exclusively owning all its bytes. We're not sure
+ * if the source has any bytes not fully owned by the parent.
+ */
+ if (parent->excl != parent->rfer)
+ return 1;
+
+ parent->excl += fs_info->nodesize;
+ parent->rfer += fs_info->nodesize;
+ return 0;
+}
+
+/*
* Copy the accounting information between qgroups. This is necessary
* when a snapshot or a subvolume is created. Throwing an error will
* cause a transaction abort so we take extra care here to only error
@@ -3237,6 +3353,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
qgroup_dirty(fs_info, dstgroup);
qgroup_dirty(fs_info, srcgroup);
+
+ /*
+ * If the source qgroup has parent but the new one doesn't,
+ * we need a full rescan.
+ */
+ if (!inherit && !list_empty(&srcgroup->groups))
+ need_rescan = true;
}
if (!inherit)
@@ -3251,14 +3374,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
if (ret)
goto unlock;
}
+ if (srcid) {
+ /* Check if we can do a quick inherit. */
+ ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups);
+ if (ret < 0)
+ goto unlock;
+ if (ret > 0)
+ need_rescan = true;
+ ret = 0;
+ }
++i_qgroups;
-
- /*
- * If we're doing a snapshot, and adding the snapshot to a new
- * qgroup, the numbers are guaranteed to be incorrect.
- */
- if (srcid)
- need_rescan = true;
}
for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
@@ -3340,7 +3465,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
{
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 ref_root = root->root_key.objectid;
+ u64 ref_root = btrfs_root_id(root);
int ret = 0;
LIST_HEAD(qgroup_list);
@@ -3575,7 +3700,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- int err = -ENOMEM;
int ret = 0;
bool stopped = false;
bool did_leaf_rescans = false;
@@ -3584,8 +3708,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
return;
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ ret = -ENOMEM;
goto out;
+ }
/*
* Rescan should only search for commit root, and any later difference
* should be recorded by qgroup
@@ -3593,18 +3719,17 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
path->search_commit_root = 1;
path->skip_locking = 1;
- err = 0;
- while (!err && !(stopped = rescan_should_stop(fs_info))) {
+ while (!ret && !(stopped = rescan_should_stop(fs_info))) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
break;
}
- err = qgroup_rescan_leaf(trans, path);
+ ret = qgroup_rescan_leaf(trans, path);
did_leaf_rescans = true;
- if (err > 0)
+ if (ret > 0)
btrfs_commit_transaction(trans);
else
btrfs_end_transaction(trans);
@@ -3614,10 +3739,10 @@ out:
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (err > 0 &&
+ if (ret > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- } else if (err < 0 || stopped) {
+ } else if (ret < 0 || stopped) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3632,11 +3757,11 @@ out:
if (did_leaf_rescans) {
trans = btrfs_start_transaction(fs_info->quota_root, 1);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
trans = NULL;
btrfs_err(fs_info,
"fail to start transaction for status update: %d",
- err);
+ ret);
}
} else {
trans = NULL;
@@ -3647,11 +3772,11 @@ out:
fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (trans) {
- ret = update_qgroup_status_item(trans);
- if (ret < 0) {
- err = ret;
- btrfs_err(fs_info, "fail to update qgroup status: %d",
- err);
+ int ret2 = update_qgroup_status_item(trans);
+
+ if (ret2 < 0) {
+ ret = ret2;
+ btrfs_err(fs_info, "fail to update qgroup status: %d", ret);
}
}
fs_info->qgroup_rescan_running = false;
@@ -3668,11 +3793,11 @@ out:
btrfs_info(fs_info, "qgroup scan paused");
} else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
btrfs_info(fs_info, "qgroup scan cancelled");
- } else if (err >= 0) {
+ } else if (ret >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
- err > 0 ? " (inconsistency flag cleared)" : "");
+ ret > 0 ? " (inconsistency flag cleared)" : "");
} else {
- btrfs_err(fs_info, "qgroup scan failed with %d", err);
+ btrfs_err(fs_info, "qgroup scan failed with %d", ret);
}
}
@@ -3981,7 +4106,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
int ret;
if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(root->root_key.objectid) || len == 0)
+ !is_fstree(btrfs_root_id(root)) || len == 0)
return 0;
/* @reserved parameter is mandatory for qgroup */
@@ -4051,13 +4176,14 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
/* Free ranges specified by @reserved, normally in error path */
static int qgroup_free_reserved_data(struct btrfs_inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len)
+ struct extent_changeset *reserved,
+ u64 start, u64 len, u64 *freed_ret)
{
struct btrfs_root *root = inode->root;
struct ulist_node *unode;
struct ulist_iterator uiter;
struct extent_changeset changeset;
- int freed = 0;
+ u64 freed = 0;
int ret;
extent_changeset_init(&changeset);
@@ -4096,9 +4222,11 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode,
goto out;
freed += changeset.bytes_changed;
}
- btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
+ btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed,
BTRFS_QGROUP_RSV_DATA);
- ret = freed;
+ if (freed_ret)
+ *freed_ret = freed;
+ ret = 0;
out:
extent_changeset_release(&changeset);
return ret;
@@ -4106,7 +4234,7 @@ out:
static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len,
- int free)
+ u64 *released, int free)
{
struct extent_changeset changeset;
int trace_op = QGROUP_RELEASE;
@@ -4122,7 +4250,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
if (free && reserved)
- return qgroup_free_reserved_data(inode, reserved, start, len);
+ return qgroup_free_reserved_data(inode, reserved, start, len, released);
extent_changeset_init(&changeset);
ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
EXTENT_QGROUP_RESERVED, &changeset);
@@ -4135,9 +4263,10 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
changeset.bytes_changed, trace_op);
if (free)
btrfs_qgroup_free_refroot(inode->root->fs_info,
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
- ret = changeset.bytes_changed;
+ if (released)
+ *released = changeset.bytes_changed;
out:
extent_changeset_release(&changeset);
return ret;
@@ -4156,9 +4285,10 @@ out:
* NOTE: This function may sleep for memory allocation.
*/
int btrfs_qgroup_free_data(struct btrfs_inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len)
+ struct extent_changeset *reserved,
+ u64 start, u64 len, u64 *freed)
{
- return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
+ return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1);
}
/*
@@ -4176,9 +4306,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
*
* NOTE: This function may sleep for memory allocation.
*/
-int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released)
{
- return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
+ return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0);
}
static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
@@ -4228,7 +4358,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
int ret;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(root->root_key.objectid) || num_bytes == 0)
+ !is_fstree(btrfs_root_id(root)) || num_bytes == 0)
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
@@ -4273,13 +4403,13 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(root->root_key.objectid))
+ !is_fstree(btrfs_root_id(root)))
return;
/* TODO: Update trace point to handle such free */
trace_qgroup_meta_free_all_pertrans(root);
/* Special value -1 means to free all reserved space */
- btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
+ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1,
BTRFS_QGROUP_RSV_META_PERTRANS);
}
@@ -4289,7 +4419,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(root->root_key.objectid))
+ !is_fstree(btrfs_root_id(root)))
return;
/*
@@ -4300,8 +4430,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
- btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
- num_bytes, type);
+ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type);
}
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
@@ -4326,8 +4455,9 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
qgroup_rsv_release(fs_info, qgroup, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
- qgroup_rsv_add(fs_info, qgroup, num_bytes,
- BTRFS_QGROUP_RSV_META_PERTRANS);
+ if (!sb_rdonly(fs_info->sb))
+ qgroup_rsv_add(fs_info, qgroup, num_bytes,
+ BTRFS_QGROUP_RSV_META_PERTRANS);
list_for_each_entry(glist, &qgroup->groups, next_group)
qgroup_iterator_add(&qgroup_list, glist->group);
@@ -4348,13 +4478,15 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(root->root_key.objectid))
+ !is_fstree(btrfs_root_id(root)))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
trace_qgroup_meta_convert(root, num_bytes);
- qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
+ qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes);
+ if (!sb_rdonly(fs_info->sb))
+ add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
}
/*
@@ -4381,7 +4513,7 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
btrfs_ino(inode), unode->val, unode->aux);
}
btrfs_qgroup_free_refroot(inode->root->fs_info,
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
@@ -4567,7 +4699,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
- if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+ if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root)
return 0;
spin_lock(&blocks->lock);
@@ -4649,6 +4781,17 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
*root = RB_ROOT;
}
+void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
+{
+ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
+ return;
+
+ if (!is_fstree(root))
+ return;
+
+ btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA);
+}
+
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
struct btrfs_squota_delta *delta)
{
@@ -4693,8 +4836,5 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
out:
spin_unlock(&fs_info->qgroup_lock);
- if (!ret && delta->rsv_bytes)
- btrfs_qgroup_free_refroot(fs_info, root, delta->rsv_bytes,
- BTRFS_QGROUP_RSV_DATA);
return ret;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 855a4f978761..706640be0ec2 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -6,12 +6,22 @@
#ifndef BTRFS_QGROUP_H
#define BTRFS_QGROUP_H
+#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/kobject.h>
-#include "ulist.h"
-#include "delayed-ref.h"
-#include "misc.h"
+#include <linux/list.h>
+#include <uapi/linux/btrfs_tree.h>
+
+struct extent_buffer;
+struct extent_changeset;
+struct btrfs_delayed_extent_op;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_ioctl_quota_ctl_args;
+struct btrfs_trans_handle;
+struct btrfs_delayed_ref_root;
+struct btrfs_inode;
/*
* Btrfs qgroup overview
@@ -274,8 +284,6 @@ struct btrfs_squota_delta {
u64 root;
/* The number of bytes in the extent being counted. */
u64 num_bytes;
- /* The number of bytes reserved for this extent. */
- u64 rsv_bytes;
/* The generation the extent was created in. */
u64 generation;
/* Whether we are using or freeing the extent. */
@@ -323,7 +331,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit);
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
-struct btrfs_delayed_extent_op;
int btrfs_qgroup_trace_extent_nolock(
struct btrfs_fs_info *fs_info,
@@ -343,6 +350,9 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
struct ulist *new_roots);
int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
+int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_inherit *inherit,
+ size_t size);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
u64 objectid, u64 inode_rootid,
struct btrfs_qgroup_inherit *inherit);
@@ -358,10 +368,10 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
/* New io_tree based accurate qgroup reserve API */
int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
-int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released);
int btrfs_qgroup_free_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start,
- u64 len);
+ u64 len, u64 *freed);
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
@@ -422,6 +432,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
+void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes);
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
struct btrfs_squota_delta *delta);
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 944e8f1862aa..6af6b4b9a32e 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -11,7 +11,6 @@
#include "disk-io.h"
#include "raid-stripe-tree.h"
#include "volumes.h"
-#include "misc.h"
#include "print-tree.h"
int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
@@ -145,7 +144,7 @@ int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
btrfs_put_bioc(bioc);
}
- return ret;
+ return 0;
}
int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index cdb58b38fcb5..c9c258f84903 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -6,6 +6,10 @@
#ifndef BTRFS_RAID_STRIPE_TREE_H
#define BTRFS_RAID_STRIPE_TREE_H
+#include <linux/types.h>
+#include <uapi/linux/btrfs_tree.h>
+#include "fs.h"
+
#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID1_MASK | \
BTRFS_BLOCK_GROUP_RAID0 | \
@@ -13,6 +17,7 @@
struct btrfs_io_context;
struct btrfs_io_stripe;
+struct btrfs_fs_info;
struct btrfs_ordered_extent;
struct btrfs_trans_handle;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 3e014b9370a3..831fac45e70f 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -14,7 +14,6 @@
#include <linux/raid/xor.h>
#include <linux/mm.h>
#include "messages.h"
-#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "volumes.h"
@@ -332,12 +331,11 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
static void merge_rbio(struct btrfs_raid_bio *dest,
struct btrfs_raid_bio *victim)
{
- bio_list_merge(&dest->bio_list, &victim->bio_list);
+ bio_list_merge_init(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
/* Also inherit the bitmaps from @victim. */
bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
dest->stripe_nsectors);
- bio_list_init(&victim->bio_list);
}
/*
@@ -918,6 +916,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
*/
ASSERT(stripe_nsectors <= BITS_PER_LONG);
+ /*
+ * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
+ * (limited by u8).
+ */
+ ASSERT(real_stripes >= 2);
+ ASSERT(real_stripes <= U8_MAX);
+
rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -955,6 +960,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
+ ASSERT(rbio->nr_data > 0);
return rbio;
}
@@ -964,7 +970,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{
int ret;
- ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
+ ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0);
if (ret < 0)
return ret;
/* Mapping all sectors */
@@ -979,7 +985,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
int ret;
ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
- rbio->stripe_pages + data_pages);
+ rbio->stripe_pages + data_pages, 0);
if (ret < 0)
return ret;
@@ -1181,6 +1187,26 @@ static inline void bio_list_put(struct bio_list *bio_list)
bio_put(bio);
}
+static void assert_rbio(struct btrfs_raid_bio *rbio)
+{
+ if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
+ !IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ /*
+ * At least two stripes (2 disks RAID5), and since real_stripes is U8,
+ * we won't go beyond 256 disks anyway.
+ */
+ ASSERT(rbio->real_stripes >= 2);
+ ASSERT(rbio->nr_data > 0);
+
+ /*
+ * This is another check to make sure nr data stripes is smaller
+ * than total stripes.
+ */
+ ASSERT(rbio->nr_data < rbio->real_stripes);
+}
+
/* Generate PQ for one vertical stripe. */
static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
{
@@ -1212,6 +1238,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
pointers[stripe++] = kmap_local_page(sector->page) +
sector->pgoff;
+ assert_rbio(rbio);
raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
} else {
@@ -1530,7 +1557,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
const int data_pages = rbio->nr_data * rbio->stripe_npages;
int ret;
- ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
+ ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0);
if (ret < 0)
return ret;
@@ -1549,7 +1576,6 @@ struct btrfs_plug_cb {
struct blk_plug_cb cb;
struct btrfs_fs_info *info;
struct list_head rbio_list;
- struct work_struct work;
};
/*
@@ -2474,6 +2500,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
}
if (has_qstripe) {
+ assert_rbio(rbio);
/* RAID6, call the library function to fill in our P/Q */
raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 45e6ff78316f..0d7b4c2fb6ae 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -7,9 +7,18 @@
#ifndef BTRFS_RAID56_H
#define BTRFS_RAID56_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/bio.h>
+#include <linux/refcount.h>
#include <linux/workqueue.h>
#include "volumes.h"
+struct page;
+struct sector_ptr;
+struct btrfs_fs_info;
+
enum btrfs_rbio_ops {
BTRFS_RBIO_WRITE,
BTRFS_RBIO_READ_REBUILD,
@@ -164,7 +173,7 @@ struct raid56_bio_trace_info {
u8 stripe_nr;
};
-static inline int nr_data_stripes(const struct map_lookup *map)
+static inline int nr_data_stripes(const struct btrfs_chunk_map *map)
{
return map->num_stripes - btrfs_nr_parity_stripes(map->type);
}
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index 5c2b66d155ef..1c2d7cb1fe6f 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -6,6 +6,12 @@
#ifndef BTRFS_RCU_STRING_H
#define BTRFS_RCU_STRING_H
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#include <linux/printk.h>
+
struct rcu_string {
struct rcu_head rcu;
char str[];
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 1f62976bee82..cf531255ab76 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -673,7 +673,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
int ret = 0;
bool metadata;
u64 bytenr = generic_ref->bytenr;
- u64 num_bytes = generic_ref->len;
+ u64 num_bytes = generic_ref->num_bytes;
u64 parent = generic_ref->parent;
u64 ref_root = 0;
u64 owner = 0;
@@ -684,11 +684,11 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
if (generic_ref->type == BTRFS_REF_METADATA) {
if (!parent)
- ref_root = generic_ref->tree_ref.ref_root;
+ ref_root = generic_ref->ref_root;
owner = generic_ref->tree_ref.level;
} else if (!parent) {
- ref_root = generic_ref->data_ref.ref_root;
- owner = generic_ref->data_ref.ino;
+ ref_root = generic_ref->ref_root;
+ owner = generic_ref->data_ref.objectid;
offset = generic_ref->data_ref.offset;
}
metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
@@ -794,6 +794,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
dump_ref_action(fs_info, ra);
kfree(ref);
kfree(ra);
+ kfree(re);
goto out_unlock;
} else if (be->num_refs == 0) {
btrfs_err(fs_info,
@@ -803,6 +804,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
dump_ref_action(fs_info, ra);
kfree(ref);
kfree(ra);
+ kfree(re);
goto out_unlock;
}
@@ -887,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
out_unlock:
spin_unlock(&fs_info->ref_verify_lock);
out:
- if (ret)
+ if (ret) {
+ btrfs_free_ref_cache(fs_info);
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ }
return ret;
}
@@ -1019,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
}
}
if (ret) {
- btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
btrfs_free_ref_cache(fs_info);
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
}
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index 855de37719b5..3511e1a5c96b 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h
@@ -6,7 +6,16 @@
#ifndef BTRFS_REF_VERIFY_H
#define BTRFS_REF_VERIFY_H
+#include <linux/types.h>
+#include <linux/rbtree_types.h>
+
+struct btrfs_fs_info;
+struct btrfs_ref;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+
+#include <linux/spinlock.h>
+
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f88b0c2ac3fe..d0a3fcecc46a 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -141,9 +141,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (datal < block_size)
memzero_page(page, datal, block_size - datal);
- btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
- btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
- btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
+ btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size);
+ btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size);
out_unlock:
if (page) {
unlock_page(page);
@@ -174,7 +174,7 @@ static int clone_copy_inline_extent(struct inode *dst,
char *inline_data,
struct btrfs_trans_handle **trans_out)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(dst);
struct btrfs_root *root = BTRFS_I(dst)->root;
const u64 aligned_end = ALIGN(new_key->offset + datal,
fs_info->sectorsize);
@@ -337,7 +337,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
const u64 destoff, int no_time_update)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_path *path = NULL;
struct extent_buffer *leaf;
struct btrfs_trans_handle *trans;
@@ -616,35 +616,6 @@ out:
return ret;
}
-static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL);
-}
-
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- u64 range1_end = loff1 + len - 1;
- u64 range2_end = loff2 + len - 1;
-
- if (inode1 < inode2) {
- swap(inode1, inode2);
- swap(loff1, loff2);
- swap(range1_end, range2_end);
- } else if (inode1 == inode2 && loff2 < loff1) {
- swap(loff1, loff2);
- swap(range1_end, range2_end);
- }
-
- lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL);
- lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL);
-
- btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
- btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
-}
-
static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
{
if (inode1 < inode2)
@@ -662,17 +633,21 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
+ const u64 end = dst_loff + len - 1;
+ struct extent_state *cached_state = NULL;
struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
- const u64 bs = fs_info->sb->s_blocksize;
+ const u64 bs = fs_info->sectorsize;
int ret;
/*
- * Lock destination range to serialize with concurrent readahead() and
- * source range to serialize with relocation.
+ * Lock destination range to serialize with concurrent readahead(), and
+ * we are safe from concurrency with relocation of source extents
+ * because we have already locked the inode's i_mmap_lock in exclusive
+ * mode.
*/
- btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
- btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+ unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
btrfs_btree_balance_dirty(fs_info);
@@ -690,7 +665,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
if (root_dst->send_in_progress) {
btrfs_warn_rl(root_dst->fs_info,
"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
- root_dst->root_key.objectid,
+ btrfs_root_id(root_dst),
root_dst->send_in_progress);
spin_unlock(&root_dst->root_item_lock);
return -EAGAIN;
@@ -724,13 +699,15 @@ out:
static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
u64 off, u64 olen, u64 destoff)
{
+ struct extent_state *cached_state = NULL;
struct inode *inode = file_inode(file);
struct inode *src = file_inode(file_src);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
int ret;
int wb_ret;
u64 len = olen;
- u64 bs = fs_info->sb->s_blocksize;
+ u64 bs = fs_info->sectorsize;
+ u64 end;
/*
* VFS's generic_remap_file_range_prep() protects us from cloning the
@@ -763,12 +740,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
}
/*
- * Lock destination range to serialize with concurrent readahead() and
- * source range to serialize with relocation.
+ * Lock destination range to serialize with concurrent readahead(), and
+ * we are safe from concurrency with relocation of source extents
+ * because we have already locked the inode's i_mmap_lock in exclusive
+ * mode.
*/
- btrfs_double_extent_lock(src, off, inode, destoff, len);
+ end = destoff + len - 1;
+ lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
- btrfs_double_extent_unlock(src, off, inode, destoff, len);
+ unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
/*
* We may have copied an inline extent into a page of the destination
@@ -796,7 +776,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
- u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
+ u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize;
u64 wb_len;
int ret;
diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h
index ecb309b4dad0..1e291f7d85c4 100644
--- a/fs/btrfs/reflink.h
+++ b/fs/btrfs/reflink.h
@@ -3,7 +3,9 @@
#ifndef BTRFS_REFLINK_H
#define BTRFS_REFLINK_H
-#include <linux/fs.h>
+#include <linux/types.h>
+
+struct file;
loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f5d9e5f74a52..8b24bb5a0aa1 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -473,20 +473,19 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
struct btrfs_backref_node *node = NULL;
struct btrfs_backref_edge *edge;
int ret;
- int err = 0;
iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
if (!iter)
return ERR_PTR(-ENOMEM);
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
node = btrfs_backref_alloc_node(cache, bytenr, level);
if (!node) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -497,10 +496,9 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
do {
ret = btrfs_backref_add_tree_node(trans, cache, path, iter,
node_key, cur);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
+
edge = list_first_entry_or_null(&cache->pending_edge,
struct btrfs_backref_edge, list[UPPER]);
/*
@@ -515,19 +513,18 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
/* Finish the upper linkage of newly added edges/nodes */
ret = btrfs_backref_finish_upper_links(cache, node);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
if (handle_useless_nodes(rc, node))
node = NULL;
out:
- btrfs_backref_iter_free(iter);
+ btrfs_free_path(iter->path);
+ kfree(iter);
btrfs_free_path(path);
- if (err) {
+ if (ret) {
btrfs_backref_error_cleanup(cache, node);
- return ERR_PTR(err);
+ return ERR_PTR(ret);
}
ASSERT(!node || !node->detached);
ASSERT(list_empty(&cache->useless_node) &&
@@ -753,7 +750,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = objectid;
- if (root->root_key.objectid == objectid) {
+ if (btrfs_root_id(root) == objectid) {
u64 commit_root_gen;
/* called by btrfs_init_reloc_root */
@@ -797,7 +794,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
btrfs_set_root_level(root_item, btrfs_header_level(eb));
btrfs_set_root_generation(root_item, trans->transid);
- if (root->root_key.objectid == objectid) {
+ if (btrfs_root_id(root) == objectid) {
btrfs_set_root_refs(root_item, 0);
memset(&root_item->drop_progress, 0,
sizeof(struct btrfs_disk_key));
@@ -875,8 +872,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
* We are merging reloc roots, we do not need new reloc trees. Also
* reloc trees never need their own reloc tree.
*/
- if (!rc->create_reloc_tree ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
return 0;
if (!trans->reloc_reserved) {
@@ -884,7 +880,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
trans->block_rsv = rc->block_rsv;
clear_rsv = 1;
}
- reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+ reloc_root = create_reloc_root(trans, root, btrfs_root_id(root));
if (clear_rsv)
trans->block_rsv = rsv;
if (IS_ERR(reloc_root))
@@ -951,60 +947,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
}
/*
- * helper to find first cached inode with inode number >= objectid
- * in a subvolume
- */
-static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
-{
- struct rb_node *node;
- struct rb_node *prev;
- struct btrfs_inode *entry;
- struct inode *inode;
-
- spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- entry = rb_entry(node, struct btrfs_inode, rb_node);
-
- if (objectid < btrfs_ino(entry))
- node = node->rb_left;
- else if (objectid > btrfs_ino(entry))
- node = node->rb_right;
- else
- break;
- }
- if (!node) {
- while (prev) {
- entry = rb_entry(prev, struct btrfs_inode, rb_node);
- if (objectid <= btrfs_ino(entry)) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
- while (node) {
- entry = rb_entry(node, struct btrfs_inode, rb_node);
- inode = igrab(&entry->vfs_inode);
- if (inode) {
- spin_unlock(&root->inode_lock);
- return inode;
- }
-
- objectid = btrfs_ino(entry) + 1;
- if (cond_resched_lock(&root->inode_lock))
- goto again;
-
- node = rb_next(node);
- }
- spin_unlock(&root->inode_lock);
- return NULL;
-}
-
-/*
* get new location of data
*/
static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
@@ -1064,7 +1006,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
u64 parent;
u64 bytenr;
u64 new_bytenr = 0;
@@ -1080,7 +1022,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
return 0;
/* reloc trees always use full backref */
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
parent = leaf->start;
else
parent = 0;
@@ -1109,15 +1051,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
* if we are modifying block in fs tree, wait for read_folio
* to complete and drop the extent cache
*/
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
if (first) {
- inode = find_next_inode(root, key.objectid);
+ inode = btrfs_find_first_inode(root, key.objectid);
first = 0;
- } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
- inode = find_next_inode(root, key.objectid);
+ } else if (inode && btrfs_ino(inode) < key.objectid) {
+ btrfs_add_delayed_iput(inode);
+ inode = btrfs_find_first_inode(root, key.objectid);
}
- if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
+ if (inode && btrfs_ino(inode) == key.objectid) {
struct extent_state *cached_state = NULL;
end = key.offset +
@@ -1126,16 +1068,20 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
fs_info->sectorsize));
WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
end--;
- ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end,
- &cached_state);
- if (!ret)
+ /* Take mmap lock to serialize with reflinks. */
+ if (!down_read_trylock(&inode->i_mmap_lock))
+ continue;
+ ret = try_lock_extent(&inode->io_tree, key.offset,
+ end, &cached_state);
+ if (!ret) {
+ up_read(&inode->i_mmap_lock);
continue;
+ }
- btrfs_drop_extent_map_range(BTRFS_I(inode),
- key.offset, end, true);
- unlock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end, &cached_state);
+ btrfs_drop_extent_map_range(inode, key.offset, end, true);
+ unlock_extent(&inode->io_tree, key.offset, end,
+ &cached_state);
+ up_read(&inode->i_mmap_lock);
}
}
@@ -1153,22 +1099,28 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
dirty = 1;
key.offset -= btrfs_file_extent_offset(leaf, fi);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
- num_bytes, parent, root->root_key.objectid);
- btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset,
- root->root_key.objectid, false);
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.bytenr = new_bytenr;
+ ref.num_bytes = num_bytes;
+ ref.parent = parent;
+ ref.owning_root = btrfs_root_id(root);
+ ref.ref_root = btrfs_header_owner(leaf);
+ btrfs_init_data_ref(&ref, key.objectid, key.offset,
+ btrfs_root_id(root), false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- num_bytes, parent, root->root_key.objectid);
- btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset,
- root->root_key.objectid, false);
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.bytenr = bytenr;
+ ref.num_bytes = num_bytes;
+ ref.parent = parent;
+ ref.owning_root = btrfs_root_id(root);
+ ref.ref_root = btrfs_header_owner(leaf);
+ btrfs_init_data_ref(&ref, key.objectid, key.offset,
+ btrfs_root_id(root), false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1178,7 +1130,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
if (dirty)
btrfs_mark_buffer_dirty(trans, leaf);
if (inode)
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ btrfs_add_delayed_iput(inode);
return ret;
}
@@ -1224,8 +1176,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
int ret;
int slot;
- ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
- ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID);
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
again:
@@ -1377,20 +1329,26 @@ again:
path->slots[level], old_ptr_gen);
btrfs_mark_buffer_dirty(trans, path->nodes[level]);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
- blocksize, path->nodes[level]->start,
- src->root_key.objectid);
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
- 0, true);
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.bytenr = old_bytenr;
+ ref.num_bytes = blocksize;
+ ref.parent = path->nodes[level]->start;
+ ref.owning_root = btrfs_root_id(src);
+ ref.ref_root = btrfs_root_id(src);
+ btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
- blocksize, 0, dest->root_key.objectid);
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
- true);
+
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.bytenr = new_bytenr;
+ ref.num_bytes = blocksize;
+ ref.parent = 0;
+ ref.owning_root = btrfs_root_id(dest);
+ ref.ref_root = btrfs_root_id(dest);
+ btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1398,10 +1356,13 @@ again:
}
/* We don't know the real owning_root, use 0. */
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
- blocksize, path->nodes[level]->start, 0);
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
- 0, true);
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.bytenr = new_bytenr;
+ ref.num_bytes = blocksize;
+ ref.parent = path->nodes[level]->start;
+ ref.owning_root = 0;
+ ref.ref_root = btrfs_root_id(src);
+ btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1409,10 +1370,13 @@ again:
}
/* We don't know the real owning_root, use 0. */
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
- blocksize, 0, 0);
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
- 0, true);
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.bytenr = old_bytenr;
+ ref.num_bytes = blocksize;
+ ref.parent = 0;
+ ref.owning_root = 0;
+ ref.ref_root = btrfs_root_id(dest);
+ btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1520,7 +1484,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
const struct btrfs_key *max_key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
u64 objectid;
u64 start, end;
u64 ino;
@@ -1530,23 +1494,24 @@ static int invalidate_extent_cache(struct btrfs_root *root,
struct extent_state *cached_state = NULL;
cond_resched();
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
if (objectid > max_key->objectid)
break;
- inode = find_next_inode(root, objectid);
+ inode = btrfs_find_first_inode(root, objectid);
if (!inode)
break;
- ino = btrfs_ino(BTRFS_I(inode));
+ ino = btrfs_ino(inode);
if (ino > max_key->objectid) {
- iput(inode);
+ iput(&inode->vfs_inode);
break;
}
objectid = ino + 1;
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->vfs_inode.i_mode))
continue;
if (unlikely(min_key->objectid == ino)) {
@@ -1579,9 +1544,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
}
/* the lock_extent waits for read_folio to complete */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
+ lock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_drop_extent_map_range(inode, start, end, true);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
}
return 0;
}
@@ -1616,7 +1581,7 @@ static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
int ret;
/* @root must be a subvolume tree root with a valid reloc tree */
- ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
ASSERT(reloc_root);
reloc_root_item = &reloc_root->root_item;
@@ -1645,7 +1610,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
reloc_dirty_list) {
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
/* Merged subvolume, cleanup its reloc root */
struct btrfs_root *reloc_root = root->reloc_root;
@@ -1920,13 +1885,13 @@ again:
if (root->reloc_root) {
btrfs_err(fs_info,
"reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu",
- root->root_key.objectid,
- root->reloc_root->root_key.objectid,
+ btrfs_root_id(root),
+ btrfs_root_id(root->reloc_root),
root->reloc_root->root_key.type,
root->reloc_root->root_key.offset,
btrfs_root_generation(
&root->reloc_root->root_item),
- reloc_root->root_key.objectid,
+ btrfs_root_id(reloc_root),
reloc_root->root_key.type,
reloc_root->root_key.offset,
btrfs_root_generation(
@@ -1934,8 +1899,8 @@ again:
} else {
btrfs_err(fs_info,
"reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu",
- root->root_key.objectid,
- reloc_root->root_key.objectid,
+ btrfs_root_id(root),
+ btrfs_root_id(reloc_root),
reloc_root->root_key.type,
reloc_root->root_key.offset,
btrfs_root_generation(
@@ -2192,7 +2157,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
return ERR_PTR(-EUCLEAN);
}
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
ret = record_reloc_root_in_trans(trans, root);
if (ret)
return ERR_PTR(ret);
@@ -2299,7 +2264,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return root;
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID)
fs_root = root;
if (next != node)
@@ -2315,9 +2280,8 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
return fs_root;
}
-static noinline_for_stack
-u64 calcu_metadata_size(struct reloc_control *rc,
- struct btrfs_backref_node *node, int reserve)
+static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
+ struct btrfs_backref_node *node)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_backref_node *next = node;
@@ -2326,12 +2290,12 @@ u64 calcu_metadata_size(struct reloc_control *rc,
u64 num_bytes = 0;
int index = 0;
- BUG_ON(reserve && node->processed);
+ BUG_ON(node->processed);
while (next) {
cond_resched();
while (1) {
- if (next->processed && (reserve || next != node))
+ if (next->processed)
break;
num_bytes += fs_info->nodesize;
@@ -2359,7 +2323,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
int ret;
u64 tmp;
- num_bytes = calcu_metadata_size(rc, node, 1) * 2;
+ num_bytes = calcu_metadata_size(rc, node) * 2;
trans->block_rsv = rc->block_rsv;
rc->reserved_bytes += num_bytes;
@@ -2422,8 +2386,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
- struct btrfs_ref ref = { 0 };
-
cond_resched();
upper = edge->node[UPPER];
@@ -2511,19 +2473,23 @@ static int do_relocation(struct btrfs_trans_handle *trans,
*/
ASSERT(node->eb == eb);
} else {
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = node->eb->start,
+ .num_bytes = blocksize,
+ .parent = upper->eb->start,
+ .owning_root = btrfs_header_owner(upper->eb),
+ .ref_root = btrfs_header_owner(upper->eb),
+ };
+
btrfs_set_node_blockptr(upper->eb, slot,
node->eb->start);
btrfs_set_node_ptr_generation(upper->eb, slot,
trans->transid);
btrfs_mark_buffer_dirty(trans, upper->eb);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- node->eb->start, blocksize,
- upper->eb->start,
- btrfs_header_owner(upper->eb));
btrfs_init_tree_ref(&ref, node->level,
- btrfs_header_owner(upper->eb),
- root->root_key.objectid, false);
+ btrfs_root_id(root), false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (!ret)
ret = btrfs_drop_subtree(trans, root, eb,
@@ -2775,12 +2741,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct tree_block *block;
struct tree_block *next;
- int ret;
- int err = 0;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out_free_blocks;
}
@@ -2795,8 +2760,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
/* Get first keys */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready) {
- err = get_tree_block_key(fs_info, block);
- if (err)
+ ret = get_tree_block_key(fs_info, block);
+ if (ret)
goto out_free_path;
}
}
@@ -2806,25 +2771,23 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
node = build_backref_tree(trans, rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
- err = PTR_ERR(node);
+ ret = PTR_ERR(node);
goto out;
}
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
break;
- }
}
out:
- err = finish_pending_nodes(trans, rc, path, err);
+ ret = finish_pending_nodes(trans, rc, path, ret);
out_free_path:
btrfs_free_path(path);
out_free_blocks:
free_block_list(blocks);
- return err;
+ return ret;
}
static noinline_for_stack int prealloc_file_extent_cluster(
@@ -2849,7 +2812,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
* btrfs_do_readpage() call of previously relocated file cluster.
*
* If the current cluster starts in the above range, btrfs_do_readpage()
- * will skip the read, and relocate_one_page() will later writeback
+ * will skip the read, and relocate_one_folio() will later writeback
* the padding zeros as new data, causing data corruption.
*
* Here we have to manually invalidate the range (i_size, PAGE_END + 1).
@@ -2858,7 +2821,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
struct address_space *mapping = inode->vfs_inode.i_mapping;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
- struct page *page;
+ struct folio *folio;
ASSERT(sectorsize < PAGE_SIZE);
ASSERT(IS_ALIGNED(i_size, sectorsize));
@@ -2889,16 +2852,16 @@ static noinline_for_stack int prealloc_file_extent_cluster(
clear_extent_bits(&inode->io_tree, i_size,
round_up(i_size, PAGE_SIZE) - 1,
EXTENT_UPTODATE);
- page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
+ folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT);
/*
* If page is freed we don't need to do anything then, as we
* will re-read the whole page anyway.
*/
- if (page) {
- btrfs_subpage_clear_uptodate(fs_info, page, i_size,
+ if (!IS_ERR(folio)) {
+ btrfs_subpage_clear_uptodate(fs_info, folio, i_size,
round_up(i_size, PAGE_SIZE) - i_size);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
}
@@ -2951,7 +2914,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
em->len = end + 1 - start;
em->block_len = em->len;
em->block_start = block_start;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags |= EXTENT_FLAG_PINNED;
lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
@@ -2983,68 +2946,71 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
return cluster->boundary[cluster_nr + 1] - 1;
}
-static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
- const struct file_extent_cluster *cluster,
- int *cluster_nr, unsigned long page_index)
+static int relocate_one_folio(struct inode *inode, struct file_ra_state *ra,
+ const struct file_extent_cluster *cluster,
+ int *cluster_nr, unsigned long index)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
u64 offset = BTRFS_I(inode)->index_cnt;
const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
- struct page *page;
- u64 page_start;
- u64 page_end;
+ struct folio *folio;
+ u64 folio_start;
+ u64 folio_end;
u64 cur;
int ret;
- ASSERT(page_index <= last_index);
- page = find_lock_page(inode->i_mapping, page_index);
- if (!page) {
+ ASSERT(index <= last_index);
+ folio = filemap_lock_folio(inode->i_mapping, index);
+ if (IS_ERR(folio)) {
page_cache_sync_readahead(inode->i_mapping, ra, NULL,
- page_index, last_index + 1 - page_index);
- page = find_or_create_page(inode->i_mapping, page_index, mask);
- if (!page)
- return -ENOMEM;
+ index, last_index + 1 - index);
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
}
- if (PageReadahead(page))
+ WARN_ON(folio_order(folio));
+
+ if (folio_test_readahead(folio))
page_cache_async_readahead(inode->i_mapping, ra, NULL,
- page_folio(page), page_index,
- last_index + 1 - page_index);
+ folio, index,
+ last_index + 1 - index);
- if (!PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
- goto release_page;
+ goto release_folio;
}
}
/*
- * We could have lost page private when we dropped the lock to read the
- * page above, make sure we set_page_extent_mapped here so we have any
+ * We could have lost folio private when we dropped the lock to read the
+ * folio above, make sure we set_page_extent_mapped here so we have any
* of the subpage blocksize stuff we need in place.
*/
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0)
- goto release_page;
+ goto release_folio;
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
+ folio_start = folio_pos(folio);
+ folio_end = folio_start + PAGE_SIZE - 1;
/*
* Start from the cluster, as for subpage case, the cluster can start
- * inside the page.
+ * inside the folio.
*/
- cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
- while (cur <= page_end) {
+ cur = max(folio_start, cluster->boundary[*cluster_nr] - offset);
+ while (cur <= folio_end) {
struct extent_state *cached_state = NULL;
u64 extent_start = cluster->boundary[*cluster_nr] - offset;
u64 extent_end = get_cluster_boundary_end(cluster,
*cluster_nr) - offset;
- u64 clamped_start = max(page_start, extent_start);
- u64 clamped_end = min(page_end, extent_end);
+ u64 clamped_start = max(folio_start, extent_start);
+ u64 clamped_end = min(folio_end, extent_end);
u32 clamped_len = clamped_end + 1 - clamped_start;
/* Reserve metadata for this range */
@@ -3052,7 +3018,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
clamped_len, clamped_len,
false);
if (ret)
- goto release_page;
+ goto release_folio;
/* Mark the range delalloc and dirty for later writeback */
lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
@@ -3068,19 +3034,18 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
clamped_len, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
clamped_len);
- goto release_page;
+ goto release_folio;
}
- btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
+ btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len);
/*
- * Set the boundary if it's inside the page.
+ * Set the boundary if it's inside the folio.
* Data relocation requires the destination extents to have the
* same size as the source.
* EXTENT_BOUNDARY bit prevents current extent from being merged
* with previous extent.
*/
- if (in_range(cluster->boundary[*cluster_nr] - offset,
- page_start, PAGE_SIZE)) {
+ if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) {
u64 boundary_start = cluster->boundary[*cluster_nr] -
offset;
u64 boundary_end = boundary_start +
@@ -3103,8 +3068,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
break;
}
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
@@ -3112,9 +3077,9 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
ret = -ECANCELED;
return ret;
-release_page:
- unlock_page(page);
- put_page(page);
+release_folio:
+ folio_unlock(folio);
+ folio_put(folio);
return ret;
}
@@ -3149,7 +3114,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
last_index = (cluster->end - offset) >> PAGE_SHIFT;
for (index = (cluster->start - offset) >> PAGE_SHIFT;
index <= last_index && !ret; index++)
- ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
+ ret = relocate_one_folio(inode, ra, cluster, &cluster_nr, index);
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
out:
@@ -3926,7 +3891,7 @@ static noinline_for_stack struct inode *create_reloc_inode(
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
u64 objectid;
- int err = 0;
+ int ret = 0;
root = btrfs_grab_root(fs_info->data_reloc_root);
trans = btrfs_start_transaction(root, 6);
@@ -3935,31 +3900,31 @@ static noinline_for_stack struct inode *create_reloc_inode(
return ERR_CAST(trans);
}
- err = btrfs_get_free_objectid(root, &objectid);
- if (err)
+ ret = btrfs_get_free_objectid(root, &objectid);
+ if (ret)
goto out;
- err = __insert_orphan_inode(trans, root, objectid);
- if (err)
+ ret = __insert_orphan_inode(trans, root, objectid);
+ if (ret)
goto out;
inode = btrfs_iget(fs_info->sb, objectid, root);
if (IS_ERR(inode)) {
delete_orphan_inode(trans, root, objectid);
- err = PTR_ERR(inode);
+ ret = PTR_ERR(inode);
inode = NULL;
goto out;
}
BTRFS_I(inode)->index_cnt = group->start;
- err = btrfs_orphan_add(trans, BTRFS_I(inode));
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
out:
btrfs_put_root(root);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- if (err) {
+ if (ret) {
iput(inode);
- inode = ERR_PTR(err);
+ inode = ERR_PTR(ret);
}
return inode;
}
@@ -4437,9 +4402,11 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
disk_bytenr + ordered->num_bytes - 1,
- &list, 0, false);
- if (ret)
+ &list, false);
+ if (ret < 0) {
+ btrfs_mark_ordered_extent_error(ordered);
return ret;
+ }
while (!list_empty(&list)) {
struct btrfs_ordered_sum *sums =
@@ -4489,8 +4456,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
btrfs_root_last_snapshot(&root->root_item))
first_cow = 1;
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
- rc->create_reloc_tree) {
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) {
WARN_ON(!first_cow && level == 0);
node = rc->backref_cache.path[level];
@@ -4583,8 +4549,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
}
new_root = pending->snap;
- reloc_root = create_reloc_root(trans, root->reloc_root,
- new_root->root_key.objectid);
+ reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root));
if (IS_ERR(reloc_root))
return PTR_ERR(reloc_root);
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 5fb60f2deb53..788c86d8633a 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -3,6 +3,15 @@
#ifndef BTRFS_RELOCATION_H
#define BTRFS_RELOCATION_H
+#include <linux/types.h>
+
+struct extent_buffer;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_trans_handle;
+struct btrfs_ordered_extent;
+struct btrfs_pending_snapshot;
+
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 603ad1459368..33962671a96c 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -10,7 +10,6 @@
#include "messages.h"
#include "transaction.h"
#include "disk-io.h"
-#include "print-tree.h"
#include "qgroup.h"
#include "space-info.h"
#include "accessors.h"
@@ -82,7 +81,14 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
if (ret > 0)
goto out;
} else {
- BUG_ON(ret == 0); /* Logical error */
+ /*
+ * Key with offset -1 found, there would have to exist a root
+ * with such id, but this is out of the valid range.
+ */
+ if (ret == 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
if (path->slots[0] == 0)
goto out;
path->slots[0]--;
@@ -142,8 +148,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
if (ret > 0) {
btrfs_crit(fs_info,
"unable to find root key (%llu %u %llu) in tree %llu",
- key->objectid, key->type, key->offset,
- root->root_key.objectid);
+ key->objectid, key->type, key->offset, btrfs_root_id(root));
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -323,8 +328,11 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, key, path, -1, 1);
if (ret < 0)
goto out;
-
- BUG_ON(ret != 0);
+ if (ret != 0) {
+ /* The root must exist but we did not find it by the key. */
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = btrfs_del_item(trans, root, path);
out:
@@ -539,13 +547,3 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
}
return ret;
}
-
-void btrfs_subvolume_release_metadata(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- u64 qgroup_to_release;
-
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
- btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
-}
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
index 8b2c3859e464..8f5739e732b9 100644
--- a/fs/btrfs/root-tree.h
+++ b/fs/btrfs/root-tree.h
@@ -3,13 +3,21 @@
#ifndef BTRFS_ROOT_TREE_H
#define BTRFS_ROOT_TREE_H
+#include <linux/types.h>
+
struct fscrypt_str;
+struct extent_buffer;
+struct btrfs_key;
+struct btrfs_root;
+struct btrfs_root_item;
+struct btrfs_path;
+struct btrfs_fs_info;
+struct btrfs_block_rsv;
+struct btrfs_trans_handle;
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv);
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 sequence,
const struct fscrypt_str *name);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9ce5be21b036..4b22cfe9a98c 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -43,7 +43,7 @@ struct scrub_ctx;
/*
* The following value only influences the performance.
*
- * This detemines how many stripes would be submitted in one go,
+ * This determines how many stripes would be submitted in one go,
* which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
*/
#define SCRUB_STRIPES_PER_GROUP 8
@@ -192,7 +192,6 @@ struct scrub_ctx {
int cur_stripe;
atomic_t cancel_req;
int readonly;
- int sectors_per_bio;
/* State of IO submission throttling affecting the associated device */
ktime_t throttle_deadline;
@@ -262,7 +261,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
atomic_set(&stripe->pending_io, 0);
spin_lock_init(&stripe->write_error_lock);
- ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
+ ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0);
if (ret < 0)
goto error;
@@ -710,7 +709,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
/* Metadata, verify the full tree block. */
if (sector->is_metadata) {
/*
- * Check if the tree block crosses the stripe boudary. If
+ * Check if the tree block crosses the stripe boundary. If
* crossed the boundary, we cannot verify it but only give a
* warning.
*
@@ -884,7 +883,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
/*
* Init needed infos for error reporting.
*
- * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
+ * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
* thus no need for dev/physical, error reporting still needs dev and physical.
*/
if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
@@ -1013,6 +1012,7 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
struct btrfs_fs_info *fs_info = sctx->fs_info;
int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
stripe->bg->length);
+ unsigned long repaired;
int mirror;
int i;
@@ -1079,16 +1079,15 @@ out:
* Submit the repaired sectors. For zoned case, we cannot do repair
* in-place, but queue the bg to be relocated.
*/
- if (btrfs_is_zoned(fs_info)) {
- if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+ bitmap_andnot(&repaired, &stripe->init_error_bitmap, &stripe->error_bitmap,
+ stripe->nr_sectors);
+ if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) {
+ if (btrfs_is_zoned(fs_info)) {
btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start);
- } else if (!sctx->readonly) {
- unsigned long repaired;
-
- bitmap_andnot(&repaired, &stripe->init_error_bitmap,
- &stripe->error_bitmap, stripe->nr_sectors);
- scrub_write_sectors(sctx, stripe, repaired, false);
- wait_scrub_stripe_io(stripe);
+ } else {
+ scrub_write_sectors(sctx, stripe, repaired, false);
+ wait_scrub_stripe_io(stripe);
+ }
}
scrub_stripe_report_errors(sctx, stripe);
@@ -1099,12 +1098,22 @@ out:
static void scrub_read_endio(struct btrfs_bio *bbio)
{
struct scrub_stripe *stripe = bbio->private;
+ struct bio_vec *bvec;
+ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
+ int num_sectors;
+ u32 bio_size = 0;
+ int i;
+
+ ASSERT(sector_nr < stripe->nr_sectors);
+ bio_for_each_bvec_all(bvec, &bbio->bio, i)
+ bio_size += bvec->bv_len;
+ num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
if (bbio->bio.bi_status) {
- bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
- bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
+ bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
+ bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
} else {
- bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+ bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io)) {
@@ -1280,7 +1289,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
* return 0 if it is a data stripe, 1 means parity stripe.
*/
static int get_raid56_logic_offset(u64 physical, int num,
- struct map_lookup *map, u64 *offset,
+ struct btrfs_chunk_map *map, u64 *offset,
u64 *stripe_start)
{
int i;
@@ -1381,8 +1390,15 @@ static int find_first_extent_item(struct btrfs_root *extent_root,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
+ btrfs_release_path(path);
+ return -EUCLEAN;
+ }
- ASSERT(ret > 0);
/*
* Here we intentionally pass 0 as @min_objectid, as there could be
* an extent item starting before @search_start.
@@ -1409,14 +1425,11 @@ search_forward:
if (ret > 0)
break;
next:
- path->slots[0]++;
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(extent_root, path);
- if (ret) {
- /* Either no more item or fatal error */
- btrfs_release_path(path);
- return ret;
- }
+ ret = btrfs_next_item(extent_root, path);
+ if (ret) {
+ /* Either no more items or a fatal error. */
+ btrfs_release_path(path);
+ return ret;
}
}
btrfs_release_path(path);
@@ -1640,6 +1653,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
+ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+ stripe->bg->length - stripe->logical) >>
+ fs_info->sectorsize_bits;
u64 stripe_len = BTRFS_STRIPE_LEN;
int mirror = stripe->mirror_num;
int i;
@@ -1650,6 +1666,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
struct page *page = scrub_stripe_get_page(stripe, i);
unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
+ /* We're beyond the chunk boundary, no need to read anymore. */
+ if (i >= nr_sectors)
+ break;
+
/* The current sector cannot be merged, submit the bio. */
if (bbio &&
((i > 0 &&
@@ -1705,6 +1725,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_bio *bbio;
+ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+ stripe->bg->length - stripe->logical) >>
+ fs_info->sectorsize_bits;
int mirror = stripe->mirror_num;
ASSERT(stripe->bg);
@@ -1719,14 +1742,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
scrub_read_endio, stripe);
- /* Read the whole stripe. */
bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
- for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
+ /* Read the whole range inside the chunk boundary. */
+ for (unsigned int cur = 0; cur < nr_sectors; cur++) {
+ struct page *page = scrub_stripe_get_page(stripe, cur);
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
int ret;
- ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
+ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
/* We should have allocated enough bio vectors. */
- ASSERT(ret == PAGE_SIZE);
+ ASSERT(ret == fs_info->sectorsize);
}
atomic_inc(&stripe->pending_io);
@@ -1816,7 +1841,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
if (sctx->is_dev_replace) {
/*
* For dev-replace, if we know there is something wrong with
- * metadata, we should immedately abort.
+ * metadata, we should immediately abort.
*/
for (int i = 0; i < nr_stripes; i++) {
if (stripe_has_metadata_error(&sctx->stripes[i])) {
@@ -1868,6 +1893,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
*/
ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);
+ /* @found_logical_ret must be specified. */
+ ASSERT(found_logical_ret);
+
stripe = &sctx->stripes[sctx->cur_stripe];
scrub_reset_stripe(stripe);
ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
@@ -1876,8 +1904,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
/* Either >0 as no more extents or <0 for error. */
if (ret)
return ret;
- if (found_logical_ret)
- *found_logical_ret = stripe->logical;
+ *found_logical_ret = stripe->logical;
sctx->cur_stripe++;
/* We filled one group, submit it. */
@@ -1896,7 +1923,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
u64 full_stripe_start)
{
DECLARE_COMPLETION_ONSTACK(io_done);
@@ -2065,7 +2092,7 @@ out:
*/
static int scrub_simple_mirror(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
u64 logical_start, u64 logical_length,
struct btrfs_device *device,
u64 physical, int mirror_num)
@@ -2080,7 +2107,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
/* Go through each extent items inside the logical range */
while (cur_logical < logical_end) {
- u64 found_logical;
+ u64 found_logical = U64_MAX;
u64 cur_physical = physical + cur_logical - logical_start;
/* Canceled? */
@@ -2115,6 +2142,8 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
if (ret < 0)
break;
+ /* queue_scrub_stripe() returned 0, @found_logical must be updated. */
+ ASSERT(found_logical != U64_MAX);
cur_logical = found_logical + BTRFS_STRIPE_LEN;
/* Don't hold CPU for too long time */
@@ -2124,7 +2153,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
}
/* Calculate the full stripe length for simple stripe based profiles */
-static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
+static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map)
{
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
@@ -2133,7 +2162,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
}
/* Get the logical bytenr for the stripe */
-static u64 simple_stripe_get_logical(struct map_lookup *map,
+static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map,
struct btrfs_block_group *bg,
int stripe_index)
{
@@ -2150,7 +2179,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map,
}
/* Get the mirror number for the stripe */
-static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
+static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index)
{
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
@@ -2162,7 +2191,7 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
static int scrub_simple_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct btrfs_device *device,
int stripe_index)
{
@@ -2195,18 +2224,17 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct extent_map *em,
+ struct btrfs_chunk_map *map,
struct btrfs_device *scrub_dev,
int stripe_index)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct map_lookup *map = em->map_lookup;
const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
const u64 chunk_logical = bg->start;
int ret;
int ret2;
u64 physical = map->stripes[stripe_index].physical;
- const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
+ const u64 dev_stripe_len = btrfs_calc_stripe_length(map);
const u64 physical_end = physical + dev_stripe_len;
u64 logical;
u64 logic_end;
@@ -2369,17 +2397,12 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
u64 dev_extent_len)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
int i;
int ret = 0;
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, bg->start, bg->length);
- read_unlock(&map_tree->lock);
-
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, bg->start, bg->length);
+ if (!map) {
/*
* Might have been an unused block group deleted by the cleaner
* kthread or relocation.
@@ -2391,22 +2414,21 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
return ret;
}
- if (em->start != bg->start)
+ if (map->start != bg->start)
goto out;
- if (em->len < dev_extent_len)
+ if (map->chunk_len < dev_extent_len)
goto out;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; ++i) {
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
- ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
+ ret = scrub_stripe(sctx, bg, map, scrub_dev, i);
if (ret)
goto out;
}
}
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2790,7 +2812,17 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
gen = btrfs_get_last_trans_committed(fs_info);
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
- bytenr = btrfs_sb_offset(i);
+ ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr);
+ if (ret == -ENOENT)
+ break;
+
+ if (ret) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.super_errors++;
+ spin_unlock(&sctx->stat_lock);
+ continue;
+ }
+
if (bytenr + BTRFS_SUPER_INFO_SIZE >
scrub_dev->commit_total_bytes)
break;
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
index 7639103ebf9d..f0df597b75c7 100644
--- a/fs/btrfs/scrub.h
+++ b/fs/btrfs/scrub.h
@@ -3,6 +3,12 @@
#ifndef BTRFS_SCRUB_H
#define BTRFS_SCRUB_H
+#include <linux/types.h>
+
+struct btrfs_fs_info;
+struct btrfs_device;
+struct btrfs_scrub_progress;
+
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3b929f0e8f04..3dd4a48479a9 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -25,7 +25,6 @@
#include "btrfs_inode.h"
#include "transaction.h"
#include "compression.h"
-#include "xattr.h"
#include "print-tree.h"
#include "accessors.h"
#include "dir-item.h"
@@ -393,9 +392,8 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
btrfs_err(sctx->send_root->fs_info,
"Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
result_string, what, sctx->cmp_key->objectid,
- sctx->send_root->root_key.objectid,
- (sctx->parent_root ?
- sctx->parent_root->root_key.objectid : 0));
+ btrfs_root_id(sctx->send_root),
+ (sctx->parent_root ? btrfs_root_id(sctx->parent_root) : 0));
}
__maybe_unused
@@ -777,7 +775,12 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
if (WARN_ON(!sctx->send_buf))
return -EINVAL;
- BUG_ON(sctx->send_size);
+ if (unlikely(sctx->send_size != 0)) {
+ btrfs_err(sctx->send_root->fs_info,
+ "send: command header buffer not empty cmd %d offset %llu",
+ cmd, sctx->send_off);
+ return -EINVAL;
+ }
sctx->send_size += sizeof(*hdr);
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
@@ -1070,7 +1073,15 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
ret = PTR_ERR(start);
goto out;
}
- BUG_ON(start < p->buf);
+ if (unlikely(start < p->buf)) {
+ btrfs_err(root->fs_info,
+ "send: path ref buffer underflow for key (%llu %u %llu)",
+ found_key->objectid,
+ found_key->type,
+ found_key->offset);
+ ret = -EINVAL;
+ goto out;
+ }
}
p->start = start;
} else {
@@ -1304,9 +1315,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
u64 root = (u64)(uintptr_t)key;
const struct clone_root *cr = elt;
- if (root < cr->root->root_key.objectid)
+ if (root < btrfs_root_id(cr->root))
return -1;
- if (root > cr->root->root_key.objectid)
+ if (root > btrfs_root_id(cr->root))
return 1;
return 0;
}
@@ -1316,9 +1327,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
const struct clone_root *cr1 = e1;
const struct clone_root *cr2 = e2;
- if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
+ if (btrfs_root_id(cr1->root) < btrfs_root_id(cr2->root))
return -1;
- if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
+ if (btrfs_root_id(cr1->root) > btrfs_root_id(cr2->root))
return 1;
return 0;
}
@@ -1406,7 +1417,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
struct btrfs_lru_cache_entry *raw_entry;
struct backref_cache_entry *entry;
- if (btrfs_lru_cache_size(&sctx->backref_cache) == 0)
+ if (sctx->backref_cache.size == 0)
return false;
/*
@@ -1504,7 +1515,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
* transaction handle or holding fs_info->commit_root_sem, so no need
* to take any lock here.
*/
- if (btrfs_lru_cache_size(&sctx->backref_cache) == 1)
+ if (sctx->backref_cache.size == 1)
sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
}
@@ -1766,7 +1777,7 @@ static int read_symlink(struct btrfs_root *root,
*/
btrfs_err(root->fs_info,
"Found empty symlink inode %llu at root %llu",
- ino, root->root_key.objectid);
+ ino, btrfs_root_id(root));
ret = -EIO;
goto out;
}
@@ -2520,7 +2531,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
return -ENOMEM;
}
- key.objectid = send_root->root_key.objectid;
+ key.objectid = btrfs_root_id(send_root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = 0;
@@ -2536,7 +2547,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.type != BTRFS_ROOT_BACKREF_KEY ||
- key.objectid != send_root->root_key.objectid) {
+ key.objectid != btrfs_root_id(send_root)) {
ret = -ENOENT;
goto out;
}
@@ -2809,8 +2820,7 @@ static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen)
static int trim_dir_utimes_cache(struct send_ctx *sctx)
{
- while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) >
- SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
+ while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
struct btrfs_lru_cache_entry *lru;
int ret;
@@ -4182,7 +4192,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* This should never happen as the root dir always has the same ref
* which is always '..'
*/
- BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+ if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) {
+ btrfs_err(fs_info,
+ "send: unexpected inode %llu in process_recorded_refs()",
+ sctx->cur_ino);
+ ret = -EINVAL;
+ goto out;
+ }
valid_path = fs_path_alloc();
if (!valid_path) {
@@ -5257,10 +5273,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct page *page;
+ struct folio *folio;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
unsigned pg_offset = offset_in_page(offset);
+ struct address_space *mapping = sctx->cur_inode->i_mapping;
int ret;
ret = put_data_header(sctx, len);
@@ -5273,44 +5290,44 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
unsigned cur_len = min_t(unsigned, len,
PAGE_SIZE - pg_offset);
- page = find_lock_page(sctx->cur_inode->i_mapping, index);
- if (!page) {
- page_cache_sync_readahead(sctx->cur_inode->i_mapping,
+ folio = filemap_lock_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ page_cache_sync_readahead(mapping,
&sctx->ra, NULL, index,
last_index + 1 - index);
- page = find_or_create_page(sctx->cur_inode->i_mapping,
- index, GFP_KERNEL);
- if (!page) {
- ret = -ENOMEM;
+ folio = filemap_grab_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
break;
}
}
- if (PageReadahead(page))
- page_cache_async_readahead(sctx->cur_inode->i_mapping,
- &sctx->ra, NULL, page_folio(page),
+ WARN_ON(folio_order(folio));
+
+ if (folio_test_readahead(folio))
+ page_cache_async_readahead(mapping, &sctx->ra, NULL, folio,
index, last_index + 1 - index);
- if (!PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
+ if (!folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (!folio_test_uptodate(folio)) {
+ folio_unlock(folio);
btrfs_err(fs_info,
"send: IO error at offset %llu for inode %llu root %llu",
- page_offset(page), sctx->cur_ino,
- sctx->send_root->root_key.objectid);
- put_page(page);
+ folio_pos(folio), sctx->cur_ino,
+ btrfs_root_id(sctx->send_root));
+ folio_put(folio);
ret = -EIO;
break;
}
}
- memcpy_from_page(sctx->send_buf + sctx->send_size, page,
- pg_offset, cur_len);
- unlock_page(page);
- put_page(page);
+ memcpy_from_folio(sctx->send_buf + sctx->send_size, folio,
+ pg_offset, cur_len);
+ folio_unlock(folio);
+ folio_put(folio);
index++;
pg_offset = 0;
len -= cur_len;
@@ -5371,7 +5388,7 @@ static int send_clone(struct send_ctx *sctx,
btrfs_debug(sctx->send_root->fs_info,
"send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
- offset, len, clone_root->root->root_key.objectid,
+ offset, len, btrfs_root_id(clone_root->root),
clone_root->ino, clone_root->offset);
p = fs_path_alloc();
@@ -6140,7 +6157,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
int ret = 0;
u64 offset = key->offset;
u64 end;
- u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
+ u64 bs = sctx->send_root->fs_info->sectorsize;
end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
if (offset >= end)
@@ -6458,21 +6475,18 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
return 0;
- if (sctx->cur_inode_last_extent == (u64)-1) {
- ret = get_last_extent(sctx, key->offset - 1);
- if (ret)
- return ret;
- }
-
- if (path->slots[0] == 0 &&
- sctx->cur_inode_last_extent < key->offset) {
- /*
- * We might have skipped entire leafs that contained only
- * file extent items for our current inode. These leafs have
- * a generation number smaller (older) than the one in the
- * current leaf and the leaf our last extent came from, and
- * are located between these 2 leafs.
- */
+ /*
+ * Get last extent's end offset (exclusive) if we haven't determined it
+ * yet (we're processing the first file extent item that is new), or if
+ * we're at the first slot of a leaf and the last extent's end is less
+ * than the current extent's offset, because we might have skipped
+ * entire leaves that contained only file extent items for our current
+ * inode. These leaves have a generation number smaller (older) than the
+ * one in the current leaf and the leaf our last extent came from, and
+ * are located between these 2 leaves.
+ */
+ if ((sctx->cur_inode_last_extent == (u64)-1) ||
+ (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset)) {
ret = get_last_extent(sctx, key->offset - 1);
if (ret)
return ret;
@@ -6705,11 +6719,20 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (ret)
goto out;
}
- if (sctx->cur_inode_last_extent <
- sctx->cur_inode_size) {
- ret = send_hole(sctx, sctx->cur_inode_size);
- if (ret)
+ if (sctx->cur_inode_last_extent < sctx->cur_inode_size) {
+ ret = range_is_hole_in_parent(sctx,
+ sctx->cur_inode_last_extent,
+ sctx->cur_inode_size);
+ if (ret < 0) {
goto out;
+ } else if (ret == 0) {
+ ret = send_hole(sctx, sctx->cur_inode_size);
+ if (ret < 0)
+ goto out;
+ } else {
+ /* Range is already a hole, skip. */
+ ret = 0;
+ }
}
}
if (need_truncate) {
@@ -7314,7 +7337,7 @@ static int search_key_again(const struct send_ctx *sctx,
"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
key->objectid, key->type, key->offset,
(root == sctx->parent_root ? "parent" : "send"),
- root->root_key.objectid, path->lowest_level,
+ btrfs_root_id(root), path->lowest_level,
path->slots[path->lowest_level]);
return -EUCLEAN;
}
@@ -7420,8 +7443,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen
u64 reada_done = 0;
lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
+ ASSERT(*level != 0);
- BUG_ON(*level == 0);
eb = btrfs_read_node_slot(parent, slot);
if (IS_ERR(eb))
return PTR_ERR(eb);
@@ -8048,7 +8071,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
if (root->send_in_progress < 0)
btrfs_err(root->fs_info,
"send_in_progress unbalanced %d root %llu",
- root->send_in_progress, root->root_key.objectid);
+ root->send_in_progress, btrfs_root_id(root));
spin_unlock(&root->root_item_lock);
}
@@ -8056,7 +8079,7 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
{
btrfs_warn_rl(root->fs_info,
"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
- root->root_key.objectid, root->dedupe_in_progress);
+ btrfs_root_id(root), root->dedupe_in_progress);
}
long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
@@ -8111,7 +8134,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
}
if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
- ret = -EINVAL;
+ ret = -EOPNOTSUPP;
goto out;
}
@@ -8158,7 +8181,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
}
sctx->send_filp = fget(arg->send_fd);
- if (!sctx->send_filp) {
+ if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) {
ret = -EBADF;
goto out;
}
@@ -8205,8 +8228,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
- arg->clone_sources_count + 1,
+ sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1,
+ sizeof(*sctx->clone_roots),
GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 4f5509cb1803..dd1c9f02b011 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -8,6 +8,11 @@
#define BTRFS_SEND_H
#include <linux/types.h>
+#include <linux/sizes.h>
+#include <linux/align.h>
+
+struct inode;
+struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
/* Conditional support for the upcoming protocol version. */
@@ -25,9 +30,6 @@
#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K
#define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE)
-struct inode;
-struct btrfs_ioctl_send_args;
-
enum btrfs_tlv_type {
BTRFS_TLV_U8,
BTRFS_TLV_U16,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 571bb13587d5..d620323d08ea 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -9,7 +9,6 @@
#include "ordered-data.h"
#include "transaction.h"
#include "block-group.h"
-#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
@@ -856,7 +855,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
- u64 global_rsv_size = fs_info->global_block_rsv.reserved;
+ const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
u64 ordered, delalloc;
u64 thresh;
u64 used;
@@ -956,8 +955,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
if (ordered >= delalloc)
- used += fs_info->delayed_refs_rsv.reserved +
- fs_info->delayed_block_rsv.reserved;
+ used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) +
+ btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv);
else
used += space_info->bytes_may_use - global_rsv_size;
@@ -1173,7 +1172,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
enum btrfs_flush_state flush;
u64 delalloc_size = 0;
u64 to_reclaim, block_rsv_size;
- u64 global_rsv_size = global_rsv->reserved;
+ const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv);
loops++;
@@ -1185,9 +1184,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* assume it's tied up in delalloc reservations.
*/
block_rsv_size = global_rsv_size +
- delayed_block_rsv->reserved +
- delayed_refs_rsv->reserved +
- trans_rsv->reserved;
+ btrfs_block_rsv_reserved(delayed_block_rsv) +
+ btrfs_block_rsv_reserved(delayed_refs_rsv) +
+ btrfs_block_rsv_reserved(trans_rsv);
if (block_rsv_size < space_info->bytes_may_use)
delalloc_size = space_info->bytes_may_use - block_rsv_size;
@@ -1207,16 +1206,16 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
to_reclaim = delalloc_size;
flush = FLUSH_DELALLOC;
} else if (space_info->bytes_pinned >
- (delayed_block_rsv->reserved +
- delayed_refs_rsv->reserved)) {
+ (btrfs_block_rsv_reserved(delayed_block_rsv) +
+ btrfs_block_rsv_reserved(delayed_refs_rsv))) {
to_reclaim = space_info->bytes_pinned;
flush = COMMIT_TRANS;
- } else if (delayed_block_rsv->reserved >
- delayed_refs_rsv->reserved) {
- to_reclaim = delayed_block_rsv->reserved;
+ } else if (btrfs_block_rsv_reserved(delayed_block_rsv) >
+ btrfs_block_rsv_reserved(delayed_refs_rsv)) {
+ to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv);
flush = FLUSH_DELAYED_ITEMS_NR;
} else {
- to_reclaim = delayed_refs_rsv->reserved;
+ to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv);
flush = FLUSH_DELAYED_REFS_NR;
}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 92c595fed1b0..a733458fd13b 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -4,8 +4,17 @@
#define BTRFS_SPACE_INFO_H
#include <trace/events/btrfs.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/kobject.h>
+#include <linux/lockdep.h>
+#include <linux/wait.h>
+#include <linux/rwsem.h>
#include "volumes.h"
+struct btrfs_fs_info;
+struct btrfs_block_group;
+
/*
* Different levels for to flush space when doing space reservations.
*
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 1b999c6e4193..54736f6238e6 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -64,7 +64,7 @@
* This means a slightly higher tree locking latency.
*/
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping)
{
if (fs_info->sectorsize >= PAGE_SIZE)
return false;
@@ -74,8 +74,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
* mapping. And if page->mapping->host is data inode, it's subpage.
* As we have ruled our sectorsize >= PAGE_SIZE case already.
*/
- if (!page->mapping || !page->mapping->host ||
- is_data_inode(page->mapping->host))
+ if (!mapping || !mapping->host || is_data_inode(mapping->host))
return true;
/*
@@ -112,11 +111,14 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector
subpage_info->checked_offset = cur;
cur += nr_bits;
+ subpage_info->locked_offset = cur;
+ cur += nr_bits;
+
subpage_info->total_nr_bits = cur;
}
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page, enum btrfs_subpage_type type)
+ struct folio *folio, enum btrfs_subpage_type type)
{
struct btrfs_subpage *subpage;
@@ -124,31 +126,30 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
* We have cases like a dummy extent buffer page, which is not mapped
* and doesn't need to be locked.
*/
- if (page->mapping)
- ASSERT(PageLocked(page));
+ if (folio->mapping)
+ ASSERT(folio_test_locked(folio));
- /* Either not subpage, or the page already has private attached */
- if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page))
+ /* Either not subpage, or the folio already has private attached. */
+ if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio))
return 0;
subpage = btrfs_alloc_subpage(fs_info, type);
if (IS_ERR(subpage))
return PTR_ERR(subpage);
- attach_page_private(page, subpage);
+ folio_attach_private(folio, subpage);
return 0;
}
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- /* Either not subpage, or already detached */
- if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page))
+ /* Either not subpage, or the folio already has private attached. */
+ if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio))
return;
- subpage = detach_page_private(page);
+ subpage = folio_detach_private(folio);
ASSERT(subpage);
btrfs_free_subpage(subpage);
}
@@ -188,78 +189,109 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage)
* This is important for eb allocation, to prevent race with last eb freeing
* of the same page.
* With the eb_refs increased before the eb inserted into radix tree,
- * detach_extent_buffer_page() won't detach the page private while we're still
+ * detach_extent_buffer_page() won't detach the folio private while we're still
* allocating the extent buffer.
*/
-void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->mapping);
- lockdep_assert_held(&page->mapping->private_lock);
+ ASSERT(folio_test_private(folio) && folio->mapping);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
atomic_inc(&subpage->eb_refs);
}
-void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->mapping);
- lockdep_assert_held(&page->mapping->private_lock);
+ ASSERT(folio_test_private(folio) && folio->mapping);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
ASSERT(atomic_read(&subpage->eb_refs));
atomic_dec(&subpage->eb_refs);
}
static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
+ /* For subpage support, the folio must be single page. */
+ ASSERT(folio_order(folio) == 0);
+
/* Basic checks */
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(len, fs_info->sectorsize));
/*
* The range check only works for mapped page, we can still have
* unmapped page like dummy extent buffer pages.
*/
- if (page->mapping)
- ASSERT(page_offset(page) <= start &&
- start + len <= page_offset(page) + PAGE_SIZE);
+ if (folio->mapping)
+ ASSERT(folio_pos(folio) <= start &&
+ start + len <= folio_pos(folio) + PAGE_SIZE);
}
+#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
+({ \
+ unsigned int start_bit; \
+ \
+ btrfs_subpage_assert(fs_info, folio, start, len); \
+ start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
+ start_bit += fs_info->subpage_info->name##_offset; \
+ start_bit; \
+})
+
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = len >> fs_info->sectorsize_bits;
+ unsigned long flags;
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ /*
+ * Even though it's just for reading the page, no one should have
+ * locked the subpage range.
+ */
+ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ bitmap_set(subpage->bitmaps, start_bit, nbits);
atomic_add(nbits, &subpage->readers);
+ spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = len >> fs_info->sectorsize_bits;
+ unsigned long flags;
bool is_data;
bool last;
- btrfs_subpage_assert(fs_info, page, start, len);
- is_data = is_data_inode(page->mapping->host);
+ btrfs_subpage_assert(fs_info, folio, start, len);
+ is_data = is_data_inode(folio->mapping->host);
+
+ spin_lock_irqsave(&subpage->lock, flags);
+
+ /* The range should have already been locked. */
+ ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
ASSERT(atomic_read(&subpage->readers) >= nbits);
+
+ bitmap_clear(subpage->bitmaps, start_bit, nbits);
last = atomic_sub_and_test(nbits, &subpage->readers);
/*
@@ -270,49 +302,60 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
* As we want the atomic_sub_and_test() to be always executed.
*/
if (is_data && last)
- unlock_page(page);
+ folio_unlock(folio);
+ spin_unlock_irqrestore(&subpage->lock, flags);
}
-static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
+static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
{
u64 orig_start = *start;
u32 orig_len = *len;
- *start = max_t(u64, page_offset(page), orig_start);
+ *start = max_t(u64, folio_pos(folio), orig_start);
/*
* For certain call sites like btrfs_drop_pages(), we may have pages
* beyond the target range. In that case, just set @len to 0, subpage
* helpers can handle @len == 0 without any problem.
*/
- if (page_offset(page) >= orig_start + orig_len)
+ if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ *len = min_t(u64, folio_pos(folio) + PAGE_SIZE,
orig_start + orig_len) - *start;
}
-void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = (len >> fs_info->sectorsize_bits);
+ unsigned long flags;
int ret;
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
+ spin_lock_irqsave(&subpage->lock, flags);
ASSERT(atomic_read(&subpage->readers) == 0);
+ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ bitmap_set(subpage->bitmaps, start_bit, nbits);
ret = atomic_add_return(nbits, &subpage->writers);
ASSERT(ret == nbits);
+ spin_unlock_irqrestore(&subpage->lock, flags);
}
-bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = (len >> fs_info->sectorsize_bits);
+ unsigned long flags;
+ bool last;
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
+ spin_lock_irqsave(&subpage->lock, flags);
/*
* We have call sites passing @lock_page into
* extent_clear_unlock_delalloc() for compression path.
@@ -320,15 +363,22 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
* This @locked_page is locked by plain lock_page(), thus its
* subpage::writers is 0. Handle them in a special way.
*/
- if (atomic_read(&subpage->writers) == 0)
+ if (atomic_read(&subpage->writers) == 0) {
+ spin_unlock_irqrestore(&subpage->lock, flags);
return true;
+ }
ASSERT(atomic_read(&subpage->writers) >= nbits);
- return atomic_sub_and_test(nbits, &subpage->writers);
+ /* The target range should have been locked. */
+ ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
+ bitmap_clear(subpage->bitmaps, start_bit, nbits);
+ last = atomic_sub_and_test(nbits, &subpage->writers);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ return last;
}
/*
- * Lock a page for delalloc page writeback.
+ * Lock a folio for delalloc page writeback.
*
* Return -EAGAIN if the page is not properly initialized.
* Return 0 with the page locked, and writer counter updated.
@@ -337,43 +387,35 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
* it's really the correct page, as the caller is using
* filemap_get_folios_contig(), which can race with page invalidating.
*/
-int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {
- lock_page(page);
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_lock(folio);
return 0;
}
- lock_page(page);
- if (!PagePrivate(page) || !page->private) {
- unlock_page(page);
+ folio_lock(folio);
+ if (!folio_test_private(folio) || !folio_get_private(folio)) {
+ folio_unlock(folio);
return -EAGAIN;
}
- btrfs_subpage_clamp_range(page, &start, &len);
- btrfs_subpage_start_writer(fs_info, page, start, len);
+ btrfs_subpage_clamp_range(folio, &start, &len);
+ btrfs_subpage_start_writer(fs_info, folio, start, len);
return 0;
}
-void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))
- return unlock_page(page);
- btrfs_subpage_clamp_range(page, &start, &len);
- if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
- unlock_page(page);
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
+ }
+ btrfs_subpage_clamp_range(folio, &start, &len);
+ if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len))
+ folio_unlock(folio);
}
-#define subpage_calc_start_bit(fs_info, page, name, start, len) \
-({ \
- unsigned int start_bit; \
- \
- btrfs_subpage_assert(fs_info, page, start, len); \
- start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
- start_bit += fs_info->subpage_info->name##_offset; \
- start_bit; \
-})
-
#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
bitmap_test_range_all_set(subpage->bitmaps, \
fs_info->subpage_info->name##_offset, \
@@ -385,46 +427,46 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
fs_info->subpage_info->bitmap_nr_bits)
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
}
/*
@@ -438,10 +480,10 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
* extra handling for tree blocks.
*/
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
bool last = false;
@@ -455,101 +497,102 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
}
void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
bool last;
- last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, start, len);
if (last)
- clear_page_dirty_for_io(page);
+ folio_clear_dirty_for_io(folio);
}
void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- set_page_writeback(page);
+ if (!folio_test_writeback(folio))
+ folio_start_writeback(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
- ASSERT(PageWriteback(page));
- end_page_writeback(page);
+ ASSERT(folio_test_writeback(folio));
+ folio_end_writeback(folio);
}
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- SetPageOrdered(page);
+ folio_set_ordered(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
- ClearPageOrdered(page);
+ folio_clear_ordered(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
- SetPageChecked(page);
+ folio_set_checked(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- ClearPageChecked(page);
+ folio_clear_checked(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -559,10 +602,10 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
*/
#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+ struct folio *folio, u64 start, u32 len) \
{ \
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \
+ struct btrfs_subpage *subpage = folio_get_private(folio); \
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \
name, start, len); \
unsigned long flags; \
bool ret; \
@@ -584,88 +627,94 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
* in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
* back to regular sectorsize branch.
*/
-#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \
- test_page_func) \
-void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+#define IMPLEMENT_BTRFS_PAGE_OPS(name, folio_set_func, \
+ folio_clear_func, folio_test_func) \
+void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- set_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_set_func(folio); \
return; \
} \
- btrfs_subpage_set_##name(fs_info, page, start, len); \
+ btrfs_subpage_set_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- clear_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_clear_func(folio); \
return; \
} \
- btrfs_subpage_clear_##name(fs_info, page, start, len); \
+ btrfs_subpage_clear_##name(fs_info, folio, start, len); \
} \
-bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
- return test_page_func(page); \
- return btrfs_subpage_test_##name(fs_info, page, start, len); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) \
+ return folio_test_func(folio); \
+ return btrfs_subpage_test_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- set_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_set_func(folio); \
return; \
} \
- btrfs_subpage_clamp_range(page, &start, &len); \
- btrfs_subpage_set_##name(fs_info, page, start, len); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ btrfs_subpage_set_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- clear_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_clear_func(folio); \
return; \
} \
- btrfs_subpage_clamp_range(page, &start, &len); \
- btrfs_subpage_clear_##name(fs_info, page, start, len); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ btrfs_subpage_clear_##name(fs_info, folio, start, len); \
} \
-bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
- return test_page_func(page); \
- btrfs_subpage_clamp_range(page, &start, &len); \
- return btrfs_subpage_test_##name(fs_info, page, start, len); \
-}
-IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
- PageUptodate);
-IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
- PageDirty);
-IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
- PageWriteback);
-IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
- PageOrdered);
-IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) \
+ return folio_test_func(folio); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ return btrfs_subpage_test_##name(fs_info, folio, start, len); \
+}
+IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate,
+ folio_test_uptodate);
+IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io,
+ folio_test_dirty);
+IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback,
+ folio_test_writeback);
+IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
+ folio_test_ordered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
+ folio_test_checked);
/*
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
* is cleared.
*/
-void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
- ASSERT(!PageDirty(page));
- if (!btrfs_is_subpage(fs_info, page))
+ ASSERT(!folio_test_dirty(folio));
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
}
@@ -684,18 +733,20 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
* extent_write_locked_range().
* In this case, we have to call subpage helper to handle the case.
*/
-void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
- u64 start, u32 len)
+void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage;
- ASSERT(PageLocked(page));
+ ASSERT(folio_test_locked(folio));
/* For non-subpage case, we just unlock the page */
- if (!btrfs_is_subpage(fs_info, page))
- return unlock_page(page);
+ if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
+ }
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ subpage = folio_get_private(folio);
/*
* For subpage case, there are two types of locked page. With or
@@ -704,12 +755,14 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
* Since we own the page lock, no one else could touch subpage::writers
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->writers) == 0)
+ if (atomic_read(&subpage->writers) == 0) {
/* No writers, locked by plain lock_page() */
- return unlock_page(page);
+ folio_unlock(folio);
+ return;
+ }
/* Have writers, use proper subpage helper to end it */
- btrfs_page_end_writer_lock(fs_info, page, start, len);
+ btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}
#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \
@@ -717,7 +770,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
subpage_info->name##_offset, subpage_info->bitmap_nr_bits)
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
struct btrfs_subpage *subpage;
@@ -729,9 +782,9 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
unsigned long checked_bitmap;
unsigned long flags;
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(subpage_info);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
spin_lock_irqsave(&subpage->lock, flags);
GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap);
@@ -739,12 +792,13 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap);
GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap);
GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, subpage_info, locked, &checked_bitmap);
spin_unlock_irqrestore(&subpage->lock, flags);
- dump_page(page, "btrfs subpage dump");
+ dump_page(folio_page(folio, 0), "btrfs subpage dump");
btrfs_warn(fs_info,
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
- start, len, page_offset(page),
+ start, len, folio_pos(folio),
subpage_info->bitmap_nr_bits, &uptodate_bitmap,
subpage_info->bitmap_nr_bits, &error_bitmap,
subpage_info->bitmap_nr_bits, &dirty_bitmap,
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 5cbf67ccbdeb..b6dc013b0fdc 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -4,6 +4,11 @@
#define BTRFS_SUBPAGE_H
#include <linux/spinlock.h>
+#include <linux/atomic.h>
+
+struct address_space;
+struct folio;
+struct btrfs_fs_info;
/*
* Extra info for subpapge bitmap.
@@ -28,7 +33,7 @@ struct btrfs_subpage_info {
unsigned int total_nr_bits;
/*
- * *_start indicates where the bitmap starts, the length is always
+ * *_offset indicates where the bitmap starts, the length is always
* @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
*/
unsigned int uptodate_offset;
@@ -36,6 +41,16 @@ struct btrfs_subpage_info {
unsigned int writeback_offset;
unsigned int ordered_offset;
unsigned int checked_offset;
+
+ /*
+ * For locked bitmaps, normally it's subpage representation for folio
+ * Locked flag, but metadata is different:
+ *
+ * - Metadata doesn't really lock the folio
+ * It's just to prevent page::private get cleared before the last
+ * end_page_read().
+ */
+ unsigned int locked_offset;
};
/*
@@ -73,71 +88,64 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page);
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping);
void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page, enum btrfs_subpage_type type);
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page);
+ struct folio *folio, enum btrfs_subpage_type type);
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio);
/* Allocate additional data where page represents more than one sector */
struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
enum btrfs_subpage_type type);
void btrfs_free_subpage(struct btrfs_subpage *subpage);
-void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page);
-void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page);
+void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
-void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
/*
* Template for subpage related operations.
*
- * btrfs_subpage_*() are for call sites where the page has subpage attached and
- * the range is ensured to be inside the page.
+ * btrfs_subpage_*() are for call sites where the folio has subpage attached and
+ * the range is ensured to be inside the folio's single page.
*
- * btrfs_page_*() are for call sites where the page can either be subpage
- * specific or regular page. The function will handle both cases.
- * But the range still needs to be inside the page.
+ * btrfs_folio_*() are for call sites where the page can either be subpage
+ * specific or regular folios. The function will handle both cases.
+ * But the range still needs to be inside one single page.
*
- * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't
+ * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't
* need to be inside the page. Those functions will truncate the range
* automatically.
*/
#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
+ struct folio *folio, u64 start, u32 len); \
void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
+ struct folio *folio, u64 start, u32 len); \
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len);
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
@@ -146,13 +154,12 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered);
DECLARE_BTRFS_SUBPAGE_OPS(checked);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
-void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page);
-void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
- u64 start, u32 len);
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f638dc339693..2dbc930a20f7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -27,19 +27,18 @@
#include <linux/crc32c.h>
#include <linux/btrfs.h>
#include <linux/security.h>
+#include <linux/fs_parser.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "props.h"
#include "xattr.h"
#include "bio.h"
#include "export.h"
#include "compression.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "free-space-cache.h"
#include "backref.h"
@@ -64,27 +63,32 @@
#include <trace/events/btrfs.h>
static const struct super_operations btrfs_super_ops;
-
-/*
- * Types for mounting the default subvolume and a subvolume explicitly
- * requested by subvol=/path. That way the callchain is straightforward and we
- * don't have to play tricks with the mount options and recursive calls to
- * btrfs_mount.
- *
- * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
- */
static struct file_system_type btrfs_fs_type;
-static struct file_system_type btrfs_root_fs_type;
-
-static int btrfs_remount(struct super_block *sb, int *flags, char *data);
static void btrfs_put_super(struct super_block *sb)
{
- close_ctree(btrfs_sb(sb));
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ btrfs_info(fs_info, "last unmount of filesystem %pU", fs_info->fs_devices->fsid);
+ close_ctree(fs_info);
}
+/* Store the mount options related information. */
+struct btrfs_fs_context {
+ char *subvol_name;
+ u64 subvol_objectid;
+ u64 max_inline;
+ u32 commit_interval;
+ u32 metadata_ratio;
+ u32 thread_pool_size;
+ unsigned long mount_opt;
+ unsigned long compress_type:4;
+ unsigned int compress_level;
+ refcount_t refs;
+};
+
enum {
- Opt_acl, Opt_noacl,
+ Opt_acl,
Opt_clear_cache,
Opt_commit_interval,
Opt_compress,
@@ -94,27 +98,26 @@ enum {
Opt_degraded,
Opt_device,
Opt_fatal_errors,
- Opt_flushoncommit, Opt_noflushoncommit,
+ Opt_flushoncommit,
Opt_max_inline,
- Opt_barrier, Opt_nobarrier,
- Opt_datacow, Opt_nodatacow,
- Opt_datasum, Opt_nodatasum,
- Opt_defrag, Opt_nodefrag,
- Opt_discard, Opt_nodiscard,
+ Opt_barrier,
+ Opt_datacow,
+ Opt_datasum,
+ Opt_defrag,
+ Opt_discard,
Opt_discard_mode,
- Opt_norecovery,
Opt_ratio,
Opt_rescan_uuid_tree,
Opt_skip_balance,
- Opt_space_cache, Opt_no_space_cache,
+ Opt_space_cache,
Opt_space_cache_version,
- Opt_ssd, Opt_nossd,
- Opt_ssd_spread, Opt_nossd_spread,
+ Opt_ssd,
+ Opt_ssd_spread,
Opt_subvol,
Opt_subvol_empty,
Opt_subvolid,
Opt_thread_pool,
- Opt_treelog, Opt_notreelog,
+ Opt_treelog,
Opt_user_subvol_rm_allowed,
/* Rescue options */
@@ -125,14 +128,10 @@ enum {
Opt_ignoredatacsums,
Opt_rescue_all,
- /* Deprecated options */
- Opt_recovery,
- Opt_inode_cache, Opt_noinode_cache,
-
/* Debugging options */
- Opt_enospc_debug, Opt_noenospc_debug,
+ Opt_enospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
- Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+ Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
Opt_ref_verify,
@@ -140,785 +139,611 @@ enum {
Opt_err,
};
-static const match_table_t tokens = {
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_clear_cache, "clear_cache"},
- {Opt_commit_interval, "commit=%u"},
- {Opt_compress, "compress"},
- {Opt_compress_type, "compress=%s"},
- {Opt_compress_force, "compress-force"},
- {Opt_compress_force_type, "compress-force=%s"},
- {Opt_degraded, "degraded"},
- {Opt_device, "device=%s"},
- {Opt_fatal_errors, "fatal_errors=%s"},
- {Opt_flushoncommit, "flushoncommit"},
- {Opt_noflushoncommit, "noflushoncommit"},
- {Opt_inode_cache, "inode_cache"},
- {Opt_noinode_cache, "noinode_cache"},
- {Opt_max_inline, "max_inline=%s"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_datacow, "datacow"},
- {Opt_nodatacow, "nodatacow"},
- {Opt_datasum, "datasum"},
- {Opt_nodatasum, "nodatasum"},
- {Opt_defrag, "autodefrag"},
- {Opt_nodefrag, "noautodefrag"},
- {Opt_discard, "discard"},
- {Opt_discard_mode, "discard=%s"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_norecovery, "norecovery"},
- {Opt_ratio, "metadata_ratio=%u"},
- {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
- {Opt_skip_balance, "skip_balance"},
- {Opt_space_cache, "space_cache"},
- {Opt_no_space_cache, "nospace_cache"},
- {Opt_space_cache_version, "space_cache=%s"},
- {Opt_ssd, "ssd"},
- {Opt_nossd, "nossd"},
- {Opt_ssd_spread, "ssd_spread"},
- {Opt_nossd_spread, "nossd_spread"},
- {Opt_subvol, "subvol=%s"},
- {Opt_subvol_empty, "subvol="},
- {Opt_subvolid, "subvolid=%s"},
- {Opt_thread_pool, "thread_pool=%u"},
- {Opt_treelog, "treelog"},
- {Opt_notreelog, "notreelog"},
- {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+enum {
+ Opt_fatal_errors_panic,
+ Opt_fatal_errors_bug,
+};
- /* Rescue options */
- {Opt_rescue, "rescue=%s"},
+static const struct constant_table btrfs_parameter_fatal_errors[] = {
+ { "panic", Opt_fatal_errors_panic },
+ { "bug", Opt_fatal_errors_bug },
+ {}
+};
+
+enum {
+ Opt_discard_sync,
+ Opt_discard_async,
+};
+
+static const struct constant_table btrfs_parameter_discard[] = {
+ { "sync", Opt_discard_sync },
+ { "async", Opt_discard_async },
+ {}
+};
+
+enum {
+ Opt_space_cache_v1,
+ Opt_space_cache_v2,
+};
+
+static const struct constant_table btrfs_parameter_space_cache[] = {
+ { "v1", Opt_space_cache_v1 },
+ { "v2", Opt_space_cache_v2 },
+ {}
+};
+
+enum {
+ Opt_rescue_usebackuproot,
+ Opt_rescue_nologreplay,
+ Opt_rescue_ignorebadroots,
+ Opt_rescue_ignoredatacsums,
+ Opt_rescue_parameter_all,
+};
+
+static const struct constant_table btrfs_parameter_rescue[] = {
+ { "usebackuproot", Opt_rescue_usebackuproot },
+ { "nologreplay", Opt_rescue_nologreplay },
+ { "ignorebadroots", Opt_rescue_ignorebadroots },
+ { "ibadroots", Opt_rescue_ignorebadroots },
+ { "ignoredatacsums", Opt_rescue_ignoredatacsums },
+ { "idatacsums", Opt_rescue_ignoredatacsums },
+ { "all", Opt_rescue_parameter_all },
+ {}
+};
+
+#ifdef CONFIG_BTRFS_DEBUG
+enum {
+ Opt_fragment_parameter_data,
+ Opt_fragment_parameter_metadata,
+ Opt_fragment_parameter_all,
+};
+
+static const struct constant_table btrfs_parameter_fragment[] = {
+ { "data", Opt_fragment_parameter_data },
+ { "metadata", Opt_fragment_parameter_metadata },
+ { "all", Opt_fragment_parameter_all },
+ {}
+};
+#endif
+
+static const struct fs_parameter_spec btrfs_fs_parameters[] = {
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_flag_no("autodefrag", Opt_defrag),
+ fsparam_flag_no("barrier", Opt_barrier),
+ fsparam_flag("clear_cache", Opt_clear_cache),
+ fsparam_u32("commit", Opt_commit_interval),
+ fsparam_flag("compress", Opt_compress),
+ fsparam_string("compress", Opt_compress_type),
+ fsparam_flag("compress-force", Opt_compress_force),
+ fsparam_string("compress-force", Opt_compress_force_type),
+ fsparam_flag_no("datacow", Opt_datacow),
+ fsparam_flag_no("datasum", Opt_datasum),
+ fsparam_flag("degraded", Opt_degraded),
+ fsparam_string("device", Opt_device),
+ fsparam_flag_no("discard", Opt_discard),
+ fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard),
+ fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors),
+ fsparam_flag_no("flushoncommit", Opt_flushoncommit),
+ fsparam_string("max_inline", Opt_max_inline),
+ fsparam_u32("metadata_ratio", Opt_ratio),
+ fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree),
+ fsparam_flag("skip_balance", Opt_skip_balance),
+ fsparam_flag_no("space_cache", Opt_space_cache),
+ fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache),
+ fsparam_flag_no("ssd", Opt_ssd),
+ fsparam_flag_no("ssd_spread", Opt_ssd_spread),
+ fsparam_string("subvol", Opt_subvol),
+ fsparam_flag("subvol=", Opt_subvol_empty),
+ fsparam_u64("subvolid", Opt_subvolid),
+ fsparam_u32("thread_pool", Opt_thread_pool),
+ fsparam_flag_no("treelog", Opt_treelog),
+ fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed),
+
+ /* Rescue options. */
+ fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue),
/* Deprecated, with alias rescue=nologreplay */
- {Opt_nologreplay, "nologreplay"},
+ __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL),
/* Deprecated, with alias rescue=usebackuproot */
- {Opt_usebackuproot, "usebackuproot"},
+ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL),
- /* Deprecated options */
- {Opt_recovery, "recovery"},
-
- /* Debugging options */
- {Opt_enospc_debug, "enospc_debug"},
- {Opt_noenospc_debug, "noenospc_debug"},
+ /* Debugging options. */
+ fsparam_flag_no("enospc_debug", Opt_enospc_debug),
#ifdef CONFIG_BTRFS_DEBUG
- {Opt_fragment_data, "fragment=data"},
- {Opt_fragment_metadata, "fragment=metadata"},
- {Opt_fragment_all, "fragment=all"},
+ fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment),
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- {Opt_ref_verify, "ref_verify"},
+ fsparam_flag("ref_verify", Opt_ref_verify),
#endif
- {Opt_err, NULL},
+ {}
};
-static const match_table_t rescue_tokens = {
- {Opt_usebackuproot, "usebackuproot"},
- {Opt_nologreplay, "nologreplay"},
- {Opt_ignorebadroots, "ignorebadroots"},
- {Opt_ignorebadroots, "ibadroots"},
- {Opt_ignoredatacsums, "ignoredatacsums"},
- {Opt_ignoredatacsums, "idatacsums"},
- {Opt_rescue_all, "all"},
- {Opt_err, NULL},
-};
-
-static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
- const char *opt_name)
+/* No support for restricting writes to btrfs devices yet... */
+static inline blk_mode_t btrfs_open_mode(struct fs_context *fc)
{
- if (fs_info->mount_opt & opt) {
- btrfs_err(fs_info, "%s must be used with ro mount option",
- opt_name);
- return true;
- }
- return false;
+ return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES;
}
-static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
+static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *opts;
- char *orig;
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int ret = 0;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ opt = fs_parse(fc, btrfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
- while ((p = strsep(&opts, ":")) != NULL) {
- int token;
+ switch (opt) {
+ case Opt_degraded:
+ btrfs_set_opt(ctx->mount_opt, DEGRADED);
+ break;
+ case Opt_subvol_empty:
+ /*
+ * This exists because we used to allow it on accident, so we're
+ * keeping it to maintain ABI. See 37becec95ac3 ("Btrfs: allow
+ * empty subvol= again").
+ */
+ break;
+ case Opt_subvol:
+ kfree(ctx->subvol_name);
+ ctx->subvol_name = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->subvol_name)
+ return -ENOMEM;
+ break;
+ case Opt_subvolid:
+ ctx->subvol_objectid = result.uint_64;
- if (!*p)
- continue;
- token = match_token(p, rescue_tokens, args);
- switch (token){
- case Opt_usebackuproot:
- btrfs_info(info,
- "trying to use backup root at mount time");
- btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
- break;
- case Opt_nologreplay:
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_ignorebadroots:
- btrfs_set_and_info(info, IGNOREBADROOTS,
- "ignoring bad roots");
- break;
- case Opt_ignoredatacsums:
- btrfs_set_and_info(info, IGNOREDATACSUMS,
- "ignoring data csums");
- break;
- case Opt_rescue_all:
- btrfs_info(info, "enabling all of the rescue options");
- btrfs_set_and_info(info, IGNOREDATACSUMS,
- "ignoring data csums");
- btrfs_set_and_info(info, IGNOREBADROOTS,
- "ignoring bad roots");
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_err:
- btrfs_info(info, "unrecognized rescue option '%s'", p);
- ret = -EINVAL;
- goto out;
- default:
- break;
- }
+ /* subvolid=0 means give me the original fs_tree. */
+ if (!ctx->subvol_objectid)
+ ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID;
+ break;
+ case Opt_device: {
+ struct btrfs_device *device;
+ blk_mode_t mode = btrfs_open_mode(fc);
+ mutex_lock(&uuid_mutex);
+ device = btrfs_scan_one_device(param->string, mode, false);
+ mutex_unlock(&uuid_mutex);
+ if (IS_ERR(device))
+ return PTR_ERR(device);
+ break;
}
-out:
- kfree(orig);
- return ret;
-}
-
-/*
- * Regular mount options parser. Everything that is needed only when
- * reading in a new superblock is parsed here.
- * XXX JDM: This needs to be cleaned up for remount.
- */
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
- unsigned long new_flags)
-{
- substring_t args[MAX_OPT_ARGS];
- char *p, *num;
- int intarg;
- int ret = 0;
- char *compress_type;
- bool compress_force = false;
- enum btrfs_compression_type saved_compress_type;
- int saved_compress_level;
- bool saved_compress_force;
- int no_compress = 0;
- const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state);
-
- if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
- btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
- else if (btrfs_free_space_cache_v1_active(info)) {
- if (btrfs_is_zoned(info)) {
- btrfs_info(info,
- "zoned: clearing existing space cache");
- btrfs_set_super_cache_generation(info->super_copy, 0);
+ case Opt_datasum:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NODATASUM);
} else {
- btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
}
- }
-
- /*
- * Even the options are empty, we still need to do extra check
- * against new flags
- */
- if (!options)
- goto check;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_degraded:
- btrfs_info(info, "allowing degraded mounts");
- btrfs_set_opt(info->mount_opt, DEGRADED);
- break;
- case Opt_subvol:
- case Opt_subvol_empty:
- case Opt_subvolid:
- case Opt_device:
- /*
- * These are parsed by btrfs_parse_subvol_options or
- * btrfs_parse_device_options and can be ignored here.
- */
- break;
- case Opt_nodatasum:
- btrfs_set_and_info(info, NODATASUM,
- "setting nodatasum");
- break;
- case Opt_datasum:
- if (btrfs_test_opt(info, NODATASUM)) {
- if (btrfs_test_opt(info, NODATACOW))
- btrfs_info(info,
- "setting datasum, datacow enabled");
- else
- btrfs_info(info, "setting datasum");
- }
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- break;
- case Opt_nodatacow:
- if (!btrfs_test_opt(info, NODATACOW)) {
- if (!btrfs_test_opt(info, COMPRESS) ||
- !btrfs_test_opt(info, FORCE_COMPRESS)) {
- btrfs_info(info,
- "setting nodatacow, compression disabled");
- } else {
- btrfs_info(info, "setting nodatacow");
- }
- }
- btrfs_clear_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- btrfs_set_opt(info->mount_opt, NODATACOW);
- btrfs_set_opt(info->mount_opt, NODATASUM);
- break;
- case Opt_datacow:
- btrfs_clear_and_info(info, NODATACOW,
- "setting datacow");
- break;
- case Opt_compress_force:
- case Opt_compress_force_type:
- compress_force = true;
- fallthrough;
- case Opt_compress:
- case Opt_compress_type:
- saved_compress_type = btrfs_test_opt(info,
- COMPRESS) ?
- info->compress_type : BTRFS_COMPRESS_NONE;
- saved_compress_force =
- btrfs_test_opt(info, FORCE_COMPRESS);
- saved_compress_level = info->compress_level;
- if (token == Opt_compress ||
- token == Opt_compress_force ||
- strncmp(args[0].from, "zlib", 4) == 0) {
- compress_type = "zlib";
-
- info->compress_type = BTRFS_COMPRESS_ZLIB;
- info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
- /*
- * args[0] contains uninitialized data since
- * for these tokens we don't expect any
- * parameter.
- */
- if (token != Opt_compress &&
- token != Opt_compress_force)
- info->compress_level =
- btrfs_compress_str2level(
- BTRFS_COMPRESS_ZLIB,
- args[0].from + 4);
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- no_compress = 0;
- } else if (strncmp(args[0].from, "lzo", 3) == 0) {
- compress_type = "lzo";
- info->compress_type = BTRFS_COMPRESS_LZO;
- info->compress_level = 0;
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- btrfs_set_fs_incompat(info, COMPRESS_LZO);
- no_compress = 0;
- } else if (strncmp(args[0].from, "zstd", 4) == 0) {
- compress_type = "zstd";
- info->compress_type = BTRFS_COMPRESS_ZSTD;
- info->compress_level =
- btrfs_compress_str2level(
- BTRFS_COMPRESS_ZSTD,
- args[0].from + 4);
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
- no_compress = 0;
- } else if (strncmp(args[0].from, "no", 2) == 0) {
- compress_type = "no";
- info->compress_level = 0;
- info->compress_type = 0;
- btrfs_clear_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- compress_force = false;
- no_compress++;
- } else {
- btrfs_err(info, "unrecognized compression value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
-
- if (compress_force) {
- btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
- } else {
- /*
- * If we remount from compress-force=xxx to
- * compress=xxx, we need clear FORCE_COMPRESS
- * flag, otherwise, there is no way for users
- * to disable forcible compression separately.
- */
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- }
- if (no_compress == 1) {
- btrfs_info(info, "use no compression");
- } else if ((info->compress_type != saved_compress_type) ||
- (compress_force != saved_compress_force) ||
- (info->compress_level != saved_compress_level)) {
- btrfs_info(info, "%s %s compression, level %d",
- (compress_force) ? "force" : "use",
- compress_type, info->compress_level);
- }
- compress_force = false;
- break;
- case Opt_ssd:
- btrfs_set_and_info(info, SSD,
- "enabling ssd optimizations");
- btrfs_clear_opt(info->mount_opt, NOSSD);
- break;
- case Opt_ssd_spread:
- btrfs_set_and_info(info, SSD,
- "enabling ssd optimizations");
- btrfs_set_and_info(info, SSD_SPREAD,
- "using spread ssd allocation scheme");
- btrfs_clear_opt(info->mount_opt, NOSSD);
- break;
- case Opt_nossd:
- btrfs_set_opt(info->mount_opt, NOSSD);
- btrfs_clear_and_info(info, SSD,
- "not using ssd optimizations");
- fallthrough;
- case Opt_nossd_spread:
- btrfs_clear_and_info(info, SSD_SPREAD,
- "not using spread ssd allocation scheme");
- break;
- case Opt_barrier:
- btrfs_clear_and_info(info, NOBARRIER,
- "turning on barriers");
- break;
- case Opt_nobarrier:
- btrfs_set_and_info(info, NOBARRIER,
- "turning off barriers");
- break;
- case Opt_thread_pool:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized thread_pool value %s",
- args[0].from);
- goto out;
- } else if (intarg == 0) {
- btrfs_err(info, "invalid value 0 for thread_pool");
- ret = -EINVAL;
- goto out;
- }
- info->thread_pool_size = intarg;
- break;
- case Opt_max_inline:
- num = match_strdup(&args[0]);
- if (num) {
- info->max_inline = memparse(num, NULL);
- kfree(num);
-
- if (info->max_inline) {
- info->max_inline = min_t(u64,
- info->max_inline,
- info->sectorsize);
- }
- btrfs_info(info, "max_inline at %llu",
- info->max_inline);
- } else {
- ret = -ENOMEM;
- goto out;
- }
- break;
- case Opt_acl:
+ break;
+ case Opt_datacow:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ btrfs_set_opt(ctx->mount_opt, NODATACOW);
+ btrfs_set_opt(ctx->mount_opt, NODATASUM);
+ } else {
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ }
+ break;
+ case Opt_compress_force:
+ case Opt_compress_force_type:
+ btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS);
+ fallthrough;
+ case Opt_compress:
+ case Opt_compress_type:
+ if (opt == Opt_compress || opt == Opt_compress_force) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "zlib", 4) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level =
+ btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
+ param->string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "lzo", 3) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_LZO;
+ ctx->compress_level = 0;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "zstd", 4) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_ZSTD;
+ ctx->compress_level =
+ btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
+ param->string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "no", 2) == 0) {
+ ctx->compress_level = 0;
+ ctx->compress_type = 0;
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ } else {
+ btrfs_err(NULL, "unrecognized compression value %s",
+ param->string);
+ return -EINVAL;
+ }
+ break;
+ case Opt_ssd:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NOSSD);
+ btrfs_clear_opt(ctx->mount_opt, SSD);
+ btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, SSD);
+ btrfs_clear_opt(ctx->mount_opt, NOSSD);
+ }
+ break;
+ case Opt_ssd_spread:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, SSD);
+ btrfs_set_opt(ctx->mount_opt, SSD_SPREAD);
+ btrfs_clear_opt(ctx->mount_opt, NOSSD);
+ }
+ break;
+ case Opt_barrier:
+ if (result.negated)
+ btrfs_set_opt(ctx->mount_opt, NOBARRIER);
+ else
+ btrfs_clear_opt(ctx->mount_opt, NOBARRIER);
+ break;
+ case Opt_thread_pool:
+ if (result.uint_32 == 0) {
+ btrfs_err(NULL, "invalid value 0 for thread_pool");
+ return -EINVAL;
+ }
+ ctx->thread_pool_size = result.uint_32;
+ break;
+ case Opt_max_inline:
+ ctx->max_inline = memparse(param->string, NULL);
+ break;
+ case Opt_acl:
+ if (result.negated) {
+ fc->sb_flags &= ~SB_POSIXACL;
+ } else {
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- info->sb->s_flags |= SB_POSIXACL;
- break;
+ fc->sb_flags |= SB_POSIXACL;
#else
- btrfs_err(info, "support for ACL not compiled in!");
- ret = -EINVAL;
- goto out;
+ btrfs_err(NULL, "support for ACL not compiled in");
+ return -EINVAL;
#endif
- case Opt_noacl:
- info->sb->s_flags &= ~SB_POSIXACL;
- break;
- case Opt_notreelog:
- btrfs_set_and_info(info, NOTREELOG,
- "disabling tree log");
- break;
- case Opt_treelog:
- btrfs_clear_and_info(info, NOTREELOG,
- "enabling tree log");
- break;
- case Opt_norecovery:
- case Opt_nologreplay:
- btrfs_warn(info,
+ }
+ /*
+ * VFS limits the ability to toggle ACL on and off via remount,
+ * despite every file system allowing this. This seems to be
+ * an oversight since we all do, but it'll fail if we're
+ * remounting. So don't set the mask here, we'll check it in
+ * btrfs_reconfigure and do the toggling ourselves.
+ */
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE)
+ fc->sb_flags_mask |= SB_POSIXACL;
+ break;
+ case Opt_treelog:
+ if (result.negated)
+ btrfs_set_opt(ctx->mount_opt, NOTREELOG);
+ else
+ btrfs_clear_opt(ctx->mount_opt, NOTREELOG);
+ break;
+ case Opt_nologreplay:
+ btrfs_warn(NULL,
"'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_flushoncommit:
- btrfs_set_and_info(info, FLUSHONCOMMIT,
- "turning on flush-on-commit");
- break;
- case Opt_noflushoncommit:
- btrfs_clear_and_info(info, FLUSHONCOMMIT,
- "turning off flush-on-commit");
- break;
- case Opt_ratio:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized metadata_ratio value %s",
- args[0].from);
- goto out;
- }
- info->metadata_ratio = intarg;
- btrfs_info(info, "metadata ratio %u",
- info->metadata_ratio);
- break;
- case Opt_discard:
- case Opt_discard_mode:
- if (token == Opt_discard ||
- strcmp(args[0].from, "sync") == 0) {
- btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC);
- btrfs_set_and_info(info, DISCARD_SYNC,
- "turning on sync discard");
- } else if (strcmp(args[0].from, "async") == 0) {
- btrfs_clear_opt(info->mount_opt, DISCARD_SYNC);
- btrfs_set_and_info(info, DISCARD_ASYNC,
- "turning on async discard");
- } else {
- btrfs_err(info, "unrecognized discard mode value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- btrfs_clear_opt(info->mount_opt, NODISCARD);
- break;
- case Opt_nodiscard:
- btrfs_clear_and_info(info, DISCARD_SYNC,
- "turning off discard");
- btrfs_clear_and_info(info, DISCARD_ASYNC,
- "turning off async discard");
- btrfs_set_opt(info->mount_opt, NODISCARD);
- break;
- case Opt_space_cache:
- case Opt_space_cache_version:
- /*
- * We already set FREE_SPACE_TREE above because we have
- * compat_ro(FREE_SPACE_TREE) set, and we aren't going
- * to allow v1 to be set for extent tree v2, simply
- * ignore this setting if we're extent tree v2.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- if (token == Opt_space_cache ||
- strcmp(args[0].from, "v1") == 0) {
- btrfs_clear_opt(info->mount_opt,
- FREE_SPACE_TREE);
- btrfs_set_and_info(info, SPACE_CACHE,
- "enabling disk space caching");
- } else if (strcmp(args[0].from, "v2") == 0) {
- btrfs_clear_opt(info->mount_opt,
- SPACE_CACHE);
- btrfs_set_and_info(info, FREE_SPACE_TREE,
- "enabling free space tree");
- } else {
- btrfs_err(info, "unrecognized space_cache value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- break;
- case Opt_rescan_uuid_tree:
- btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
- break;
- case Opt_no_space_cache:
- /*
- * We cannot operate without the free space tree with
- * extent tree v2, ignore this option.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- if (btrfs_test_opt(info, SPACE_CACHE)) {
- btrfs_clear_and_info(info, SPACE_CACHE,
- "disabling disk space caching");
- }
- if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
- btrfs_clear_and_info(info, FREE_SPACE_TREE,
- "disabling free space tree");
- }
- break;
- case Opt_inode_cache:
- case Opt_noinode_cache:
- btrfs_warn(info,
- "the 'inode_cache' option is deprecated and has no effect since 5.11");
- break;
- case Opt_clear_cache:
- /*
- * We cannot clear the free space tree with extent tree
- * v2, ignore this option.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- btrfs_set_and_info(info, CLEAR_CACHE,
- "force clearing of disk cache");
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
+ break;
+ case Opt_flushoncommit:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT);
+ else
+ btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT);
+ break;
+ case Opt_ratio:
+ ctx->metadata_ratio = result.uint_32;
+ break;
+ case Opt_discard:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ btrfs_set_opt(ctx->mount_opt, NODISCARD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ }
+ break;
+ case Opt_discard_mode:
+ switch (result.uint_32) {
+ case Opt_discard_sync:
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
break;
- case Opt_user_subvol_rm_allowed:
- btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ case Opt_discard_async:
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC);
break;
- case Opt_enospc_debug:
- btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+ default:
+ btrfs_err(NULL, "unrecognized discard mode value %s",
+ param->key);
+ return -EINVAL;
+ }
+ btrfs_clear_opt(ctx->mount_opt, NODISCARD);
+ break;
+ case Opt_space_cache:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NOSPACECACHE);
+ btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
+ } else {
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
+ }
+ break;
+ case Opt_space_cache_version:
+ switch (result.uint_32) {
+ case Opt_space_cache_v1:
+ btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
break;
- case Opt_noenospc_debug:
- btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+ case Opt_space_cache_v2:
+ btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE);
break;
- case Opt_defrag:
- btrfs_set_and_info(info, AUTO_DEFRAG,
- "enabling auto defrag");
+ default:
+ btrfs_err(NULL, "unrecognized space_cache value %s",
+ param->key);
+ return -EINVAL;
+ }
+ break;
+ case Opt_rescan_uuid_tree:
+ btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE);
+ break;
+ case Opt_clear_cache:
+ btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
+ break;
+ case Opt_user_subvol_rm_allowed:
+ btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ break;
+ case Opt_enospc_debug:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG);
+ else
+ btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG);
+ break;
+ case Opt_defrag:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG);
+ else
+ btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG);
+ break;
+ case Opt_usebackuproot:
+ btrfs_warn(NULL,
+ "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead");
+ btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
+
+ /* If we're loading the backup roots we can't trust the space cache. */
+ btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
+ break;
+ case Opt_skip_balance:
+ btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE);
+ break;
+ case Opt_fatal_errors:
+ switch (result.uint_32) {
+ case Opt_fatal_errors_panic:
+ btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
break;
- case Opt_nodefrag:
- btrfs_clear_and_info(info, AUTO_DEFRAG,
- "disabling auto defrag");
+ case Opt_fatal_errors_bug:
+ btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
break;
- case Opt_recovery:
- case Opt_usebackuproot:
- btrfs_warn(info,
- "'%s' is deprecated, use 'rescue=usebackuproot' instead",
- token == Opt_recovery ? "recovery" :
- "usebackuproot");
- btrfs_info(info,
- "trying to use backup root at mount time");
- btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
+ default:
+ btrfs_err(NULL, "unrecognized fatal_errors value %s",
+ param->key);
+ return -EINVAL;
+ }
+ break;
+ case Opt_commit_interval:
+ ctx->commit_interval = result.uint_32;
+ if (ctx->commit_interval == 0)
+ ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ break;
+ case Opt_rescue:
+ switch (result.uint_32) {
+ case Opt_rescue_usebackuproot:
+ btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
break;
- case Opt_skip_balance:
- btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+ case Opt_rescue_nologreplay:
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
- case Opt_fatal_errors:
- if (strcmp(args[0].from, "panic") == 0) {
- btrfs_set_opt(info->mount_opt,
- PANIC_ON_FATAL_ERROR);
- } else if (strcmp(args[0].from, "bug") == 0) {
- btrfs_clear_opt(info->mount_opt,
- PANIC_ON_FATAL_ERROR);
- } else {
- btrfs_err(info, "unrecognized fatal_errors value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
+ case Opt_rescue_ignorebadroots:
+ btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
break;
- case Opt_commit_interval:
- intarg = 0;
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized commit_interval value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- if (intarg == 0) {
- btrfs_info(info,
- "using default commit interval %us",
- BTRFS_DEFAULT_COMMIT_INTERVAL);
- intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
- } else if (intarg > 300) {
- btrfs_warn(info, "excessive commit interval %d",
- intarg);
- }
- info->commit_interval = intarg;
+ case Opt_rescue_ignoredatacsums:
+ btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
break;
- case Opt_rescue:
- ret = parse_rescue_options(info, args[0].from);
- if (ret < 0) {
- btrfs_err(info, "unrecognized rescue value %s",
- args[0].from);
- goto out;
- }
+ case Opt_rescue_parameter_all:
+ btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
+ btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
+ default:
+ btrfs_info(NULL, "unrecognized rescue option '%s'",
+ param->key);
+ return -EINVAL;
+ }
+ break;
#ifdef CONFIG_BTRFS_DEBUG
- case Opt_fragment_all:
- btrfs_info(info, "fragmenting all space");
- btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
- btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+ case Opt_fragment:
+ switch (result.uint_32) {
+ case Opt_fragment_parameter_all:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
break;
- case Opt_fragment_metadata:
- btrfs_info(info, "fragmenting metadata");
- btrfs_set_opt(info->mount_opt,
- FRAGMENT_METADATA);
+ case Opt_fragment_parameter_metadata:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
break;
- case Opt_fragment_data:
- btrfs_info(info, "fragmenting data");
- btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ case Opt_fragment_parameter_data:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
break;
+ default:
+ btrfs_info(NULL, "unrecognized fragment option '%s'",
+ param->key);
+ return -EINVAL;
+ }
+ break;
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- case Opt_ref_verify:
- btrfs_info(info, "doing ref verification");
- btrfs_set_opt(info->mount_opt, REF_VERIFY);
- break;
+ case Opt_ref_verify:
+ btrfs_set_opt(ctx->mount_opt, REF_VERIFY);
+ break;
#endif
- case Opt_err:
- btrfs_err(info, "unrecognized mount option '%s'", p);
- ret = -EINVAL;
- goto out;
- default:
- break;
- }
+ default:
+ btrfs_err(NULL, "unrecognized mount option '%s'", param->key);
+ return -EINVAL;
}
-check:
- /* We're read-only, don't have to check. */
- if (new_flags & SB_RDONLY)
- goto out;
- if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
- check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
- check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
- ret = -EINVAL;
-out:
- if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
- !btrfs_test_opt(info, FREE_SPACE_TREE) &&
- !btrfs_test_opt(info, CLEAR_CACHE)) {
- btrfs_err(info, "cannot disable free space tree");
- ret = -EINVAL;
- }
- if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
- !btrfs_test_opt(info, FREE_SPACE_TREE)) {
- btrfs_err(info, "cannot disable free space tree with block-group-tree feature");
- ret = -EINVAL;
- }
- if (!ret)
- ret = btrfs_check_mountopts_zoned(info);
- if (!ret && !remounting) {
- if (btrfs_test_opt(info, SPACE_CACHE))
- btrfs_info(info, "disk space caching is enabled");
- if (btrfs_test_opt(info, FREE_SPACE_TREE))
- btrfs_info(info, "using free space tree");
- }
- return ret;
+ return 0;
}
/*
- * Parse mount options that are required early in the mount process.
- *
- * All other options will be parsed on much later in the mount process and
- * only when we need to allocate a new super block.
+ * Some options only have meaning at mount time and shouldn't persist across
+ * remounts, or be displayed. Clear these at the end of mount and remount code
+ * paths.
*/
-static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
+static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
{
- substring_t args[MAX_OPT_ARGS];
- char *device_name, *opts, *orig, *p;
- struct btrfs_device *device = NULL;
- int error = 0;
+ btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
+ btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE);
+}
- lockdep_assert_held(&uuid_mutex);
+static bool check_ro_option(struct btrfs_fs_info *fs_info,
+ unsigned long mount_opt, unsigned long opt,
+ const char *opt_name)
+{
+ if (mount_opt & opt) {
+ btrfs_err(fs_info, "%s must be used with ro mount option",
+ opt_name);
+ return true;
+ }
+ return false;
+}
- if (!options)
- return 0;
+bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+ unsigned long flags)
+{
+ bool ret = true;
- /*
- * strsep changes the string, duplicate it because btrfs_parse_options
- * gets called later
- */
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ if (!(flags & SB_RDONLY) &&
+ (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")))
+ ret = false;
- while ((p = strsep(&opts, ",")) != NULL) {
- int token;
+ if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) {
+ btrfs_err(info, "cannot disable free-space-tree");
+ ret = false;
+ }
+ if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) {
+ btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature");
+ ret = false;
+ }
- if (!*p)
- continue;
+ if (btrfs_check_mountopts_zoned(info, mount_opt))
+ ret = false;
- token = match_token(p, tokens, args);
- if (token == Opt_device) {
- device_name = match_strdup(&args[0]);
- if (!device_name) {
- error = -ENOMEM;
- goto out;
- }
- device = btrfs_scan_one_device(device_name, flags, false);
- kfree(device_name);
- if (IS_ERR(device)) {
- error = PTR_ERR(device);
- goto out;
- }
- }
+ if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
+ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE))
+ btrfs_info(info, "disk space caching is enabled");
+ if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
+ btrfs_info(info, "using free-space-tree");
}
-out:
- kfree(orig);
- return error;
+ return ret;
}
/*
- * Parse mount options that are related to subvolume id
+ * This is subtle, we only call this during open_ctree(). We need to pre-load
+ * the mount options with the on-disk settings. Before the new mount API took
+ * effect we would do this on mount and remount. With the new mount API we'll
+ * only do this on the initial mount.
*
- * The value is later passed to mount_subvol()
+ * This isn't a change in behavior, because we're using the current state of the
+ * file system to set the current mount options. If you mounted with special
+ * options to disable these features and then remounted we wouldn't revert the
+ * settings, because mounting without these features cleared the on-disk
+ * settings, so this being called on re-mount is not needed.
*/
-static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
- u64 *subvol_objectid)
+void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info)
{
- substring_t args[MAX_OPT_ARGS];
- char *opts, *orig, *p;
- int error = 0;
- u64 subvolid;
-
- if (!options)
- return 0;
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
+ btrfs_info(fs_info,
+ "forcing free space tree for sector size %u with page size %lu",
+ fs_info->sectorsize, PAGE_SIZE);
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ }
+ }
/*
- * strsep changes the string, duplicate it because
- * btrfs_parse_device_options gets called later
+ * At this point our mount options are populated, so we only mess with
+ * these settings if we don't have any settings already.
*/
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ return;
- while ((p = strsep(&opts, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
+ if (btrfs_is_zoned(fs_info) &&
+ btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_info(fs_info, "zoned: clearing existing space cache");
+ btrfs_set_super_cache_generation(fs_info->super_copy, 0);
+ return;
+ }
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_subvol:
- kfree(*subvol_name);
- *subvol_name = match_strdup(&args[0]);
- if (!*subvol_name) {
- error = -ENOMEM;
- goto out;
- }
- break;
- case Opt_subvolid:
- error = match_u64(&args[0], &subvolid);
- if (error)
- goto out;
+ if (btrfs_test_opt(fs_info, SPACE_CACHE))
+ return;
- /* we want the original fs_tree */
- if (subvolid == 0)
- subvolid = BTRFS_FS_TREE_OBJECTID;
+ if (btrfs_test_opt(fs_info, NOSPACECACHE))
+ return;
- *subvol_objectid = subvolid;
- break;
- default:
- break;
- }
- }
+ /*
+ * At this point we don't have explicit options set by the user, set
+ * them ourselves based on the state of the file system.
+ */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ else if (btrfs_free_space_cache_v1_active(fs_info))
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+}
-out:
- kfree(orig);
- return error;
+static void set_device_specific_options(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_test_opt(fs_info, NOSSD) &&
+ !fs_info->fs_devices->rotating)
+ btrfs_set_opt(fs_info->mount_opt, SSD);
+
+ /*
+ * For devices supporting discard turn on discard=async automatically,
+ * unless it's already set or disabled. This could be turned off by
+ * nodiscard for the same mount.
+ *
+ * The zoned mode piggy backs on the discard functionality for
+ * resetting a zone. There is no reason to delay the zone reset as it is
+ * fast enough. So, do not enable async discard for zoned mode.
+ */
+ if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
+ btrfs_test_opt(fs_info, NODISCARD)) &&
+ fs_info->fs_devices->discardable &&
+ !btrfs_is_zoned(fs_info))
+ btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC);
}
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
@@ -1102,10 +927,6 @@ static int btrfs_fill_super(struct super_block *sb,
#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- sb->s_flags |= SB_POSIXACL;
-#endif
- sb->s_flags |= SB_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
err = super_setup_bdi(sb);
@@ -1276,10 +1097,9 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
#endif
if (btrfs_test_opt(info, REF_VERIFY))
seq_puts(seq, ",ref_verify");
- seq_printf(seq, ",subvolid=%llu",
- BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
subvol_name = btrfs_get_subvol_name_from_objectid(info,
- BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
if (!IS_ERR(subvol_name)) {
seq_puts(seq, ",subvol=");
seq_escape(seq, subvol_name, " \t\n\\");
@@ -1288,22 +1108,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
return 0;
}
-static int btrfs_test_super(struct super_block *s, void *data)
-{
- struct btrfs_fs_info *p = data;
- struct btrfs_fs_info *fs_info = btrfs_sb(s);
-
- return fs_info->fs_devices == p->fs_devices;
-}
-
-static int btrfs_set_super(struct super_block *s, void *data)
-{
- int err = set_anon_super(s, data);
- if (!err)
- s->s_fs_info = data;
- return err;
-}
-
/*
* subvolumes are identified by ino 256
*/
@@ -1347,7 +1151,7 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
struct super_block *s = root->d_sb;
struct btrfs_fs_info *fs_info = btrfs_sb(s);
struct inode *root_inode = d_inode(root);
- u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
+ u64 root_objectid = btrfs_root_id(BTRFS_I(root_inode)->root);
ret = 0;
if (!is_subvolume_inode(root_inode)) {
@@ -1379,200 +1183,6 @@ out:
return root;
}
-/*
- * Find a superblock for the given device / mount point.
- *
- * Note: This is based on mount_bdev from fs/super.c with a few additions
- * for multiple device setup. Make sure to keep it in sync.
- */
-static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
- int flags, const char *device_name, void *data)
-{
- struct block_device *bdev = NULL;
- struct super_block *s;
- struct btrfs_device *device = NULL;
- struct btrfs_fs_devices *fs_devices = NULL;
- struct btrfs_fs_info *fs_info = NULL;
- void *new_sec_opts = NULL;
- blk_mode_t mode = sb_open_mode(flags);
- int error = 0;
-
- if (data) {
- error = security_sb_eat_lsm_opts(data, &new_sec_opts);
- if (error)
- return ERR_PTR(error);
- }
-
- /*
- * Setup a dummy root and fs_info for test/set super. This is because
- * we don't actually fill this stuff out until open_ctree, but we need
- * then open_ctree will properly initialize the file system specific
- * settings later. btrfs_init_fs_info initializes the static elements
- * of the fs_info (locks and such) to make cleanup easier if we find a
- * superblock with our given fs_devices later on at sget() time.
- */
- fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
- if (!fs_info) {
- error = -ENOMEM;
- goto error_sec_opts;
- }
- btrfs_init_fs_info(fs_info);
-
- fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
- fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
- if (!fs_info->super_copy || !fs_info->super_for_commit) {
- error = -ENOMEM;
- goto error_fs_info;
- }
-
- mutex_lock(&uuid_mutex);
- error = btrfs_parse_device_options(data, mode);
- if (error) {
- mutex_unlock(&uuid_mutex);
- goto error_fs_info;
- }
-
- /*
- * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
- * either a valid device or an error.
- */
- device = btrfs_scan_one_device(device_name, mode, true);
- ASSERT(device != NULL);
- if (IS_ERR(device)) {
- mutex_unlock(&uuid_mutex);
- error = PTR_ERR(device);
- goto error_fs_info;
- }
-
- fs_devices = device->fs_devices;
- fs_info->fs_devices = fs_devices;
-
- error = btrfs_open_devices(fs_devices, mode, fs_type);
- mutex_unlock(&uuid_mutex);
- if (error)
- goto error_fs_info;
-
- if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
- error = -EACCES;
- goto error_close_devices;
- }
-
- bdev = fs_devices->latest_dev->bdev;
- s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
- fs_info);
- if (IS_ERR(s)) {
- error = PTR_ERR(s);
- goto error_close_devices;
- }
-
- if (s->s_root) {
- btrfs_close_devices(fs_devices);
- btrfs_free_fs_info(fs_info);
- if ((flags ^ s->s_flags) & SB_RDONLY)
- error = -EBUSY;
- } else {
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name,
- s->s_id);
- btrfs_sb(s)->bdev_holder = fs_type;
- error = btrfs_fill_super(s, fs_devices, data);
- }
- if (!error)
- error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
- security_free_mnt_opts(&new_sec_opts);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
-
- return dget(s->s_root);
-
-error_close_devices:
- btrfs_close_devices(fs_devices);
-error_fs_info:
- btrfs_free_fs_info(fs_info);
-error_sec_opts:
- security_free_mnt_opts(&new_sec_opts);
- return ERR_PTR(error);
-}
-
-/*
- * Mount function which is called by VFS layer.
- *
- * In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
- * which needs vfsmount* of device's root (/). This means device's root has to
- * be mounted internally in any case.
- *
- * Operation flow:
- * 1. Parse subvol id related options for later use in mount_subvol().
- *
- * 2. Mount device's root (/) by calling vfs_kern_mount().
- *
- * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
- * first place. In order to avoid calling btrfs_mount() again, we use
- * different file_system_type which is not registered to VFS by
- * register_filesystem() (btrfs_root_fs_type). As a result,
- * btrfs_mount_root() is called. The return value will be used by
- * mount_subtree() in mount_subvol().
- *
- * 3. Call mount_subvol() to get the dentry of subvolume. Since there is
- * "btrfs subvolume set-default", mount_subvol() is called always.
- */
-static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
- const char *device_name, void *data)
-{
- struct vfsmount *mnt_root;
- struct dentry *root;
- char *subvol_name = NULL;
- u64 subvol_objectid = 0;
- int error = 0;
-
- error = btrfs_parse_subvol_options(data, &subvol_name,
- &subvol_objectid);
- if (error) {
- kfree(subvol_name);
- return ERR_PTR(error);
- }
-
- /* mount device's root (/) */
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
- if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
- if (flags & SB_RDONLY) {
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
- flags & ~SB_RDONLY, device_name, data);
- } else {
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
- flags | SB_RDONLY, device_name, data);
- if (IS_ERR(mnt_root)) {
- root = ERR_CAST(mnt_root);
- kfree(subvol_name);
- goto out;
- }
-
- down_write(&mnt_root->mnt_sb->s_umount);
- error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
- up_write(&mnt_root->mnt_sb->s_umount);
- if (error < 0) {
- root = ERR_PTR(error);
- mntput(mnt_root);
- kfree(subvol_name);
- goto out;
- }
- }
- }
- if (IS_ERR(mnt_root)) {
- root = ERR_CAST(mnt_root);
- kfree(subvol_name);
- goto out;
- }
-
- /* mount_subvol() will free subvol_name and mnt_root */
- root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
-
-out:
- return root;
-}
-
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
u32 new_pool_size, u32 old_pool_size)
{
@@ -1635,202 +1245,282 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
}
-static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+static int btrfs_remount_rw(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- unsigned old_flags = sb->s_flags;
- unsigned long old_opts = fs_info->mount_opt;
- unsigned long old_compress_type = fs_info->compress_type;
- u64 old_max_inline = fs_info->max_inline;
- u32 old_thread_pool_size = fs_info->thread_pool_size;
- u32 old_metadata_ratio = fs_info->metadata_ratio;
int ret;
- sync_filesystem(sb);
- set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ if (BTRFS_FS_ERROR(fs_info)) {
+ btrfs_err(fs_info,
+ "remounting read-write after error is not allowed");
+ return -EINVAL;
+ }
- if (data) {
- void *new_sec_opts = NULL;
+ if (fs_info->fs_devices->rw_devices == 0)
+ return -EACCES;
- ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
- if (!ret)
- ret = security_sb_remount(sb, new_sec_opts);
- security_free_mnt_opts(&new_sec_opts);
- if (ret)
- goto restore;
+ if (!btrfs_check_rw_degradable(fs_info, NULL)) {
+ btrfs_warn(fs_info,
+ "too many missing devices, writable remount is not allowed");
+ return -EACCES;
}
- ret = btrfs_parse_options(fs_info, data, *flags);
+ if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+ btrfs_warn(fs_info,
+ "mount required to replay tree-log, cannot remount read-write");
+ return -EINVAL;
+ }
+
+ /*
+ * NOTE: when remounting with a change that does writes, don't put it
+ * anywhere above this point, as we are not sure to be safe to write
+ * until we pass the above checks.
+ */
+ ret = btrfs_start_pre_rw_mount(fs_info);
if (ret)
- goto restore;
+ return ret;
- ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY));
- if (ret < 0)
- goto restore;
+ btrfs_clear_sb_rdonly(fs_info->sb);
- btrfs_remount_begin(fs_info, old_opts, *flags);
- btrfs_resize_thread_pool(fs_info,
- fs_info->thread_pool_size, old_thread_pool_size);
+ set_bit(BTRFS_FS_OPEN, &fs_info->flags);
- if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
- (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
- (!sb_rdonly(sb) || (*flags & SB_RDONLY))) {
- btrfs_warn(fs_info,
- "remount supports changing free space tree only from ro to rw");
- /* Make sure free space cache options match the state on disk */
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
- btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
- }
- if (btrfs_free_space_cache_v1_active(fs_info)) {
- btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
- btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
- }
- }
+ /*
+ * If we've gone from readonly -> read-write, we need to get our
+ * sync/async discard lists in the right state.
+ */
+ btrfs_discard_resume(fs_info);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
- goto out;
+ return 0;
+}
- if (*flags & SB_RDONLY) {
- /*
- * this also happens on 'umount -rf' or on shutdown, when
- * the filesystem is busy.
- */
- cancel_work_sync(&fs_info->async_reclaim_work);
- cancel_work_sync(&fs_info->async_data_reclaim_work);
+static int btrfs_remount_ro(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * This also happens on 'umount -rf' or on shutdown, when the
+ * filesystem is busy.
+ */
+ cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
- btrfs_discard_cleanup(fs_info);
+ btrfs_discard_cleanup(fs_info);
- /* wait for the uuid_scan task to finish */
- down(&fs_info->uuid_tree_rescan_sem);
- /* avoid complains from lockdep et al. */
- up(&fs_info->uuid_tree_rescan_sem);
+ /* Wait for the uuid_scan task to finish */
+ down(&fs_info->uuid_tree_rescan_sem);
+ /* Avoid complains from lockdep et al. */
+ up(&fs_info->uuid_tree_rescan_sem);
- btrfs_set_sb_rdonly(sb);
+ btrfs_set_sb_rdonly(fs_info->sb);
- /*
- * Setting SB_RDONLY will put the cleaner thread to
- * sleep at the next loop if it's already active.
- * If it's already asleep, we'll leave unused block
- * groups on disk until we're mounted read-write again
- * unless we clean them up here.
- */
- btrfs_delete_unused_bgs(fs_info);
+ /*
+ * Setting SB_RDONLY will put the cleaner thread to sleep at the next
+ * loop if it's already active. If it's already asleep, we'll leave
+ * unused block groups on disk until we're mounted read-write again
+ * unless we clean them up here.
+ */
+ btrfs_delete_unused_bgs(fs_info);
- /*
- * The cleaner task could be already running before we set the
- * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).
- * We must make sure that after we finish the remount, i.e. after
- * we call btrfs_commit_super(), the cleaner can no longer start
- * a transaction - either because it was dropping a dead root,
- * running delayed iputs or deleting an unused block group (the
- * cleaner picked a block group from the list of unused block
- * groups before we were able to in the previous call to
- * btrfs_delete_unused_bgs()).
- */
- wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING,
- TASK_UNINTERRUPTIBLE);
+ /*
+ * The cleaner task could be already running before we set the flag
+ * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). We must make
+ * sure that after we finish the remount, i.e. after we call
+ * btrfs_commit_super(), the cleaner can no longer start a transaction
+ * - either because it was dropping a dead root, running delayed iputs
+ * or deleting an unused block group (the cleaner picked a block
+ * group from the list of unused block groups before we were able to
+ * in the previous call to btrfs_delete_unused_bgs()).
+ */
+ wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE);
- /*
- * We've set the superblock to RO mode, so we might have made
- * the cleaner task sleep without running all pending delayed
- * iputs. Go through all the delayed iputs here, so that if an
- * unmount happens without remounting RW we don't end up at
- * finishing close_ctree() with a non-empty list of delayed
- * iputs.
- */
- btrfs_run_delayed_iputs(fs_info);
+ /*
+ * We've set the superblock to RO mode, so we might have made the
+ * cleaner task sleep without running all pending delayed iputs. Go
+ * through all the delayed iputs here, so that if an unmount happens
+ * without remounting RW we don't end up at finishing close_ctree()
+ * with a non-empty list of delayed iputs.
+ */
+ btrfs_run_delayed_iputs(fs_info);
- btrfs_dev_replace_suspend_for_unmount(fs_info);
- btrfs_scrub_cancel(fs_info);
- btrfs_pause_balance(fs_info);
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
+ btrfs_scrub_cancel(fs_info);
+ btrfs_pause_balance(fs_info);
- /*
- * Pause the qgroup rescan worker if it is running. We don't want
- * it to be still running after we are in RO mode, as after that,
- * by the time we unmount, it might have left a transaction open,
- * so we would leak the transaction and/or crash.
- */
- btrfs_qgroup_wait_for_completion(fs_info, false);
+ /*
+ * Pause the qgroup rescan worker if it is running. We don't want it to
+ * be still running after we are in RO mode, as after that, by the time
+ * we unmount, it might have left a transaction open, so we would leak
+ * the transaction and/or crash.
+ */
+ btrfs_qgroup_wait_for_completion(fs_info, false);
- ret = btrfs_commit_super(fs_info);
- if (ret)
- goto restore;
- } else {
- if (BTRFS_FS_ERROR(fs_info)) {
- btrfs_err(fs_info,
- "Remounting read-write after error is not allowed");
- ret = -EINVAL;
- goto restore;
- }
- if (fs_info->fs_devices->rw_devices == 0) {
- ret = -EACCES;
- goto restore;
- }
+ return btrfs_commit_super(fs_info);
+}
- if (!btrfs_check_rw_degradable(fs_info, NULL)) {
- btrfs_warn(fs_info,
- "too many missing devices, writable remount is not allowed");
- ret = -EACCES;
- goto restore;
- }
+static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
+{
+ fs_info->max_inline = ctx->max_inline;
+ fs_info->commit_interval = ctx->commit_interval;
+ fs_info->metadata_ratio = ctx->metadata_ratio;
+ fs_info->thread_pool_size = ctx->thread_pool_size;
+ fs_info->mount_opt = ctx->mount_opt;
+ fs_info->compress_type = ctx->compress_type;
+ fs_info->compress_level = ctx->compress_level;
+}
- if (btrfs_super_log_root(fs_info->super_copy) != 0) {
- btrfs_warn(fs_info,
- "mount required to replay tree-log, cannot remount read-write");
- ret = -EINVAL;
- goto restore;
- }
+static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
+{
+ ctx->max_inline = fs_info->max_inline;
+ ctx->commit_interval = fs_info->commit_interval;
+ ctx->metadata_ratio = fs_info->metadata_ratio;
+ ctx->thread_pool_size = fs_info->thread_pool_size;
+ ctx->mount_opt = fs_info->mount_opt;
+ ctx->compress_type = fs_info->compress_type;
+ ctx->compress_level = fs_info->compress_level;
+}
- /*
- * NOTE: when remounting with a change that does writes, don't
- * put it anywhere above this point, as we are not sure to be
- * safe to write until we pass the above checks.
- */
- ret = btrfs_start_pre_rw_mount(fs_info);
- if (ret)
- goto restore;
+#define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...) \
+do { \
+ if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \
+ btrfs_raw_test_opt(fs_info->mount_opt, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+} while (0)
+
+#define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...) \
+do { \
+ if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \
+ !btrfs_raw_test_opt(fs_info->mount_opt, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+} while (0)
+
+static void btrfs_emit_options(struct btrfs_fs_info *info,
+ struct btrfs_fs_context *old)
+{
+ btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
+ btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
+ btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
+ btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log");
+ btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time");
+ btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit");
+ btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard");
+ btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard");
+ btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree");
+ btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching");
+ btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache");
+ btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag");
+ btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data");
+ btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata");
+ btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification");
+ btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time");
+ btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots");
+ btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums");
+
+ btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
+ btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
+ btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
+ btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
+ btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
+ btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
+ btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag");
+ btrfs_info_if_unset(info, old, COMPRESS, "use no compression");
+
+ /* Did the compression settings change? */
+ if (btrfs_test_opt(info, COMPRESS) &&
+ (!old ||
+ old->compress_type != info->compress_type ||
+ old->compress_level != info->compress_level ||
+ (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) &&
+ btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) {
+ const char *compress_type = btrfs_compress_type2str(info->compress_type);
+
+ btrfs_info(info, "%s %s compression, level %d",
+ btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use",
+ compress_type, info->compress_level);
+ }
- btrfs_clear_sb_rdonly(sb);
+ if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
+ btrfs_info(info, "max_inline set to %llu", info->max_inline);
+}
- set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+static int btrfs_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_context old_ctx;
+ int ret = 0;
+ bool mount_reconfigure = (fc->s_fs_info != NULL);
- /*
- * If we've gone from readonly -> read/write, we need to get
- * our sync/async discard lists in the right state.
- */
- btrfs_discard_resume(fs_info);
+ btrfs_info_to_ctx(fs_info, &old_ctx);
+
+ /*
+ * This is our "bind mount" trick, we don't want to allow the user to do
+ * anything other than mount a different ro/rw and a different subvol,
+ * all of the mount options should be maintained.
+ */
+ if (mount_reconfigure)
+ ctx->mount_opt = old_ctx.mount_opt;
+
+ sync_filesystem(sb);
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
+ if (!mount_reconfigure &&
+ !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
+ return -EINVAL;
+
+ ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY));
+ if (ret < 0)
+ return ret;
+
+ btrfs_ctx_to_info(fs_info, ctx);
+ btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags);
+ btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size,
+ old_ctx.thread_pool_size);
+
+ if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
+ (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) {
+ btrfs_warn(fs_info,
+ "remount supports changing free space tree only from RO to RW");
+ /* Make sure free space cache options match the state on disk. */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ if (btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
}
-out:
+
+ ret = 0;
+ if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY))
+ ret = btrfs_remount_ro(fs_info);
+ else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY))
+ ret = btrfs_remount_rw(fs_info);
+ if (ret)
+ goto restore;
+
/*
- * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
- * since the absence of the flag means it can be toggled off by remount.
+ * If we set the mask during the parameter parsing VFS would reject the
+ * remount. Here we can set the mask and the value will be updated
+ * appropriately.
*/
- *flags |= SB_I_VERSION;
+ if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL))
+ fc->sb_flags_mask |= SB_POSIXACL;
+ btrfs_emit_options(fs_info, &old_ctx);
wake_up_process(fs_info->transaction_kthread);
- btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
btrfs_clear_oneshot_options(fs_info);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
return 0;
-
restore:
- /* We've hit an error - don't reset SB_RDONLY */
- if (sb_rdonly(sb))
- old_flags |= SB_RDONLY;
- if (!(old_flags & SB_RDONLY))
- clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
- sb->s_flags = old_flags;
- fs_info->mount_opt = old_opts;
- fs_info->compress_type = old_compress_type;
- fs_info->max_inline = old_max_inline;
- btrfs_resize_thread_pool(fs_info,
- old_thread_pool_size, fs_info->thread_pool_size);
- fs_info->metadata_ratio = old_metadata_ratio;
- btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_ctx_to_info(fs_info, &old_ctx);
+ btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
-
return ret;
}
@@ -2074,7 +1764,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = 0;
buf->f_type = BTRFS_SUPER_MAGIC;
- buf->f_bsize = dentry->d_sb->s_blocksize;
+ buf->f_bsize = fs_info->sectorsize;
buf->f_namelen = BTRFS_NAME_LEN;
/* We treat it as constant endianness (it doesn't matter _which_)
@@ -2083,14 +1773,315 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
/* Mask in the root object ID too, to disambiguate subvols */
- buf->f_fsid.val[0] ^=
- BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
- buf->f_fsid.val[1] ^=
- BTRFS_I(d_inode(dentry))->root->root_key.objectid;
+ buf->f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32;
+ buf->f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root);
return 0;
}
+static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct btrfs_fs_info *p = fc->s_fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ return fs_info->fs_devices == p->fs_devices;
+}
+
+static int btrfs_get_tree_super(struct fs_context *fc)
+{
+ struct btrfs_fs_info *fs_info = fc->s_fs_info;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_devices *fs_devices = NULL;
+ struct block_device *bdev;
+ struct btrfs_device *device;
+ struct super_block *sb;
+ blk_mode_t mode = btrfs_open_mode(fc);
+ int ret;
+
+ btrfs_ctx_to_info(fs_info, ctx);
+ mutex_lock(&uuid_mutex);
+
+ /*
+ * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
+ * either a valid device or an error.
+ */
+ device = btrfs_scan_one_device(fc->source, mode, true);
+ ASSERT(device != NULL);
+ if (IS_ERR(device)) {
+ mutex_unlock(&uuid_mutex);
+ return PTR_ERR(device);
+ }
+
+ fs_devices = device->fs_devices;
+ fs_info->fs_devices = fs_devices;
+
+ ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type);
+ mutex_unlock(&uuid_mutex);
+ if (ret)
+ return ret;
+
+ if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+ ret = -EACCES;
+ goto error;
+ }
+
+ bdev = fs_devices->latest_dev->bdev;
+
+ /*
+ * From now on the error handling is not straightforward.
+ *
+ * If successful, this will transfer the fs_info into the super block,
+ * and fc->s_fs_info will be NULL. However if there's an existing
+ * super, we'll still have fc->s_fs_info populated. If we error
+ * completely out it'll be cleaned up when we drop the fs_context,
+ * otherwise it's tied to the lifetime of the super_block.
+ */
+ sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc);
+ if (IS_ERR(sb)) {
+ ret = PTR_ERR(sb);
+ goto error;
+ }
+
+ set_device_specific_options(fs_info);
+
+ if (sb->s_root) {
+ btrfs_close_devices(fs_devices);
+ if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)
+ ret = -EBUSY;
+ } else {
+ snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
+ btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
+ ret = btrfs_fill_super(sb, fs_devices, NULL);
+ }
+
+ if (ret) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
+
+ btrfs_clear_oneshot_options(fs_info);
+
+ fc->root = dget(sb->s_root);
+ return 0;
+
+error:
+ btrfs_close_devices(fs_devices);
+ return ret;
+}
+
+/*
+ * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes
+ * with different ro/rw options") the following works:
+ *
+ * (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo
+ * (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar
+ *
+ * which looks nice and innocent but is actually pretty intricate and deserves
+ * a long comment.
+ *
+ * On another filesystem a subvolume mount is close to something like:
+ *
+ * (iii) # create rw superblock + initial mount
+ * mount -t xfs /dev/sdb /opt/
+ *
+ * # create ro bind mount
+ * mount --bind -o ro /opt/foo /mnt/foo
+ *
+ * # unmount initial mount
+ * umount /opt
+ *
+ * Of course, there's some special subvolume sauce and there's the fact that the
+ * sb->s_root dentry is really swapped after mount_subtree(). But conceptually
+ * it's very close and will help us understand the issue.
+ *
+ * The old mount API didn't cleanly distinguish between a mount being made ro
+ * and a superblock being made ro. The only way to change the ro state of
+ * either object was by passing ms_rdonly. If a new mount was created via
+ * mount(2) such as:
+ *
+ * mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null);
+ *
+ * the MS_RDONLY flag being specified had two effects:
+ *
+ * (1) MNT_READONLY was raised -> the resulting mount got
+ * @mnt->mnt_flags |= MNT_READONLY raised.
+ *
+ * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems
+ * made the superblock ro. Note, how SB_RDONLY has the same value as
+ * ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2).
+ *
+ * Creating a subtree mount via (iii) ends up leaving a rw superblock with a
+ * subtree mounted ro.
+ *
+ * But consider the effect on the old mount API on btrfs subvolume mounting
+ * which combines the distinct step in (iii) into a single step.
+ *
+ * By issuing (i) both the mount and the superblock are turned ro. Now when (ii)
+ * is issued the superblock is ro and thus even if the mount created for (ii) is
+ * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro
+ * to rw for (ii) which it did using an internal remount call.
+ *
+ * IOW, subvolume mounting was inherently complicated due to the ambiguity of
+ * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate
+ * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when
+ * passed by mount(8) to mount(2).
+ *
+ * Enter the new mount API. The new mount API disambiguates making a mount ro
+ * and making a superblock ro.
+ *
+ * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either
+ * fsmount() or mount_setattr() this is a pure VFS level change for a
+ * specific mount or mount tree that is never seen by the filesystem itself.
+ *
+ * (4) To turn a superblock ro the "ro" flag must be used with
+ * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem
+ * in fc->sb_flags.
+ *
+ * This disambiguation has rather positive consequences. Mounting a subvolume
+ * ro will not also turn the superblock ro. Only the mount for the subvolume
+ * will become ro.
+ *
+ * So, if the superblock creation request comes from the new mount API the
+ * caller must have explicitly done:
+ *
+ * fsconfig(FSCONFIG_SET_FLAG, "ro")
+ * fsmount/mount_setattr(MOUNT_ATTR_RDONLY)
+ *
+ * IOW, at some point the caller must have explicitly turned the whole
+ * superblock ro and we shouldn't just undo it like we did for the old mount
+ * API. In any case, it lets us avoid the hack in the new mount API.
+ *
+ * Consequently, the remounting hack must only be used for requests originating
+ * from the old mount API and should be marked for full deprecation so it can be
+ * turned off in a couple of years.
+ *
+ * The new mount API has no reason to support this hack.
+ */
+static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc)
+{
+ struct vfsmount *mnt;
+ int ret;
+ const bool ro2rw = !(fc->sb_flags & SB_RDONLY);
+
+ /*
+ * We got an EBUSY because our SB_RDONLY flag didn't match the existing
+ * super block, so invert our setting here and retry the mount so we
+ * can get our vfsmount.
+ */
+ if (ro2rw)
+ fc->sb_flags |= SB_RDONLY;
+ else
+ fc->sb_flags &= ~SB_RDONLY;
+
+ mnt = fc_mount(fc);
+ if (IS_ERR(mnt))
+ return mnt;
+
+ if (!fc->oldapi || !ro2rw)
+ return mnt;
+
+ /* We need to convert to rw, call reconfigure. */
+ fc->sb_flags &= ~SB_RDONLY;
+ down_write(&mnt->mnt_sb->s_umount);
+ ret = btrfs_reconfigure(fc);
+ up_write(&mnt->mnt_sb->s_umount);
+ if (ret) {
+ mntput(mnt);
+ return ERR_PTR(ret);
+ }
+ return mnt;
+}
+
+static int btrfs_get_tree_subvol(struct fs_context *fc)
+{
+ struct btrfs_fs_info *fs_info = NULL;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct fs_context *dup_fc;
+ struct dentry *dentry;
+ struct vfsmount *mnt;
+
+ /*
+ * Setup a dummy root and fs_info for test/set super. This is because
+ * we don't actually fill this stuff out until open_ctree, but we need
+ * then open_ctree will properly initialize the file system specific
+ * settings later. btrfs_init_fs_info initializes the static elements
+ * of the fs_info (locks and such) to make cleanup easier if we find a
+ * superblock with our given fs_devices later on at sget() time.
+ */
+ fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
+ if (!fs_info)
+ return -ENOMEM;
+
+ fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+ fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+ if (!fs_info->super_copy || !fs_info->super_for_commit) {
+ btrfs_free_fs_info(fs_info);
+ return -ENOMEM;
+ }
+ btrfs_init_fs_info(fs_info);
+
+ dup_fc = vfs_dup_fs_context(fc);
+ if (IS_ERR(dup_fc)) {
+ btrfs_free_fs_info(fs_info);
+ return PTR_ERR(dup_fc);
+ }
+
+ /*
+ * When we do the sget_fc this gets transferred to the sb, so we only
+ * need to set it on the dup_fc as this is what creates the super block.
+ */
+ dup_fc->s_fs_info = fs_info;
+
+ /*
+ * We'll do the security settings in our btrfs_get_tree_super() mount
+ * loop, they were duplicated into dup_fc, we can drop the originals
+ * here.
+ */
+ security_free_mnt_opts(&fc->security);
+ fc->security = NULL;
+
+ mnt = fc_mount(dup_fc);
+ if (PTR_ERR_OR_ZERO(mnt) == -EBUSY)
+ mnt = btrfs_reconfigure_for_mount(dup_fc);
+ put_fs_context(dup_fc);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ /*
+ * This free's ->subvol_name, because if it isn't set we have to
+ * allocate a buffer to hold the subvol_name, so we just drop our
+ * reference to it here.
+ */
+ dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt);
+ ctx->subvol_name = NULL;
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ fc->root = dentry;
+ return 0;
+}
+
+static int btrfs_get_tree(struct fs_context *fc)
+{
+ /*
+ * Since we use mount_subtree to mount the default/specified subvol, we
+ * have to do mounts in two steps.
+ *
+ * First pass through we call btrfs_get_tree_subvol(), this is just a
+ * wrapper around fc_mount() to call back into here again, and this time
+ * we'll call btrfs_get_tree_super(). This will do the open_ctree() and
+ * everything to open the devices and file system. Then we return back
+ * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and
+ * from there we can do our mount_subvol() call, which will lookup
+ * whichever subvol we're mounting and setup this fc with the
+ * appropriate dentry for the subvol.
+ */
+ if (fc->s_fs_info)
+ return btrfs_get_tree_super(fc);
+ return btrfs_get_tree_subvol(fc);
+}
+
static void btrfs_kill_super(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -2098,22 +2089,85 @@ static void btrfs_kill_super(struct super_block *sb)
btrfs_free_fs_info(fs_info);
}
-static struct file_system_type btrfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "btrfs",
- .mount = btrfs_mount,
- .kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
-};
+static void btrfs_free_fs_context(struct fs_context *fc)
+{
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_info *fs_info = fc->s_fs_info;
-static struct file_system_type btrfs_root_fs_type = {
- .owner = THIS_MODULE,
- .name = "btrfs",
- .mount = btrfs_mount_root,
- .kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ if (fs_info)
+ btrfs_free_fs_info(fs_info);
+
+ if (ctx && refcount_dec_and_test(&ctx->refs)) {
+ kfree(ctx->subvol_name);
+ kfree(ctx);
+ }
+}
+
+static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc)
+{
+ struct btrfs_fs_context *ctx = src_fc->fs_private;
+
+ /*
+ * Give a ref to our ctx to this dup, as we want to keep it around for
+ * our original fc so we can have the subvolume name or objectid.
+ *
+ * We unset ->source in the original fc because the dup needs it for
+ * mounting, and then once we free the dup it'll free ->source, so we
+ * need to make sure we're only pointing to it in one fc.
+ */
+ refcount_inc(&ctx->refs);
+ fc->fs_private = ctx;
+ fc->source = src_fc->source;
+ src_fc->source = NULL;
+ return 0;
+}
+
+static const struct fs_context_operations btrfs_fs_context_ops = {
+ .parse_param = btrfs_parse_param,
+ .reconfigure = btrfs_reconfigure,
+ .get_tree = btrfs_get_tree,
+ .dup = btrfs_dup_fs_context,
+ .free = btrfs_free_fs_context,
};
+static int btrfs_init_fs_context(struct fs_context *fc)
+{
+ struct btrfs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ refcount_set(&ctx->refs, 1);
+ fc->fs_private = ctx;
+ fc->ops = &btrfs_fs_context_ops;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx);
+ } else {
+ ctx->thread_pool_size =
+ min_t(unsigned long, num_online_cpus() + 2, 8);
+ ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE;
+ ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ }
+
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ fc->sb_flags |= SB_POSIXACL;
+#endif
+ fc->sb_flags |= SB_I_VERSION;
+
+ return 0;
+}
+
+static struct file_system_type btrfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "btrfs",
+ .init_fs_context = btrfs_init_fs_context,
+ .parameters = btrfs_fs_parameters,
+ .kill_sb = btrfs_kill_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ };
+
MODULE_ALIAS_FS("btrfs");
static int btrfs_control_open(struct inode *inode, struct file *file)
@@ -2144,7 +2198,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
vol = memdup_user((void __user *)arg, sizeof(*vol));
if (IS_ERR(vol))
return PTR_ERR(vol);
- vol->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol);
+ if (ret < 0)
+ goto out;
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
@@ -2186,6 +2242,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
break;
}
+out:
kfree(vol);
return ret;
}
@@ -2314,6 +2371,24 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
return 0;
}
+static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_control *sc)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ const s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+ trace_btrfs_extent_map_shrinker_count(fs_info, nr);
+
+ return nr;
+}
+
+static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)
+{
+ const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ return btrfs_free_extent_maps(fs_info, nr_to_scan);
+}
+
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.evict_inode = btrfs_evict_inode,
@@ -2325,9 +2400,10 @@ static const struct super_operations btrfs_super_ops = {
.destroy_inode = btrfs_destroy_inode,
.free_inode = btrfs_free_inode,
.statfs = btrfs_statfs,
- .remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
.unfreeze_fs = btrfs_unfreeze,
+ .nr_cached_objects = btrfs_nr_cached_objects,
+ .free_cached_objects = btrfs_free_cached_objects,
};
static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
index 8dbb909b364f..cbcab434b5ec 100644
--- a/fs/btrfs/super.h
+++ b/fs/btrfs/super.h
@@ -3,11 +3,19 @@
#ifndef BTRFS_SUPER_H
#define BTRFS_SUPER_H
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
- unsigned long new_flags);
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "fs.h"
+
+struct super_block;
+struct btrfs_fs_info;
+
+bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+ unsigned long flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
u64 subvol_objectid);
+void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info);
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e6b51fb3ddc1..af545b6b1190 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -421,7 +421,7 @@ BTRFS_ATTR(static_feature, supported_sectorsizes,
static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf)
{
- return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL));
+ return sysfs_emit(buf, "%d\n", IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL));
}
BTRFS_ATTR(static_feature, acl, acl_show);
@@ -1228,11 +1228,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ const enum btrfs_read_policy policy = READ_ONCE(fs_devices->read_policy);
ssize_t ret = 0;
int i;
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
- if (fs_devices->read_policy == i)
+ if (policy == i)
ret += sysfs_emit_at(buf, ret, "%s[%s]",
(ret == 0 ? "" : " "),
btrfs_read_policy_name[i]);
@@ -1256,8 +1257,8 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
- if (i != fs_devices->read_policy) {
- fs_devices->read_policy = i;
+ if (i != READ_ONCE(fs_devices->read_policy)) {
+ WRITE_ONCE(fs_devices->read_policy, i);
btrfs_info(fs_devices->fs_info,
"read policy set to '%s'",
btrfs_read_policy_name[i]);
@@ -1306,6 +1307,47 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
btrfs_bg_reclaim_threshold_store);
+#ifdef CONFIG_BTRFS_DEBUG
+static ssize_t btrfs_offload_csum_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+
+ switch (READ_ONCE(fs_devices->offload_csum_mode)) {
+ case BTRFS_OFFLOAD_CSUM_AUTO:
+ return sysfs_emit(buf, "auto\n");
+ case BTRFS_OFFLOAD_CSUM_FORCE_ON:
+ return sysfs_emit(buf, "1\n");
+ case BTRFS_OFFLOAD_CSUM_FORCE_OFF:
+ return sysfs_emit(buf, "0\n");
+ default:
+ WARN_ON(1);
+ return -EINVAL;
+ }
+}
+
+static ssize_t btrfs_offload_csum_store(struct kobject *kobj,
+ struct kobj_attribute *a, const char *buf,
+ size_t len)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ int ret;
+ bool val;
+
+ ret = kstrtobool(buf, &val);
+ if (ret == 0)
+ WRITE_ONCE(fs_devices->offload_csum_mode,
+ val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF);
+ else if (ret == -EINVAL && sysfs_streq(buf, "auto"))
+ WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO);
+ else
+ return -EINVAL;
+
+ return len;
+}
+BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store);
+#endif
+
/*
* Per-filesystem information and stats.
*
@@ -1325,6 +1367,9 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
BTRFS_ATTR_PTR(, commit_stats),
BTRFS_ATTR_PTR(, temp_fsid),
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_ATTR_PTR(, offload_csum),
+#endif
NULL,
};
@@ -1783,6 +1828,10 @@ static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
unsigned long long limit;
limit = memparse(buf, &endptr);
+ /* There could be trailing '\n', also catch any typos after the value. */
+ endptr = skip_spaces(endptr);
+ if (*endptr != 0)
+ return -EINVAL;
WRITE_ONCE(device->scrub_speed_max, limit);
return len;
}
@@ -2290,7 +2339,7 @@ int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
struct kobject *qgroups_kobj = fs_info->qgroups_kobj;
int ret;
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ if (btrfs_is_testing(fs_info))
return 0;
if (qgroup->kobj.state_initialized)
return 0;
@@ -2311,7 +2360,7 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
struct btrfs_qgroup *qgroup;
struct btrfs_qgroup *next;
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ if (btrfs_is_testing(fs_info))
return;
rbtree_postorder_for_each_entry_safe(qgroup, next,
@@ -2332,7 +2381,7 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
struct btrfs_qgroup *next;
int ret = 0;
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ if (btrfs_is_testing(fs_info))
return 0;
ASSERT(fsid_kobj);
@@ -2364,7 +2413,7 @@ out:
void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup)
{
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ if (btrfs_is_testing(fs_info))
return;
if (qgroup->kobj.state_initialized) {
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 86c7eef12873..e6a284c59809 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -3,8 +3,17 @@
#ifndef BTRFS_SYSFS_H
#define BTRFS_SYSFS_H
+#include <linux/types.h>
+#include <linux/compiler_types.h>
#include <linux/kobject.h>
+struct btrfs_fs_info;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_block_group;
+struct btrfs_space_info;
+struct btrfs_qgroup;
+
enum btrfs_feature_set {
FEAT_COMPAT,
FEAT_COMPAT_RO,
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ca09cf9afce8..dce0387ef155 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -28,6 +28,7 @@ const char *test_error[] = {
[TEST_ALLOC_INODE] = "cannot allocate inode",
[TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group",
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
+ [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
};
static const struct super_operations btrfs_test_super_ops = {
@@ -102,7 +103,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
if (!dev)
return ERR_PTR(-ENOMEM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0);
+ extent_io_tree_init(fs_info, &dev->alloc_state, 0);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);
@@ -159,8 +160,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
if (!fs_info)
return;
- if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
- &fs_info->fs_state)))
+ if (WARN_ON(!btrfs_is_testing(fs_info)))
return;
test_mnt->mnt_sb->s_fs_info = NULL;
@@ -185,7 +185,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->buffer_lock);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
dev_list) {
btrfs_free_dummy_device(dev);
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 7a2d7ffbe30e..dc2f2ab15fa5 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -23,6 +23,7 @@ enum {
TEST_ALLOC_INODE,
TEST_ALLOC_BLOCK_GROUP,
TEST_ALLOC_EXTENT_MAP,
+ TEST_ALLOC_CHUNK_MAP,
};
extern const char *test_error[];
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 1cc86af97dc6..865d4af4b303 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -11,6 +11,7 @@
#include "btrfs-tests.h"
#include "../ctree.h"
#include "../extent_io.h"
+#include "../disk-io.h"
#include "../btrfs_inode.h"
#define PROCESS_UNLOCK (1 << 0)
@@ -105,9 +106,11 @@ static void dump_extent_io_tree(const struct extent_io_tree *tree)
}
}
-static int test_find_delalloc(u32 sectorsize)
+static int test_find_delalloc(u32 sectorsize, u32 nodesize)
{
- struct inode *inode;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root = NULL;
+ struct inode *inode = NULL;
struct extent_io_tree *tmp;
struct page *page;
struct page *locked_page = NULL;
@@ -121,12 +124,27 @@ static int test_find_delalloc(u32 sectorsize)
test_msg("running find delalloc tests");
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ return -ENOMEM;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
inode = btrfs_new_test_inode();
if (!inode) {
test_std_err(TEST_ALLOC_INODE);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
tmp = &BTRFS_I(inode)->io_tree;
+ BTRFS_I(inode)->root = root;
/*
* Passing NULL as we don't have fs_info but tracepoints are not used
@@ -316,6 +334,8 @@ out:
process_page_range(inode, 0, total_dirty - 1,
PROCESS_UNLOCK | PROCESS_RELEASE);
iput(inode);
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
@@ -652,7 +672,7 @@ static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory,
const char *test_name)
{
for (int i = 0; i < eb->len; i++) {
- struct page *page = eb->pages[i >> PAGE_SHIFT];
+ struct page *page = folio_page(eb->folios[i >> PAGE_SHIFT], 0);
void *addr = page_address(page) + offset_in_page(i);
if (memcmp(addr, memory + i, 1) != 0) {
@@ -668,7 +688,7 @@ static int verify_eb_and_memory(struct extent_buffer *eb, void *memory,
const char *test_name)
{
for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) {
- void *eb_addr = page_address(eb->pages[i]);
+ void *eb_addr = folio_address(eb->folios[i]);
if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) {
dump_eb_and_memory_contents(eb, memory, test_name);
@@ -794,7 +814,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
test_msg("running extent I/O tests");
- ret = test_find_delalloc(sectorsize);
+ ret = test_find_delalloc(sectorsize, nodesize);
if (ret)
goto out;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 29bdd08b241f..ba36794ba2d5 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -11,21 +11,24 @@
#include "../disk-io.h"
#include "../block-group.h"
-static void free_extent_map_tree(struct extent_map_tree *em_tree)
+static int free_extent_map_tree(struct btrfs_inode *inode)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
struct rb_node *node;
+ int ret = 0;
write_lock(&em_tree->lock);
while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
node = rb_first_cached(&em_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
- remove_extent_mapping(em_tree, em);
+ remove_extent_mapping(inode, em);
#ifdef CONFIG_BTRFS_DEBUG
if (refcount_read(&em->refs) != 1) {
+ ret = -EINVAL;
test_err(
-"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d",
+"em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d",
em->start, em->len, em->block_start,
em->block_len, refcount_read(&em->refs));
@@ -35,6 +38,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
free_extent_map(em);
}
write_unlock(&em_tree->lock);
+
+ return ret;
}
/*
@@ -53,13 +58,14 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
* ->add_extent_mapping(0, 16K)
* -> #handle -EEXIST
*/
-static int test_case_1(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree)
+static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 start = 0;
u64 len = SZ_8K;
int ret;
+ int ret2;
em = alloc_extent_map();
if (!em) {
@@ -73,7 +79,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 16K)");
@@ -94,7 +100,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->block_start = SZ_32K; /* avoid merging */
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [16K, 20K)");
@@ -115,15 +121,20 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->block_start = start;
em->block_len = len;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret) {
test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
goto out;
}
- if (em &&
- (em->start != 0 || extent_map_end(em) != SZ_16K ||
- em->block_start != 0 || em->block_len != SZ_16K)) {
+ if (!em) {
+ test_err("case1 [%llu %llu]: no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
+ if (em->start != 0 || extent_map_end(em) != SZ_16K ||
+ em->block_start != 0 || em->block_len != SZ_16K) {
test_err(
"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
start, start + len, ret, em->start, em->len,
@@ -132,7 +143,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
}
free_extent_map(em);
out:
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
return ret;
}
@@ -143,11 +156,12 @@ out:
* Reading the inline ending up with EEXIST, ie. read an inline
* extent and discard page cache and read it again.
*/
-static int test_case_2(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree)
+static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
int ret;
+ int ret2;
em = alloc_extent_map();
if (!em) {
@@ -161,7 +175,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 1K)");
@@ -182,7 +196,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->block_start = SZ_4K;
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
@@ -203,15 +217,19 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret) {
test_err("case2 [0 1K]: ret %d", ret);
goto out;
}
- if (em &&
- (em->start != 0 || extent_map_end(em) != SZ_1K ||
- em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) {
+ if (!em) {
+ test_err("case2 [0 1K]: no extent map returned");
+ ret = -ENOENT;
+ goto out;
+ }
+ if (em->start != 0 || extent_map_end(em) != SZ_1K ||
+ em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1) {
test_err(
"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
ret, em->start, em->len, em->block_start,
@@ -220,17 +238,21 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
}
free_extent_map(em);
out:
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
return ret;
}
static int __test_case_3(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree, u64 start)
+ struct btrfs_inode *inode, u64 start)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 len = SZ_4K;
int ret;
+ int ret2;
em = alloc_extent_map();
if (!em) {
@@ -244,7 +266,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
em->block_start = SZ_4K;
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
@@ -265,29 +287,36 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
- test_err("case3 [0x%llx 0x%llx): ret %d",
+ test_err("case3 [%llu %llu): ret %d",
start, start + len, ret);
goto out;
}
+ if (!em) {
+ test_err("case3 [%llu %llu): no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
/*
* Since bytes within em are contiguous, em->block_start is identical to
* em->start.
*/
- if (em &&
- (start < em->start || start + len > extent_map_end(em) ||
- em->start != em->block_start || em->len != em->block_len)) {
+ if (start < em->start || start + len > extent_map_end(em) ||
+ em->start != em->block_start || em->len != em->block_len) {
test_err(
-"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
+"case3 [%llu %llu): ret %d em (start %llu len %llu block_start %llu block_len %llu)",
start, start + len, ret, em->start, em->len,
em->block_start, em->block_len);
ret = -EINVAL;
}
free_extent_map(em);
out:
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
return ret;
}
@@ -308,28 +337,29 @@ out:
* -> add_extent_mapping()
* -> add_extent_mapping()
*/
-static int test_case_3(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree)
+static int test_case_3(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
int ret;
- ret = __test_case_3(fs_info, em_tree, 0);
+ ret = __test_case_3(fs_info, inode, 0);
if (ret)
return ret;
- ret = __test_case_3(fs_info, em_tree, SZ_8K);
+ ret = __test_case_3(fs_info, inode, SZ_8K);
if (ret)
return ret;
- ret = __test_case_3(fs_info, em_tree, (12 * SZ_1K));
+ ret = __test_case_3(fs_info, inode, (12 * SZ_1K));
return ret;
}
static int __test_case_4(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree, u64 start)
+ struct btrfs_inode *inode, u64 start)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 len = SZ_4K;
int ret;
+ int ret2;
em = alloc_extent_map();
if (!em) {
@@ -343,7 +373,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_8K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 8K)");
@@ -364,7 +394,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->block_start = SZ_16K; /* avoid merging */
em->block_len = 24 * SZ_1K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [8K, 32K)");
@@ -384,23 +414,31 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_32K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
- test_err("case4 [0x%llx 0x%llx): ret %d",
- start, len, ret);
+ test_err("case4 [%llu %llu): ret %d",
+ start, start + len, ret);
goto out;
}
- if (em && (start < em->start || start + len > extent_map_end(em))) {
+ if (!em) {
+ test_err("case4 [%llu %llu): no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
+ if (start < em->start || start + len > extent_map_end(em)) {
test_err(
-"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
- start, len, ret, em->start, em->len, em->block_start,
+"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu block_start %llu block_len %llu)",
+ start, start + len, ret, em->start, em->len, em->block_start,
em->block_len);
ret = -EINVAL;
}
free_extent_map(em);
out:
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
return ret;
}
@@ -430,22 +468,22 @@ out:
* # handle -EEXIST when adding
* # [0, 32K)
*/
-static int test_case_4(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree)
+static int test_case_4(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
int ret;
- ret = __test_case_4(fs_info, em_tree, 0);
+ ret = __test_case_4(fs_info, inode, 0);
if (ret)
return ret;
- ret = __test_case_4(fs_info, em_tree, SZ_4K);
+ ret = __test_case_4(fs_info, inode, SZ_4K);
return ret;
}
-static int add_compressed_extent(struct extent_map_tree *em_tree,
+static int add_compressed_extent(struct btrfs_inode *inode,
u64 start, u64 len, u64 block_start)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
int ret;
@@ -459,9 +497,9 @@ static int add_compressed_extent(struct extent_map_tree *em_tree,
em->len = len;
em->block_start = block_start;
em->block_len = SZ_4K;
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
free_extent_map(em);
if (ret < 0) {
@@ -567,53 +605,44 @@ static int validate_range(struct extent_map_tree *em_tree, int index)
* They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from
* merging the em's.
*/
-static int test_case_5(void)
+static int test_case_5(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
- struct extent_map_tree *em_tree;
- struct inode *inode;
u64 start, end;
int ret;
+ int ret2;
test_msg("Running btrfs_drop_extent_map_range tests");
- inode = btrfs_new_test_inode();
- if (!inode) {
- test_std_err(TEST_ALLOC_INODE);
- return -ENOMEM;
- }
-
- em_tree = &BTRFS_I(inode)->extent_tree;
-
/* [0, 12k) */
- ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0);
+ ret = add_compressed_extent(inode, 0, SZ_4K * 3, 0);
if (ret) {
test_err("cannot add extent range [0, 12K)");
goto out;
}
/* [12k, 24k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K);
+ ret = add_compressed_extent(inode, SZ_4K * 3, SZ_4K * 3, SZ_4K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [24k, 36k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K);
+ ret = add_compressed_extent(inode, SZ_4K * 6, SZ_4K * 3, SZ_8K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [36k, 40k) */
- ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
+ ret = add_compressed_extent(inode, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [40k, 64k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K);
+ ret = add_compressed_extent(inode, SZ_4K * 10, SZ_4K * 6, SZ_16K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
@@ -622,36 +651,39 @@ static int test_case_5(void)
/* Drop [8k, 12k) */
start = SZ_8K;
end = (3 * SZ_4K) - 1;
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
- ret = validate_range(&BTRFS_I(inode)->extent_tree, 0);
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = validate_range(&inode->extent_tree, 0);
if (ret)
goto out;
/* Drop [12k, 20k) */
start = SZ_4K * 3;
end = SZ_16K + SZ_4K - 1;
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
- ret = validate_range(&BTRFS_I(inode)->extent_tree, 1);
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = validate_range(&inode->extent_tree, 1);
if (ret)
goto out;
/* Drop [28k, 32k) */
start = SZ_32K - SZ_4K;
end = SZ_32K - 1;
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
- ret = validate_range(&BTRFS_I(inode)->extent_tree, 2);
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = validate_range(&inode->extent_tree, 2);
if (ret)
goto out;
/* Drop [32k, 64k) */
start = SZ_32K;
end = SZ_64K - 1;
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
- ret = validate_range(&BTRFS_I(inode)->extent_tree, 3);
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = validate_range(&inode->extent_tree, 3);
if (ret)
goto out;
out:
- iput(inode);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
+
return ret;
}
@@ -660,23 +692,26 @@ out:
* for areas between two existing ems. Validate it doesn't do this when there
* are two unmerged em's side by side.
*/
-static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree)
+static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em = NULL;
int ret;
+ int ret2;
- ret = add_compressed_extent(em_tree, 0, SZ_4K, 0);
+ ret = add_compressed_extent(inode, 0, SZ_4K, 0);
if (ret)
goto out;
- ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0);
+ ret = add_compressed_extent(inode, SZ_4K, SZ_4K, 0);
if (ret)
goto out;
em = alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
em->start = SZ_4K;
@@ -684,7 +719,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em
em->block_start = SZ_16K;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, 0, SZ_8K);
+ ret = btrfs_add_extent_mapping(inode, &em, 0, SZ_8K);
write_unlock(&em_tree->lock);
if (ret != 0) {
@@ -704,7 +739,10 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em
ret = 0;
out:
free_extent_map(em);
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
+
return ret;
}
@@ -713,28 +751,19 @@ out:
* true would mess up the start/end calculations and subsequent splits would be
* incorrect.
*/
-static int test_case_7(void)
+static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
- struct extent_map_tree *em_tree;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
- struct inode *inode;
int ret;
+ int ret2;
test_msg("Running btrfs_drop_extent_cache with pinned");
- inode = btrfs_new_test_inode();
- if (!inode) {
- test_std_err(TEST_ALLOC_INODE);
- return -ENOMEM;
- }
-
- em_tree = &BTRFS_I(inode)->extent_tree;
-
em = alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
/* [0, 16K), pinned */
@@ -742,9 +771,9 @@ static int test_case_7(void)
em->len = SZ_16K;
em->block_start = 0;
em->block_len = SZ_4K;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags |= EXTENT_FLAG_PINNED;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("couldn't add extent map");
@@ -765,7 +794,7 @@ static int test_case_7(void)
em->block_start = SZ_32K;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("couldn't add extent map");
@@ -777,7 +806,7 @@ static int test_case_7(void)
* Drop [0, 36K) This should skip the [0, 4K) extent and then split the
* [32K, 48K) extent.
*/
- btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (36 * SZ_1K) - 1, true);
+ btrfs_drop_extent_map_range(inode, 0, (36 * SZ_1K) - 1, true);
/* Make sure our extent maps look sane. */
ret = -EINVAL;
@@ -826,6 +855,11 @@ static int test_case_7(void)
goto out;
}
+ if (em->block_start != SZ_32K + SZ_4K) {
+ test_err("em->block_start is %llu, expected 36K", em->block_start);
+ goto out;
+ }
+
free_extent_map(em);
read_lock(&em_tree->lock);
@@ -839,7 +873,14 @@ static int test_case_7(void)
ret = 0;
out:
free_extent_map(em);
- iput(inode);
+ /* Unpin our extent to prevent warning when removing it below. */
+ ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0);
+ if (ret == 0)
+ ret = ret2;
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
+
return ret;
}
@@ -859,33 +900,21 @@ struct rmap_test_vector {
static int test_rmap_block(struct btrfs_fs_info *fs_info,
struct rmap_test_vector *test)
{
- struct extent_map *em;
- struct map_lookup *map = NULL;
+ struct btrfs_chunk_map *map;
u64 *logical = NULL;
int i, out_ndaddrs, out_stripe_len;
int ret;
- em = alloc_extent_map();
- if (!em) {
- test_std_err(TEST_ALLOC_EXTENT_MAP);
- return -ENOMEM;
- }
-
- map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL);
+ map = btrfs_alloc_chunk_map(test->num_stripes, GFP_KERNEL);
if (!map) {
- kfree(em);
- test_std_err(TEST_ALLOC_EXTENT_MAP);
+ test_std_err(TEST_ALLOC_CHUNK_MAP);
return -ENOMEM;
}
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
/* Start at 4GiB logical address */
- em->start = SZ_4G;
- em->len = test->data_stripe_size * test->num_data_stripes;
- em->block_len = em->len;
- em->orig_block_len = test->data_stripe_size;
- em->map_lookup = map;
-
+ map->start = SZ_4G;
+ map->chunk_len = test->data_stripe_size * test->num_data_stripes;
+ map->stripe_size = test->data_stripe_size;
map->num_stripes = test->num_stripes;
map->type = test->raid_type;
@@ -901,15 +930,13 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
map->stripes[i].physical = test->data_stripe_phys_start[i];
}
- write_lock(&fs_info->mapping_tree.lock);
- ret = add_extent_mapping(&fs_info->mapping_tree, em, 0);
- write_unlock(&fs_info->mapping_tree.lock);
+ ret = btrfs_add_chunk_map(fs_info, map);
if (ret) {
- test_err("error adding block group mapping to mapping tree");
+ test_err("error adding chunk map to mapping tree");
goto out_free;
}
- ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
+ ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1),
&logical, &out_ndaddrs, &out_stripe_len);
if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
test_err("didn't rmap anything but expected %d",
@@ -938,14 +965,8 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
ret = 0;
out:
- write_lock(&fs_info->mapping_tree.lock);
- remove_extent_mapping(&fs_info->mapping_tree, em);
- write_unlock(&fs_info->mapping_tree.lock);
- /* For us */
- free_extent_map(em);
+ btrfs_remove_chunk_map(fs_info, map);
out_free:
- /* For the tree */
- free_extent_map(em);
kfree(logical);
return ret;
}
@@ -953,7 +974,8 @@ out_free:
int btrfs_test_extent_map(void)
{
struct btrfs_fs_info *fs_info = NULL;
- struct extent_map_tree *em_tree;
+ struct inode *inode;
+ struct btrfs_root *root = NULL;
int ret = 0, i;
struct rmap_test_vector rmap_tests[] = {
{
@@ -1002,33 +1024,42 @@ int btrfs_test_extent_map(void)
return -ENOMEM;
}
- em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL);
- if (!em_tree) {
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_std_err(TEST_ALLOC_INODE);
ret = -ENOMEM;
goto out;
}
- extent_map_tree_init(em_tree);
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(root);
+ root = NULL;
+ goto out;
+ }
- ret = test_case_1(fs_info, em_tree);
+ BTRFS_I(inode)->root = root;
+
+ ret = test_case_1(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_2(fs_info, em_tree);
+ ret = test_case_2(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_3(fs_info, em_tree);
+ ret = test_case_3(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_4(fs_info, em_tree);
+ ret = test_case_4(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_5();
+ ret = test_case_5(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_6(fs_info, em_tree);
+ ret = test_case_6(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_7();
+ ret = test_case_7(fs_info, BTRFS_I(inode));
if (ret)
goto out;
@@ -1040,7 +1071,8 @@ int btrfs_test_extent_map(void)
}
out:
- kfree(em_tree);
+ iput(inode);
+ btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
return ret;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 492d69d2fa73..99da9d34b77a 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -211,9 +211,9 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
}
-static unsigned long prealloc_only = 0;
-static unsigned long compressed_only = 0;
-static unsigned long vacancy_only = 0;
+static u32 prealloc_only = 0;
+static u32 compressed_only = 0;
+static u32 vacancy_only = 0;
static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
{
@@ -258,7 +258,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* First with no extents */
BTRFS_I(inode)->root = root;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize);
if (IS_ERR(em)) {
em = NULL;
test_err("got an error when we shouldn't have");
@@ -278,7 +278,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
*/
setup_file_extents(root, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, (u64)-1);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
/*
@@ -316,7 +316,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -332,14 +332,14 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
/* Regular extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -355,7 +355,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -367,7 +367,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* The next 3 are split extents */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -383,7 +383,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -396,7 +396,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -412,13 +412,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -434,7 +434,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != orig_start) {
@@ -452,7 +452,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Prealloc extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -468,7 +468,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
@@ -481,7 +481,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* The next 3 are a half written prealloc extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -497,7 +497,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
@@ -511,7 +511,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -527,7 +527,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != orig_start) {
@@ -544,7 +544,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -560,7 +560,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
@@ -579,7 +579,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Now for the compressed extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -595,7 +595,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
@@ -604,16 +604,16 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
/* Split compressed extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -629,7 +629,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
@@ -638,9 +638,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
disk_bytenr = em->block_start;
@@ -648,7 +648,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -664,7 +664,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -675,7 +675,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
@@ -701,16 +701,16 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
/* A hole between regular extents but no hole extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -726,7 +726,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -737,7 +737,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -758,7 +758,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != vacancy_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
vacancy_only, em->flags);
goto out;
}
@@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -786,7 +786,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -850,7 +850,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
insert_inode_item_key(root);
insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize,
sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -866,13 +866,13 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != vacancy_only) {
- test_err("wrong flags, wanted %lu, have %lu", vacancy_only,
+ test_err("wrong flags, wanted %u, have %u", vacancy_only,
em->flags);
goto out;
}
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -888,7 +888,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, wanted 0 got %lu",
+ test_err("unexpected flags set, wanted 0 got %u",
em->flags);
goto out;
}
@@ -1095,8 +1095,8 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
test_msg("running inode tests");
- set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
- set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+ compressed_only |= EXTENT_FLAG_COMPRESS_ZLIB;
+ prealloc_only |= EXTENT_FLAG_PREALLOC;
ret = test_btrfs_get_extent(sectorsize, nodesize);
if (ret)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 6e63816dddcb..3388c836b9a5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -23,12 +23,10 @@
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
-#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
-#include "defrag.h"
#include "dir-item.h"
#include "uuid-tree.h"
#include "ioctl.h"
@@ -37,8 +35,6 @@
static struct kmem_cache *btrfs_trans_handle_cachep;
-#define BTRFS_ROOT_TRANS_TAG 0
-
/*
* Transaction states and transitions
*
@@ -430,7 +426,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
}
radix_tree_tag_set(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
root->last_trans = trans->transid;
@@ -476,7 +472,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
/* Make sure we don't try to update the root at commit time */
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_tag_clear(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
}
@@ -554,7 +550,7 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
if (!fs_info->reloc_ctl ||
!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
root->reloc_root)
return false;
@@ -566,56 +562,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
u64 num_bytes,
u64 *delayed_refs_bytes)
{
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
- u64 extra_delayed_refs_bytes = 0;
- u64 bytes;
+ u64 bytes = num_bytes + *delayed_refs_bytes;
int ret;
/*
- * If there's a gap between the size of the delayed refs reserve and
- * its reserved space, than some tasks have added delayed refs or bumped
- * its size otherwise (due to block group creation or removal, or block
- * group item update). Also try to allocate that gap in order to prevent
- * using (and possibly abusing) the global reserve when committing the
- * transaction.
- */
- if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- !btrfs_block_rsv_full(delayed_refs_rsv)) {
- spin_lock(&delayed_refs_rsv->lock);
- if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
- extra_delayed_refs_bytes = delayed_refs_rsv->size -
- delayed_refs_rsv->reserved;
- spin_unlock(&delayed_refs_rsv->lock);
- }
-
- bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
-
- /*
* We want to reserve all the bytes we may need all at once, so we only
* do 1 enospc flushing cycle per transaction start.
*/
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
- if (ret == 0) {
- if (extra_delayed_refs_bytes > 0)
- btrfs_migrate_to_delayed_refs_rsv(fs_info,
- extra_delayed_refs_bytes);
- return 0;
- }
-
- if (extra_delayed_refs_bytes > 0) {
- bytes -= extra_delayed_refs_bytes;
- ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
- if (ret == 0)
- return 0;
- }
/*
* If we are an emergency flush, which can steal from the global block
* reserve, then attempt to not reserve space for the delayed refs, as
* we will consume space for them from the global block reserve.
*/
- if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
bytes -= *delayed_refs_bytes;
*delayed_refs_bytes = 0;
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
@@ -783,14 +745,6 @@ again:
h->reloc_reserved = reloc_reserved;
}
- /*
- * Now that we have found a transaction to be a part of, convert the
- * qgroup reservation from prealloc to pertrans. A different transaction
- * can't race in and free our pertrans out from under us.
- */
- if (qgroup_reserved)
- btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
-
got_it:
if (!current->journal_info)
current->journal_info = h;
@@ -824,8 +778,15 @@ got_it:
* not just freed.
*/
btrfs_end_transaction(h);
- return ERR_PTR(ret);
+ goto reserve_fail;
}
+ /*
+ * Now that we have found a transaction to be a part of, convert the
+ * qgroup reservation from prealloc to pertrans. A different transaction
+ * can't race in and free our pertrans out from under us.
+ */
+ if (qgroup_reserved)
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
return h;
@@ -1091,7 +1052,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
- int err = 0;
+ int ret = 0;
if (refcount_read(&trans->use_count) > 1) {
refcount_dec(&trans->use_count);
@@ -1130,13 +1091,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
wake_up_process(info->transaction_kthread);
if (TRANS_ABORTED(trans))
- err = trans->aborted;
+ ret = trans->aborted;
else
- err = -EROFS;
+ ret = -EROFS;
}
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- return err;
+ return ret;
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans)
@@ -1157,8 +1118,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark)
{
- int err = 0;
- int werr = 0;
+ int ret = 0;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
@@ -1168,7 +1128,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
mark, &cached_state)) {
bool wait_writeback = false;
- err = convert_extent_bit(dirty_pages, start, end,
+ ret = convert_extent_bit(dirty_pages, start, end,
EXTENT_NEED_WAIT,
mark, &cached_state);
/*
@@ -1184,22 +1144,22 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
* We cleanup any entries left in the io tree when committing
* the transaction (through extent_io_tree_release()).
*/
- if (err == -ENOMEM) {
- err = 0;
+ if (ret == -ENOMEM) {
+ ret = 0;
wait_writeback = true;
}
- if (!err)
- err = filemap_fdatawrite_range(mapping, start, end);
- if (err)
- werr = err;
- else if (wait_writeback)
- werr = filemap_fdatawait_range(mapping, start, end);
+ if (!ret)
+ ret = filemap_fdatawrite_range(mapping, start, end);
+ if (!ret && wait_writeback)
+ ret = filemap_fdatawait_range(mapping, start, end);
free_extent_state(cached_state);
+ if (ret)
+ break;
cached_state = NULL;
cond_resched();
start = end + 1;
}
- return werr;
+ return ret;
}
/*
@@ -1211,12 +1171,11 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
- int err = 0;
- int werr = 0;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
+ int ret = 0;
while (find_first_extent_bit(dirty_pages, start, &start, &end,
EXTENT_NEED_WAIT, &cached_state)) {
@@ -1228,22 +1187,20 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* concurrently - we do it only at transaction commit time when
* it's safe to do it (through extent_io_tree_release()).
*/
- err = clear_extent_bit(dirty_pages, start, end,
+ ret = clear_extent_bit(dirty_pages, start, end,
EXTENT_NEED_WAIT, &cached_state);
- if (err == -ENOMEM)
- err = 0;
- if (!err)
- err = filemap_fdatawait_range(mapping, start, end);
- if (err)
- werr = err;
+ if (ret == -ENOMEM)
+ ret = 0;
+ if (!ret)
+ ret = filemap_fdatawait_range(mapping, start, end);
free_extent_state(cached_state);
+ if (ret)
+ break;
cached_state = NULL;
cond_resched();
start = end + 1;
}
- if (err)
- werr = err;
- return werr;
+ return ret;
}
static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
@@ -1268,7 +1225,7 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
bool errors = false;
int err;
- ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID);
err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
if ((mark & EXTENT_DIRTY) &&
@@ -1531,8 +1488,9 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
ASSERT(atomic_read(&root->log_commit[1]) == 0);
radix_tree_tag_clear(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
+ btrfs_qgroup_free_meta_all_pertrans(root);
spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log(trans, root);
@@ -1557,7 +1515,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
if (ret2)
return ret2;
spin_lock(&fs_info->fs_roots_radix_lock);
- btrfs_qgroup_free_meta_all_pertrans(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1622,8 +1579,8 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
goto out;
/* Now qgroup are all updated, we can inherit it to new qgroups */
- ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
- parent->root_key.objectid, inherit);
+ ret = btrfs_qgroup_inherit(trans, btrfs_root_id(src), dst_objectid,
+ btrfs_root_id(parent), inherit);
if (ret < 0)
goto out;
@@ -1774,7 +1731,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_create_qgroup(trans, objectid);
- if (ret) {
+ if (ret && ret != -EEXIST) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1861,7 +1818,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* insert root back/forward references
*/
ret = btrfs_add_root_ref(trans, objectid,
- parent_root->root_key.objectid,
+ btrfs_root_id(parent_root),
btrfs_ino(BTRFS_I(parent_inode)), index,
&fname.disk_name);
if (ret) {
@@ -1870,7 +1827,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.offset = (u64)-1;
- pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
+ pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
pending->snap = NULL;
@@ -1894,16 +1851,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = qgroup_account_snapshot(trans, root, parent_root,
pending->inherit, objectid);
else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
- ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid,
- parent_root->root_key.objectid, pending->inherit);
+ ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid,
+ btrfs_root_id(parent_root), pending->inherit);
if (ret < 0)
goto fail;
ret = btrfs_insert_dir_item(trans, &fname.disk_name,
BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
index);
- /* We have check then name at the beginning, so it is impossible. */
- BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1995,19 +1950,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->uuid_tree_generation = root_item->generation;
}
-int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
-{
- struct btrfs_transaction *trans;
- int ret = 0;
-
- spin_lock(&info->trans_lock);
- trans = info->running_transaction;
- if (trans)
- ret = (trans->state >= TRANS_STATE_COMMIT_START);
- spin_unlock(&info->trans_lock);
- return ret;
-}
-
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
struct btrfs_transaction *trans;
@@ -2677,7 +2619,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
list_del_init(&root->root_list);
spin_unlock(&fs_info->trans_lock);
- btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
+ btrfs_debug(fs_info, "cleaner removing %llu", btrfs_root_id(root));
btrfs_kill_all_delayed_nodes(root);
@@ -2722,9 +2664,7 @@ void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
int __init btrfs_transaction_init(void)
{
- btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
- sizeof(struct btrfs_trans_handle), 0,
- SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
+ btrfs_trans_handle_cachep = KMEM_CACHE(btrfs_trans_handle, SLAB_TEMPORARY);
if (!btrfs_trans_handle_cachep)
return -ENOMEM;
return 0;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 18c4f6e83b78..4e451ab173b1 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -6,12 +6,30 @@
#ifndef BTRFS_TRANSACTION_H
#define BTRFS_TRANSACTION_H
+#include <linux/atomic.h>
#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/time64.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
#include "btrfs_inode.h"
#include "delayed-ref.h"
-#include "ctree.h"
+#include "extent-io-tree.h"
+#include "block-rsv.h"
+#include "messages.h"
#include "misc.h"
+struct dentry;
+struct inode;
+struct btrfs_pending_snapshot;
+struct btrfs_fs_info;
+struct btrfs_root_item;
+struct btrfs_root;
+struct btrfs_path;
+
+/* Radix-tree tag for roots that are part of the trasaction. */
+#define BTRFS_ROOT_TRANS_TAG 0
+
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
TRANS_STATE_COMMIT_PREP,
@@ -259,7 +277,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark);
int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
-int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
void btrfs_put_transaction(struct btrfs_transaction *transaction);
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index a416cbea75d1..a2c3651a3d8f 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -21,7 +21,6 @@
#include "messages.h"
#include "ctree.h"
#include "tree-checker.h"
-#include "disk-io.h"
#include "compression.h"
#include "volumes.h"
#include "misc.h"
@@ -30,7 +29,7 @@
#include "file-item.h"
#include "inode-item.h"
#include "dir-item.h"
-#include "raid-stripe-tree.h"
+#include "extent-tree.h"
/*
* Error message should follow the following format:
@@ -66,6 +65,7 @@ static void generic_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -93,6 +93,7 @@ static void file_extent_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -153,6 +154,7 @@ static void dir_item_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -648,6 +650,7 @@ static void block_group_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -1004,6 +1007,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(eb->fs_info,
"corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -1259,6 +1263,7 @@ static void extent_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(eb->fs_info,
"corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -1276,6 +1281,8 @@ static int check_extent_item(struct extent_buffer *leaf,
unsigned long ptr; /* Current pointer inside inline refs */
unsigned long end; /* Extent item end */
const u32 item_size = btrfs_item_size(leaf, slot);
+ u8 last_type = 0;
+ u64 last_seq = U64_MAX;
u64 flags;
u64 generation;
u64 total_refs; /* Total refs in btrfs_extent_item */
@@ -1322,6 +1329,18 @@ static int check_extent_item(struct extent_buffer *leaf,
* 2.2) Ref type specific data
* Either using btrfs_extent_inline_ref::offset, or specific
* data structure.
+ *
+ * All above inline items should follow the order:
+ *
+ * - All btrfs_extent_inline_ref::type should be in an ascending
+ * order
+ *
+ * - Within the same type, the items should follow a descending
+ * order by their sequence number. The sequence number is
+ * determined by:
+ * * btrfs_extent_inline_ref::offset for all types other than
+ * EXTENT_DATA_REF
+ * * hash_extent_data_ref() for EXTENT_DATA_REF
*/
if (unlikely(item_size < sizeof(*ei))) {
extent_err(leaf, slot,
@@ -1403,6 +1422,7 @@ static int check_extent_item(struct extent_buffer *leaf,
struct btrfs_extent_inline_ref *iref;
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
+ u64 seq;
u64 dref_offset;
u64 inline_offset;
u8 inline_type;
@@ -1416,10 +1436,11 @@ static int check_extent_item(struct extent_buffer *leaf,
iref = (struct btrfs_extent_inline_ref *)ptr;
inline_type = btrfs_extent_inline_ref_type(leaf, iref);
inline_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+ seq = inline_offset;
if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
- ptr, inline_type, end);
+ ptr, btrfs_extent_inline_ref_size(inline_type), end);
return -EUCLEAN;
}
@@ -1446,6 +1467,10 @@ static int check_extent_item(struct extent_buffer *leaf,
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
dref_offset = btrfs_extent_data_ref_offset(leaf, dref);
+ seq = hash_extent_data_ref(
+ btrfs_extent_data_ref_root(leaf, dref),
+ btrfs_extent_data_ref_objectid(leaf, dref),
+ btrfs_extent_data_ref_offset(leaf, dref));
if (unlikely(!IS_ALIGNED(dref_offset,
fs_info->sectorsize))) {
extent_err(leaf, slot,
@@ -1475,6 +1500,24 @@ static int check_extent_item(struct extent_buffer *leaf,
inline_type);
return -EUCLEAN;
}
+ if (inline_type < last_type) {
+ extent_err(leaf, slot,
+ "inline ref out-of-order: has type %u, prev type %u",
+ inline_type, last_type);
+ return -EUCLEAN;
+ }
+ /* Type changed, allow the sequence starts from U64_MAX again. */
+ if (inline_type > last_type)
+ last_seq = U64_MAX;
+ if (seq > last_seq) {
+ extent_err(leaf, slot,
+"inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx",
+ inline_type, inline_offset, seq,
+ last_type, last_seq);
+ return -EUCLEAN;
+ }
+ last_type = inline_type;
+ last_seq = seq;
ptr += btrfs_extent_inline_ref_size(inline_type);
}
/* No padding is allowed */
@@ -1754,6 +1797,11 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
return BTRFS_TREE_BLOCK_INVALID_LEVEL;
}
+ if (unlikely(!btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN))) {
+ generic_err(leaf, 0, "invalid flag for leaf, WRITTEN not set");
+ return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET;
+ }
+
/*
* Extent buffers from a relocation tree have a owner field that
* corresponds to the subvolume tree they are based on. So just from an
@@ -1815,6 +1863,7 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
for (slot = 0; slot < nritems; slot++) {
u32 item_end_expected;
u64 item_data_end;
+ enum btrfs_tree_block_status ret;
btrfs_item_key_to_cpu(leaf, &key, slot);
@@ -1870,21 +1919,10 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
return BTRFS_TREE_BLOCK_INVALID_OFFSETS;
}
- /*
- * We only want to do this if WRITTEN is set, otherwise the leaf
- * may be in some intermediate state and won't appear valid.
- */
- if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) {
- enum btrfs_tree_block_status ret;
-
- /*
- * Check if the item size and content meet other
- * criteria
- */
- ret = check_leaf_item(leaf, &key, slot, &prev_key);
- if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN))
- return ret;
- }
+ /* Check if the item size and content meet other criteria. */
+ ret = check_leaf_item(leaf, &key, slot, &prev_key);
+ if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN))
+ return ret;
prev_key.objectid = key.objectid;
prev_key.type = key.type;
@@ -1914,6 +1952,11 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node)
int level = btrfs_header_level(node);
u64 bytenr;
+ if (unlikely(!btrfs_header_flag(node, BTRFS_HEADER_FLAG_WRITTEN))) {
+ generic_err(node, 0, "invalid flag for node, WRITTEN not set");
+ return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET;
+ }
+
if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) {
generic_err(node, 0,
"invalid level for node, have %d expect [1, %d]",
@@ -1978,7 +2021,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
* Skip dummy fs, as selftests don't create unique ebs for each dummy
* root.
*/
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state))
+ if (btrfs_is_testing(eb->fs_info))
return 0;
/*
* There are several call sites (backref walking, qgroup, and data
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 3c2a02a72f64..01669cfa6578 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -6,10 +6,12 @@
#ifndef BTRFS_TREE_CHECKER_H
#define BTRFS_TREE_CHECKER_H
+#include <linux/types.h>
#include <uapi/linux/btrfs_tree.h>
struct extent_buffer;
struct btrfs_chunk;
+struct btrfs_key;
/* All the extra info needed to verify the parentness of a tree block. */
struct btrfs_tree_parent_check {
@@ -22,7 +24,7 @@ struct btrfs_tree_parent_check {
/*
* Expected transid, can be 0 to skip the check, but such skip
- * should only be utlized for backref walk related code.
+ * should only be utilized for backref walk related code.
*/
u64 transid;
@@ -51,6 +53,7 @@ enum btrfs_tree_block_status {
BTRFS_TREE_BLOCK_INVALID_BLOCKPTR,
BTRFS_TREE_BLOCK_INVALID_ITEM,
BTRFS_TREE_BLOCK_INVALID_OWNER,
+ BTRFS_TREE_BLOCK_WRITTEN_NOT_SET,
};
/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7d6729d9fd2f..5146387b416b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -13,13 +13,11 @@
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
-#include "print-tree.h"
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
-#include "zoned.h"
#include "inode-item.h"
#include "fs.h"
#include "accessors.h"
@@ -393,7 +391,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* the leaf before writing into the log tree. See the comments at
* copy_items() for more details.
*/
- ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
item_size = btrfs_item_size(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
@@ -750,7 +748,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
goto out;
if (ins.objectid > 0) {
- struct btrfs_ref ref = { 0 };
u64 csum_start;
u64 csum_end;
LIST_HEAD(ordered_sums);
@@ -764,13 +761,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
if (ret < 0) {
goto out;
} else if (ret == 0) {
- btrfs_init_generic_ref(&ref,
- BTRFS_ADD_DELAYED_REF,
- ins.objectid, ins.offset, 0,
- root->root_key.objectid);
- btrfs_init_data_ref(&ref,
- root->root_key.objectid,
- key->objectid, offset, 0, false);
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = ins.objectid,
+ .num_bytes = ins.offset,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+ btrfs_init_data_ref(&ref, key->objectid, offset,
+ 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret)
goto out;
@@ -780,7 +779,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* allocation tree
*/
ret = btrfs_alloc_logged_file_extent(trans,
- root->root_key.objectid,
+ btrfs_root_id(root),
key->objectid, offset, &ins);
if (ret)
goto out;
@@ -799,9 +798,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_list(root->log_root,
csum_start, csum_end - 1,
- &ordered_sums, 0, false);
- if (ret)
+ &ordered_sums, false);
+ if (ret < 0)
goto out;
+ ret = 0;
/*
* Now delete all existing cums in the csum root that
* cover our range. We do this because we can have an
@@ -2575,7 +2575,6 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans,
ret = btrfs_pin_reserved_extent(trans, eb);
if (ret)
return ret;
- btrfs_redirty_list_add(trans->transaction, eb);
} else {
unaccount_log_buffer(eb->fs_info, eb->start);
}
@@ -2821,6 +2820,52 @@ static void wait_for_writer(struct btrfs_root *root)
finish_wait(&root->log_writer_wait, &wait);
}
+void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode)
+{
+ ctx->log_ret = 0;
+ ctx->log_transid = 0;
+ ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
+ ctx->logging_new_delayed_dentries = false;
+ ctx->logged_before = false;
+ ctx->inode = inode;
+ INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+ INIT_LIST_HEAD(&ctx->conflict_inodes);
+ ctx->num_conflict_inodes = 0;
+ ctx->logging_conflict_inodes = false;
+ ctx->scratch_eb = NULL;
+}
+
+void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+
+ if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return;
+
+ /*
+ * Don't care about allocation failure. This is just for optimization,
+ * if we fail to allocate here, we will try again later if needed.
+ */
+ ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
+}
+
+void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ ASSERT(inode_is_locked(ctx->inode));
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
+
static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
@@ -3002,7 +3047,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (ret != -ENOSPC)
btrfs_err(fs_info,
"failed to update log for root %llu ret %d",
- root->root_key.objectid, ret);
+ btrfs_root_id(root), ret);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
@@ -3620,6 +3665,30 @@ out:
return ret;
}
+static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
+{
+ const int slot = path->slots[0];
+
+ if (ctx->scratch_eb) {
+ copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
+ } else {
+ ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!ctx->scratch_eb)
+ return -ENOMEM;
+ }
+
+ btrfs_release_path(path);
+ path->nodes[0] = ctx->scratch_eb;
+ path->slots[0] = slot;
+ /*
+ * Add extra ref to scratch eb so that it is not freed when callers
+ * release the path, so we can reuse it later if needed.
+ */
+ atomic_inc(&ctx->scratch_eb->refs);
+
+ return 0;
+}
+
static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
@@ -3634,23 +3703,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
bool last_found = false;
int batch_start = 0;
int batch_size = 0;
- int i;
+ int ret;
/*
* We need to clone the leaf, release the read lock on it, and use the
* clone before modifying the log tree. See the comment at copy_items()
* about why we need to do this.
*/
- src = btrfs_clone_extent_buffer(path->nodes[0]);
- if (!src)
- return -ENOMEM;
+ ret = clone_leaf(path, ctx);
+ if (ret < 0)
+ return ret;
- i = path->slots[0];
- btrfs_release_path(path);
- path->nodes[0] = src;
- path->slots[0] = i;
+ src = path->nodes[0];
- for (; i < nritems; i++) {
+ for (int i = path->slots[0]; i < nritems; i++) {
struct btrfs_dir_item *di;
struct btrfs_key key;
int ret;
@@ -4260,17 +4326,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_path *dst_path,
struct btrfs_path *src_path,
int start_slot, int nr, int inode_only,
- u64 logged_isize)
+ u64 logged_isize, struct btrfs_log_ctx *ctx)
{
struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
struct extent_buffer *src;
- int ret = 0;
+ int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
struct btrfs_item_batch batch;
char *ins_data;
- int i;
int dst_index;
const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
const u64 i_size = i_size_read(&inode->vfs_inode);
@@ -4303,14 +4368,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* while the other is holding the delayed node's mutex and wants to
* write lock the same subvolume leaf for flushing delayed items.
*/
- src = btrfs_clone_extent_buffer(src_path->nodes[0]);
- if (!src)
- return -ENOMEM;
+ ret = clone_leaf(src_path, ctx);
+ if (ret < 0)
+ return ret;
- i = src_path->slots[0];
- btrfs_release_path(src_path);
- src_path->nodes[0] = src;
- src_path->slots[0] = i;
+ src = src_path->nodes[0];
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
nr * sizeof(u32), GFP_NOFS);
@@ -4325,7 +4387,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
batch.nr = 0;
dst_index = 0;
- for (i = 0; i < nr; i++) {
+ for (int i = 0; i < nr; i++) {
const int src_slot = start_slot + i;
struct btrfs_root *csum_root;
struct btrfs_ordered_sum *sums;
@@ -4400,9 +4462,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
disk_bytenr += extent_offset;
ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
disk_bytenr + extent_num_bytes - 1,
- &ordered_sums, 0, false);
- if (ret)
+ &ordered_sums, false);
+ if (ret < 0)
goto out;
+ ret = 0;
list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
if (!ret)
@@ -4432,7 +4495,7 @@ add_to_batch:
goto out;
dst_index = 0;
- for (i = 0; i < nr; i++) {
+ for (int i = 0; i < nr; i++) {
const int src_slot = start_slot + i;
const int dst_slot = dst_path->slots[0] + dst_index;
struct btrfs_key key;
@@ -4514,13 +4577,13 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *csum_root;
u64 csum_offset;
u64 csum_len;
- u64 mod_start = em->mod_start;
- u64 mod_len = em->mod_len;
+ u64 mod_start = em->start;
+ u64 mod_len = em->len;
LIST_HEAD(ordered_sums);
int ret = 0;
if (inode->flags & BTRFS_INODE_NODATASUM ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ (em->flags & EXTENT_FLAG_PREALLOC) ||
em->block_start == EXTENT_MAP_HOLE)
return 0;
@@ -4583,7 +4646,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
return 0;
/* If we're compressed we have to save the entire range of csums. */
- if (em->compress_type) {
+ if (extent_map_is_compressed(em)) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
@@ -4595,9 +4658,10 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
em->block_start + csum_offset +
- csum_len - 1, &ordered_sums, 0, false);
- if (ret)
+ csum_len - 1, &ordered_sums, false);
+ if (ret < 0)
return ret;
+ ret = 0;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
@@ -4623,18 +4687,20 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item fi = { 0 };
struct extent_buffer *leaf;
struct btrfs_key key;
+ enum btrfs_compression_type compress_type;
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
btrfs_set_stack_file_extent_generation(&fi, trans->transid);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
else
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ compress_type = extent_map_compression(em);
+ if (compress_type != BTRFS_COMPRESS_NONE) {
btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
@@ -4646,7 +4712,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_offset(&fi, extent_offset);
btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
- btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
+ btrfs_set_stack_file_extent_compression(&fi, compress_type);
ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
@@ -4703,7 +4769,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
*/
static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
struct btrfs_key key;
@@ -4769,7 +4836,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
if (slot >= btrfs_header_nritems(leaf)) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
if (ret < 0)
goto out;
ins_nr = 0;
@@ -4819,7 +4886,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
}
if (ins_nr > 0)
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
out:
btrfs_release_path(path);
btrfs_free_path(dst_path);
@@ -4859,13 +4926,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
continue;
/* We log prealloc extents beyond eof later. */
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
+ if ((em->flags & EXTENT_FLAG_PREALLOC) &&
em->start >= i_size_read(&inode->vfs_inode))
continue;
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(&em->refs);
- set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags |= EXTENT_FLAG_LOGGING;
list_add_tail(&em->list, &extents);
num++;
}
@@ -4882,7 +4949,7 @@ process:
* private list.
*/
if (ret) {
- clear_em_logging(tree, em);
+ clear_em_logging(inode, em);
free_extent_map(em);
continue;
}
@@ -4891,14 +4958,14 @@ process:
ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
- clear_em_logging(tree, em);
+ clear_em_logging(inode, em);
free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
if (!ret)
- ret = btrfs_log_prealloc_extents(trans, inode, path);
+ ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
if (ret)
return ret;
@@ -4979,7 +5046,8 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path)
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
int ret;
@@ -5008,7 +5076,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
if (slot >= nritems) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5034,7 +5102,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
}
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
if (ret < 0)
return ret;
}
@@ -5846,7 +5914,7 @@ again:
}
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr,
- inode_only, logged_isize);
+ inode_only, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5865,7 +5933,7 @@ again:
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot,
- ins_nr, inode_only, logged_isize);
+ ins_nr, inode_only, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5882,7 +5950,7 @@ again:
}
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
- ins_nr, inode_only, logged_isize);
+ ins_nr, inode_only, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 1;
@@ -5897,7 +5965,7 @@ next_slot:
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
- logged_isize);
+ logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5922,7 +5990,7 @@ next_key:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
- ins_nr, inode_only, logged_isize);
+ ins_nr, inode_only, logged_isize, ctx);
if (ret)
return ret;
}
@@ -5933,7 +6001,7 @@ next_key:
* lock the same leaf with btrfs_log_prealloc_extents() below.
*/
btrfs_release_path(path);
- ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
}
return ret;
@@ -6525,7 +6593,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
btrfs_release_path(dst_path);
- ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
if (ret)
goto out_unlock;
xattrs_logged = true;
@@ -6552,7 +6620,7 @@ log_extents:
* BTRFS_INODE_COPY_EVERYTHING set.
*/
if (!xattrs_logged && inode->logged_trans < trans->transid) {
- ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
if (ret)
goto out_unlock;
btrfs_release_path(path);
@@ -7501,6 +7569,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
ctx.logging_new_name = true;
+ btrfs_init_log_ctx_scratch_eb(&ctx);
/*
* We don't care about the return value. If we fail to log the new name
* then we know the next attempt to sync the log will fallback to a full
@@ -7509,6 +7578,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+ free_extent_buffer(ctx.scratch_eb);
ASSERT(list_empty(&ctx.conflict_inodes));
out:
/*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index a550a8a375cd..22e9cbc81577 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -6,10 +6,18 @@
#ifndef BTRFS_TREE_LOG_H
#define BTRFS_TREE_LOG_H
+#include <linux/list.h>
+#include <linux/fs.h>
#include "messages.h"
#include "ctree.h"
#include "transaction.h"
+struct inode;
+struct dentry;
+struct btrfs_ordered_extent;
+struct btrfs_root;
+struct btrfs_trans_handle;
+
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
@@ -36,37 +44,20 @@ struct btrfs_log_ctx {
struct list_head conflict_inodes;
int num_conflict_inodes;
bool logging_conflict_inodes;
+ /*
+ * Used for fsyncs that need to copy items from the subvolume tree to
+ * the log tree (full sync flag set or copy everything flag set) to
+ * avoid allocating a temporary extent buffer while holding a lock on
+ * an extent buffer of the subvolume tree and under the log transaction.
+ * Also helps to avoid allocating and freeing a temporary extent buffer
+ * in case we need to process multiple leaves from the subvolume tree.
+ */
+ struct extent_buffer *scratch_eb;
};
-static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
- struct inode *inode)
-{
- ctx->log_ret = 0;
- ctx->log_transid = 0;
- ctx->log_new_dentries = false;
- ctx->logging_new_name = false;
- ctx->logging_new_delayed_dentries = false;
- ctx->logged_before = false;
- ctx->inode = inode;
- INIT_LIST_HEAD(&ctx->list);
- INIT_LIST_HEAD(&ctx->ordered_extents);
- INIT_LIST_HEAD(&ctx->conflict_inodes);
- ctx->num_conflict_inodes = 0;
- ctx->logging_conflict_inodes = false;
-}
-
-static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
-{
- struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_extent *tmp;
-
- ASSERT(inode_is_locked(ctx->inode));
-
- list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
- list_del_init(&ordered->log_list);
- btrfs_put_ordered_extent(ordered);
- }
-}
+void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode);
+void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx);
+void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx);
static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
{
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 3df6153d5d5a..fa45b5fb9683 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -44,7 +44,7 @@ struct tree_mod_elem {
/*
* Pull a new tree mod seq number for our operation.
*/
-static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
{
return atomic64_inc_return(&fs_info->tree_mod_seq);
}
@@ -170,8 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
* this until all tree mod log insertions are recorded in the rb tree and then
* write unlock fs_info::tree_mod_log_lock.
*/
-static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
+static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return true;
@@ -188,7 +187,7 @@ static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
}
/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
-static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
struct extent_buffer *eb)
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
@@ -367,9 +366,9 @@ free_tms:
return ret;
}
-static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
- struct tree_mod_elem **tm_list,
- int nritems)
+static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem **tm_list,
+ int nritems)
{
int i, j;
int ret;
@@ -1005,7 +1004,7 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(eb_root);
check.level = level;
- check.owner_root = root->root_key.objectid;
+ check.owner_root = btrfs_root_id(root);
old = read_tree_block(fs_info, logical, &check);
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index 94f10afeee97..ff00c8e8a393 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -3,7 +3,13 @@
#ifndef BTRFS_TREE_MOD_LOG_H
#define BTRFS_TREE_MOD_LOG_H
-#include "ctree.h"
+#include <linux/list.h>
+
+struct extent_buffer;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_root;
+struct btrfs_seq_list;
/* Represents a tree mod log user. */
struct btrfs_seq_list {
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index b4ac2b0cd235..183863f4bfa4 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -7,7 +7,6 @@
#include <linux/slab.h>
#include "messages.h"
#include "ulist.h"
-#include "ctree.h"
/*
* ulist is a generic data structure to hold a collection of unique u64
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index b2cef187ea8e..8e200fe1a2dd 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -7,6 +7,7 @@
#ifndef BTRFS_ULIST_H
#define BTRFS_ULIST_H
+#include <linux/types.h>
#include <linux/list.h>
#include <linux/rbtree.h>
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 5be74f9e47eb..b0aff297d67d 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -9,7 +9,6 @@
#include "ctree.h"
#include "transaction.h"
#include "disk-io.h"
-#include "print-tree.h"
#include "fs.h"
#include "accessors.h"
#include "uuid-tree.h"
@@ -114,7 +113,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
ret = btrfs_insert_empty_item(trans, uuid_root, path, &key,
sizeof(subid_le));
- if (ret >= 0) {
+ if (ret == 0) {
/* Add an item for the type for the first time */
eb = path->nodes[0];
slot = path->slots[0];
diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h
index 5350c87fe2ca..080ede0227ae 100644
--- a/fs/btrfs/uuid-tree.h
+++ b/fs/btrfs/uuid-tree.h
@@ -3,6 +3,11 @@
#ifndef BTRFS_UUID_TREE_H
#define BTRFS_UUID_TREE_H
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_fs_info;
+
int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 66e2270b0dae..4042dd6437ae 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -14,7 +14,6 @@
#include "ctree.h"
#include "btrfs_inode.h"
#include "transaction.h"
-#include "disk-io.h"
#include "locking.h"
#include "fs.h"
#include "accessors.h"
diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h
index 91c10f7d0a46..d696659e43e4 100644
--- a/fs/btrfs/verity.h
+++ b/fs/btrfs/verity.h
@@ -3,8 +3,13 @@
#ifndef BTRFS_VERITY_H
#define BTRFS_VERITY_H
+struct inode;
+struct btrfs_inode;
+
#ifdef CONFIG_FS_VERITY
+#include <linux/fsverity.h>
+
extern const struct fsverity_operations btrfs_verityops;
int btrfs_drop_verity_items(struct btrfs_inode *inode);
@@ -12,6 +17,8 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
#else
+#include <linux/errno.h>
+
static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
{
return 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c87e18827a0a..b6a701011fb0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,10 +14,8 @@
#include <linux/namei.h>
#include "misc.h"
#include "ctree.h"
-#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
-#include "print-tree.h"
#include "volumes.h"
#include "raid56.h"
#include "rcu-string.h"
@@ -41,6 +39,17 @@
BTRFS_BLOCK_GROUP_RAID10 | \
BTRFS_BLOCK_GROUP_RAID56_MASK)
+struct btrfs_io_geometry {
+ u32 stripe_index;
+ u32 stripe_nr;
+ int mirror_num;
+ int num_stripes;
+ u64 stripe_offset;
+ u64 raid56_full_stripe_start;
+ int max_errors;
+ enum btrfs_map_op op;
+};
+
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
@@ -457,39 +466,39 @@ static noinline struct btrfs_fs_devices *find_fsid(
static int
btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
- int flush, struct bdev_handle **bdev_handle,
+ int flush, struct file **bdev_file,
struct btrfs_super_block **disk_super)
{
struct block_device *bdev;
int ret;
- *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL);
+ *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
- if (IS_ERR(*bdev_handle)) {
- ret = PTR_ERR(*bdev_handle);
+ if (IS_ERR(*bdev_file)) {
+ ret = PTR_ERR(*bdev_file);
goto error;
}
- bdev = (*bdev_handle)->bdev;
+ bdev = file_bdev(*bdev_file);
if (flush)
sync_blockdev(bdev);
ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
- bdev_release(*bdev_handle);
+ fput(*bdev_file);
goto error;
}
invalidate_bdev(bdev);
*disk_super = btrfs_read_dev_super(bdev);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- bdev_release(*bdev_handle);
+ fput(*bdev_file);
goto error;
}
return 0;
error:
- *bdev_handle = NULL;
+ *bdev_file = NULL;
return ret;
}
@@ -632,7 +641,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, blk_mode_t flags,
void *holder)
{
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
@@ -643,7 +652,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev_handle, &disk_super);
+ &bdev_file, &disk_super);
if (ret)
return ret;
@@ -667,22 +676,32 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = true;
} else {
- if (bdev_read_only(bdev_handle->bdev))
+ if (bdev_read_only(file_bdev(bdev_file)))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
else
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- if (!bdev_nonrot(bdev_handle->bdev))
+ if (!bdev_nonrot(file_bdev(bdev_file)))
fs_devices->rotating = true;
- if (bdev_max_discard_sectors(bdev_handle->bdev))
+ if (bdev_max_discard_sectors(file_bdev(bdev_file)))
fs_devices->discardable = true;
- device->bdev_handle = bdev_handle;
- device->bdev = bdev_handle->bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ if (device->devt != device->bdev->bd_dev) {
+ btrfs_warn(NULL,
+ "device %s maj:min changed from %d:%d to %d:%d",
+ device->name->str, MAJOR(device->devt),
+ MINOR(device->devt), MAJOR(device->bdev->bd_dev),
+ MINOR(device->bdev->bd_dev));
+
+ device->devt = device->bdev->bd_dev;
+ }
+
fs_devices->open_devices++;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@ -695,7 +714,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- bdev_release(bdev_handle);
+ fput(bdev_file);
return -EINVAL;
}
@@ -748,18 +767,19 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (!fs_devices) {
fs_devices = alloc_fs_devices(disk_super->fsid);
+ if (IS_ERR(fs_devices))
+ return ERR_CAST(fs_devices);
+
if (has_metadata_uuid)
memcpy(fs_devices->metadata_uuid,
disk_super->metadata_uuid, BTRFS_FSID_SIZE);
- if (IS_ERR(fs_devices))
- return ERR_CAST(fs_devices);
-
if (same_fsid_diff_dev) {
generate_random_uuid(fs_devices->fsid);
fs_devices->temp_fsid = true;
- pr_info("BTRFS: device %s using temp-fsid %pU\n",
- path, fs_devices->fsid);
+ pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n",
+ path, MAJOR(path_devt), MINOR(path_devt),
+ fs_devices->fsid);
}
mutex_lock(&fs_devices->device_list_mutex);
@@ -788,8 +808,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (fs_devices->opened) {
btrfs_err(NULL,
-"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
- path, fs_devices->fsid, current->comm,
+"device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
+ path, MAJOR(path_devt), MINOR(path_devt),
+ fs_devices->fsid, current->comm,
task_pid_nr(current));
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-EBUSY);
@@ -815,13 +836,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (disk_super->label[0])
pr_info(
- "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
+"BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
disk_super->label, devid, found_transid, path,
+ MAJOR(path_devt), MINOR(path_devt),
current->comm, task_pid_nr(current));
else
pr_info(
- "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
+"BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
disk_super->fsid, devid, found_transid, path,
+ MAJOR(path_devt), MINOR(path_devt),
current->comm, task_pid_nr(current));
} else if (!device->name || strcmp(device->name->str, path)) {
@@ -1004,10 +1027,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
continue;
- if (device->bdev_handle) {
- bdev_release(device->bdev_handle);
+ if (device->bdev_file) {
+ fput(device->bdev_file);
device->bdev = NULL;
- device->bdev_handle = NULL;
+ device->bdev_file = NULL;
fs_devices->open_devices--;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1052,7 +1075,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- bdev_release(device->bdev_handle);
+ fput(device->bdev_file);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1161,23 +1184,30 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
struct btrfs_device *tmp_device;
+ int ret = 0;
list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
dev_list) {
- int ret;
+ int ret2;
- ret = btrfs_open_one_device(fs_devices, device, flags, holder);
- if (ret == 0 &&
+ ret2 = btrfs_open_one_device(fs_devices, device, flags, holder);
+ if (ret2 == 0 &&
(!latest_dev || device->generation > latest_dev->generation)) {
latest_dev = device;
- } else if (ret == -ENODATA) {
+ } else if (ret2 == -ENODATA) {
fs_devices->num_devices--;
list_del(&device->dev_list);
btrfs_free_device(device);
}
+ if (ret == 0 && ret2 != 0)
+ ret = ret2;
}
- if (fs_devices->open_devices == 0)
+
+ if (fs_devices->open_devices == 0) {
+ if (ret)
+ return ret;
return -EINVAL;
+ }
fs_devices->opened = 1;
fs_devices->latest_dev = latest_dev;
@@ -1290,6 +1320,47 @@ int btrfs_forget_devices(dev_t devt)
return ret;
}
+static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
+ const char *path, dev_t devt,
+ bool mount_arg_dev)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Do not skip device registration for mounted devices with matching
+ * maj:min but different paths. Booting without initrd relies on
+ * /dev/root initially, later replaced with the actual root device.
+ * A successful scan ensures grub2-probe selects the correct device.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ struct btrfs_device *device;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ if (!fs_devices->opened) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ continue;
+ }
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->bdev && (device->bdev->bd_dev == devt) &&
+ strcmp(device->name->str, path) != 0) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ /* Do not skip registration. */
+ return false;
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+ }
+
+ if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+ !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING))
+ return true;
+
+ return false;
+}
+
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
@@ -1305,8 +1376,9 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
u64 bytenr, bytenr_orig;
+ dev_t devt;
int ret;
lockdep_assert_held(&uuid_mutex);
@@ -1328,36 +1400,31 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
- bdev_handle = bdev_open_by_path(path, flags, NULL, NULL);
- if (IS_ERR(bdev_handle))
- return ERR_CAST(bdev_handle);
+ bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
+ if (IS_ERR(bdev_file))
+ return ERR_CAST(bdev_file);
bytenr_orig = btrfs_sb_offset(0);
- ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr);
+ ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
if (ret) {
device = ERR_PTR(ret);
goto error_bdev_put;
}
- disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr,
+ disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
bytenr_orig);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
}
- if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
- !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
- dev_t devt;
+ devt = file_bdev(bdev_file)->bd_dev;
+ if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
+ pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
+ path, MAJOR(devt), MINOR(devt));
- ret = lookup_bdev(path, &devt);
- if (ret)
- btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
- path, ret);
- else
- btrfs_free_stale_devices(devt, NULL);
+ btrfs_free_stale_devices(devt, NULL);
- pr_debug("BTRFS: skip registering single non-seed device %s\n", path);
device = NULL;
goto free_disk_super;
}
@@ -1370,7 +1437,7 @@ free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- bdev_release(bdev_handle);
+ fput(bdev_file);
return device;
}
@@ -1392,7 +1459,7 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
if (in_range(physical_start, *start, len) ||
in_range(*start, physical_start,
- physical_end - physical_start)) {
+ physical_end + 1 - physical_start)) {
*start = physical_end + 1;
return true;
}
@@ -1742,19 +1809,18 @@ out:
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
struct rb_node *n;
u64 ret = 0;
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- n = rb_last(&em_tree->map.rb_root);
+ read_lock(&fs_info->mapping_tree_lock);
+ n = rb_last(&fs_info->mapping_tree.rb_root);
if (n) {
- em = rb_entry(n, struct extent_map, rb_node);
- ret = em->start + em->len;
+ struct btrfs_chunk_map *map;
+
+ map = rb_entry(n, struct btrfs_chunk_map, rb_node);
+ ret = map->start + map->chunk_len;
}
- read_unlock(&em_tree->lock);
+ read_unlock(&fs_info->mapping_tree_lock);
return ret;
}
@@ -2022,11 +2088,10 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
copy_num, ret);
}
-void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
- struct block_device *bdev,
- const char *device_path)
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
{
int copy_num;
+ struct block_device *bdev = device->bdev;
if (!bdev)
return;
@@ -2042,12 +2107,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(device_path);
+ update_dev_time(device->name->str);
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct bdev_handle **bdev_handle)
+ struct file **bdev_file)
{
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
@@ -2156,7 +2221,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
btrfs_assign_next_active_device(device, NULL);
- if (device->bdev_handle) {
+ if (device->bdev_file) {
cur_devices->open_devices--;
/* remove sysfs entry */
btrfs_sysfs_remove_device(device);
@@ -2172,20 +2237,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and bdev_release() will pull in the ->open_mutex on
- * the block device and it's dependencies. Instead just flush the
- * device and let the caller do the final bdev_release.
+ * write lock, and fput() on the block device will pull in the
+ * ->open_mutex on the block device and it's dependencies. Instead
+ * just flush the device and let the caller do the final bdev_release.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
- btrfs_scratch_superblocks(fs_info, device->bdev,
- device->name->str);
+ btrfs_scratch_superblocks(fs_info, device);
if (device->bdev) {
sync_blockdev(device->bdev);
invalidate_bdev(device->bdev);
}
}
- *bdev_handle = device->bdev_handle;
+ *bdev_file = device->bdev_file;
synchronize_rcu();
btrfs_free_device(device);
@@ -2291,8 +2355,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
- tgtdev->name->str);
+ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev);
btrfs_close_bdev(tgtdev);
synchronize_rcu();
@@ -2322,7 +2385,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
const char *path)
{
struct btrfs_super_block *disk_super;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
int ret;
if (!path || !path[0])
@@ -2340,7 +2403,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
}
ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
- &bdev_handle, &disk_super);
+ &bdev_file, &disk_super);
if (ret) {
btrfs_put_dev_args_from_path(args);
return ret;
@@ -2353,7 +2416,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- bdev_release(bdev_handle);
+ fput(bdev_file);
return 0;
}
@@ -2573,7 +2636,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct super_block *sb = fs_info->sb;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *seed_devices = NULL;
@@ -2586,12 +2649,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
- bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev_handle))
- return PTR_ERR(bdev_handle);
+ if (IS_ERR(bdev_file))
+ return PTR_ERR(bdev_file);
- if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) {
+ if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
ret = -EINVAL;
goto error;
}
@@ -2603,11 +2666,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
locked = true;
}
- sync_blockdev(bdev_handle->bdev);
+ sync_blockdev(file_bdev(bdev_file));
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
- if (device->bdev == bdev_handle->bdev) {
+ if (device->bdev == file_bdev(bdev_file)) {
ret = -EEXIST;
rcu_read_unlock();
goto error;
@@ -2623,8 +2686,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
device->fs_info = fs_info;
- device->bdev_handle = bdev_handle;
- device->bdev = bdev_handle->bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
ret = lookup_bdev(device_path, &device->devt);
if (ret)
goto error_free_device;
@@ -2807,7 +2870,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- bdev_release(bdev_handle);
+ fput(bdev_file);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
@@ -2986,6 +3049,81 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
return ret;
}
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev;
+ struct btrfs_chunk_map *map;
+ struct btrfs_chunk_map *prev_map = NULL;
+
+ while (node) {
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ prev = node;
+ prev_map = map;
+
+ if (logical < map->start) {
+ node = node->rb_left;
+ } else if (logical >= map->start + map->chunk_len) {
+ node = node->rb_right;
+ } else {
+ refcount_inc(&map->refs);
+ return map;
+ }
+ }
+
+ if (!prev)
+ return NULL;
+
+ orig_prev = prev;
+ while (prev && logical >= prev_map->start + prev_map->chunk_len) {
+ prev = rb_next(prev);
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ }
+
+ if (!prev) {
+ prev = orig_prev;
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ while (prev && logical < prev_map->start) {
+ prev = rb_prev(prev);
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ }
+ }
+
+ if (prev) {
+ u64 end = logical + length;
+
+ /*
+ * Caller can pass a U64_MAX length when it wants to get any
+ * chunk starting at an offset of 'logical' or higher, so deal
+ * with underflow by resetting the end offset to U64_MAX.
+ */
+ if (end < logical)
+ end = U64_MAX;
+
+ if (end > prev_map->start &&
+ logical < prev_map->start + prev_map->chunk_len) {
+ refcount_inc(&prev_map->refs);
+ return prev_map;
+ }
+ }
+
+ return NULL;
+}
+
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct btrfs_chunk_map *map;
+
+ read_lock(&fs_info->mapping_tree_lock);
+ map = btrfs_find_chunk_map_nolock(fs_info, logical, length);
+ read_unlock(&fs_info->mapping_tree_lock);
+
+ return map;
+}
+
/*
* Find the mapping containing the given logical extent.
*
@@ -2994,37 +3132,35 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
*
* Return: Chunk mapping or ERR_PTR.
*/
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length)
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, length);
- read_unlock(&em_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, logical, length);
- if (!em) {
- btrfs_crit(fs_info, "unable to find logical %llu length %llu",
+ if (unlikely(!map)) {
+ btrfs_crit(fs_info,
+ "unable to find chunk map for logical %llu length %llu",
logical, length);
return ERR_PTR(-EINVAL);
}
- if (em->start > logical || em->start + em->len < logical) {
+ if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
btrfs_crit(fs_info,
- "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
- logical, length, em->start, em->start + em->len);
- free_extent_map(em);
+ "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
+ logical, logical + length, map->start,
+ map->start + map->chunk_len);
+ btrfs_free_chunk_map(map);
return ERR_PTR(-EINVAL);
}
- /* callers are responsible for dropping em's ref. */
- return em;
+ /* Callers are responsible for dropping the reference. */
+ return map;
}
static int remove_chunk_item(struct btrfs_trans_handle *trans,
- struct map_lookup *map, u64 chunk_offset)
+ struct btrfs_chunk_map *map, u64 chunk_offset)
{
int i;
@@ -3049,23 +3185,21 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 dev_extent_len = 0;
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(em)) {
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map)) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
ASSERT(0);
- return PTR_ERR(em);
+ return PTR_ERR(map);
}
- map = em->map_lookup;
/*
* First delete the device extent items from the devices btree.
@@ -3168,7 +3302,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
goto out;
}
- trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
+ trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
@@ -3187,7 +3321,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
*/
btrfs_trans_release_chunk_metadata(trans);
- ret = btrfs_remove_block_group(trans, chunk_offset, em);
+ ret = btrfs_remove_block_group(trans, map);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3199,7 +3333,7 @@ out:
trans->removing_chunk = false;
}
/* once for us */
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -3312,7 +3446,18 @@ again:
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
- BUG_ON(ret == 0); /* Corruption */
+ if (ret == 0) {
+ /*
+ * On the first search we would find chunk tree with
+ * offset -1, which is not possible. On subsequent
+ * loops this would find an existing item on an invalid
+ * offset (one less than the previous one, wrong
+ * alignment and size).
+ */
+ ret = -EUCLEAN;
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ goto error;
+ }
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
@@ -3399,6 +3544,44 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
return 0;
}
+static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ const struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+ cpu->limit = le64_to_cpu(disk->limit);
+ cpu->stripes_min = le32_to_cpu(disk->stripes_min);
+ cpu->stripes_max = le32_to_cpu(disk->stripes_max);
+}
+
+static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+ const struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+ disk->limit = cpu_to_le64(cpu->limit);
+ disk->stripes_min = cpu_to_le32(cpu->stripes_min);
+ disk->stripes_max = cpu_to_le32(cpu->stripes_max);
+}
+
static int insert_balance_item(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl)
{
@@ -3543,7 +3726,7 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info)
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
int ret;
- BUG_ON(!fs_info->balance_ctl);
+ ASSERT(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = NULL;
@@ -5346,24 +5529,116 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
}
}
+static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ set_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
+ }
+}
+
+static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ __clear_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT,
+ NULL, NULL);
+ }
+}
+
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+ write_lock(&fs_info->mapping_tree_lock);
+ rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+ RB_CLEAR_NODE(&map->rb_node);
+ chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ write_unlock(&fs_info->mapping_tree_lock);
+
+ /* Once for the tree reference. */
+ btrfs_free_chunk_map(map);
+}
+
+EXPORT_FOR_TESTS
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ bool leftmost = true;
+
+ write_lock(&fs_info->mapping_tree_lock);
+ p = &fs_info->mapping_tree.rb_root.rb_node;
+ while (*p) {
+ struct btrfs_chunk_map *entry;
+
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_chunk_map, rb_node);
+
+ if (map->start < entry->start) {
+ p = &(*p)->rb_left;
+ } else if (map->start > entry->start) {
+ p = &(*p)->rb_right;
+ leftmost = false;
+ } else {
+ write_unlock(&fs_info->mapping_tree_lock);
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&map->rb_node, parent, p);
+ rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);
+ chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
+ chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
+ write_unlock(&fs_info->mapping_tree_lock);
+
+ return 0;
+}
+
+EXPORT_FOR_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp)
+{
+ struct btrfs_chunk_map *map;
+
+ map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp);
+ if (!map)
+ return NULL;
+
+ refcount_set(&map->refs, 1);
+ RB_CLEAR_NODE(&map->rb_node);
+
+ return map;
+}
+
static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
struct btrfs_fs_info *info = trans->fs_info;
- struct map_lookup *map = NULL;
- struct extent_map_tree *em_tree;
+ struct btrfs_chunk_map *map;
struct btrfs_block_group *block_group;
- struct extent_map *em;
u64 start = ctl->start;
u64 type = ctl->type;
int ret;
int i;
int j;
- map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+ map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
if (!map)
return ERR_PTR(-ENOMEM);
+
+ map->start = start;
+ map->chunk_len = ctl->chunk_size;
+ map->stripe_size = ctl->stripe_size;
+ map->type = type;
+ map->io_align = BTRFS_STRIPE_LEN;
+ map->io_width = BTRFS_STRIPE_LEN;
+ map->sub_stripes = ctl->sub_stripes;
map->num_stripes = ctl->num_stripes;
for (i = 0; i < ctl->ndevs; ++i) {
@@ -5374,41 +5649,22 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
j * ctl->stripe_size;
}
}
- map->io_align = BTRFS_STRIPE_LEN;
- map->io_width = BTRFS_STRIPE_LEN;
- map->type = type;
- map->sub_stripes = ctl->sub_stripes;
trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
- em = alloc_extent_map();
- if (!em) {
- kfree(map);
- return ERR_PTR(-ENOMEM);
- }
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->map_lookup = map;
- em->start = start;
- em->len = ctl->chunk_size;
- em->block_start = 0;
- em->block_len = em->len;
- em->orig_block_len = ctl->stripe_size;
-
- em_tree = &info->mapping_tree;
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_chunk_map(info, map);
if (ret) {
- write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ERR_PTR(ret);
}
- write_unlock(&em_tree->lock);
block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
- if (IS_ERR(block_group))
- goto error_del_extent;
+ if (IS_ERR(block_group)) {
+ btrfs_remove_chunk_map(info, map);
+ return block_group;
+ }
- for (i = 0; i < map->num_stripes; i++) {
+ for (int i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
btrfs_device_set_bytes_used(dev,
@@ -5421,23 +5677,10 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
atomic64_sub(ctl->stripe_size * map->num_stripes,
&info->free_chunk_space);
- free_extent_map(em);
check_raid56_incompat_flag(info, type);
check_raid1c34_incompat_flag(info, type);
return block_group;
-
-error_del_extent:
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* One for our allocation */
- free_extent_map(em);
- /* One for the tree reference */
- free_extent_map(em);
-
- return block_group;
}
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
@@ -5513,8 +5756,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
size_t item_size;
int i;
int ret;
@@ -5543,14 +5785,13 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
*/
lockdep_assert_held(&fs_info->chunk_mutex);
- em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
btrfs_abort_transaction(trans, ret);
return ret;
}
- map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
chunk = kzalloc(item_size, GFP_NOFS);
@@ -5607,7 +5848,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
out:
kfree(chunk);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -5652,7 +5893,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
return 0;
}
-static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map)
{
const int index = btrfs_bg_flags_to_raid_index(map->type);
@@ -5661,17 +5902,15 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
int miss_ndevs = 0;
int i;
bool ret = true;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map))
return false;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
if (test_bit(BTRFS_DEV_STATE_MISSING,
&map->stripes[i].dev->dev_state)) {
@@ -5692,38 +5931,37 @@ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (miss_ndevs > btrfs_chunk_max_errors(map))
ret = false;
end:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
-void btrfs_mapping_tree_free(struct extent_map_tree *tree)
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
{
- struct extent_map *em;
+ write_lock(&fs_info->mapping_tree_lock);
+ while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) {
+ struct btrfs_chunk_map *map;
+ struct rb_node *node;
- while (1) {
- write_lock(&tree->lock);
- em = lookup_extent_mapping(tree, 0, (u64)-1);
- if (em)
- remove_extent_mapping(tree, em);
- write_unlock(&tree->lock);
- if (!em)
- break;
- /* once for us */
- free_extent_map(em);
- /* once for the tree */
- free_extent_map(em);
+ node = rb_first_cached(&fs_info->mapping_tree);
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+ RB_CLEAR_NODE(&map->rb_node);
+ chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ /* Once for the tree ref. */
+ btrfs_free_chunk_map(map);
+ cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
}
+ write_unlock(&fs_info->mapping_tree_lock);
}
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
enum btrfs_raid_types index;
int ret = 1;
- em = btrfs_get_chunk_map(fs_info, logical, len);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, logical, len);
+ if (IS_ERR(map))
/*
* We could return errors for these cases, but that could get
* ugly and we'd probably do the same thing which is just not do
@@ -5732,7 +5970,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
*/
return 1;
- map = em->map_lookup;
index = btrfs_bg_flags_to_raid_index(map->type);
/* Non-RAID56, use their ncopies from btrfs_raid_array. */
@@ -5749,55 +5986,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
* stripe under reconstruction.
*/
ret = map->num_stripes;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
unsigned long len = fs_info->sectorsize;
if (!btrfs_fs_incompat(fs_info, RAID56))
return len;
- em = btrfs_get_chunk_map(fs_info, logical, len);
+ map = btrfs_get_chunk_map(fs_info, logical, len);
- if (!WARN_ON(IS_ERR(em))) {
- map = em->map_lookup;
+ if (!WARN_ON(IS_ERR(map))) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
}
return len;
}
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
int ret = 0;
if (!btrfs_fs_incompat(fs_info, RAID56))
return 0;
- em = btrfs_get_chunk_map(fs_info, logical, len);
+ map = btrfs_get_chunk_map(fs_info, logical, len);
- if(!WARN_ON(IS_ERR(em))) {
- map = em->map_lookup;
+ if (!WARN_ON(IS_ERR(map))) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
}
return ret;
}
static int find_live_mirror(struct btrfs_fs_info *fs_info,
- struct map_lookup *map, int first,
+ struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing)
{
+ const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy);
int i;
int num_stripes;
int preferred_mirror;
@@ -5812,13 +6046,12 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
else
num_stripes = map->num_stripes;
- switch (fs_info->fs_devices->read_policy) {
+ switch (policy) {
default:
/* Shouldn't happen, just warn and use pid instead of failing */
- btrfs_warn_rl(fs_info,
- "unknown read_policy type %u, reset to pid",
- fs_info->fs_devices->read_policy);
- fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+ btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid",
+ policy);
+ WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID);
fallthrough;
case BTRFS_READ_POLICY_PID:
preferred_mirror = first + (current->pid % num_stripes);
@@ -5902,8 +6135,7 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
u32 *num_stripes)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_discard_stripe *stripes;
u64 length = *length_ret;
u64 offset;
@@ -5921,11 +6153,9 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
int ret;
int i;
- em = btrfs_get_chunk_map(fs_info, logical, length);
- if (IS_ERR(em))
- return ERR_CAST(em);
-
- map = em->map_lookup;
+ map = btrfs_get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(map))
+ return ERR_CAST(map);
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -5933,8 +6163,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
goto out_free_map;
}
- offset = logical - em->start;
- length = min_t(u64, em->start + em->len - logical, length);
+ offset = logical - map->start;
+ length = min_t(u64, map->start + map->chunk_len - logical, length);
*length_ret = length;
/*
@@ -6031,10 +6261,10 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
}
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return stripes;
out_free_map:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ERR_PTR(ret);
}
@@ -6132,17 +6362,16 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
bioc->replace_nr_stripes = nr_extra_stripes;
}
-static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
- u64 offset, u32 *stripe_nr, u64 *stripe_offset,
- u64 *full_stripe_start)
+static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
+ struct btrfs_io_geometry *io_geom)
{
/*
* Stripe_nr is the stripe where this block falls. stripe_offset is
* the offset of this block in its stripe.
*/
- *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
- *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
- ASSERT(*stripe_offset < U32_MAX);
+ io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
+ io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
+ ASSERT(io_geom->stripe_offset < U32_MAX);
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len =
@@ -6157,18 +6386,17 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
* to go rounddown(), not round_down(), as nr_data_stripes is
* not ensured to be power of 2.
*/
- *full_stripe_start =
- btrfs_stripe_nr_to_offset(
- rounddown(*stripe_nr, nr_data_stripes(map)));
+ io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
+ rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
- ASSERT(*full_stripe_start + full_stripe_len > offset);
- ASSERT(*full_stripe_start <= offset);
+ ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset);
+ ASSERT(io_geom->raid56_full_stripe_start <= offset);
/*
* For writes to RAID56, allow to write a full stripe set, but
* no straddling of stripe sets.
*/
- if (op == BTRFS_MAP_WRITE)
- return full_stripe_len - (offset - *full_stripe_start);
+ if (io_geom->op == BTRFS_MAP_WRITE)
+ return full_stripe_len - (offset - io_geom->raid56_full_stripe_start);
}
/*
@@ -6176,26 +6404,180 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
* a single disk).
*/
if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
- return BTRFS_STRIPE_LEN - *stripe_offset;
+ return BTRFS_STRIPE_LEN - io_geom->stripe_offset;
return U64_MAX;
}
-static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 *length, struct btrfs_io_stripe *dst,
- struct map_lookup *map, u32 stripe_index,
- u64 stripe_offset, u64 stripe_nr)
+static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 *length, struct btrfs_io_stripe *dst,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
{
- dst->dev = map->stripes[stripe_index].dev;
+ dst->dev = map->stripes[io_geom->stripe_index].dev;
- if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type))
+ if (io_geom->op == BTRFS_MAP_READ &&
+ btrfs_need_stripe_tree_update(fs_info, map->type))
return btrfs_get_raid_extent_offset(fs_info, logical, length,
- map->type, stripe_index, dst);
+ map->type,
+ io_geom->stripe_index, dst);
- dst->physical = map->stripes[stripe_index].physical +
- stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
+ dst->physical = map->stripes[io_geom->stripe_index].physical +
+ io_geom->stripe_offset +
+ btrfs_stripe_nr_to_offset(io_geom->stripe_nr);
return 0;
}
+static bool is_single_device_io(struct btrfs_fs_info *fs_info,
+ const struct btrfs_io_stripe *smap,
+ const struct btrfs_chunk_map *map,
+ int num_alloc_stripes,
+ enum btrfs_map_op op, int mirror_num)
+{
+ if (!smap)
+ return false;
+
+ if (num_alloc_stripes != 1)
+ return false;
+
+ if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
+ return false;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
+ return false;
+
+ return true;
+}
+
+static void map_blocks_raid0(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+ io_geom->stripe_nr /= map->num_stripes;
+ if (io_geom->op == BTRFS_MAP_READ)
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_raid1(struct btrfs_fs_info *fs_info,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ bool dev_replace_is_ongoing)
+{
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->num_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index = io_geom->mirror_num - 1;
+ return;
+ }
+
+ io_geom->stripe_index = find_live_mirror(fs_info, map, 0,
+ dev_replace_is_ongoing);
+ io_geom->mirror_num = io_geom->stripe_index + 1;
+}
+
+static void map_blocks_dup(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->num_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index = io_geom->mirror_num - 1;
+ return;
+ }
+
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_raid10(struct btrfs_fs_info *fs_info,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ bool dev_replace_is_ongoing)
+{
+ u32 factor = map->num_stripes / map->sub_stripes;
+ int old_stripe_index;
+
+ io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes;
+ io_geom->stripe_nr /= factor;
+
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->sub_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index += io_geom->mirror_num - 1;
+ return;
+ }
+
+ old_stripe_index = io_geom->stripe_index;
+ io_geom->stripe_index = find_live_mirror(fs_info, map,
+ io_geom->stripe_index,
+ dev_replace_is_ongoing);
+ io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1;
+}
+
+static void map_blocks_raid56_write(struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ u64 logical, u64 *length)
+{
+ int data_stripes = nr_data_stripes(map);
+
+ /*
+ * Needs full stripe mapping.
+ *
+ * Push stripe_nr back to the start of the full stripe For those cases
+ * needing a full stripe, @stripe_nr is the full stripe number.
+ *
+ * Originally we go raid56_full_stripe_start / full_stripe_len, but
+ * that can be expensive. Here we just divide @stripe_nr with
+ * @data_stripes.
+ */
+ io_geom->stripe_nr /= data_stripes;
+
+ /* RAID[56] write or recovery. Return all stripes */
+ io_geom->num_stripes = map->num_stripes;
+ io_geom->max_errors = btrfs_chunk_max_errors(map);
+
+ /* Return the length to the full stripe end. */
+ *length = min(logical + *length,
+ io_geom->raid56_full_stripe_start + map->start +
+ btrfs_stripe_nr_to_offset(data_stripes)) -
+ logical;
+ io_geom->stripe_index = 0;
+ io_geom->stripe_offset = 0;
+}
+
+static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ int data_stripes = nr_data_stripes(map);
+
+ ASSERT(io_geom->mirror_num <= 1);
+ /* Just grab the data stripe directly. */
+ io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
+ io_geom->stripe_nr /= data_stripes;
+
+ /* We distribute the parity blocks across stripes. */
+ io_geom->stripe_index =
+ (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes;
+
+ if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1)
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_single(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+ io_geom->stripe_nr /= map->num_stripes;
+ io_geom->mirror_num = io_geom->stripe_index + 1;
+}
+
/*
* Map one logical range to one or more physical ranges.
*
@@ -6236,43 +6618,37 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_io_context **bioc_ret,
struct btrfs_io_stripe *smap, int *mirror_num_ret)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
+ struct btrfs_io_geometry io_geom = { 0 };
u64 map_offset;
- u64 stripe_offset;
- u32 stripe_nr;
- u32 stripe_index;
- int data_stripes;
int i;
int ret = 0;
- int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
- int num_stripes;
int num_copies;
- int max_errors = 0;
struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
u16 num_alloc_stripes;
- u64 raid56_full_stripe_start = (u64)-1;
u64 max_len;
ASSERT(bioc_ret);
+ io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
+ io_geom.num_stripes = 1;
+ io_geom.stripe_index = 0;
+ io_geom.op = op;
+
num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
- if (mirror_num > num_copies)
+ if (io_geom.mirror_num > num_copies)
return -EINVAL;
- em = btrfs_get_chunk_map(fs_info, logical, *length);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- map = em->map_lookup;
- data_stripes = nr_data_stripes(map);
+ map = btrfs_get_chunk_map(fs_info, logical, *length);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
- map_offset = logical - em->start;
- max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
- &stripe_offset, &raid56_full_stripe_start);
- *length = min_t(u64, em->len - map_offset, max_len);
+ map_offset = logical - map->start;
+ io_geom.raid56_full_stripe_start = (u64)-1;
+ max_len = btrfs_max_io_len(map, map_offset, &io_geom);
+ *length = min_t(u64, map->chunk_len - map_offset, max_len);
down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
@@ -6283,107 +6659,46 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (!dev_replace_is_ongoing)
up_read(&dev_replace->rwsem);
- num_stripes = 1;
- stripe_index = 0;
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_index = stripe_nr % map->num_stripes;
- stripe_nr /= map->num_stripes;
- if (op == BTRFS_MAP_READ)
- mirror_num = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- if (op != BTRFS_MAP_READ) {
- num_stripes = map->num_stripes;
- } else if (mirror_num) {
- stripe_index = mirror_num - 1;
- } else {
- stripe_index = find_live_mirror(fs_info, map, 0,
- dev_replace_is_ongoing);
- mirror_num = stripe_index + 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (op != BTRFS_MAP_READ) {
- num_stripes = map->num_stripes;
- } else if (mirror_num) {
- stripe_index = mirror_num - 1;
- } else {
- mirror_num = 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- u32 factor = map->num_stripes / map->sub_stripes;
-
- stripe_index = (stripe_nr % factor) * map->sub_stripes;
- stripe_nr /= factor;
-
- if (op != BTRFS_MAP_READ)
- num_stripes = map->sub_stripes;
- else if (mirror_num)
- stripe_index += mirror_num - 1;
- else {
- int old_stripe_index = stripe_index;
- stripe_index = find_live_mirror(fs_info, map,
- stripe_index,
- dev_replace_is_ongoing);
- mirror_num = stripe_index - old_stripe_index + 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- if (op != BTRFS_MAP_READ || mirror_num > 1) {
- /*
- * Needs full stripe mapping.
- *
- * Push stripe_nr back to the start of the full stripe
- * For those cases needing a full stripe, @stripe_nr
- * is the full stripe number.
- *
- * Originally we go raid56_full_stripe_start / full_stripe_len,
- * but that can be expensive. Here we just divide
- * @stripe_nr with @data_stripes.
- */
- stripe_nr /= data_stripes;
-
- /* RAID[56] write or recovery. Return all stripes */
- num_stripes = map->num_stripes;
- max_errors = btrfs_chunk_max_errors(map);
-
- /* Return the length to the full stripe end */
- *length = min(logical + *length,
- raid56_full_stripe_start + em->start +
- btrfs_stripe_nr_to_offset(data_stripes)) -
- logical;
- stripe_index = 0;
- stripe_offset = 0;
- } else {
- ASSERT(mirror_num <= 1);
- /* Just grab the data stripe directly. */
- stripe_index = stripe_nr % data_stripes;
- stripe_nr /= data_stripes;
-
- /* We distribute the parity blocks across stripes */
- stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
- if (op == BTRFS_MAP_READ && mirror_num < 1)
- mirror_num = 1;
- }
- } else {
+ switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case BTRFS_BLOCK_GROUP_RAID0:
+ map_blocks_raid0(map, &io_geom);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing);
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ map_blocks_dup(map, &io_geom);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID10:
+ map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID5:
+ case BTRFS_BLOCK_GROUP_RAID6:
+ if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)
+ map_blocks_raid56_write(map, &io_geom, logical, length);
+ else
+ map_blocks_raid56_read(map, &io_geom);
+ break;
+ default:
/*
* After this, stripe_nr is the number of stripes on this
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
- stripe_index = stripe_nr % map->num_stripes;
- stripe_nr /= map->num_stripes;
- mirror_num = stripe_index + 1;
+ map_blocks_single(map, &io_geom);
+ break;
}
- if (stripe_index >= map->num_stripes) {
+ if (io_geom.stripe_index >= map->num_stripes) {
btrfs_crit(fs_info,
"stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
- stripe_index, map->num_stripes);
+ io_geom.stripe_index, map->num_stripes);
ret = -EINVAL;
goto out;
}
- num_alloc_stripes = num_stripes;
+ num_alloc_stripes = io_geom.num_stripes;
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
op != BTRFS_MAP_READ)
/*
@@ -6400,14 +6715,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* physical block information on the stack instead of allocating an
* I/O context structure.
*/
- if (smap && num_alloc_stripes == 1 &&
- !(btrfs_need_stripe_tree_update(fs_info, map->type) &&
- op != BTRFS_MAP_READ) &&
- !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
- ret = set_io_stripe(fs_info, op, logical, length, smap, map,
- stripe_index, stripe_offset, stripe_nr);
+ if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
+ io_geom.mirror_num)) {
+ ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
if (mirror_num_ret)
- *mirror_num_ret = mirror_num;
+ *mirror_num_ret = io_geom.mirror_num;
*bioc_ret = NULL;
goto out;
}
@@ -6427,7 +6739,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* It's still mostly the same as other profiles, just with extra rotation.
*/
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
- (op != BTRFS_MAP_READ || mirror_num > 1)) {
+ (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) {
/*
* For RAID56 @stripe_nr is already the number of full stripes
* before us, which is also the rotation value (needs to modulo
@@ -6436,28 +6748,31 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* In this case, we just add @stripe_nr with @i, then do the
* modulo, to reduce one modulo call.
*/
- bioc->full_stripe_logical = em->start +
- btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
- for (int i = 0; i < num_stripes; i++) {
- ret = set_io_stripe(fs_info, op, logical, length,
- &bioc->stripes[i], map,
- (i + stripe_nr) % num_stripes,
- stripe_offset, stripe_nr);
- if (ret < 0)
- break;
+ bioc->full_stripe_logical = map->start +
+ btrfs_stripe_nr_to_offset(io_geom.stripe_nr *
+ nr_data_stripes(map));
+ for (int i = 0; i < io_geom.num_stripes; i++) {
+ struct btrfs_io_stripe *dst = &bioc->stripes[i];
+ u32 stripe_index;
+
+ stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes;
+ dst->dev = map->stripes[stripe_index].dev;
+ dst->physical =
+ map->stripes[stripe_index].physical +
+ io_geom.stripe_offset +
+ btrfs_stripe_nr_to_offset(io_geom.stripe_nr);
}
} else {
/*
* For all other non-RAID56 profiles, just copy the target
* stripe into the bioc.
*/
- for (i = 0; i < num_stripes; i++) {
- ret = set_io_stripe(fs_info, op, logical, length,
- &bioc->stripes[i], map, stripe_index,
- stripe_offset, stripe_nr);
+ for (i = 0; i < io_geom.num_stripes; i++) {
+ ret = set_io_stripe(fs_info, logical, length,
+ &bioc->stripes[i], map, &io_geom);
if (ret < 0)
break;
- stripe_index++;
+ io_geom.stripe_index++;
}
}
@@ -6468,18 +6783,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
}
if (op != BTRFS_MAP_READ)
- max_errors = btrfs_chunk_max_errors(map);
+ io_geom.max_errors = btrfs_chunk_max_errors(map);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
op != BTRFS_MAP_READ) {
handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
- &num_stripes, &max_errors);
+ &io_geom.num_stripes, &io_geom.max_errors);
}
*bioc_ret = bioc;
- bioc->num_stripes = num_stripes;
- bioc->max_errors = max_errors;
- bioc->mirror_num = mirror_num;
+ bioc->num_stripes = io_geom.num_stripes;
+ bioc->max_errors = io_geom.max_errors;
+ bioc->mirror_num = io_geom.mirror_num;
out:
if (dev_replace_is_ongoing) {
@@ -6487,7 +6802,7 @@ out:
/* Unlock and let waiting writers proceed */
up_read(&dev_replace->rwsem);
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -6659,12 +6974,11 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
devid, uuid);
}
-u64 btrfs_calc_stripe_length(const struct extent_map *em)
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map)
{
- const struct map_lookup *map = em->map_lookup;
const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
- return div_u64(em->len, data_stripes);
+ return div_u64(map->chunk_len, data_stripes);
}
#if BITS_PER_LONG == 32
@@ -6733,9 +7047,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
u64 logical;
u64 length;
u64 devid;
@@ -6769,35 +7081,22 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
return ret;
}
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, logical, 1);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, logical, 1);
/* already mapped? */
- if (em && em->start <= logical && em->start + em->len > logical) {
- free_extent_map(em);
+ if (map && map->start <= logical && map->start + map->chunk_len > logical) {
+ btrfs_free_chunk_map(map);
return 0;
- } else if (em) {
- free_extent_map(em);
+ } else if (map) {
+ btrfs_free_chunk_map(map);
}
- em = alloc_extent_map();
- if (!em)
- return -ENOMEM;
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
- if (!map) {
- free_extent_map(em);
+ map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS);
+ if (!map)
return -ENOMEM;
- }
-
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->map_lookup = map;
- em->start = logical;
- em->len = length;
- em->orig_start = 0;
- em->block_start = 0;
- em->block_len = em->len;
+ map->start = logical;
+ map->chunk_len = length;
map->num_stripes = num_stripes;
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
@@ -6812,7 +7111,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
*/
map->sub_stripes = btrfs_raid_array[index].sub_stripes;
map->verified_stripes = 0;
- em->orig_block_len = btrfs_calc_stripe_length(em);
+ map->stripe_size = btrfs_calc_stripe_length(map);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -6828,7 +7127,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
devid, uuid);
if (IS_ERR(map->stripes[i].dev)) {
ret = PTR_ERR(map->stripes[i].dev);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
}
@@ -6837,15 +7136,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
&(map->stripes[i].dev->dev_state));
}
- write_lock(&map_tree->lock);
- ret = add_extent_mapping(map_tree, em, 0);
- write_unlock(&map_tree->lock);
+ ret = btrfs_add_chunk_map(fs_info, map);
if (ret < 0) {
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
- em->start, em->len, ret);
+ map->start, map->chunk_len, ret);
}
- free_extent_map(em);
return ret;
}
@@ -7155,26 +7451,21 @@ out_short_read:
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev)
{
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- u64 next_start = 0;
+ struct btrfs_chunk_map *map;
+ u64 next_start;
bool ret = true;
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, 0, (u64)-1);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, 0, U64_MAX);
/* No chunk at all? Return false anyway */
- if (!em) {
+ if (!map) {
ret = false;
goto out;
}
- while (em) {
- struct map_lookup *map;
+ while (map) {
int missing = 0;
int max_tolerated;
int i;
- map = em->map_lookup;
max_tolerated =
btrfs_get_num_tolerated_disk_barrier_failures(
map->type);
@@ -7192,18 +7483,15 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
if (!failing_dev)
btrfs_warn(fs_info,
"chunk %llu missing %d devices, max tolerance is %d for writable mount",
- em->start, missing, max_tolerated);
- free_extent_map(em);
+ map->start, missing, max_tolerated);
+ btrfs_free_chunk_map(map);
ret = false;
goto out;
}
- next_start = extent_map_end(em);
- free_extent_map(em);
+ next_start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, next_start,
- (u64)(-1) - next_start);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start);
}
out:
return ret;
@@ -7696,20 +7984,15 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 physical_offset, u64 physical_len)
{
struct btrfs_dev_lookup_args args = { .devid = devid };
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_device *dev;
u64 stripe_len;
bool found = false;
int ret = 0;
int i;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
-
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+ if (!map) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
physical_offset, devid);
@@ -7717,12 +8000,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
goto out;
}
- map = em->map_lookup;
- stripe_len = btrfs_calc_stripe_length(em);
+ stripe_len = btrfs_calc_stripe_length(map);
if (physical_len != stripe_len) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
- physical_offset, devid, em->start, physical_len,
+ physical_offset, devid, map->start, physical_len,
stripe_len);
ret = -EUCLEAN;
goto out;
@@ -7745,7 +8027,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
if (map->verified_stripes >= map->num_stripes) {
btrfs_err(fs_info,
"too many dev extents for chunk %llu found",
- em->start);
+ map->start);
ret = -EUCLEAN;
goto out;
}
@@ -7791,32 +8073,30 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
struct rb_node *node;
int ret = 0;
- read_lock(&em_tree->lock);
- for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
- em = rb_entry(node, struct extent_map, rb_node);
- if (em->map_lookup->num_stripes !=
- em->map_lookup->verified_stripes) {
+ read_lock(&fs_info->mapping_tree_lock);
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+ struct btrfs_chunk_map *map;
+
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ if (map->num_stripes != map->verified_stripes) {
btrfs_err(fs_info,
"chunk %llu has missing dev extent, have %d expect %d",
- em->start, em->map_lookup->verified_stripes,
- em->map_lookup->num_stripes);
+ map->start, map->verified_stripes, map->num_stripes);
ret = -EUCLEAN;
goto out;
}
}
out:
- read_unlock(&em_tree->lock);
+ read_unlock(&fs_info->mapping_tree_lock);
return ret;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9cc374864a79..66e6fc481ecd 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -6,13 +6,28 @@
#ifndef BTRFS_VOLUMES_H
#define BTRFS_VOLUMES_H
+#include <linux/blk_types.h>
+#include <linux/sizes.h>
+#include <linux/atomic.h>
#include <linux/sort.h>
-#include <linux/btrfs.h>
-#include "async-thread.h"
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/log2.h>
+#include <linux/kobject.h>
+#include <linux/refcount.h>
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/btrfs.h>
#include "messages.h"
-#include "tree-checker.h"
#include "rcu-string.h"
+struct block_device;
+struct bdev_handle;
+struct btrfs_fs_info;
+struct btrfs_block_group;
+struct btrfs_trans_handle;
+struct btrfs_zoned_device_info;
+
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
extern struct mutex uuid_mutex;
@@ -77,7 +92,10 @@ enum btrfs_raid_types {
#define BTRFS_DEV_STATE_FLUSH_SENT (4)
#define BTRFS_DEV_STATE_NO_READA (5)
-struct btrfs_zoned_device_info;
+/* Special value encoding failure to write primary super block. */
+#define BTRFS_SUPER_PRIMARY_WRITE_ERROR (INT_MAX / 2)
+
+struct btrfs_fs_devices;
struct btrfs_device {
struct list_head dev_list; /* device_list_mutex */
@@ -90,7 +108,7 @@ struct btrfs_device {
u64 generation;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
struct btrfs_zoned_device_info *zone_info;
@@ -127,6 +145,12 @@ struct btrfs_device {
/* type and info about this device */
u64 type;
+ /*
+ * Counter of super block write errors, values larger than
+ * BTRFS_SUPER_PRIMARY_WRITE_ERROR encode primary super block write failure.
+ */
+ atomic_t sb_write_errors;
+
/* minimal io size for this device */
u32 sector_size;
@@ -276,6 +300,25 @@ enum btrfs_read_policy {
BTRFS_NR_READ_POLICY,
};
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Checksum mode - offload it to workqueues or do it synchronously in
+ * btrfs_submit_chunk().
+ */
+enum btrfs_offload_csum_mode {
+ /*
+ * Choose offloading checksum or do it synchronously automatically.
+ * Do it synchronously if the checksum is fast, or offload to workqueues
+ * otherwise.
+ */
+ BTRFS_OFFLOAD_CSUM_AUTO,
+ /* Always offload checksum to workqueues. */
+ BTRFS_OFFLOAD_CSUM_FORCE_ON,
+ /* Never offload checksum to workqueues. */
+ BTRFS_OFFLOAD_CSUM_FORCE_OFF,
+};
+#endif
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
@@ -380,6 +423,11 @@ struct btrfs_fs_devices {
/* Policy used to read the mirrored stripes. */
enum btrfs_read_policy read_policy;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ /* Checksum mode - offload it or do it synchronously. */
+ enum btrfs_offload_csum_mode offload_csum_mode;
+#endif
};
#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \
@@ -426,7 +474,8 @@ struct btrfs_discard_stripe {
struct btrfs_io_context {
refcount_t refs;
struct btrfs_fs_info *fs_info;
- u64 map_type; /* get from map_lookup->type */
+ /* Taken from struct btrfs_chunk_map::type. */
+ u64 map_type;
struct bio *orig_bio;
atomic_t error;
u16 max_errors;
@@ -529,21 +578,33 @@ struct btrfs_raid_attr {
extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
-struct map_lookup {
+struct btrfs_chunk_map {
+ struct rb_node rb_node;
+ /* For mount time dev extent verification. */
+ int verified_stripes;
+ refcount_t refs;
+ u64 start;
+ u64 chunk_len;
+ u64 stripe_size;
u64 type;
int io_align;
int io_width;
int num_stripes;
int sub_stripes;
- int verified_stripes; /* For mount time dev extent verification */
struct btrfs_io_stripe stripes[];
};
-#define map_lookup_size(n) (sizeof(struct map_lookup) + \
- (sizeof(struct btrfs_io_stripe) * (n)))
+#define btrfs_chunk_map_size(n) (sizeof(struct btrfs_chunk_map) + \
+ (sizeof(struct btrfs_io_stripe) * (n)))
+
+static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map)
+{
+ if (map && refcount_dec_and_test(&map->refs)) {
+ ASSERT(RB_EMPTY_NODE(&map->rb_node));
+ kfree(map);
+ }
+}
-struct btrfs_balance_args;
-struct btrfs_balance_progress;
struct btrfs_balance_control {
struct btrfs_balance_args data;
struct btrfs_balance_args meta;
@@ -598,7 +659,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
}
/*
- * Do the type safe converstion from stripe_nr to offset inside the chunk.
+ * Do the type safe conversion from stripe_nr to offset inside the chunk.
*
* @stripe_nr is u32, with left shift it can overflow u32 for chunks larger
* than 4G. This does the proper type cast to avoid overflow.
@@ -624,7 +685,7 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
-void btrfs_mapping_tree_free(struct extent_map_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
blk_mode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
@@ -646,7 +707,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct bdev_handle **bdev_handle);
+ struct file **bdev_file);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
@@ -680,13 +741,24 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
-u64 btrfs_calc_stripe_length(const struct extent_map *em);
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
int btrfs_nr_parity_stripes(u64 type);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp);
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
+#endif
+
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
void btrfs_release_disk_super(struct btrfs_super_block *super);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
@@ -753,9 +825,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev);
-void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
- struct block_device *bdev,
- const char *device_path);
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device);
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags);
int btrfs_bg_type_to_factor(u64 flags);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3cf236fb40a4..15d0999e340e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -382,6 +382,53 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
+static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler,
+ struct dentry *unused,
+ struct inode *inode,
+ const char *name, void *buffer,
+ size_t size)
+{
+ int ret;
+ bool is_cap = false;
+
+ name = xattr_full_name(handler, name);
+
+ /*
+ * security.capability doesn't cache the results, so calls into us
+ * constantly to see if there's a capability xattr. Cache the result
+ * here in order to avoid wasting time doing lookups for xattrs we know
+ * don't exist.
+ */
+ if (strcmp(name, XATTR_NAME_CAPS) == 0) {
+ is_cap = true;
+ if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags))
+ return -ENODATA;
+ }
+
+ ret = btrfs_getxattr(inode, name, buffer, size);
+ if (ret == -ENODATA && is_cap)
+ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+ return ret;
+}
+
+static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused,
+ struct inode *inode,
+ const char *name,
+ const void *buffer,
+ size_t size, int flags)
+{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
+ name = xattr_full_name(handler, name);
+ if (strcmp(name, XATTR_NAME_CAPS) == 0)
+ clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ return btrfs_setxattr_trans(inode, name, buffer, size, flags);
+}
+
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
@@ -420,8 +467,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
static const struct xattr_handler btrfs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .get = btrfs_xattr_handler_get,
- .set = btrfs_xattr_handler_set,
+ .get = btrfs_xattr_handler_get_security,
+ .set = btrfs_xattr_handler_set_security,
};
static const struct xattr_handler btrfs_trusted_xattr_handler = {
@@ -457,7 +504,7 @@ static int btrfs_initxattrs(struct inode *inode,
const struct xattr *xattr;
unsigned int nofs_flag;
char *name;
- int err = 0;
+ int ret = 0;
/*
* We're holding a transaction handle, so use a NOFS memory allocation
@@ -468,19 +515,23 @@ static int btrfs_initxattrs(struct inode *inode,
name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
strlen(xattr->name) + 1, GFP_KERNEL);
if (!name) {
- err = -ENOMEM;
+ ret = -ENOMEM;
break;
}
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
- err = btrfs_setxattr(trans, inode, name, xattr->value,
+
+ if (strcmp(name, XATTR_NAME_CAPS) == 0)
+ clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ ret = btrfs_setxattr(trans, inode, name, xattr->value,
xattr->value_len, 0);
kfree(name);
- if (err < 0)
+ if (ret < 0)
break;
}
memalloc_nofs_restore(nofs_flag);
- return err;
+ return ret;
}
int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 118118ca3e1d..b9376ea258ff 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -6,7 +6,11 @@
#ifndef BTRFS_XATTR_H
#define BTRFS_XATTR_H
-#include <linux/xattr.h>
+struct dentry;
+struct inode;
+struct qstr;
+struct xattr_handler;
+struct btrfs_trans_handle;
extern const struct xattr_handler * const btrfs_xattr_handlers[];
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 6c231a116a29..d9e5c88a0f85 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -91,24 +91,24 @@ fail:
return ERR_PTR(-ENOMEM);
}
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out)
+int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret;
char *data_in = NULL;
- char *cpage_out;
- int nr_pages = 0;
- struct page *in_page = NULL;
- struct page *out_page = NULL;
+ char *cfolio_out;
+ int nr_folios = 0;
+ struct folio *in_folio = NULL;
+ struct folio *out_folio = NULL;
unsigned long bytes_left;
- unsigned int in_buf_pages;
+ unsigned int in_buf_folios;
unsigned long len = *total_out;
- unsigned long nr_dest_pages = *out_pages;
- const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+ unsigned long nr_dest_folios = *out_folios;
+ const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
- *out_pages = 0;
+ *out_folios = 0;
*total_out = 0;
*total_in = 0;
@@ -121,18 +121,18 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
- pages[0] = out_page;
- nr_pages = 1;
+ cfolio_out = folio_address(out_folio);
+ folios[0] = out_folio;
+ nr_folios = 1;
workspace->strm.next_in = workspace->buf;
workspace->strm.avail_in = 0;
- workspace->strm.next_out = cpage_out;
+ workspace->strm.next_out = cfolio_out;
workspace->strm.avail_out = PAGE_SIZE;
while (workspace->strm.total_in < len) {
@@ -142,19 +142,22 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*/
if (workspace->strm.avail_in == 0) {
bytes_left = len - workspace->strm.total_in;
- in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
- workspace->buf_size / PAGE_SIZE);
- if (in_buf_pages > 1) {
+ in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
+ workspace->buf_size / PAGE_SIZE);
+ if (in_buf_folios > 1) {
int i;
- for (i = 0; i < in_buf_pages; i++) {
+ for (i = 0; i < in_buf_folios; i++) {
if (data_in) {
kunmap_local(data_in);
- put_page(in_page);
+ folio_put(in_folio);
+ data_in = NULL;
}
- in_page = find_get_page(mapping,
- start >> PAGE_SHIFT);
- data_in = kmap_local_page(in_page);
+ ret = btrfs_compress_filemap_get_folio(mapping,
+ start, &in_folio);
+ if (ret < 0)
+ goto out;
+ data_in = kmap_local_folio(in_folio, 0);
copy_page(workspace->buf + i * PAGE_SIZE,
data_in);
start += PAGE_SIZE;
@@ -163,11 +166,14 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
} else {
if (data_in) {
kunmap_local(data_in);
- put_page(in_page);
+ folio_put(in_folio);
+ data_in = NULL;
}
- in_page = find_get_page(mapping,
- start >> PAGE_SHIFT);
- data_in = kmap_local_page(in_page);
+ ret = btrfs_compress_filemap_get_folio(mapping,
+ start, &in_folio);
+ if (ret < 0)
+ goto out;
+ data_in = kmap_local_folio(in_folio, 0);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
}
@@ -196,20 +202,20 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
- if (nr_pages == nr_dest_pages) {
+ if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
- pages[nr_pages] = out_page;
- nr_pages++;
+ cfolio_out = folio_address(out_folio);
+ folios[nr_folios] = out_folio;
+ nr_folios++;
workspace->strm.avail_out = PAGE_SIZE;
- workspace->strm.next_out = cpage_out;
+ workspace->strm.next_out = cfolio_out;
}
/* we're all done */
if (workspace->strm.total_in >= len)
@@ -231,21 +237,21 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -EIO;
goto out;
} else if (workspace->strm.avail_out == 0) {
- /* get another page for the stream end */
- if (nr_pages == nr_dest_pages) {
+ /* Get another folio for the stream end. */
+ if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
- pages[nr_pages] = out_page;
- nr_pages++;
+ cfolio_out = folio_address(out_folio);
+ folios[nr_folios] = out_folio;
+ nr_folios++;
workspace->strm.avail_out = PAGE_SIZE;
- workspace->strm.next_out = cpage_out;
+ workspace->strm.next_out = cfolio_out;
}
}
zlib_deflateEnd(&workspace->strm);
@@ -259,10 +265,10 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_out = workspace->strm.total_out;
*total_in = workspace->strm.total_in;
out:
- *out_pages = nr_pages;
+ *out_folios = nr_folios;
if (data_in) {
kunmap_local(data_in);
- put_page(in_page);
+ folio_put(in_folio);
}
return ret;
@@ -275,13 +281,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
int wbits = MAX_WBITS;
char *data_in;
size_t total_out = 0;
- unsigned long page_in_index = 0;
+ unsigned long folio_in_index = 0;
size_t srclen = cb->compressed_len;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
- struct page **pages_in = cb->compressed_pages;
+ struct folio **folios_in = cb->compressed_folios;
- data_in = kmap_local_page(pages_in[page_in_index]);
+ data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
@@ -331,12 +337,12 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
kunmap_local(data_in);
- page_in_index++;
- if (page_in_index >= total_pages_in) {
+ folio_in_index++;
+ if (folio_in_index >= total_folios_in) {
data_in = NULL;
break;
}
- data_in = kmap_local_page(pages_in[page_in_index]);
+ data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
@@ -354,18 +360,13 @@ done:
}
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
int wbits = MAX_WBITS;
- unsigned long bytes_left;
- unsigned long total_out = 0;
- unsigned long pg_offset = 0;
-
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes_left = destlen;
+ unsigned long to_copy;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = srclen;
@@ -390,60 +391,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
return -EIO;
}
- while (bytes_left > 0) {
- unsigned long buf_start;
- unsigned long buf_offset;
- unsigned long bytes;
-
- ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
- if (ret != Z_OK && ret != Z_STREAM_END)
- break;
-
- buf_start = total_out;
- total_out = workspace->strm.total_out;
-
- if (total_out == buf_start) {
- ret = -EIO;
- break;
- }
-
- if (total_out <= start_byte)
- goto next;
-
- if (total_out > start_byte && buf_start < start_byte)
- buf_offset = start_byte - buf_start;
- else
- buf_offset = 0;
-
- bytes = min(PAGE_SIZE - pg_offset,
- PAGE_SIZE - (buf_offset % PAGE_SIZE));
- bytes = min(bytes, bytes_left);
+ /*
+ * Everything (in/out buf) should be at most one sector, there should
+ * be no need to switch any input/output buffer.
+ */
+ ret = zlib_inflate(&workspace->strm, Z_FINISH);
+ to_copy = min(workspace->strm.total_out, destlen);
+ if (ret != Z_STREAM_END)
+ goto out;
- memcpy_to_page(dest_page, pg_offset,
- workspace->buf + buf_offset, bytes);
+ memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy);
- pg_offset += bytes;
- bytes_left -= bytes;
-next:
- workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = workspace->buf_size;
- }
-
- if (ret != Z_STREAM_END && bytes_left != 0)
+out:
+ if (unlikely(to_copy != destlen)) {
+ pr_warn_ratelimited("BTRFS: inflate failed, decompressed=%lu expected=%zu\n",
+ to_copy, destlen);
ret = -EIO;
- else
+ } else {
ret = 0;
+ }
zlib_inflateEnd(&workspace->strm);
- /*
- * this should only happen if zlib returned fewer bytes than we
- * expected. btrfs_get_block is responsible for zeroing from the
- * end of the inline extent (destlen) to the end of the page
- */
- if (pg_offset < destlen) {
- memzero_page(dest_page, pg_offset, destlen - pg_offset);
- }
+ if (unlikely(to_copy < destlen))
+ memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 3504ade30cb0..4cba80b34387 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -12,10 +12,8 @@
#include "rcu-string.h"
#include "disk-io.h"
#include "block-group.h"
-#include "transaction.h"
#include "dev-replace.h"
#include "space-info.h"
-#include "super.h"
#include "fs.h"
#include "accessors.h"
#include "bio.h"
@@ -578,26 +576,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
kvfree(zones);
- switch (bdev_zoned_model(bdev)) {
- case BLK_ZONED_HM:
+ if (bdev_is_zoned(bdev)) {
model = "host-managed zoned";
emulated = "";
- break;
- case BLK_ZONED_HA:
- model = "host-aware zoned";
- emulated = "";
- break;
- case BLK_ZONED_NONE:
+ } else {
model = "regular";
emulated = "emulated ";
- break;
- default:
- /* Just in case */
- btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
- bdev_zoned_model(bdev),
- rcu_str_deref(device->name));
- ret = -EOPNOTSUPP;
- goto out_free_zone_info;
}
btrfs_info_in_rcu(fs_info,
@@ -609,9 +593,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
out:
kvfree(zones);
-out_free_zone_info:
btrfs_destroy_dev_zone_info(device);
-
return ret;
}
@@ -688,8 +670,7 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
- if (device->bdev &&
- bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
+ if (device->bdev && bdev_is_zoned(device->bdev)) {
btrfs_err(fs_info,
"zoned: mode not enabled but zoned device found: %pg",
device->bdev);
@@ -781,7 +762,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
* Check mount options here, because we might change fs_info->zoned
* from fs_info->zone_size.
*/
- ret = btrfs_check_mountopts_zoned(fs_info);
+ ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt);
if (ret)
return ret;
@@ -789,7 +770,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt)
{
if (!btrfs_is_zoned(info))
return 0;
@@ -798,18 +779,21 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
* Space cache writing is not COWed. Disable that to avoid write errors
* in sequential zones.
*/
- if (btrfs_test_opt(info, SPACE_CACHE)) {
+ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
btrfs_err(info, "zoned: space cache v1 is not supported");
return -EINVAL;
}
- if (btrfs_test_opt(info, NODATACOW)) {
+ if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) {
btrfs_err(info, "zoned: NODATACOW not supported");
return -EINVAL;
}
- btrfs_clear_and_info(info, DISCARD_ASYNC,
- "zoned: async discard ignored and disabled for zoned mode");
+ if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) {
+ btrfs_info(info,
+ "zoned: async discard ignored and disabled for zoned mode");
+ btrfs_clear_opt(*mount_opt, DISCARD_ASYNC);
+ }
return 0;
}
@@ -838,11 +822,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ unsigned int nofs_flags;
+
ASSERT(sb_zone_is_full(reset));
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- reset->start, reset->len,
- GFP_NOFS);
+ reset->start, reset->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -988,11 +975,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
* explicit ZONE_FINISH is not necessary.
*/
if (zone->wp != zone->start + zone->capacity) {
+ unsigned int nofs_flags;
int ret;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev,
REQ_OP_ZONE_FINISH, zone->start,
- zone->len, GFP_NOFS);
+ zone->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
}
@@ -1010,11 +1000,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
+ unsigned int nofs_flags;
sector_t zone_sectors;
sector_t nr_sectors;
u8 zone_sectors_shift;
u32 sb_zone;
u32 nr_zones;
+ int ret;
zone_sectors = bdev_zone_sectors(bdev);
zone_sectors_shift = ilog2(zone_sectors);
@@ -1025,9 +1017,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- zone_start_sector(sb_zone, bdev),
- zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ zone_start_sector(sb_zone, bdev),
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES);
+ memalloc_nofs_restore(nofs_flags);
+ return ret;
}
/*
@@ -1138,12 +1133,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
u64 length, u64 *bytes)
{
+ unsigned int nofs_flags;
int ret;
*bytes = 0;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
- physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
- GFP_NOFS);
+ physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -1290,7 +1287,7 @@ struct zone_info {
static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct zone_info *info, unsigned long *active,
- struct map_lookup *map)
+ struct btrfs_chunk_map *map)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *device = map->stripes[zone_idx].dev;
@@ -1393,7 +1390,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1435,7 +1432,7 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1483,7 +1480,7 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1515,7 +1512,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1552,9 +1549,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 logical = cache->start;
u64 length = cache->length;
struct zone_info *zone_info = NULL;
@@ -1575,21 +1570,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
return -EIO;
}
- /* Get the chunk mapping */
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, length);
- read_unlock(&em_tree->lock);
-
- if (!em)
+ map = btrfs_find_chunk_map(fs_info, logical, length);
+ if (!map)
return -EINVAL;
- map = em->map_lookup;
-
- cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
- if (!cache->physical_map) {
- ret = -ENOMEM;
- goto out;
- }
+ cache->physical_map = map;
zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS);
if (!zone_info) {
@@ -1661,11 +1646,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
out:
- if (cache->alloc_offset > fs_info->zone_size) {
- btrfs_err(fs_info,
- "zoned: invalid write pointer %llu in block group %llu",
- cache->alloc_offset, cache->start);
- ret = -EIO;
+ /* Reject non SINGLE data profiles without RST */
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
+ (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
}
if (cache->alloc_offset > cache->zone_capacity) {
@@ -1694,12 +1681,11 @@ out:
spin_unlock(&fs_info->zone_active_bgs_lock);
}
} else {
- kfree(cache->physical_map);
+ btrfs_free_chunk_map(cache->physical_map);
cache->physical_map = NULL;
}
bitmap_free(active);
kfree(zone_info);
- free_extent_map(em);
return ret;
}
@@ -1722,22 +1708,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
cache->zone_unusable = unusable;
}
-void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb)
-{
- if (!btrfs_is_zoned(eb->fs_info) ||
- btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
- return;
-
- ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-
- memzero_extent_buffer(eb, 0, eb->len);
- set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
- set_extent_buffer_dirty(eb);
- set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
- EXTENT_DIRTY, NULL);
-}
-
bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
@@ -2089,7 +2059,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
bool btrfs_zone_activate(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_device *device;
u64 physical;
const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA);
@@ -2101,6 +2071,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
map = block_group->physical_map;
+ spin_lock(&fs_info->zone_active_bgs_lock);
spin_lock(&block_group->lock);
if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
ret = true;
@@ -2113,7 +2084,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
- spin_lock(&fs_info->zone_active_bgs_lock);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_zoned_device_info *zinfo;
int reserved = 0;
@@ -2133,20 +2103,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
*/
if (atomic_read(&zinfo->active_zones_left) <= reserved) {
ret = false;
- spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
if (!btrfs_dev_set_active_zone(device, physical)) {
/* Cannot activate the zone */
ret = false;
- spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
if (!is_data)
zinfo->reserved_active_zones--;
}
- spin_unlock(&fs_info->zone_active_bgs_lock);
/* Successfully activated all the zones */
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
@@ -2154,8 +2121,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
/* For the active block group list */
btrfs_get_block_group(block_group);
-
- spin_lock(&fs_info->zone_active_bgs_lock);
list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
spin_unlock(&fs_info->zone_active_bgs_lock);
@@ -2163,6 +2128,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
out_unlock:
spin_unlock(&block_group->lock);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
return ret;
}
@@ -2201,9 +2167,10 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
const bool is_metadata = (block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int ret = 0;
int i;
@@ -2279,27 +2246,33 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
btrfs_clear_data_reloc_bg(block_group);
spin_unlock(&block_group->lock);
+ down_read(&dev_replace->rwsem);
map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ unsigned int nofs_flags;
if (zinfo->max_active_zones == 0)
continue;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
- if (ret)
+ if (ret) {
+ up_read(&dev_replace->rwsem);
return ret;
+ }
if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
zinfo->reserved_active_zones++;
btrfs_dev_clear_active_zone(device, physical);
}
+ up_read(&dev_replace->rwsem);
if (!fully_written)
btrfs_dec_block_group_ro(block_group);
@@ -2650,7 +2623,7 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
/* Release reservation for currently active block groups. */
spin_lock(&fs_info->zone_active_bgs_lock);
list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
- struct map_lookup *map = block_group->physical_map;
+ struct btrfs_chunk_map *map = block_group->physical_map;
if (!(block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index b9cec523b778..77c4321e331f 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -4,12 +4,27 @@
#define BTRFS_ZONED_H
#include <linux/types.h>
+#include <linux/atomic.h>
#include <linux/blkdev.h>
+#include <linux/blkzoned.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
#include "messages.h"
#include "volumes.h"
#include "disk-io.h"
#include "block-group.h"
#include "btrfs_inode.h"
+#include "fs.h"
+
+struct block_device;
+struct extent_buffer;
+struct btrfs_bio;
+struct btrfs_ordered_extent;
+struct btrfs_fs_info;
+struct btrfs_space_info;
+struct btrfs_eb_write_context;
+struct btrfs_fs_devices;
#define BTRFS_DEFAULT_RECLAIM_THRESH (75)
@@ -45,7 +60,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt);
int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
u64 *bytenr_ret);
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
@@ -59,8 +74,6 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
-void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb);
bool btrfs_use_zone_append(struct btrfs_bio *bbio);
void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
@@ -123,7 +136,8 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
return -EOPNOTSUPP;
}
-static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info,
+ unsigned long *mount_opt)
{
return 0;
}
@@ -180,9 +194,6 @@ static inline int btrfs_load_block_group_zone_info(
static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
-static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb) { }
-
static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
return false;
@@ -323,8 +334,8 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i
(bdev_zone_sectors(bdev) << SECTOR_SHIFT);
}
- /* Do not allow Host Manged zoned device */
- return bdev_zoned_model(bdev) != BLK_ZONED_HM;
+ /* Do not allow Host Managed zoned device. */
+ return !bdev_is_zoned(bdev);
}
static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 5511766485cd..2b232b82c3a8 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -18,8 +18,9 @@
#include <linux/slab.h>
#include <linux/zstd.h>
#include "misc.h"
+#include "fs.h"
#include "compression.h"
-#include "ctree.h"
+#include "super.h"
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
@@ -373,25 +374,25 @@ fail:
return ERR_PTR(-ENOMEM);
}
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out)
+int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
zstd_cstream *stream;
int ret = 0;
- int nr_pages = 0;
- struct page *in_page = NULL; /* The current page to read */
- struct page *out_page = NULL; /* The current page to write to */
+ int nr_folios = 0;
+ struct folio *in_folio = NULL; /* The current folio to read. */
+ struct folio *out_folio = NULL; /* The current folio to write to. */
unsigned long tot_in = 0;
unsigned long tot_out = 0;
unsigned long len = *total_out;
- const unsigned long nr_dest_pages = *out_pages;
- unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+ const unsigned long nr_dest_folios = *out_folios;
+ unsigned long max_out = nr_dest_folios * PAGE_SIZE;
zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
len);
- *out_pages = 0;
+ *out_folios = 0;
*total_out = 0;
*total_in = 0;
@@ -405,20 +406,21 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
}
/* map in the first page of input data */
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap_local_page(in_page);
+ ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
+ if (ret < 0)
+ goto out;
+ workspace->in_buf.src = kmap_local_folio(in_folio, 0);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
-
/* Allocate and map in the output buffer */
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ folios[nr_folios++] = out_folio;
+ workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
@@ -453,17 +455,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- if (nr_pages == nr_dest_pages) {
+ if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ folios[nr_folios++] = out_folio;
+ workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -479,11 +481,14 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
kunmap_local(workspace->in_buf.src);
- put_page(in_page);
+ workspace->in_buf.src = NULL;
+ folio_put(in_folio);
start += PAGE_SIZE;
len -= PAGE_SIZE;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap_local_page(in_page);
+ ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
+ if (ret < 0)
+ goto out;
+ workspace->in_buf.src = kmap_local_folio(in_folio, 0);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
@@ -510,17 +515,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- if (nr_pages == nr_dest_pages) {
+ if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ folios[nr_folios++] = out_folio;
+ workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -534,10 +539,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = tot_in;
*total_out = tot_out;
out:
- *out_pages = nr_pages;
+ *out_folios = nr_folios;
if (workspace->in_buf.src) {
kunmap_local(workspace->in_buf.src);
- put_page(in_page);
+ folio_put(in_folio);
}
return ret;
}
@@ -545,12 +550,12 @@ out:
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- struct page **pages_in = cb->compressed_pages;
+ struct folio **folios_in = cb->compressed_folios;
size_t srclen = cb->compressed_len;
zstd_dstream *stream;
int ret = 0;
- unsigned long page_in_index = 0;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long folio_in_index = 0;
+ unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long total_out = 0;
@@ -562,7 +567,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -599,14 +604,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->in_buf.pos == workspace->in_buf.size) {
kunmap_local(workspace->in_buf.src);
- page_in_index++;
- if (page_in_index >= total_pages_in) {
+ folio_in_index++;
+ if (folio_in_index >= total_folios_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
+ workspace->in_buf.src =
+ kmap_local_folio(folios_in[folio_in_index], 0);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
@@ -619,80 +625,48 @@ done:
}
int zstd_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
zstd_dstream *stream;
int ret = 0;
- size_t ret2;
- unsigned long total_out = 0;
- unsigned long pg_offset = 0;
+ unsigned long to_copy = 0;
stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
if (!stream) {
pr_warn("BTRFS: zstd_init_dstream failed\n");
- ret = -EIO;
goto finish;
}
- destlen = min_t(size_t, destlen, PAGE_SIZE);
-
workspace->in_buf.src = data_in;
workspace->in_buf.pos = 0;
workspace->in_buf.size = srclen;
workspace->out_buf.dst = workspace->buf;
workspace->out_buf.pos = 0;
- workspace->out_buf.size = PAGE_SIZE;
-
- ret2 = 1;
- while (pg_offset < destlen
- && workspace->in_buf.pos < workspace->in_buf.size) {
- unsigned long buf_start;
- unsigned long buf_offset;
- unsigned long bytes;
-
- /* Check if the frame is over and we still need more input */
- if (ret2 == 0) {
- pr_debug("BTRFS: zstd_decompress_stream ended early\n");
- ret = -EIO;
- goto finish;
- }
- ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
- &workspace->in_buf);
- if (zstd_is_error(ret2)) {
- pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
- zstd_get_error_code(ret2));
- ret = -EIO;
- goto finish;
- }
-
- buf_start = total_out;
- total_out += workspace->out_buf.pos;
- workspace->out_buf.pos = 0;
-
- if (total_out <= start_byte)
- continue;
-
- if (total_out > start_byte && buf_start < start_byte)
- buf_offset = start_byte - buf_start;
- else
- buf_offset = 0;
-
- bytes = min_t(unsigned long, destlen - pg_offset,
- workspace->out_buf.size - buf_offset);
-
- memcpy_to_page(dest_page, pg_offset,
- workspace->out_buf.dst + buf_offset, bytes);
-
- pg_offset += bytes;
+ workspace->out_buf.size = sectorsize;
+
+ /*
+ * Since both input and output buffers should not exceed one sector,
+ * one call should end the decompression.
+ */
+ ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf);
+ if (zstd_is_error(ret)) {
+ pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n",
+ zstd_get_error_code(ret));
+ goto finish;
}
- ret = 0;
+ to_copy = workspace->out_buf.pos;
+ memcpy_to_page(dest_page, dest_pgoff, workspace->out_buf.dst, to_copy);
finish:
- if (pg_offset < destlen) {
- memzero_page(dest_page, pg_offset, destlen - pg_offset);
+ /* Error or early end. */
+ if (unlikely(to_copy < destlen)) {
+ ret = -EIO;
+ memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
}
return ret;
}