aboutsummaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2019-10-27 11:00:19 -0700
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2019-10-27 11:00:19 -0700
commit728d90bdc9e480dc93913e59a0aa3c896c7aa697 (patch)
tree258b1b6ee711f0ef67fd225700d84eccec285194 /fs/ext4
parentcb3efd5a38855eabd26c2b631dd027169678d60f (diff)
parentd6d5df1db6e9d7f8f76d2911707f7d5877251b02 (diff)
Merge tag 'v5.4-rc5' into next
Sync up with mainline.
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Makefile1
-rw-r--r--fs/ext4/block_validity.c189
-rw-r--r--fs/ext4/dir.c7
-rw-r--r--fs/ext4/ext4.h95
-rw-r--r--fs/ext4/extents.c98
-rw-r--r--fs/ext4/extents_status.c521
-rw-r--r--fs/ext4/extents_status.h8
-rw-r--r--fs/ext4/file.c6
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/inline.c2
-rw-r--r--fs/ext4/inode.c158
-rw-r--r--fs/ext4/ioctl.c143
-rw-r--r--fs/ext4/namei.c4
-rw-r--r--fs/ext4/readpage.c211
-rw-r--r--fs/ext4/super.c45
-rw-r--r--fs/ext4/sysfs.c6
-rw-r--r--fs/ext4/verity.c367
17 files changed, 1532 insertions, 331 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8fdfcd3c3e04..b17ddc229ac5 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -13,3 +13,4 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
+ext4-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 8e83741b02e0..d4d4fdfac1a6 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -38,6 +38,7 @@ int __init ext4_init_system_zone(void)
void ext4_exit_system_zone(void)
{
+ rcu_barrier();
kmem_cache_destroy(ext4_system_zone_cachep);
}
@@ -49,17 +50,26 @@ static inline int can_merge(struct ext4_system_zone *entry1,
return 0;
}
+static void release_system_zone(struct ext4_system_blocks *system_blks)
+{
+ struct ext4_system_zone *entry, *n;
+
+ rbtree_postorder_for_each_entry_safe(entry, n,
+ &system_blks->root, node)
+ kmem_cache_free(ext4_system_zone_cachep, entry);
+}
+
/*
* Mark a range of blocks as belonging to the "system zone" --- that
* is, filesystem metadata blocks which should never be used by
* inodes.
*/
-static int add_system_zone(struct ext4_sb_info *sbi,
+static int add_system_zone(struct ext4_system_blocks *system_blks,
ext4_fsblk_t start_blk,
unsigned int count)
{
struct ext4_system_zone *new_entry = NULL, *entry;
- struct rb_node **n = &sbi->system_blks.rb_node, *node;
+ struct rb_node **n = &system_blks->root.rb_node, *node;
struct rb_node *parent = NULL, *new_node = NULL;
while (*n) {
@@ -91,7 +101,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
new_node = &new_entry->node;
rb_link_node(new_node, parent, n);
- rb_insert_color(new_node, &sbi->system_blks);
+ rb_insert_color(new_node, &system_blks->root);
}
/* Can we merge to the left? */
@@ -101,7 +111,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
if (can_merge(entry, new_entry)) {
new_entry->start_blk = entry->start_blk;
new_entry->count += entry->count;
- rb_erase(node, &sbi->system_blks);
+ rb_erase(node, &system_blks->root);
kmem_cache_free(ext4_system_zone_cachep, entry);
}
}
@@ -112,7 +122,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
entry = rb_entry(node, struct ext4_system_zone, node);
if (can_merge(new_entry, entry)) {
new_entry->count += entry->count;
- rb_erase(node, &sbi->system_blks);
+ rb_erase(node, &system_blks->root);
kmem_cache_free(ext4_system_zone_cachep, entry);
}
}
@@ -126,7 +136,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
int first = 1;
printk(KERN_INFO "System zones: ");
- node = rb_first(&sbi->system_blks);
+ node = rb_first(&sbi->system_blks->root);
while (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ",
@@ -137,7 +147,47 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
printk(KERN_CONT "\n");
}
-static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
+ struct ext4_system_blocks *system_blks,
+ ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ struct ext4_system_zone *entry;
+ struct rb_node *n;
+
+ if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count < start_blk) ||
+ (start_blk + count > ext4_blocks_count(sbi->s_es))) {
+ sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+ return 0;
+ }
+
+ if (system_blks == NULL)
+ return 1;
+
+ n = system_blks->root.rb_node;
+ while (n) {
+ entry = rb_entry(n, struct ext4_system_zone, node);
+ if (start_blk + count - 1 < entry->start_blk)
+ n = n->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = n->rb_right;
+ else {
+ sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int ext4_protect_reserved_inode(struct super_block *sb,
+ struct ext4_system_blocks *system_blks,
+ u32 ino)
{
struct inode *inode;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -163,14 +213,15 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
if (n == 0) {
i++;
} else {
- if (!ext4_data_block_valid(sbi, map.m_pblk, n)) {
+ if (!ext4_data_block_valid_rcu(sbi, system_blks,
+ map.m_pblk, n)) {
ext4_error(sb, "blocks %llu-%llu from inode %u "
"overlap system zone", map.m_pblk,
map.m_pblk + map.m_len - 1, ino);
err = -EFSCORRUPTED;
break;
}
- err = add_system_zone(sbi, map.m_pblk, n);
+ err = add_system_zone(system_blks, map.m_pblk, n);
if (err < 0)
break;
i += n;
@@ -180,94 +231,130 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
return err;
}
+static void ext4_destroy_system_zone(struct rcu_head *rcu)
+{
+ struct ext4_system_blocks *system_blks;
+
+ system_blks = container_of(rcu, struct ext4_system_blocks, rcu);
+ release_system_zone(system_blks);
+ kfree(system_blks);
+}
+
+/*
+ * Build system zone rbtree which is used for block validity checking.
+ *
+ * The update of system_blks pointer in this function is protected by
+ * sb->s_umount semaphore. However we have to be careful as we can be
+ * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * protected only by RCU. That's why we first build the rbtree and then
+ * swap it in place.
+ */
int ext4_setup_system_zone(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_system_blocks *system_blks;
struct ext4_group_desc *gdp;
ext4_group_t i;
int flex_size = ext4_flex_bg_size(sbi);
int ret;
if (!test_opt(sb, BLOCK_VALIDITY)) {
- if (sbi->system_blks.rb_node)
+ if (sbi->system_blks)
ext4_release_system_zone(sb);
return 0;
}
- if (sbi->system_blks.rb_node)
+ if (sbi->system_blks)
return 0;
+ system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
+ if (!system_blks)
+ return -ENOMEM;
+
for (i=0; i < ngroups; i++) {
cond_resched();
if (ext4_bg_has_super(sb, i) &&
((i < 5) || ((i % flex_size) == 0)))
- add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+ add_system_zone(system_blks,
+ ext4_group_first_block_no(sb, i),
ext4_bg_num_gdb(sb, i) + 1);
gdp = ext4_get_group_desc(sb, i, NULL);
- ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+ ret = add_system_zone(system_blks,
+ ext4_block_bitmap(sb, gdp), 1);
if (ret)
- return ret;
- ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+ goto err;
+ ret = add_system_zone(system_blks,
+ ext4_inode_bitmap(sb, gdp), 1);
if (ret)
- return ret;
- ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+ goto err;
+ ret = add_system_zone(system_blks,
+ ext4_inode_table(sb, gdp),
sbi->s_itb_per_group);
if (ret)
- return ret;
+ goto err;
}
if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) {
- ret = ext4_protect_reserved_inode(sb,
+ ret = ext4_protect_reserved_inode(sb, system_blks,
le32_to_cpu(sbi->s_es->s_journal_inum));
if (ret)
- return ret;
+ goto err;
}
+ /*
+ * System blks rbtree complete, announce it once to prevent racing
+ * with ext4_data_block_valid() accessing the rbtree at the same
+ * time.
+ */
+ rcu_assign_pointer(sbi->system_blks, system_blks);
+
if (test_opt(sb, DEBUG))
debug_print_tree(sbi);
return 0;
+err:
+ release_system_zone(system_blks);
+ kfree(system_blks);
+ return ret;
}
-/* Called when the filesystem is unmounted */
+/*
+ * Called when the filesystem is unmounted or when remounting it with
+ * noblock_validity specified.
+ *
+ * The update of system_blks pointer in this function is protected by
+ * sb->s_umount semaphore. However we have to be careful as we can be
+ * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * protected only by RCU. So we first clear the system_blks pointer and
+ * then free the rbtree only after RCU grace period expires.
+ */
void ext4_release_system_zone(struct super_block *sb)
{
- struct ext4_system_zone *entry, *n;
+ struct ext4_system_blocks *system_blks;
- rbtree_postorder_for_each_entry_safe(entry, n,
- &EXT4_SB(sb)->system_blks, node)
- kmem_cache_free(ext4_system_zone_cachep, entry);
+ system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks,
+ lockdep_is_held(&sb->s_umount));
+ rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL);
- EXT4_SB(sb)->system_blks = RB_ROOT;
+ if (system_blks)
+ call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}
-/*
- * Returns 1 if the passed-in block region (start_blk,
- * start_blk+count) is valid; 0 if some part of the block region
- * overlaps with filesystem metadata blocks.
- */
int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
unsigned int count)
{
- struct ext4_system_zone *entry;
- struct rb_node *n = sbi->system_blks.rb_node;
+ struct ext4_system_blocks *system_blks;
+ int ret;
- if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
- (start_blk + count < start_blk) ||
- (start_blk + count > ext4_blocks_count(sbi->s_es))) {
- sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
- return 0;
- }
- while (n) {
- entry = rb_entry(n, struct ext4_system_zone, node);
- if (start_blk + count - 1 < entry->start_blk)
- n = n->rb_left;
- else if (start_blk >= (entry->start_blk + entry->count))
- n = n->rb_right;
- else {
- sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
- return 0;
- }
- }
- return 1;
+ /*
+ * Lock the system zone to prevent it being released concurrently
+ * when doing a remount which inverse current "[no]block_validity"
+ * mount option.
+ */
+ rcu_read_lock();
+ system_blks = rcu_dereference(sbi->system_blks);
+ ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk,
+ count);
+ rcu_read_unlock();
+ return ret;
}
int ext4_check_blockref(const char *function, unsigned int line,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86054f31fe4d..9fdd2b269d61 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -668,14 +668,15 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
struct qstr qstr = {.name = str, .len = len };
+ struct inode *inode = dentry->d_parent->d_inode;
- if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) {
+ if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) {
if (len != name->len)
return -1;
return memcmp(str, name->name, len);
}
- return ext4_ci_compare(dentry->d_parent->d_inode, name, &qstr, false);
+ return ext4_ci_compare(inode, name, &qstr, false);
}
static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
@@ -685,7 +686,7 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
unsigned char *norm;
int len, ret = 0;
- if (!IS_CASEFOLDED(dentry->d_inode))
+ if (!IS_CASEFOLDED(dentry->d_inode) || !um)
return 0;
norm = kmalloc(PATH_MAX, GFP_ATOMIC);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf660aa7a9e0..03db3e71676c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -41,6 +41,7 @@
#endif
#include <linux/fscrypt.h>
+#include <linux/fsverity.h>
#include <linux/compiler.h>
@@ -185,6 +186,14 @@ struct ext4_map_blocks {
};
/*
+ * Block validity checking, system zone rbtree.
+ */
+struct ext4_system_blocks {
+ struct rb_root root;
+ struct rcu_head rcu;
+};
+
+/*
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
@@ -284,6 +293,9 @@ struct ext4_io_submit {
~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \
~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Fill in the low bits to get the last block of the cluster */
+#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \
+ ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
/* Get the cluster offset */
#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \
((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
@@ -395,6 +407,7 @@ struct flex_groups {
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
+#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
@@ -402,7 +415,7 @@ struct flex_groups {
#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x704BDFFF /* User visible flags */
+#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */
#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */
/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
@@ -467,6 +480,7 @@ enum {
EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
+ EXT4_INODE_VERITY = 20, /* Verity protected inode */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
@@ -512,6 +526,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(TOPDIR);
CHECK_FLAG_VALUE(HUGE_FILE);
CHECK_FLAG_VALUE(EXTENTS);
+ CHECK_FLAG_VALUE(VERITY);
CHECK_FLAG_VALUE(EA_INODE);
CHECK_FLAG_VALUE(EOFBLOCKS);
CHECK_FLAG_VALUE(INLINE_DATA);
@@ -649,6 +664,10 @@ enum {
#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
+/* ioctl codes 19--39 are reserved for fscrypt */
+#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
+#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
+#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR
#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR
@@ -662,6 +681,16 @@ enum {
#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
+/*
+ * Flags returned by EXT4_IOC_GETSTATE
+ *
+ * We only expose to userspace a subset of the state flags in
+ * i_state_flags
+ */
+#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001
+#define EXT4_STATE_FLAG_NEW 0x00000002
+#define EXT4_STATE_FLAG_NEWENTRY 0x00000004
+#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -679,6 +708,12 @@ enum {
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
#endif
+/*
+ * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag.
+ * It indicates that the entry in extent status cache is for a hole.
+ */
+#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000
+
/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
@@ -808,31 +843,20 @@ static inline __le32 ext4_encode_extra_time(struct timespec64 *time)
static inline void ext4_decode_extra_time(struct timespec64 *time,
__le32 extra)
{
- if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) {
-
-#if 1
- /* Handle legacy encoding of pre-1970 dates with epoch
- * bits 1,1. (This backwards compatibility may be removed
- * at the discretion of the ext4 developers.)
- */
- u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
- if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
- extra_bits = 0;
- time->tv_sec += extra_bits << 32;
-#else
+ if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
-#endif
- }
time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}
#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
do { \
- (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\
+ (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
(raw_inode)->xtime ## _extra = \
ext4_encode_extra_time(&(inode)->xtime); \
} \
+ else \
+ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \
} while (0)
#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
@@ -1421,7 +1445,7 @@ struct ext4_sb_info {
int s_jquota_fmt; /* Format of quota to use */
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
- struct rb_root system_blks;
+ struct ext4_system_blocks __rcu *system_blks;
#ifdef EXTENTS_STATS
/* ext4 extents stats */
@@ -1560,6 +1584,7 @@ enum {
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
+ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1610,6 +1635,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_SB(sb) (sb)
#endif
+static inline bool ext4_verity_in_progress(struct inode *inode)
+{
+ return IS_ENABLED(CONFIG_FS_VERITY) &&
+ ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+}
+
#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
/*
@@ -1632,6 +1663,10 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_GOOD_OLD_INODE_SIZE 128
+#define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN)
+#define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX
+#define EXT4_TIMESTAMP_MIN S32_MIN
+
/*
* Feature set definitions
*/
@@ -1662,6 +1697,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000
+#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1756,6 +1792,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC)
EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT)
+EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY)
EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE)
@@ -1813,7 +1850,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
EXT4_FEATURE_RO_COMPAT_QUOTA |\
- EXT4_FEATURE_RO_COMPAT_PROJECT)
+ EXT4_FEATURE_RO_COMPAT_PROJECT |\
+ EXT4_FEATURE_RO_COMPAT_VERITY)
#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -3177,6 +3215,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
extern int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages, bool is_readahead);
+extern int __init ext4_init_post_read_processing(void);
+extern void ext4_exit_post_read_processing(void);
/* symlink.c */
extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
@@ -3245,6 +3285,9 @@ extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
+extern int ext4_get_es_cache(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
@@ -3283,6 +3326,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+/* verity.c */
+extern const struct fsverity_operations ext4_verityops;
+
/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
@@ -3334,6 +3380,19 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
extern const struct iomap_ops ext4_iomap_ops;
+static inline int ext4_buffer_uptodate(struct buffer_head *bh)
+{
+ /*
+ * If the buffer has the write error flag, we have failed
+ * to write out data in the block. In this case, we don't
+ * have to read the block because we may read the old data
+ * successfully.
+ */
+ if (!buffer_uptodate(bh) && buffer_write_io_error(bh))
+ set_buffer_uptodate(bh);
+ return buffer_uptodate(bh);
+}
+
#endif /* __KERNEL__ */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 92266a2da7d6..fb0f99dc8c22 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2315,6 +2315,52 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
return err;
}
+static int ext4_fill_es_cache_info(struct inode *inode,
+ ext4_lblk_t block, ext4_lblk_t num,
+ struct fiemap_extent_info *fieinfo)
+{
+ ext4_lblk_t next, end = block + num - 1;
+ struct extent_status es;
+ unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
+ unsigned int flags;
+ int err;
+
+ while (block <= end) {
+ next = 0;
+ flags = 0;
+ if (!ext4_es_lookup_extent(inode, block, &next, &es))
+ break;
+ if (ext4_es_is_unwritten(&es))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
+ if (ext4_es_is_delayed(&es))
+ flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ if (ext4_es_is_hole(&es))
+ flags |= EXT4_FIEMAP_EXTENT_HOLE;
+ if (next == 0)
+ flags |= FIEMAP_EXTENT_LAST;
+ if (flags & (FIEMAP_EXTENT_DELALLOC|
+ EXT4_FIEMAP_EXTENT_HOLE))
+ es.es_pblk = 0;
+ else
+ es.es_pblk = ext4_es_pblock(&es);
+ err = fiemap_fill_next_extent(fieinfo,
+ (__u64)es.es_lblk << blksize_bits,
+ (__u64)es.es_pblk << blksize_bits,
+ (__u64)es.es_len << blksize_bits,
+ flags);
+ if (next == 0)
+ break;
+ block = next;
+ if (err < 0)
+ return err;
+ if (err == 1)
+ return 0;
+ }
+ return 0;
+}
+
+
/*
* ext4_ext_determine_hole - determine hole around given block
* @inode: inode we lookup in
@@ -3813,8 +3859,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
* illegal.
*/
if (ee_block != map->m_lblk || ee_len > map->m_len) {
-#ifdef EXT4_DEBUG
- ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+#ifdef CONFIG_EXT4_DEBUG
+ ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
" len %u; IO logical block %llu, len %u",
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
@@ -5017,8 +5063,6 @@ static int ext4_find_delayed_extent(struct inode *inode,
return next_del;
}
-/* fiemap flags we can handle specified here */
-#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
static int ext4_xattr_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo)
@@ -5055,10 +5099,16 @@ static int ext4_xattr_fiemap(struct inode *inode,
return (error < 0 ? error : 0);
}
-int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
+static int _ext4_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len,
+ int (*fill)(struct inode *, ext4_lblk_t,
+ ext4_lblk_t,
+ struct fiemap_extent_info *))
{
ext4_lblk_t start_blk;
+ u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR;
+
int error = 0;
if (ext4_has_inline_data(inode)) {
@@ -5075,14 +5125,18 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
error = ext4_ext_precache(inode);
if (error)
return error;
+ fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
/* fallback to generic here if not in extents fmt */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
+ fill == ext4_fill_fiemap_extents)
return generic_block_fiemap(inode, fieinfo, start, len,
ext4_get_block);
- if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+ if (fill == ext4_fill_es_cache_info)
+ ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
+ if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
return -EBADR;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
@@ -5101,12 +5155,36 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* Walk the extent tree gathering extent information
* and pushing extents back to the user.
*/
- error = ext4_fill_fiemap_extents(inode, start_blk,
- len_blks, fieinfo);
+ error = fill(inode, start_blk, len_blks, fieinfo);
}
return error;
}
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ return _ext4_fiemap(inode, fieinfo, start, len,
+ ext4_fill_fiemap_extents);
+}
+
+int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ if (ext4_has_inline_data(inode)) {
+ int has_inline;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ has_inline = ext4_has_inline_data(inode);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (has_inline)
+ return 0;
+ }
+
+ return _ext4_fiemap(inode, fieinfo, start, len,
+ ext4_fill_es_cache_info);
+}
+
+
/*
* ext4_access_path:
* Function to access the path buffer for marking it dirty.
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 7521de2dcf3a..d996b44d2265 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -146,7 +146,7 @@ static struct kmem_cache *ext4_pending_cachep;
static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end);
+ ext4_lblk_t end, int *reserved);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei);
@@ -836,7 +836,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_es_insert_extent_check(inode, &newes);
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end);
+ err = __es_remove_extent(inode, lblk, end, NULL);
if (err != 0)
goto error;
retry:
@@ -899,6 +899,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
* Return: 1 on found, 0 on not
*/
int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t *next_lblk,
struct extent_status *es)
{
struct ext4_es_tree *tree;
@@ -947,9 +948,18 @@ out:
es->es_pblk = es1->es_pblk;
if (!ext4_es_is_referenced(es1))
ext4_es_set_referenced(es1);
- stats->es_stats_cache_hits++;
+ percpu_counter_inc(&stats->es_stats_cache_hits);
+ if (next_lblk) {
+ node = rb_next(&es1->rb_node);
+ if (node) {
+ es1 = rb_entry(node, struct extent_status,
+ rb_node);
+ *next_lblk = es1->es_lblk;
+ } else
+ *next_lblk = 0;
+ }
} else {
- stats->es_stats_cache_misses++;
+ percpu_counter_inc(&stats->es_stats_cache_misses);
}
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -958,8 +968,322 @@ out:
return found;
}
+struct rsvd_count {
+ int ndelonly;
+ bool first_do_lblk_found;
+ ext4_lblk_t first_do_lblk;
+ ext4_lblk_t last_do_lblk;
+ struct extent_status *left_es;
+ bool partial;
+ ext4_lblk_t lclu;
+};
+
+/*
+ * init_rsvd - initialize reserved count data before removing block range
+ * in file from extent status tree
+ *
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @es - pointer to first extent in range
+ * @rc - pointer to reserved count data
+ *
+ * Assumes es is not NULL
+ */
+static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es, struct rsvd_count *rc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct rb_node *node;
+
+ rc->ndelonly = 0;
+
+ /*
+ * for bigalloc, note the first delonly block in the range has not
+ * been found, record the extent containing the block to the left of
+ * the region to be removed, if any, and note that there's no partial
+ * cluster to track
+ */
+ if (sbi->s_cluster_ratio > 1) {
+ rc->first_do_lblk_found = false;
+ if (lblk > es->es_lblk) {
+ rc->left_es = es;
+ } else {
+ node = rb_prev(&es->rb_node);
+ rc->left_es = node ? rb_entry(node,
+ struct extent_status,
+ rb_node) : NULL;
+ }
+ rc->partial = false;
+ }
+}
+
+/*
+ * count_rsvd - count the clusters containing delayed and not unwritten
+ * (delonly) blocks in a range within an extent and add to
+ * the running tally in rsvd_count
+ *
+ * @inode - file containing extent
+ * @lblk - first block in range
+ * @len - length of range in blocks
+ * @es - pointer to extent containing clusters to be counted
+ * @rc - pointer to reserved count data
+ *
+ * Tracks partial clusters found at the beginning and end of extents so
+ * they aren't overcounted when they span adjacent extents
+ */
+static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
+ struct extent_status *es, struct rsvd_count *rc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t i, end, nclu;
+
+ if (!ext4_es_is_delonly(es))
+ return;
+
+ WARN_ON(len <= 0);
+
+ if (sbi->s_cluster_ratio == 1) {
+ rc->ndelonly += (int) len;
+ return;
+ }
+
+ /* bigalloc */
+
+ i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
+ end = lblk + (ext4_lblk_t) len - 1;
+ end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
+
+ /* record the first block of the first delonly extent seen */
+ if (rc->first_do_lblk_found == false) {
+ rc->first_do_lblk = i;
+ rc->first_do_lblk_found = true;
+ }
+
+ /* update the last lblk in the region seen so far */
+ rc->last_do_lblk = end;
+
+ /*
+ * if we're tracking a partial cluster and the current extent
+ * doesn't start with it, count it and stop tracking
+ */
+ if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
+ rc->ndelonly++;
+ rc->partial = false;
+ }
+
+ /*
+ * if the first cluster doesn't start on a cluster boundary but
+ * ends on one, count it
+ */
+ if (EXT4_LBLK_COFF(sbi, i) != 0) {
+ if (end >= EXT4_LBLK_CFILL(sbi, i)) {
+ rc->ndelonly++;
+ rc->partial = false;
+ i = EXT4_LBLK_CFILL(sbi, i) + 1;
+ }
+ }
+
+ /*
+ * if the current cluster starts on a cluster boundary, count the
+ * number of whole delonly clusters in the extent
+ */
+ if ((i + sbi->s_cluster_ratio - 1) <= end) {
+ nclu = (end - i + 1) >> sbi->s_cluster_bits;
+ rc->ndelonly += nclu;
+ i += nclu << sbi->s_cluster_bits;
+ }
+
+ /*
+ * start tracking a partial cluster if there's a partial at the end
+ * of the current extent and we're not already tracking one
+ */
+ if (!rc->partial && i <= end) {
+ rc->partial = true;
+ rc->lclu = EXT4_B2C(sbi, i);
+ }
+}
+
+/*
+ * __pr_tree_search - search for a pending cluster reservation
+ *
+ * @root - root of pending reservation tree
+ * @lclu - logical cluster to search for
+ *
+ * Returns the pending reservation for the cluster identified by @lclu
+ * if found. If not, returns a reservation for the next cluster if any,
+ * and if not, returns NULL.
+ */
+static struct pending_reservation *__pr_tree_search(struct rb_root *root,
+ ext4_lblk_t lclu)
+{
+ struct rb_node *node = root->rb_node;
+ struct pending_reservation *pr = NULL;
+
+ while (node) {
+ pr = rb_entry(node, struct pending_reservation, rb_node);
+ if (lclu < pr->lclu)
+ node = node->rb_left;
+ else if (lclu > pr->lclu)
+ node = node->rb_right;
+ else
+ return pr;
+ }
+ if (pr && lclu < pr->lclu)
+ return pr;
+ if (pr && lclu > pr->lclu) {
+ node = rb_next(&pr->rb_node);
+ return node ? rb_entry(node, struct pending_reservation,
+ rb_node) : NULL;
+ }
+ return NULL;
+}
+
+/*
+ * get_rsvd - calculates and returns the number of cluster reservations to be
+ * released when removing a block range from the extent status tree
+ * and releases any pending reservations within the range
+ *
+ * @inode - file containing block range
+ * @end - last block in range
+ * @right_es - pointer to extent containing next block beyond end or NULL
+ * @rc - pointer to reserved count data
+ *
+ * The number of reservations to be released is equal to the number of
+ * clusters containing delayed and not unwritten (delonly) blocks within
+ * the range, minus the number of clusters still containing delonly blocks
+ * at the ends of the range, and minus the number of pending reservations
+ * within the range.
+ */
+static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
+ struct extent_status *right_es,
+ struct rsvd_count *rc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct pending_reservation *pr;
+ struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
+ struct rb_node *node;
+ ext4_lblk_t first_lclu, last_lclu;
+ bool left_delonly, right_delonly, count_pending;
+ struct extent_status *es;
+
+ if (sbi->s_cluster_ratio > 1) {
+ /* count any remaining partial cluster */
+ if (rc->partial)
+ rc->ndelonly++;
+
+ if (rc->ndelonly == 0)
+ return 0;
+
+ first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
+ last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
+
+ /*
+ * decrease the delonly count by the number of clusters at the
+ * ends of the range that still contain delonly blocks -
+ * these clusters still need to be reserved
+ */
+ left_delonly = right_delonly = false;
+
+ es = rc->left_es;
+ while (es && ext4_es_end(es) >=
+ EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
+ if (ext4_es_is_delonly(es)) {
+ rc->ndelonly--;
+ left_delonly = true;
+ break;
+ }
+ node = rb_prev(&es->rb_node);
+ if (!node)
+ break;
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+ if (right_es && (!left_delonly || first_lclu != last_lclu)) {
+ if (end < ext4_es_end(right_es)) {
+ es = right_es;
+ } else {
+ node = rb_next(&right_es->rb_node);
+ es = node ? rb_entry(node, struct extent_status,
+ rb_node) : NULL;
+ }
+ while (es && es->es_lblk <=
+ EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
+ if (ext4_es_is_delonly(es)) {
+ rc->ndelonly--;
+ right_delonly = true;
+ break;
+ }
+ node = rb_next(&es->rb_node);
+ if (!node)
+ break;
+ es = rb_entry(node, struct extent_status,
+ rb_node);
+ }
+ }
+
+ /*
+ * Determine the block range that should be searched for
+ * pending reservations, if any. Clusters on the ends of the
+ * original removed range containing delonly blocks are
+ * excluded. They've already been accounted for and it's not
+ * possible to determine if an associated pending reservation
+ * should be released with the information available in the
+ * extents status tree.
+ */
+ if (first_lclu == last_lclu) {
+ if (left_delonly | right_delonly)
+ count_pending = false;
+ else
+ count_pending = true;
+ } else {
+ if (left_delonly)
+ first_lclu++;
+ if (right_delonly)
+ last_lclu--;
+ if (first_lclu <= last_lclu)
+ count_pending = true;
+ else
+ count_pending = false;
+ }
+
+ /*
+ * a pending reservation found between first_lclu and last_lclu
+ * represents an allocated cluster that contained at least one
+ * delonly block, so the delonly total must be reduced by one
+ * for each pending reservation found and released
+ */
+ if (count_pending) {
+ pr = __pr_tree_search(&tree->root, first_lclu);
+ while (pr && pr->lclu <= last_lclu) {
+ rc->ndelonly--;
+ node = rb_next(&pr->rb_node);
+ rb_erase(&pr->rb_node, &tree->root);
+ kmem_cache_free(ext4_pending_cachep, pr);
+ if (!node)
+ break;
+ pr = rb_entry(node, struct pending_reservation,
+ rb_node);
+ }
+ }
+ }
+ return rc->ndelonly;
+}
+
+
+/*
+ * __es_remove_extent - removes block range from extent status tree
+ *
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @end - last block in range
+ * @reserved - number of cluster reservations released
+ *
+ * If @reserved is not NULL and delayed allocation is enabled, counts
+ * block/cluster reservations freed by removing range and if bigalloc
+ * enabled cancels pending reservations as needed. Returns 0 on success,
+ * error code on failure.
+ */
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end)
+ ext4_lblk_t end, int *reserved)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node *node;
@@ -968,9 +1292,14 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
int err;
+ bool count_reserved = true;
+ struct rsvd_count rc;
+ if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
+ count_reserved = false;
retry:
err = 0;
+
es = __es_tree_search(&tree->root, lblk);
if (!es)
goto out;
@@ -979,6 +1308,8 @@ retry:
/* Simply invalidate cache_es. */
tree->cache_es = NULL;
+ if (count_reserved)
+ init_rsvd(inode, lblk, es, &rc);
orig_es.es_lblk = es->es_lblk;
orig_es.es_len = es->es_len;
@@ -1020,10 +1351,16 @@ retry:
ext4_es_store_pblock(es, block);
}
}
+ if (count_reserved)
+ count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
+ &orig_es, &rc);
goto out;
}
if (len1 > 0) {
+ if (count_reserved)
+ count_rsvd(inode, lblk, orig_es.es_len - len1,
+ &orig_es, &rc);
node = rb_next(&es->rb_node);
if (node)
es = rb_entry(node, struct extent_status, rb_node);
@@ -1032,6 +1369,8 @@ retry:
}
while (es && ext4_es_end(es) <= end) {
+ if (count_reserved)
+ count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
node = rb_next(&es->rb_node);
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(inode, es);
@@ -1046,6 +1385,9 @@ retry:
ext4_lblk_t orig_len = es->es_len;
len1 = ext4_es_end(es) - end;
+ if (count_reserved)
+ count_rsvd(inode, es->es_lblk, orig_len - len1,
+ es, &rc);
es->es_lblk = end + 1;
es->es_len = len1;
if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
@@ -1054,20 +1396,28 @@ retry:
}
}
+ if (count_reserved)
+ *reserved = get_rsvd(inode, end, es, &rc);
out:
return err;
}
/*
- * ext4_es_remove_extent() removes a space from a extent status tree.
+ * ext4_es_remove_extent - removes block range from extent status tree
*
- * Return 0 on success, error code on failure.
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @len - number of blocks to remove
+ *
+ * Reduces block/cluster reservation count and for bigalloc cancels pending
+ * reservations as needed. Returns 0 on success, error code on failure.
*/
int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len)
{
ext4_lblk_t end;
int err = 0;
+ int reserved = 0;
trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
@@ -1085,9 +1435,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
* is reclaimed.
*/
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end);
+ err = __es_remove_extent(inode, lblk, end, &reserved);
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_es_print_tree(inode);
+ ext4_da_release_space(inode, reserved);
return err;
}
@@ -1235,9 +1586,9 @@ int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
- seq_printf(seq, " %lu/%lu cache hits/misses\n",
- es_stats->es_stats_cache_hits,
- es_stats->es_stats_cache_misses);
+ seq_printf(seq, " %lld/%lld cache hits/misses\n",
+ percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
+ percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
if (inode_cnt)
seq_printf(seq, " %d inodes on list\n", inode_cnt);
@@ -1264,35 +1615,46 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
sbi->s_es_nr_inode = 0;
spin_lock_init(&sbi->s_es_lock);
sbi->s_es_stats.es_stats_shrunk = 0;
- sbi->s_es_stats.es_stats_cache_hits = 0;
- sbi->s_es_stats.es_stats_cache_misses = 0;
+ err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
+ GFP_KERNEL);
+ if (err)
+ return err;
+ err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
+ GFP_KERNEL);
+ if (err)
+ goto err1;
sbi->s_es_stats.es_stats_scan_time = 0;
sbi->s_es_stats.es_stats_max_scan_time = 0;
err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
if (err)
- return err;
+ goto err2;
err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
if (err)
- goto err1;
+ goto err3;
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
err = register_shrinker(&sbi->s_es_shrinker);
if (err)
- goto err2;
+ goto err4;
return 0;
-
-err2:
+err4:
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
-err1:
+err3:
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+err2:
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
+err1:
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
return err;
}
void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
unregister_shrinker(&sbi->s_es_shrinker);
@@ -1317,6 +1679,7 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
if (!es)
goto out_wrap;
+
while (*nr_to_scan > 0) {
if (es->es_lblk > end) {
ei->i_es_shrink_lblk = end + 1;
@@ -1374,6 +1737,34 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
return nr_shrunk;
}
+/*
+ * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove
+ * discretionary entries from the extent status cache. (Some entries
+ * must be present for proper operations.)
+ */
+void ext4_clear_inode_es(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct extent_status *es;
+ struct ext4_es_tree *tree;
+ struct rb_node *node;
+
+ write_lock(&ei->i_es_lock);
+ tree = &EXT4_I(inode)->i_es_tree;
+ tree->cache_es = NULL;
+ node = rb_first(&tree->root);
+ while (node) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ node = rb_next(node);
+ if (!ext4_es_is_delayed(es)) {
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ }
+ }
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+ write_unlock(&ei->i_es_lock);
+}
+
#ifdef ES_DEBUG__
static void ext4_print_pending_tree(struct inode *inode)
{
@@ -1590,7 +1981,7 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, lblk);
+ err = __es_remove_extent(inode, lblk, lblk, NULL);
if (err != 0)
goto error;
retry:
@@ -1779,93 +2170,3 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
__remove_pending(inode, last);
}
}
-
-/*
- * ext4_es_remove_blks - remove block range from extents status tree and
- * reduce reservation count or cancel pending
- * reservation as needed
- *
- * @inode - file containing range
- * @lblk - first block in range
- * @len - number of blocks to remove
- *
- */
-void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
-{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned int clu_size, reserved = 0;
- ext4_lblk_t last_lclu, first, length, remainder, last;
- bool delonly;
- int err = 0;
- struct pending_reservation *pr;
- struct ext4_pending_tree *tree;
-
- /*
- * Process cluster by cluster for bigalloc - there may be up to
- * two clusters in a 4k page with a 1k block size and two blocks
- * per cluster. Also necessary for systems with larger page sizes
- * and potentially larger block sizes.
- */
- clu_size = sbi->s_cluster_ratio;
- last_lclu = EXT4_B2C(sbi, lblk + len - 1);
-
- write_lock(&EXT4_I(inode)->i_es_lock);
-
- for (first = lblk, remainder = len;
- remainder > 0;
- first += length, remainder -= length) {
-
- if (EXT4_B2C(sbi, first) == last_lclu)
- length = remainder;
- else
- length = clu_size - EXT4_LBLK_COFF(sbi, first);
-
- /*
- * The BH_Delay flag, which triggers calls to this function,
- * and the contents of the extents status tree can be
- * inconsistent due to writepages activity. So, note whether
- * the blocks to be removed actually belong to an extent with
- * delayed only status.
- */
- delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first);
-
- /*
- * because of the writepages effect, written and unwritten
- * blocks could be removed here
- */
- last = first + length - 1;
- err = __es_remove_extent(inode, first, last);
- if (err)
- ext4_warning(inode->i_sb,
- "%s: couldn't remove page (err = %d)",
- __func__, err);
-
- /* non-bigalloc case: simply count the cluster for release */
- if (sbi->s_cluster_ratio == 1 && delonly) {
- reserved++;
- continue;
- }
-
- /*
- * bigalloc case: if all delayed allocated only blocks have
- * just been removed from a cluster, either cancel a pending
- * reservation if it exists or count a cluster for release
- */
- if (delonly &&
- !__es_scan_clu(inode, &ext4_es_is_delonly, first)) {
- pr = __get_pending(inode, EXT4_B2C(sbi, first));
- if (pr != NULL) {
- tree = &EXT4_I(inode)->i_pending_tree;
- rb_erase(&pr->rb_node, &tree->root);
- kmem_cache_free(ext4_pending_cachep, pr);
- } else {
- reserved++;
- }
- }
- }
-
- write_unlock(&EXT4_I(inode)->i_es_lock);
-
- ext4_da_release_space(inode, reserved);
-}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 131a8b7df265..825313c59752 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -70,8 +70,8 @@ struct ext4_es_tree {
struct ext4_es_stats {
unsigned long es_stats_shrunk;
- unsigned long es_stats_cache_hits;
- unsigned long es_stats_cache_misses;
+ struct percpu_counter es_stats_cache_hits;
+ struct percpu_counter es_stats_cache_misses;
u64 es_stats_scan_time;
u64 es_stats_max_scan_time;
struct percpu_counter es_stats_all_cnt;
@@ -140,6 +140,7 @@ extern void ext4_es_find_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t *next_lblk,
struct extent_status *es);
extern bool ext4_es_scan_range(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
@@ -246,7 +247,6 @@ extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
bool allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
-extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
+extern void ext4_clear_inode_es(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 70b0438dbc94..8d2bbcc2d813 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -230,8 +230,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
- if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
- return -EOPNOTSUPP;
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -457,6 +455,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
if (ret)
return ret;
+ ret = fsverity_file_open(inode, filp);
+ if (ret)
+ return ret;
+
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index d358bfcb6b3f..3e133793a5a3 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -280,7 +280,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
unsigned char *buff;
struct qstr qstr = {.name = name, .len = len };
- if (len && IS_CASEFOLDED(dir)) {
+ if (len && IS_CASEFOLDED(dir) && um) {
buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
if (!buff)
return -ENOMEM;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 88cdf3c90bd1..2fec62d764fa 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1416,7 +1416,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
err = ext4_htree_store_dirent(dir_file, hinfo->hash,
hinfo->minor_hash, de, &tmp_str);
if (err) {
- count = err;
+ ret = err;
goto out;
}
count++;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 420fe3deed39..516faa280ced 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -527,7 +527,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
return -EFSCORRUPTED;
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -695,7 +695,7 @@ found:
* extent status tree.
*/
if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
- ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es))
goto out_sem;
}
@@ -1024,7 +1024,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
bh = ext4_getblk(handle, inode, block, map_flags);
if (IS_ERR(bh))
return bh;
- if (!bh || buffer_uptodate(bh))
+ if (!bh || ext4_buffer_uptodate(bh))
return bh;
ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
@@ -1051,7 +1051,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
for (i = 0; i < bh_count; i++)
/* Note that NULL bhs[i] is valid because of holes. */
- if (bhs[i] && !buffer_uptodate(bhs[i]))
+ if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
&bhs[i]);
@@ -1340,6 +1340,9 @@ retry_journal:
}
if (ret) {
+ bool extended = (pos + len > inode->i_size) &&
+ !ext4_verity_in_progress(inode);
+
unlock_page(page);
/*
* __block_write_begin may have instantiated a few blocks
@@ -1349,11 +1352,11 @@ retry_journal:
* Add inode to orphan list in case we crash before
* truncate finishes
*/
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ if (extended && ext4_can_truncate(inode))
ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
- if (pos + len > inode->i_size) {
+ if (extended) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
@@ -1406,6 +1409,7 @@ static int ext4_write_end(struct file *file,
int ret = 0, ret2;
int i_size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
+ bool verity = ext4_verity_in_progress(inode);
trace_ext4_write_end(inode, pos, len, copied);
if (inline_data) {
@@ -1423,12 +1427,16 @@ static int ext4_write_end(struct file *file,
/*
* it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
+ *
+ * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
+ * blocks are being written past EOF, so skip the i_size update.
*/
- i_size_changed = ext4_update_inode_size(inode, pos + copied);
+ if (!verity)
+ i_size_changed = ext4_update_inode_size(inode, pos + copied);
unlock_page(page);
put_page(page);
- if (old_size < pos)
+ if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
/*
* Don't mark the inode dirty under page lock. First, it unnecessarily
@@ -1439,7 +1447,7 @@ static int ext4_write_end(struct file *file,
if (i_size_changed || inline_data)
ext4_mark_inode_dirty(handle, inode);
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
@@ -1450,7 +1458,7 @@ errout:
if (!ret)
ret = ret2;
- if (pos + len > inode->i_size) {
+ if (pos + len > inode->i_size && !verity) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
@@ -1511,6 +1519,7 @@ static int ext4_journalled_write_end(struct file *file,
unsigned from, to;
int size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
+ bool verity = ext4_verity_in_progress(inode);
trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_SIZE - 1);
@@ -1540,13 +1549,14 @@ static int ext4_journalled_write_end(struct file *file,
if (!partial)
SetPageUptodate(page);
}
- size_changed = ext4_update_inode_size(inode, pos + copied);
+ if (!verity)
+ size_changed = ext4_update_inode_size(inode, pos + copied);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
unlock_page(page);
put_page(page);
- if (old_size < pos)
+ if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
if (size_changed || inline_data) {
@@ -1555,7 +1565,7 @@ static int ext4_journalled_write_end(struct file *file,
ret = ret2;
}
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
@@ -1566,7 +1576,7 @@ errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- if (pos + len > inode->i_size) {
+ if (pos + len > inode->i_size && !verity) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
@@ -1646,49 +1656,6 @@ void ext4_da_release_space(struct inode *inode, int to_free)
dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
}
-static void ext4_da_page_release_reservation(struct page *page,
- unsigned int offset,
- unsigned int length)
-{
- int contiguous_blks = 0;
- struct buffer_head *head, *bh;
- unsigned int curr_off = 0;
- struct inode *inode = page->mapping->host;
- unsigned int stop = offset + length;
- ext4_fsblk_t lblk;
-
- BUG_ON(stop > PAGE_SIZE || stop < length);
-
- head = page_buffers(page);
- bh = head;
- do {
- unsigned int next_off = curr_off + bh->b_size;
-
- if (next_off > stop)
- break;
-
- if ((offset <= curr_off) && (buffer_delay(bh))) {
- contiguous_blks++;
- clear_buffer_delay(bh);
- } else if (contiguous_blks) {
- lblk = page->index <<
- (PAGE_SHIFT - inode->i_blkbits);
- lblk += (curr_off >> inode->i_blkbits) -
- contiguous_blks;
- ext4_es_remove_blks(inode, lblk, contiguous_blks);
- contiguous_blks = 0;
- }
- curr_off = next_off;
- } while ((bh = bh->b_this_page) != head);
-
- if (contiguous_blks) {
- lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
- lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
- ext4_es_remove_blks(inode, lblk, contiguous_blks);
- }
-
-}
-
/*
* Delayed allocation stuff
*/
@@ -1868,7 +1835,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
(unsigned long) map->m_lblk);
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, iblock, &es)) {
+ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
if (ext4_es_is_hole(&es)) {
retval = 0;
down_read(&EXT4_I(inode)->i_data_sem);
@@ -2162,7 +2129,8 @@ static int ext4_writepage(struct page *page,
trace_ext4_writepage(page);
size = i_size_read(inode);
- if (page->index == size >> PAGE_SHIFT)
+ if (page->index == size >> PAGE_SHIFT &&
+ !ext4_verity_in_progress(inode))
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
@@ -2246,7 +2214,8 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
* after page tables are updated.
*/
size = i_size_read(mpd->inode);
- if (page->index == size >> PAGE_SHIFT)
+ if (page->index == size >> PAGE_SHIFT &&
+ !ext4_verity_in_progress(mpd->inode))
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
@@ -2345,6 +2314,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
>> inode->i_blkbits;
+ if (ext4_verity_in_progress(inode))
+ blocks = EXT_MAX_BLOCKS;
+
do {
BUG_ON(buffer_locked(bh));
@@ -2785,15 +2757,6 @@ static int ext4_writepages(struct address_space *mapping,
goto out_writepages;
}
- if (ext4_should_dioread_nolock(inode)) {
- /*
- * We may need to convert up to one extent per block in
- * the page and we may dirty the inode.
- */
- rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
- PAGE_SIZE >> inode->i_blkbits);
- }
-
/*
* If we have inline data and arrive here, it means that
* we will soon create the block for the 1st page, so
@@ -2812,6 +2775,15 @@ static int ext4_writepages(struct address_space *mapping,
ext4_journal_stop(handle);
}
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * We may need to convert up to one extent per block in
+ * the page and we may dirty the inode.
+ */
+ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
+ PAGE_SIZE >> inode->i_blkbits);
+ }
+
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
@@ -3061,8 +3033,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_SHIFT;
- if (ext4_nonda_switch(inode->i_sb) ||
- S_ISLNK(inode->i_mode)) {
+ if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) ||
+ ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
return ext4_write_begin(file, mapping, pos,
len, flags, pagep, fsdata);
@@ -3227,24 +3199,6 @@ static int ext4_da_write_end(struct file *file,
return ret ? ret : copied;
}
-static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- /*
- * Drop reserved blocks
- */
- BUG_ON(!PageLocked(page));
- if (!page_has_buffers(page))
- goto out;
-
- ext4_da_page_release_reservation(page, offset, length);
-
-out:
- ext4_invalidatepage(page, offset, length);
-
- return;
-}
-
/*
* Force all delayed allocation blocks to be allocated for a given inode.
*/
@@ -3897,6 +3851,8 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
return 0;
#endif
+ if (fsverity_active(inode))
+ return 0;
/*
* If we are doing data journalling we don't support O_DIRECT
@@ -3985,7 +3941,7 @@ static const struct address_space_operations ext4_da_aops = {
.write_end = ext4_da_write_end,
.set_page_dirty = ext4_set_page_dirty,
.bmap = ext4_bmap,
- .invalidatepage = ext4_da_invalidatepage,
+ .invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
@@ -4297,6 +4253,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
trace_ext4_punch_hole(inode, offset, length, 0);
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ if (ext4_has_inline_data(inode)) {
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ ret = ext4_convert_inline_data(inode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
+ if (ret)
+ return ret;
+ }
+
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
@@ -4739,6 +4704,8 @@ static bool ext4_should_use_dax(struct inode *inode)
return false;
if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
return false;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
+ return false;
return true;
}
@@ -4763,9 +4730,11 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_ENCRYPTED;
if (flags & EXT4_CASEFOLD_FL)
new_fl |= S_CASEFOLD;
+ if (flags & EXT4_VERITY_FL)
+ new_fl |= S_VERITY;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
- S_ENCRYPTED|S_CASEFOLD);
+ S_ENCRYPTED|S_CASEFOLD|S_VERITY);
}
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -5119,6 +5088,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
"iget: bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
+ if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
+ ext4_error_inode(inode, function, line, 0,
+ "casefold flag without casefold feature");
brelse(iloc.bh);
unlock_new_inode(inode);
@@ -5555,6 +5527,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
+ error = fsverity_prepare_setattr(dentry, attr);
+ if (error)
+ return error;
+
if (is_quota_modification(inode, attr)) {
error = dquot_initialize(inode);
if (error)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 442f7ef873fc..0b7f316fd30f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -745,6 +745,74 @@ static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
}
+/* copied from fs/ioctl.c */
+static int fiemap_check_ranges(struct super_block *sb,
+ u64 start, u64 len, u64 *new_len)
+{
+ u64 maxbytes = (u64) sb->s_maxbytes;
+
+ *new_len = len;
+
+ if (len == 0)
+ return -EINVAL;
+
+ if (start > maxbytes)
+ return -EFBIG;
+
+ /*
+ * Shrink request scope to what the fs can actually handle.
+ */
+ if (len > maxbytes || (maxbytes - len) < start)
+ *new_len = maxbytes - start;
+
+ return 0;
+}
+
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
+
+static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
+{
+ struct fiemap fiemap;
+ struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
+ struct fiemap_extent_info fieinfo = { 0, };
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
+ u64 len;
+ int error;
+
+ if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
+ return -EFAULT;
+
+ if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+ return -EINVAL;
+
+ error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
+ &len);
+ if (error)
+ return error;
+
+ fieinfo.fi_flags = fiemap.fm_flags;
+ fieinfo.fi_extents_max = fiemap.fm_extent_count;
+ fieinfo.fi_extents_start = ufiemap->fm_extents;
+
+ if (fiemap.fm_extent_count != 0 &&
+ !access_ok(fieinfo.fi_extents_start,
+ fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
+ return -EFAULT;
+
+ if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+ filemap_write_and_wait(inode->i_mapping);
+
+ error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len);
+ fiemap.fm_flags = fieinfo.fi_flags;
+ fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+ if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
+ error = -EFAULT;
+
+ return error;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1113,8 +1181,62 @@ resizefs_out:
#endif
}
case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg);
+
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_add_key(filp, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_remove_key(filp, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_remove_key_all_users(filp,
+ (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+
+ case EXT4_IOC_CLEAR_ES_CACHE:
+ {
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+ ext4_clear_inode_es(inode);
+ return 0;
+ }
+
+ case EXT4_IOC_GETSTATE:
+ {
+ __u32 state = 0;
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED))
+ state |= EXT4_STATE_FLAG_EXT_PRECACHED;
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
+ state |= EXT4_STATE_FLAG_NEW;
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
+ state |= EXT4_STATE_FLAG_NEWENTRY;
+ if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE))
+ state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE;
+
+ return put_user(state, (__u32 __user *) arg);
+ }
+
+ case EXT4_IOC_GET_ES_CACHE:
+ return ext4_ioctl_get_es_cache(filp, arg);
+
case EXT4_IOC_FSGETXATTR:
{
struct fsxattr fa;
@@ -1171,6 +1293,17 @@ out:
}
case EXT4_IOC_SHUTDOWN:
return ext4_shutdown(sb, arg);
+
+ case FS_IOC_ENABLE_VERITY:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_enable(filp, (const void __user *)arg);
+
+ case FS_IOC_MEASURE_VERITY:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_measure(filp, (void __user *)arg);
+
default:
return -ENOTTY;
}
@@ -1231,8 +1364,18 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_SET_ENCRYPTION_POLICY:
case EXT4_IOC_GET_ENCRYPTION_PWSALT:
case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
case EXT4_IOC_SHUTDOWN:
case FS_IOC_GETFSMAP:
+ case FS_IOC_ENABLE_VERITY:
+ case FS_IOC_MEASURE_VERITY:
+ case EXT4_IOC_CLEAR_ES_CACHE:
+ case EXT4_IOC_GETSTATE:
+ case EXT4_IOC_GET_ES_CACHE:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 129029534075..a427d2031a8d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1312,7 +1312,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
{
int len;
- if (!IS_CASEFOLDED(dir)) {
+ if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) {
cf_name->name = NULL;
return;
}
@@ -2183,7 +2183,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
#ifdef CONFIG_UNICODE
if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
- utf8_validate(sbi->s_encoding, &dentry->d_name))
+ sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name))
return -EINVAL;
#endif
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index c916017db334..a30b203fa461 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -47,13 +47,103 @@
#include "ext4.h"
-static inline bool ext4_bio_encrypted(struct bio *bio)
+#define NUM_PREALLOC_POST_READ_CTXS 128
+
+static struct kmem_cache *bio_post_read_ctx_cache;
+static mempool_t *bio_post_read_ctx_pool;
+
+/* postprocessing steps for read bios */
+enum bio_post_read_step {
+ STEP_INITIAL = 0,
+ STEP_DECRYPT,
+ STEP_VERITY,
+};
+
+struct bio_post_read_ctx {
+ struct bio *bio;
+ struct work_struct work;
+ unsigned int cur_step;
+ unsigned int enabled_steps;
+};
+
+static void __read_end_io(struct bio *bio)
{
-#ifdef CONFIG_FS_ENCRYPTION
- return unlikely(bio->bi_private != NULL);
-#else
- return false;
-#endif
+ struct page *page;
+ struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bv, bio, iter_all) {
+ page = bv->bv_page;
+
+ /* PG_error was set if any post_read step failed */
+ if (bio->bi_status || PageError(page)) {
+ ClearPageUptodate(page);
+ /* will re-read again later */
+ ClearPageError(page);
+ } else {
+ SetPageUptodate(page);
+ }
+ unlock_page(page);
+ }
+ if (bio->bi_private)
+ mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+ bio_put(bio);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
+
+static void decrypt_work(struct work_struct *work)
+{
+ struct bio_post_read_ctx *ctx =
+ container_of(work, struct bio_post_read_ctx, work);
+
+ fscrypt_decrypt_bio(ctx->bio);
+
+ bio_post_read_processing(ctx);
+}
+
+static void verity_work(struct work_struct *work)
+{
+ struct bio_post_read_ctx *ctx =
+ container_of(work, struct bio_post_read_ctx, work);
+
+ fsverity_verify_bio(ctx->bio);
+
+ bio_post_read_processing(ctx);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
+{
+ /*
+ * We use different work queues for decryption and for verity because
+ * verity may require reading metadata pages that need decryption, and
+ * we shouldn't recurse to the same workqueue.
+ */
+ switch (++ctx->cur_step) {
+ case STEP_DECRYPT:
+ if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
+ INIT_WORK(&ctx->work, decrypt_work);
+ fscrypt_enqueue_decrypt_work(&ctx->work);
+ return;
+ }
+ ctx->cur_step++;
+ /* fall-through */
+ case STEP_VERITY:
+ if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+ INIT_WORK(&ctx->work, verity_work);
+ fsverity_enqueue_verify_work(&ctx->work);
+ return;
+ }
+ ctx->cur_step++;
+ /* fall-through */
+ default:
+ __read_end_io(ctx->bio);
+ }
+}
+
+static bool bio_post_read_required(struct bio *bio)
+{
+ return bio->bi_private && !bio->bi_status;
}
/*
@@ -70,30 +160,53 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
*/
static void mpage_end_io(struct bio *bio)
{
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
+ if (bio_post_read_required(bio)) {
+ struct bio_post_read_ctx *ctx = bio->bi_private;
- if (ext4_bio_encrypted(bio)) {
- if (bio->bi_status) {
- fscrypt_release_ctx(bio->bi_private);
- } else {
- fscrypt_enqueue_decrypt_bio(bio->bi_private, bio);
- return;
- }
+ ctx->cur_step = STEP_INITIAL;
+ bio_post_read_processing(ctx);
+ return;
}
- bio_for_each_segment_all(bv, bio, iter_all) {
- struct page *page = bv->bv_page;
+ __read_end_io(bio);
+}
- if (!bio->bi_status) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
+static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx)
+{
+ return fsverity_active(inode) &&
+ idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
+static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode,
+ struct bio *bio,
+ pgoff_t first_idx)
+{
+ unsigned int post_read_steps = 0;
+ struct bio_post_read_ctx *ctx = NULL;
+
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+ post_read_steps |= 1 << STEP_DECRYPT;
+
+ if (ext4_need_verity(inode, first_idx))
+ post_read_steps |= 1 << STEP_VERITY;
+
+ if (post_read_steps) {
+ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+ ctx->bio = bio;
+ ctx->enabled_steps = post_read_steps;
+ bio->bi_private = ctx;
}
+ return ctx;
+}
- bio_put(bio);
+static inline loff_t ext4_readpage_limit(struct inode *inode)
+{
+ if (IS_ENABLED(CONFIG_FS_VERITY) &&
+ (IS_VERITY(inode) || ext4_verity_in_progress(inode)))
+ return inode->i_sb->s_maxbytes;
+
+ return i_size_read(inode);
}
int ext4_mpage_readpages(struct address_space *mapping,
@@ -141,7 +254,8 @@ int ext4_mpage_readpages(struct address_space *mapping,
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
- last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+ last_block_in_file = (ext4_readpage_limit(inode) +
+ blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
@@ -218,6 +332,9 @@ int ext4_mpage_readpages(struct address_space *mapping,
zero_user_segment(page, first_hole << blkbits,
PAGE_SIZE);
if (first_hole == 0) {
+ if (ext4_need_verity(inode, page->index) &&
+ !fsverity_verify_page(page))
+ goto set_error_page;
SetPageUptodate(page);
unlock_page(page);
goto next_page;
@@ -241,18 +358,16 @@ int ext4_mpage_readpages(struct address_space *mapping,
bio = NULL;
}
if (bio == NULL) {
- struct fscrypt_ctx *ctx = NULL;
+ struct bio_post_read_ctx *ctx;
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
- ctx = fscrypt_get_ctx(GFP_NOFS);
- if (IS_ERR(ctx))
- goto set_error_page;
- }
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
- if (!bio) {
- if (ctx)
- fscrypt_release_ctx(ctx);
+ if (!bio)
+ goto set_error_page;
+ ctx = get_bio_post_read_ctx(inode, bio, page->index);
+ if (IS_ERR(ctx)) {
+ bio_put(bio);
+ bio = NULL;
goto set_error_page;
}
bio_set_dev(bio, bdev);
@@ -293,3 +408,29 @@ int ext4_mpage_readpages(struct address_space *mapping,
submit_bio(bio);
return 0;
}
+
+int __init ext4_init_post_read_processing(void)
+{
+ bio_post_read_ctx_cache =
+ kmem_cache_create("ext4_bio_post_read_ctx",
+ sizeof(struct bio_post_read_ctx), 0, 0, NULL);
+ if (!bio_post_read_ctx_cache)
+ goto fail;
+ bio_post_read_ctx_pool =
+ mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
+ bio_post_read_ctx_cache);
+ if (!bio_post_read_ctx_pool)
+ goto fail_free_cache;
+ return 0;
+
+fail_free_cache:
+ kmem_cache_destroy(bio_post_read_ctx_cache);
+fail:
+ return -ENOMEM;
+}
+
+void ext4_exit_post_read_processing(void)
+{
+ mempool_destroy(bio_post_read_ctx_pool);
+ kmem_cache_destroy(bio_post_read_ctx_cache);
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4079605d437a..dd654e53ba3d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1107,6 +1107,9 @@ static int ext4_drop_inode(struct inode *inode)
{
int drop = generic_drop_inode(inode);
+ if (!drop)
+ drop = fscrypt_drop_inode(inode);
+
trace_ext4_drop_inode(inode, drop);
return drop;
}
@@ -1179,6 +1182,7 @@ void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->jinode = NULL;
}
fscrypt_put_encryption_info(inode);
+ fsverity_cleanup_inode(inode);
}
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -1874,6 +1878,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
} else if (token == Opt_commit) {
if (arg == 0)
arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ else if (arg > INT_MAX / HZ) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid commit interval %d, "
+ "must be smaller than %d",
+ arg, INT_MAX / HZ);
+ return -1;
+ }
sbi->s_commit_interval = HZ * arg;
} else if (token == Opt_debug_want_extra_isize) {
sbi->s_want_extra_isize = arg;
@@ -4035,8 +4046,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_inode_size);
goto failed_mount;
}
- if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
- sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
+ /*
+ * i_atime_extra is the last extra field available for [acm]times in
+ * struct ext4_inode. Checking for that field should suffice to ensure
+ * we have extra space for all three.
+ */
+ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
+ sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
+ sb->s_time_gran = 1;
+ sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
+ } else {
+ sb->s_time_gran = NSEC_PER_SEC;
+ sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
+ }
+
+ sb->s_time_min = EXT4_TIMESTAMP_MIN;
}
sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
@@ -4272,6 +4296,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &ext4_cryptops;
#endif
+#ifdef CONFIG_FS_VERITY
+ sb->s_vop = &ext4_verityops;
+#endif
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
if (ext4_has_feature_quota(sb))
@@ -4419,6 +4446,11 @@ no_journal:
goto failed_mount_wq;
}
+ if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
+ goto failed_mount_wq;
+ }
+
if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
!ext4_has_feature_encrypt(sb)) {
ext4_set_feature_encrypt(sb);
@@ -6095,6 +6127,10 @@ static int __init ext4_init_fs(void)
err = ext4_init_pending();
if (err)
+ goto out7;
+
+ err = ext4_init_post_read_processing();
+ if (err)
goto out6;
err = ext4_init_pageio();
@@ -6135,8 +6171,10 @@ out3:
out4:
ext4_exit_pageio();
out5:
- ext4_exit_pending();
+ ext4_exit_post_read_processing();
out6:
+ ext4_exit_pending();
+out7:
ext4_exit_es();
return err;
@@ -6153,6 +6191,7 @@ static void __exit ext4_exit_fs(void)
ext4_exit_sysfs();
ext4_exit_system_zone();
ext4_exit_pageio();
+ ext4_exit_post_read_processing();
ext4_exit_es();
ext4_exit_pending();
}
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index b3cd7655a6ff..eb1efad0e20a 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -242,6 +242,9 @@ EXT4_ATTR_FEATURE(encryption);
#ifdef CONFIG_UNICODE
EXT4_ATTR_FEATURE(casefold);
#endif
+#ifdef CONFIG_FS_VERITY
+EXT4_ATTR_FEATURE(verity);
+#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
static struct attribute *ext4_feat_attrs[] = {
@@ -254,6 +257,9 @@ static struct attribute *ext4_feat_attrs[] = {
#ifdef CONFIG_UNICODE
ATTR_LIST(casefold),
#endif
+#ifdef CONFIG_FS_VERITY
+ ATTR_LIST(verity),
+#endif
ATTR_LIST(metadata_csum_seed),
NULL,
};
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
new file mode 100644
index 000000000000..d0d8a9795dd6
--- /dev/null
+++ b/fs/ext4/verity.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/ext4/verity.c: fs-verity support for ext4
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Implementation of fsverity_operations for ext4.
+ *
+ * ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past
+ * the end of the file, starting at the first 64K boundary beyond i_size. This
+ * approach works because (a) verity files are readonly, and (b) pages fully
+ * beyond i_size aren't visible to userspace but can be read/written internally
+ * by ext4 with only some relatively small changes to ext4. This approach
+ * avoids having to depend on the EA_INODE feature and on rearchitecturing
+ * ext4's xattr support to support paging multi-gigabyte xattrs into memory, and
+ * to support encrypting xattrs. Note that the verity metadata *must* be
+ * encrypted when the file is, since it contains hashes of the plaintext data.
+ *
+ * Using a 64K boundary rather than a 4K one keeps things ready for
+ * architectures with 64K pages, and it doesn't necessarily waste space on-disk
+ * since there can be a hole between i_size and the start of the Merkle tree.
+ */
+
+#include <linux/quotaops.h>
+
+#include "ext4.h"
+#include "ext4_extents.h"
+#include "ext4_jbd2.h"
+
+static inline loff_t ext4_verity_metadata_pos(const struct inode *inode)
+{
+ return round_up(inode->i_size, 65536);
+}
+
+/*
+ * Read some verity metadata from the inode. __vfs_read() can't be used because
+ * we need to read beyond i_size.
+ */
+static int pagecache_read(struct inode *inode, void *buf, size_t count,
+ loff_t pos)
+{
+ while (count) {
+ size_t n = min_t(size_t, count,
+ PAGE_SIZE - offset_in_page(pos));
+ struct page *page;
+ void *addr;
+
+ page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
+ NULL);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ addr = kmap_atomic(page);
+ memcpy(buf, addr + offset_in_page(pos), n);
+ kunmap_atomic(addr);
+
+ put_page(page);
+
+ buf += n;
+ pos += n;
+ count -= n;
+ }
+ return 0;
+}
+
+/*
+ * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
+ * kernel_write() can't be used because the file descriptor is readonly.
+ */
+static int pagecache_write(struct inode *inode, const void *buf, size_t count,
+ loff_t pos)
+{
+ if (pos + count > inode->i_sb->s_maxbytes)
+ return -EFBIG;
+
+ while (count) {
+ size_t n = min_t(size_t, count,
+ PAGE_SIZE - offset_in_page(pos));
+ struct page *page;
+ void *fsdata;
+ void *addr;
+ int res;
+
+ res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
+ &page, &fsdata);
+ if (res)
+ return res;
+
+ addr = kmap_atomic(page);
+ memcpy(addr + offset_in_page(pos), buf, n);
+ kunmap_atomic(addr);
+
+ res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
+ page, fsdata);
+ if (res < 0)
+ return res;
+ if (res != n)
+ return -EIO;
+
+ buf += n;
+ pos += n;
+ count -= n;
+ }
+ return 0;
+}
+
+static int ext4_begin_enable_verity(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ const int credits = 2; /* superblock and inode for ext4_orphan_add() */
+ handle_t *handle;
+ int err;
+
+ if (ext4_verity_in_progress(inode))
+ return -EBUSY;
+
+ /*
+ * Since the file was opened readonly, we have to initialize the jbd
+ * inode and quotas here and not rely on ->open() doing it. This must
+ * be done before evicting the inline data.
+ */
+
+ err = ext4_inode_attach_jinode(inode);
+ if (err)
+ return err;
+
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
+ err = ext4_convert_inline_data(inode);
+ if (err)
+ return err;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ext4_warning_inode(inode,
+ "verity is only allowed on extent-based files");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * ext4 uses the last allocated block to find the verity descriptor, so
+ * we must remove any other blocks past EOF which might confuse things.
+ */
+ err = ext4_truncate(inode);
+ if (err)
+ return err;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_orphan_add(handle, inode);
+ if (err == 0)
+ ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+ ext4_journal_stop(handle);
+ return err;
+}
+
+/*
+ * ext4 stores the verity descriptor beginning on the next filesystem block
+ * boundary after the Merkle tree. Then, the descriptor size is stored in the
+ * last 4 bytes of the last allocated filesystem block --- which is either the
+ * block in which the descriptor ends, or the next block after that if there
+ * weren't at least 4 bytes remaining.
+ *
+ * We can't simply store the descriptor in an xattr because it *must* be
+ * encrypted when ext4 encryption is used, but ext4 encryption doesn't encrypt
+ * xattrs. Also, if the descriptor includes a large signature blob it may be
+ * too large to store in an xattr without the EA_INODE feature.
+ */
+static int ext4_write_verity_descriptor(struct inode *inode, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ const u64 desc_pos = round_up(ext4_verity_metadata_pos(inode) +
+ merkle_tree_size, i_blocksize(inode));
+ const u64 desc_end = desc_pos + desc_size;
+ const __le32 desc_size_disk = cpu_to_le32(desc_size);
+ const u64 desc_size_pos = round_up(desc_end + sizeof(desc_size_disk),
+ i_blocksize(inode)) -
+ sizeof(desc_size_disk);
+ int err;
+
+ err = pagecache_write(inode, desc, desc_size, desc_pos);
+ if (err)
+ return err;
+
+ return pagecache_write(inode, &desc_size_disk, sizeof(desc_size_disk),
+ desc_size_pos);
+}
+
+static int ext4_end_enable_verity(struct file *filp, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ struct inode *inode = file_inode(filp);
+ const int credits = 2; /* superblock and inode for ext4_orphan_del() */
+ handle_t *handle;
+ int err = 0;
+ int err2;
+
+ if (desc != NULL) {
+ /* Succeeded; write the verity descriptor. */
+ err = ext4_write_verity_descriptor(inode, desc, desc_size,
+ merkle_tree_size);
+
+ /* Write all pages before clearing VERITY_IN_PROGRESS. */
+ if (!err)
+ err = filemap_write_and_wait(inode->i_mapping);
+ }
+
+ /* If we failed, truncate anything we wrote past i_size. */
+ if (desc == NULL || err)
+ ext4_truncate(inode);
+
+ /*
+ * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
+ * deleting the inode from the orphan list, even if something failed.
+ * If everything succeeded, we'll also set the verity bit in the same
+ * transaction.
+ */
+
+ ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ return PTR_ERR(handle);
+ }
+
+ err2 = ext4_orphan_del(handle, inode);
+ if (err2)
+ goto out_stop;
+
+ if (desc != NULL && !err) {
+ struct ext4_iloc iloc;
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_stop;
+ ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
+ ext4_set_inode_flags(inode);
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ }
+out_stop:
+ ext4_journal_stop(handle);
+ return err ?: err2;
+}
+
+static int ext4_get_verity_descriptor_location(struct inode *inode,
+ size_t *desc_size_ret,
+ u64 *desc_pos_ret)
+{
+ struct ext4_ext_path *path;
+ struct ext4_extent *last_extent;
+ u32 end_lblk;
+ u64 desc_size_pos;
+ __le32 desc_size_disk;
+ u32 desc_size;
+ u64 desc_pos;
+ int err;
+
+ /*
+ * Descriptor size is in last 4 bytes of last allocated block.
+ * See ext4_write_verity_descriptor().
+ */
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ EXT4_ERROR_INODE(inode, "verity file doesn't use extents");
+ return -EFSCORRUPTED;
+ }
+
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ last_extent = path[path->p_depth].p_ext;
+ if (!last_extent) {
+ EXT4_ERROR_INODE(inode, "verity file has no extents");
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return -EFSCORRUPTED;
+ }
+
+ end_lblk = le32_to_cpu(last_extent->ee_block) +
+ ext4_ext_get_actual_len(last_extent);
+ desc_size_pos = (u64)end_lblk << inode->i_blkbits;
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ if (desc_size_pos < sizeof(desc_size_disk))
+ goto bad;
+ desc_size_pos -= sizeof(desc_size_disk);
+
+ err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk),
+ desc_size_pos);
+ if (err)
+ return err;
+ desc_size = le32_to_cpu(desc_size_disk);
+
+ /*
+ * The descriptor is stored just before the desc_size_disk, but starting
+ * on a filesystem block boundary.
+ */
+
+ if (desc_size > INT_MAX || desc_size > desc_size_pos)
+ goto bad;
+
+ desc_pos = round_down(desc_size_pos - desc_size, i_blocksize(inode));
+ if (desc_pos < ext4_verity_metadata_pos(inode))
+ goto bad;
+
+ *desc_size_ret = desc_size;
+ *desc_pos_ret = desc_pos;
+ return 0;
+
+bad:
+ EXT4_ERROR_INODE(inode, "verity file corrupted; can't find descriptor");
+ return -EFSCORRUPTED;
+}
+
+static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ size_t desc_size = 0;
+ u64 desc_pos = 0;
+ int err;
+
+ err = ext4_get_verity_descriptor_location(inode, &desc_size, &desc_pos);
+ if (err)
+ return err;
+
+ if (buf_size) {
+ if (desc_size > buf_size)
+ return -ERANGE;
+ err = pagecache_read(inode, buf, desc_size, desc_pos);
+ if (err)
+ return err;
+ }
+ return desc_size;
+}
+
+static struct page *ext4_read_merkle_tree_page(struct inode *inode,
+ pgoff_t index)
+{
+ index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
+
+ return read_mapping_page(inode->i_mapping, index, NULL);
+}
+
+static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
+ u64 index, int log_blocksize)
+{
+ loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize);
+
+ return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+}
+
+const struct fsverity_operations ext4_verityops = {
+ .begin_enable_verity = ext4_begin_enable_verity,
+ .end_enable_verity = ext4_end_enable_verity,
+ .get_verity_descriptor = ext4_get_verity_descriptor,
+ .read_merkle_tree_page = ext4_read_merkle_tree_page,
+ .write_merkle_tree_block = ext4_write_merkle_tree_block,
+};