Merge tag 'v5.4-rc5' into next

Sync up with mainline.
author: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2019-10-27 11:00:19 -0700
committer: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2019-10-27 11:00:19 -0700
commit: 728d90bdc9e480dc93913e59a0aa3c896c7aa697 (patch)
tree: 258b1b6ee711f0ef67fd225700d84eccec285194 /fs/ext4
parent: cb3efd5a38855eabd26c2b631dd027169678d60f (diff)
parent: d6d5df1db6e9d7f8f76d2911707f7d5877251b02 (diff)
17 files changed, 1532 insertions, 331 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8fdfcd3c3e04..b17ddc229ac5 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -13,3 +13,4 @@ ext4-y	:= balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
 
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
+ext4-$(CONFIG_FS_VERITY)		+= verity.o
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 8e83741b02e0..d4d4fdfac1a6 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -38,6 +38,7 @@ int __init ext4_init_system_zone(void)
 
 void ext4_exit_system_zone(void)
 {
+	rcu_barrier();
 	kmem_cache_destroy(ext4_system_zone_cachep);
 }
 
@@ -49,17 +50,26 @@ static inline int can_merge(struct ext4_system_zone *entry1,
 	return 0;
 }
 
+static void release_system_zone(struct ext4_system_blocks *system_blks)
+{
+	struct ext4_system_zone	*entry, *n;
+
+	rbtree_postorder_for_each_entry_safe(entry, n,
+				&system_blks->root, node)
+		kmem_cache_free(ext4_system_zone_cachep, entry);
+}
+
 /*
  * Mark a range of blocks as belonging to the "system zone" --- that
  * is, filesystem metadata blocks which should never be used by
  * inodes.
  */
-static int add_system_zone(struct ext4_sb_info *sbi,
+static int add_system_zone(struct ext4_system_blocks *system_blks,
 			   ext4_fsblk_t start_blk,
 			   unsigned int count)
 {
 	struct ext4_system_zone *new_entry = NULL, *entry;
-	struct rb_node **n = &sbi->system_blks.rb_node, *node;
+	struct rb_node **n = &system_blks->root.rb_node, *node;
 	struct rb_node *parent = NULL, *new_node = NULL;
 
 	while (*n) {
@@ -91,7 +101,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
 		new_node = &new_entry->node;
 
 		rb_link_node(new_node, parent, n);
-		rb_insert_color(new_node, &sbi->system_blks);
+		rb_insert_color(new_node, &system_blks->root);
 	}
 
 	/* Can we merge to the left? */
@@ -101,7 +111,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
 		if (can_merge(entry, new_entry)) {
 			new_entry->start_blk = entry->start_blk;
 			new_entry->count += entry->count;
-			rb_erase(node, &sbi->system_blks);
+			rb_erase(node, &system_blks->root);
 			kmem_cache_free(ext4_system_zone_cachep, entry);
 		}
 	}
@@ -112,7 +122,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
 		entry = rb_entry(node, struct ext4_system_zone, node);
 		if (can_merge(new_entry, entry)) {
 			new_entry->count += entry->count;
-			rb_erase(node, &sbi->system_blks);
+			rb_erase(node, &system_blks->root);
 			kmem_cache_free(ext4_system_zone_cachep, entry);
 		}
 	}
@@ -126,7 +136,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
 	int first = 1;
 
 	printk(KERN_INFO "System zones: ");
-	node = rb_first(&sbi->system_blks);
+	node = rb_first(&sbi->system_blks->root);
 	while (node) {
 		entry = rb_entry(node, struct ext4_system_zone, node);
 		printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ",
@@ -137,7 +147,47 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
 	printk(KERN_CONT "\n");
 }
 
-static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
+				     struct ext4_system_blocks *system_blks,
+				     ext4_fsblk_t start_blk,
+				     unsigned int count)
+{
+	struct ext4_system_zone *entry;
+	struct rb_node *n;
+
+	if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+	    (start_blk + count < start_blk) ||
+	    (start_blk + count > ext4_blocks_count(sbi->s_es))) {
+		sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+		return 0;
+	}
+
+	if (system_blks == NULL)
+		return 1;
+
+	n = system_blks->root.rb_node;
+	while (n) {
+		entry = rb_entry(n, struct ext4_system_zone, node);
+		if (start_blk + count - 1 < entry->start_blk)
+			n = n->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = n->rb_right;
+		else {
+			sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int ext4_protect_reserved_inode(struct super_block *sb,
+				       struct ext4_system_blocks *system_blks,
+				       u32 ino)
 {
 	struct inode *inode;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -163,14 +213,15 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
 		if (n == 0) {
 			i++;
 		} else {
-			if (!ext4_data_block_valid(sbi, map.m_pblk, n)) {
+			if (!ext4_data_block_valid_rcu(sbi, system_blks,
+						map.m_pblk, n)) {
 				ext4_error(sb, "blocks %llu-%llu from inode %u "
 					   "overlap system zone", map.m_pblk,
 					   map.m_pblk + map.m_len - 1, ino);
 				err = -EFSCORRUPTED;
 				break;
 			}
-			err = add_system_zone(sbi, map.m_pblk, n);
+			err = add_system_zone(system_blks, map.m_pblk, n);
 			if (err < 0)
 				break;
 			i += n;
@@ -180,94 +231,130 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
 	return err;
 }
 
+static void ext4_destroy_system_zone(struct rcu_head *rcu)
+{
+	struct ext4_system_blocks *system_blks;
+
+	system_blks = container_of(rcu, struct ext4_system_blocks, rcu);
+	release_system_zone(system_blks);
+	kfree(system_blks);
+}
+
+/*
+ * Build system zone rbtree which is used for block validity checking.
+ *
+ * The update of system_blks pointer in this function is protected by
+ * sb->s_umount semaphore. However we have to be careful as we can be
+ * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * protected only by RCU. That's why we first build the rbtree and then
+ * swap it in place.
+ */
 int ext4_setup_system_zone(struct super_block *sb)
 {
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_system_blocks *system_blks;
 	struct ext4_group_desc *gdp;
 	ext4_group_t i;
 	int flex_size = ext4_flex_bg_size(sbi);
 	int ret;
 
 	if (!test_opt(sb, BLOCK_VALIDITY)) {
-		if (sbi->system_blks.rb_node)
+		if (sbi->system_blks)
 			ext4_release_system_zone(sb);
 		return 0;
 	}
-	if (sbi->system_blks.rb_node)
+	if (sbi->system_blks)
 		return 0;
 
+	system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
+	if (!system_blks)
+		return -ENOMEM;
+
 	for (i=0; i < ngroups; i++) {
 		cond_resched();
 		if (ext4_bg_has_super(sb, i) &&
 		    ((i < 5) || ((i % flex_size) == 0)))
-			add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+			add_system_zone(system_blks,
+					ext4_group_first_block_no(sb, i),
 					ext4_bg_num_gdb(sb, i) + 1);
 		gdp = ext4_get_group_desc(sb, i, NULL);
-		ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+		ret = add_system_zone(system_blks,
+				ext4_block_bitmap(sb, gdp), 1);
 		if (ret)
-			return ret;
-		ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+			goto err;
+		ret = add_system_zone(system_blks,
+				ext4_inode_bitmap(sb, gdp), 1);
 		if (ret)
-			return ret;
-		ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+			goto err;
+		ret = add_system_zone(system_blks,
+				ext4_inode_table(sb, gdp),
 				sbi->s_itb_per_group);
 		if (ret)
-			return ret;
+			goto err;
 	}
 	if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) {
-		ret = ext4_protect_reserved_inode(sb,
+		ret = ext4_protect_reserved_inode(sb, system_blks,
 				le32_to_cpu(sbi->s_es->s_journal_inum));
 		if (ret)
-			return ret;
+			goto err;
 	}
 
+	/*
+	 * System blks rbtree complete, announce it once to prevent racing
+	 * with ext4_data_block_valid() accessing the rbtree at the same
+	 * time.
+	 */
+	rcu_assign_pointer(sbi->system_blks, system_blks);
+
 	if (test_opt(sb, DEBUG))
 		debug_print_tree(sbi);
 	return 0;
+err:
+	release_system_zone(system_blks);
+	kfree(system_blks);
+	return ret;
 }
 
-/* Called when the filesystem is unmounted */
+/*
+ * Called when the filesystem is unmounted or when remounting it with
+ * noblock_validity specified.
+ *
+ * The update of system_blks pointer in this function is protected by
+ * sb->s_umount semaphore. However we have to be careful as we can be
+ * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * protected only by RCU. So we first clear the system_blks pointer and
+ * then free the rbtree only after RCU grace period expires.
+ */
 void ext4_release_system_zone(struct super_block *sb)
 {
-	struct ext4_system_zone	*entry, *n;
+	struct ext4_system_blocks *system_blks;
 
-	rbtree_postorder_for_each_entry_safe(entry, n,
-			&EXT4_SB(sb)->system_blks, node)
-		kmem_cache_free(ext4_system_zone_cachep, entry);
+	system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks,
+					lockdep_is_held(&sb->s_umount));
+	rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL);
 
-	EXT4_SB(sb)->system_blks = RB_ROOT;
+	if (system_blks)
+		call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
 }
 
-/*
- * Returns 1 if the passed-in block region (start_blk,
- * start_blk+count) is valid; 0 if some part of the block region
- * overlaps with filesystem metadata blocks.
- */
 int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
 			  unsigned int count)
 {
-	struct ext4_system_zone *entry;
-	struct rb_node *n = sbi->system_blks.rb_node;
+	struct ext4_system_blocks *system_blks;
+	int ret;
 
-	if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
-	    (start_blk + count < start_blk) ||
-	    (start_blk + count > ext4_blocks_count(sbi->s_es))) {
-		sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
-		return 0;
-	}
-	while (n) {
-		entry = rb_entry(n, struct ext4_system_zone, node);
-		if (start_blk + count - 1 < entry->start_blk)
-			n = n->rb_left;
-		else if (start_blk >= (entry->start_blk + entry->count))
-			n = n->rb_right;
-		else {
-			sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
-			return 0;
-		}
-	}
-	return 1;
+	/*
+	 * Lock the system zone to prevent it being released concurrently
+	 * when doing a remount which inverse current "[no]block_validity"
+	 * mount option.
+	 */
+	rcu_read_lock();
+	system_blks = rcu_dereference(sbi->system_blks);
+	ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk,
+					count);
+	rcu_read_unlock();
+	return ret;
 }
 
 int ext4_check_blockref(const char *function, unsigned int line,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86054f31fe4d..9fdd2b269d61 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -668,14 +668,15 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
 			  const char *str, const struct qstr *name)
 {
 	struct qstr qstr = {.name = str, .len = len };
+	struct inode *inode = dentry->d_parent->d_inode;
 
-	if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) {
+	if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) {
 		if (len != name->len)
 			return -1;
 		return memcmp(str, name->name, len);
 	}
 
-	return ext4_ci_compare(dentry->d_parent->d_inode, name, &qstr, false);
+	return ext4_ci_compare(inode, name, &qstr, false);
 }
 
 static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
@@ -685,7 +686,7 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
 	unsigned char *norm;
 	int len, ret = 0;
 
-	if (!IS_CASEFOLDED(dentry->d_inode))
+	if (!IS_CASEFOLDED(dentry->d_inode) || !um)
 		return 0;
 
 	norm = kmalloc(PATH_MAX, GFP_ATOMIC);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf660aa7a9e0..03db3e71676c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -41,6 +41,7 @@
 #endif
 
 #include <linux/fscrypt.h>
+#include <linux/fsverity.h>
 
 #include <linux/compiler.h>
 
@@ -185,6 +186,14 @@ struct ext4_map_blocks {
 };
 
 /*
+ * Block validity checking, system zone rbtree.
+ */
+struct ext4_system_blocks {
+	struct rb_root root;
+	struct rcu_head rcu;
+};
+
+/*
  * Flags for ext4_io_end->flags
  */
 #define	EXT4_IO_END_UNWRITTEN	0x0001
@@ -284,6 +293,9 @@ struct ext4_io_submit {
 				  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
 #define EXT4_LBLK_CMASK(s, lblk) ((lblk) &				\
 				  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Fill in the low bits to get the last block of the cluster */
+#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) |				\
+				    ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
 /* Get the cluster offset */
 #define EXT4_PBLK_COFF(s, pblk) ((pblk) &				\
 				 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
@@ -395,6 +407,7 @@ struct flex_groups {
 #define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
+#define EXT4_VERITY_FL			0x00100000 /* Verity protected inode */
 #define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
 #define EXT4_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
 #define EXT4_INLINE_DATA_FL		0x10000000 /* Inode has inline data. */
@@ -402,7 +415,7 @@ struct flex_groups {
 #define EXT4_CASEFOLD_FL		0x40000000 /* Casefolded file */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
-#define EXT4_FL_USER_VISIBLE		0x704BDFFF /* User visible flags */
+#define EXT4_FL_USER_VISIBLE		0x705BDFFF /* User visible flags */
 #define EXT4_FL_USER_MODIFIABLE		0x604BC0FF /* User modifiable flags */
 
 /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
@@ -467,6 +480,7 @@ enum {
 	EXT4_INODE_TOPDIR	= 17,	/* Top of directory hierarchies*/
 	EXT4_INODE_HUGE_FILE	= 18,	/* Set to each huge file */
 	EXT4_INODE_EXTENTS	= 19,	/* Inode uses extents */
+	EXT4_INODE_VERITY	= 20,	/* Verity protected inode */
 	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
 	EXT4_INODE_EOFBLOCKS	= 22,	/* Blocks allocated beyond EOF */
 	EXT4_INODE_INLINE_DATA	= 28,	/* Data in inode. */
@@ -512,6 +526,7 @@ static inline void ext4_check_flag_values(void)
 	CHECK_FLAG_VALUE(TOPDIR);
 	CHECK_FLAG_VALUE(HUGE_FILE);
 	CHECK_FLAG_VALUE(EXTENTS);
+	CHECK_FLAG_VALUE(VERITY);
 	CHECK_FLAG_VALUE(EA_INODE);
 	CHECK_FLAG_VALUE(EOFBLOCKS);
 	CHECK_FLAG_VALUE(INLINE_DATA);
@@ -649,6 +664,10 @@ enum {
 #define EXT4_IOC_SET_ENCRYPTION_POLICY	FS_IOC_SET_ENCRYPTION_POLICY
 #define EXT4_IOC_GET_ENCRYPTION_PWSALT	FS_IOC_GET_ENCRYPTION_PWSALT
 #define EXT4_IOC_GET_ENCRYPTION_POLICY	FS_IOC_GET_ENCRYPTION_POLICY
+/* ioctl codes 19--39 are reserved for fscrypt */
+#define EXT4_IOC_CLEAR_ES_CACHE		_IO('f', 40)
+#define EXT4_IOC_GETSTATE		_IOW('f', 41, __u32)
+#define EXT4_IOC_GET_ES_CACHE		_IOWR('f', 42, struct fiemap)
 
 #define EXT4_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
 #define EXT4_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
@@ -662,6 +681,16 @@ enum {
 #define EXT4_GOING_FLAGS_LOGFLUSH		0x1	/* flush log but not data */
 #define EXT4_GOING_FLAGS_NOLOGFLUSH		0x2	/* don't flush log nor data */
 
+/*
+ * Flags returned by EXT4_IOC_GETSTATE
+ *
+ * We only expose to userspace a subset of the state flags in
+ * i_state_flags
+ */
+#define EXT4_STATE_FLAG_EXT_PRECACHED	0x00000001
+#define EXT4_STATE_FLAG_NEW		0x00000002
+#define EXT4_STATE_FLAG_NEWENTRY	0x00000004
+#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE	0x00000008
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -679,6 +708,12 @@ enum {
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
 #endif
 
+/*
+ * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag.
+ * It indicates that the entry in extent status cache is for a hole.
+ */
+#define EXT4_FIEMAP_EXTENT_HOLE		0x08000000
+
 /* Max physical block we can address w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS	0xFFFFFFFF
 
@@ -808,31 +843,20 @@ static inline __le32 ext4_encode_extra_time(struct timespec64 *time)
 static inline void ext4_decode_extra_time(struct timespec64 *time,
 					  __le32 extra)
 {
-	if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) {
-
-#if 1
-		/* Handle legacy encoding of pre-1970 dates with epoch
-		 * bits 1,1. (This backwards compatibility may be removed
-		 * at the discretion of the ext4 developers.)
-		 */
-		u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
-		if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
-			extra_bits = 0;
-		time->tv_sec += extra_bits << 32;
-#else
+	if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
 		time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
-#endif
-	}
 	time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
 }
 
 #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)				\
 do {										\
-	(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);		\
 	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     {\
+		(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);	\
 		(raw_inode)->xtime ## _extra =					\
 				ext4_encode_extra_time(&(inode)->xtime);	\
 		}								\
+	else	\
+		(raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX));	\
 } while (0)
 
 #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)			       \
@@ -1421,7 +1445,7 @@ struct ext4_sb_info {
 	int s_jquota_fmt;			/* Format of quota to use */
 #endif
 	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
-	struct rb_root system_blks;
+	struct ext4_system_blocks __rcu *system_blks;
 
 #ifdef EXTENTS_STATS
 	/* ext4 extents stats */
@@ -1560,6 +1584,7 @@ enum {
 	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
 	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */
 	EXT4_STATE_LUSTRE_EA_INODE,	/* Lustre-style ea_inode */
+	EXT4_STATE_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
@@ -1610,6 +1635,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_SB(sb)	(sb)
 #endif
 
+static inline bool ext4_verity_in_progress(struct inode *inode)
+{
+	return IS_ENABLED(CONFIG_FS_VERITY) &&
+	       ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+}
+
 #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
 
 /*
@@ -1632,6 +1663,10 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 
 #define EXT4_GOOD_OLD_INODE_SIZE 128
 
+#define EXT4_EXTRA_TIMESTAMP_MAX	(((s64)1 << 34) - 1  + S32_MIN)
+#define EXT4_NON_EXTRA_TIMESTAMP_MAX	S32_MAX
+#define EXT4_TIMESTAMP_MIN		S32_MIN
+
 /*
  * Feature set definitions
  */
@@ -1662,6 +1697,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM	0x0400
 #define EXT4_FEATURE_RO_COMPAT_READONLY		0x1000
 #define EXT4_FEATURE_RO_COMPAT_PROJECT		0x2000
+#define EXT4_FEATURE_RO_COMPAT_VERITY		0x8000
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
@@ -1756,6 +1792,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc,		BIGALLOC)
 EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum,	METADATA_CSUM)
 EXT4_FEATURE_RO_COMPAT_FUNCS(readonly,		READONLY)
 EXT4_FEATURE_RO_COMPAT_FUNCS(project,		PROJECT)
+EXT4_FEATURE_RO_COMPAT_FUNCS(verity,		VERITY)
 
 EXT4_FEATURE_INCOMPAT_FUNCS(compression,	COMPRESSION)
 EXT4_FEATURE_INCOMPAT_FUNCS(filetype,		FILETYPE)
@@ -1813,7 +1850,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold,		CASEFOLD)
 					 EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
 					 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
 					 EXT4_FEATURE_RO_COMPAT_QUOTA |\
-					 EXT4_FEATURE_RO_COMPAT_PROJECT)
+					 EXT4_FEATURE_RO_COMPAT_PROJECT |\
+					 EXT4_FEATURE_RO_COMPAT_VERITY)
 
 #define EXTN_FEATURE_FUNCS(ver) \
 static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -3177,6 +3215,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
 extern int ext4_mpage_readpages(struct address_space *mapping,
 				struct list_head *pages, struct page *page,
 				unsigned nr_pages, bool is_readahead);
+extern int __init ext4_init_post_read_processing(void);
+extern void ext4_exit_post_read_processing(void);
 
 /* symlink.c */
 extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
@@ -3245,6 +3285,9 @@ extern int ext4_ext_check_inode(struct inode *inode);
 extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
+extern int ext4_get_es_cache(struct inode *inode,
+			     struct fiemap_extent_info *fieinfo,
+			     __u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
 extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
 extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
@@ -3283,6 +3326,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
 /* mmp.c */
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 
+/* verity.c */
+extern const struct fsverity_operations ext4_verityops;
+
 /*
  * Add new method to test whether block and inode bitmaps are properly
  * initialized. With uninit_bg reading the block from disk is not enough
@@ -3334,6 +3380,19 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 
 extern const struct iomap_ops ext4_iomap_ops;
 
+static inline int ext4_buffer_uptodate(struct buffer_head *bh)
+{
+	/*
+	 * If the buffer has the write error flag, we have failed
+	 * to write out data in the block.  In this  case, we don't
+	 * have to read the block because we may read the old data
+	 * successfully.
+	 */
+	if (!buffer_uptodate(bh) && buffer_write_io_error(bh))
+		set_buffer_uptodate(bh);
+	return buffer_uptodate(bh);
+}
+
 #endif	/* __KERNEL__ */
 
 #define EFSBADCRC	EBADMSG		/* Bad CRC detected */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 92266a2da7d6..fb0f99dc8c22 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2315,6 +2315,52 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 	return err;
 }
 
+static int ext4_fill_es_cache_info(struct inode *inode,
+				   ext4_lblk_t block, ext4_lblk_t num,
+				   struct fiemap_extent_info *fieinfo)
+{
+	ext4_lblk_t next, end = block + num - 1;
+	struct extent_status es;
+	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
+	unsigned int flags;
+	int err;
+
+	while (block <= end) {
+		next = 0;
+		flags = 0;
+		if (!ext4_es_lookup_extent(inode, block, &next, &es))
+			break;
+		if (ext4_es_is_unwritten(&es))
+			flags |= FIEMAP_EXTENT_UNWRITTEN;
+		if (ext4_es_is_delayed(&es))
+			flags |= (FIEMAP_EXTENT_DELALLOC |
+				  FIEMAP_EXTENT_UNKNOWN);
+		if (ext4_es_is_hole(&es))
+			flags |= EXT4_FIEMAP_EXTENT_HOLE;
+		if (next == 0)
+			flags |= FIEMAP_EXTENT_LAST;
+		if (flags & (FIEMAP_EXTENT_DELALLOC|
+			     EXT4_FIEMAP_EXTENT_HOLE))
+			es.es_pblk = 0;
+		else
+			es.es_pblk = ext4_es_pblock(&es);
+		err = fiemap_fill_next_extent(fieinfo,
+				(__u64)es.es_lblk << blksize_bits,
+				(__u64)es.es_pblk << blksize_bits,
+				(__u64)es.es_len << blksize_bits,
+				flags);
+		if (next == 0)
+			break;
+		block = next;
+		if (err < 0)
+			return err;
+		if (err == 1)
+			return 0;
+	}
+	return 0;
+}
+
+
 /*
  * ext4_ext_determine_hole - determine hole around given block
  * @inode:	inode we lookup in
@@ -3813,8 +3859,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 	 * illegal.
 	 */
 	if (ee_block != map->m_lblk || ee_len > map->m_len) {
-#ifdef EXT4_DEBUG
-		ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+#ifdef CONFIG_EXT4_DEBUG
+		ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
 			     " len %u; IO logical block %llu, len %u",
 			     inode->i_ino, (unsigned long long)ee_block, ee_len,
 			     (unsigned long long)map->m_lblk, map->m_len);
@@ -5017,8 +5063,6 @@ static int ext4_find_delayed_extent(struct inode *inode,
 
 	return next_del;
 }
-/* fiemap flags we can handle specified here */
-#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 
 static int ext4_xattr_fiemap(struct inode *inode,
 				struct fiemap_extent_info *fieinfo)
@@ -5055,10 +5099,16 @@ static int ext4_xattr_fiemap(struct inode *inode,
 	return (error < 0 ? error : 0);
 }
 
-int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-		__u64 start, __u64 len)
+static int _ext4_fiemap(struct inode *inode,
+			struct fiemap_extent_info *fieinfo,
+			__u64 start, __u64 len,
+			int (*fill)(struct inode *, ext4_lblk_t,
+				    ext4_lblk_t,
+				    struct fiemap_extent_info *))
 {
 	ext4_lblk_t start_blk;
+	u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR;
+
 	int error = 0;
 
 	if (ext4_has_inline_data(inode)) {
@@ -5075,14 +5125,18 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		error = ext4_ext_precache(inode);
 		if (error)
 			return error;
+		fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
 	}
 
 	/* fallback to generic here if not in extents fmt */
-	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
+	    fill == ext4_fill_fiemap_extents)
 		return generic_block_fiemap(inode, fieinfo, start, len,
 			ext4_get_block);
 
-	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+	if (fill == ext4_fill_es_cache_info)
+		ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
+	if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
 		return -EBADR;
 
 	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
@@ -5101,12 +5155,36 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 * Walk the extent tree gathering extent information
 		 * and pushing extents back to the user.
 		 */
-		error = ext4_fill_fiemap_extents(inode, start_blk,
-						 len_blks, fieinfo);
+		error = fill(inode, start_blk, len_blks, fieinfo);
 	}
 	return error;
 }
 
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	return _ext4_fiemap(inode, fieinfo, start, len,
+			    ext4_fill_fiemap_extents);
+}
+
+int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		      __u64 start, __u64 len)
+{
+	if (ext4_has_inline_data(inode)) {
+		int has_inline;
+
+		down_read(&EXT4_I(inode)->xattr_sem);
+		has_inline = ext4_has_inline_data(inode);
+		up_read(&EXT4_I(inode)->xattr_sem);
+		if (has_inline)
+			return 0;
+	}
+
+	return _ext4_fiemap(inode, fieinfo, start, len,
+			    ext4_fill_es_cache_info);
+}
+
+
 /*
  * ext4_access_path:
  * Function to access the path buffer for marking it dirty.
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 7521de2dcf3a..d996b44d2265 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -146,7 +146,7 @@ static struct kmem_cache *ext4_pending_cachep;
 
 static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
-			      ext4_lblk_t end);
+			      ext4_lblk_t end, int *reserved);
 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
 		       struct ext4_inode_info *locked_ei);
@@ -836,7 +836,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 	ext4_es_insert_extent_check(inode, &newes);
 
 	write_lock(&EXT4_I(inode)->i_es_lock);
-	err = __es_remove_extent(inode, lblk, end);
+	err = __es_remove_extent(inode, lblk, end, NULL);
 	if (err != 0)
 		goto error;
 retry:
@@ -899,6 +899,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
  * Return: 1 on found, 0 on not
  */
 int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+			  ext4_lblk_t *next_lblk,
 			  struct extent_status *es)
 {
 	struct ext4_es_tree *tree;
@@ -947,9 +948,18 @@ out:
 		es->es_pblk = es1->es_pblk;
 		if (!ext4_es_is_referenced(es1))
 			ext4_es_set_referenced(es1);
-		stats->es_stats_cache_hits++;
+		percpu_counter_inc(&stats->es_stats_cache_hits);
+		if (next_lblk) {
+			node = rb_next(&es1->rb_node);
+			if (node) {
+				es1 = rb_entry(node, struct extent_status,
+					       rb_node);
+				*next_lblk = es1->es_lblk;
+			} else
+				*next_lblk = 0;
+		}
 	} else {
-		stats->es_stats_cache_misses++;
+		percpu_counter_inc(&stats->es_stats_cache_misses);
 	}
 
 	read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -958,8 +968,322 @@ out:
 	return found;
 }
 
+struct rsvd_count {
+	int ndelonly;
+	bool first_do_lblk_found;
+	ext4_lblk_t first_do_lblk;
+	ext4_lblk_t last_do_lblk;
+	struct extent_status *left_es;
+	bool partial;
+	ext4_lblk_t lclu;
+};
+
+/*
+ * init_rsvd - initialize reserved count data before removing block range
+ *	       in file from extent status tree
+ *
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @es - pointer to first extent in range
+ * @rc - pointer to reserved count data
+ *
+ * Assumes es is not NULL
+ */
+static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
+		      struct extent_status *es, struct rsvd_count *rc)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct rb_node *node;
+
+	rc->ndelonly = 0;
+
+	/*
+	 * for bigalloc, note the first delonly block in the range has not
+	 * been found, record the extent containing the block to the left of
+	 * the region to be removed, if any, and note that there's no partial
+	 * cluster to track
+	 */
+	if (sbi->s_cluster_ratio > 1) {
+		rc->first_do_lblk_found = false;
+		if (lblk > es->es_lblk) {
+			rc->left_es = es;
+		} else {
+			node = rb_prev(&es->rb_node);
+			rc->left_es = node ? rb_entry(node,
+						      struct extent_status,
+						      rb_node) : NULL;
+		}
+		rc->partial = false;
+	}
+}
+
+/*
+ * count_rsvd - count the clusters containing delayed and not unwritten
+ *		(delonly) blocks in a range within an extent and add to
+ *	        the running tally in rsvd_count
+ *
+ * @inode - file containing extent
+ * @lblk - first block in range
+ * @len - length of range in blocks
+ * @es - pointer to extent containing clusters to be counted
+ * @rc - pointer to reserved count data
+ *
+ * Tracks partial clusters found at the beginning and end of extents so
+ * they aren't overcounted when they span adjacent extents
+ */
+static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
+		       struct extent_status *es, struct rsvd_count *rc)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t i, end, nclu;
+
+	if (!ext4_es_is_delonly(es))
+		return;
+
+	WARN_ON(len <= 0);
+
+	if (sbi->s_cluster_ratio == 1) {
+		rc->ndelonly += (int) len;
+		return;
+	}
+
+	/* bigalloc */
+
+	i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
+	end = lblk + (ext4_lblk_t) len - 1;
+	end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
+
+	/* record the first block of the first delonly extent seen */
+	if (rc->first_do_lblk_found == false) {
+		rc->first_do_lblk = i;
+		rc->first_do_lblk_found = true;
+	}
+
+	/* update the last lblk in the region seen so far */
+	rc->last_do_lblk = end;
+
+	/*
+	 * if we're tracking a partial cluster and the current extent
+	 * doesn't start with it, count it and stop tracking
+	 */
+	if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
+		rc->ndelonly++;
+		rc->partial = false;
+	}
+
+	/*
+	 * if the first cluster doesn't start on a cluster boundary but
+	 * ends on one, count it
+	 */
+	if (EXT4_LBLK_COFF(sbi, i) != 0) {
+		if (end >= EXT4_LBLK_CFILL(sbi, i)) {
+			rc->ndelonly++;
+			rc->partial = false;
+			i = EXT4_LBLK_CFILL(sbi, i) + 1;
+		}
+	}
+
+	/*
+	 * if the current cluster starts on a cluster boundary, count the
+	 * number of whole delonly clusters in the extent
+	 */
+	if ((i + sbi->s_cluster_ratio - 1) <= end) {
+		nclu = (end - i + 1) >> sbi->s_cluster_bits;
+		rc->ndelonly += nclu;
+		i += nclu << sbi->s_cluster_bits;
+	}
+
+	/*
+	 * start tracking a partial cluster if there's a partial at the end
+	 * of the current extent and we're not already tracking one
+	 */
+	if (!rc->partial && i <= end) {
+		rc->partial = true;
+		rc->lclu = EXT4_B2C(sbi, i);
+	}
+}
+
+/*
+ * __pr_tree_search - search for a pending cluster reservation
+ *
+ * @root - root of pending reservation tree
+ * @lclu - logical cluster to search for
+ *
+ * Returns the pending reservation for the cluster identified by @lclu
+ * if found.  If not, returns a reservation for the next cluster if any,
+ * and if not, returns NULL.
+ */
+static struct pending_reservation *__pr_tree_search(struct rb_root *root,
+						    ext4_lblk_t lclu)
+{
+	struct rb_node *node = root->rb_node;
+	struct pending_reservation *pr = NULL;
+
+	while (node) {
+		pr = rb_entry(node, struct pending_reservation, rb_node);
+		if (lclu < pr->lclu)
+			node = node->rb_left;
+		else if (lclu > pr->lclu)
+			node = node->rb_right;
+		else
+			return pr;
+	}
+	if (pr && lclu < pr->lclu)
+		return pr;
+	if (pr && lclu > pr->lclu) {
+		node = rb_next(&pr->rb_node);
+		return node ? rb_entry(node, struct pending_reservation,
+				       rb_node) : NULL;
+	}
+	return NULL;
+}
+
+/*
+ * get_rsvd - calculates and returns the number of cluster reservations to be
+ *	      released when removing a block range from the extent status tree
+ *	      and releases any pending reservations within the range
+ *
+ * @inode - file containing block range
+ * @end - last block in range
+ * @right_es - pointer to extent containing next block beyond end or NULL
+ * @rc - pointer to reserved count data
+ *
+ * The number of reservations to be released is equal to the number of
+ * clusters containing delayed and not unwritten (delonly) blocks within
+ * the range, minus the number of clusters still containing delonly blocks
+ * at the ends of the range, and minus the number of pending reservations
+ * within the range.
+ */
+static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
+			     struct extent_status *right_es,
+			     struct rsvd_count *rc)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct pending_reservation *pr;
+	struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
+	struct rb_node *node;
+	ext4_lblk_t first_lclu, last_lclu;
+	bool left_delonly, right_delonly, count_pending;
+	struct extent_status *es;
+
+	if (sbi->s_cluster_ratio > 1) {
+		/* count any remaining partial cluster */
+		if (rc->partial)
+			rc->ndelonly++;
+
+		if (rc->ndelonly == 0)
+			return 0;
+
+		first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
+		last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
+
+		/*
+		 * decrease the delonly count by the number of clusters at the
+		 * ends of the range that still contain delonly blocks -
+		 * these clusters still need to be reserved
+		 */
+		left_delonly = right_delonly = false;
+
+		es = rc->left_es;
+		while (es && ext4_es_end(es) >=
+		       EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
+			if (ext4_es_is_delonly(es)) {
+				rc->ndelonly--;
+				left_delonly = true;
+				break;
+			}
+			node = rb_prev(&es->rb_node);
+			if (!node)
+				break;
+			es = rb_entry(node, struct extent_status, rb_node);
+		}
+		if (right_es && (!left_delonly || first_lclu != last_lclu)) {
+			if (end < ext4_es_end(right_es)) {
+				es = right_es;
+			} else {
+				node = rb_next(&right_es->rb_node);
+				es = node ? rb_entry(node, struct extent_status,
+						     rb_node) : NULL;
+			}
+			while (es && es->es_lblk <=
+			       EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
+				if (ext4_es_is_delonly(es)) {
+					rc->ndelonly--;
+					right_delonly = true;
+					break;
+				}
+				node = rb_next(&es->rb_node);
+				if (!node)
+					break;
+				es = rb_entry(node, struct extent_status,
+					      rb_node);
+			}
+		}
+
+		/*
+		 * Determine the block range that should be searched for
+		 * pending reservations, if any.  Clusters on the ends of the
+		 * original removed range containing delonly blocks are
+		 * excluded.  They've already been accounted for and it's not
+		 * possible to determine if an associated pending reservation
+		 * should be released with the information available in the
+		 * extents status tree.
+		 */
+		if (first_lclu == last_lclu) {
+			if (left_delonly | right_delonly)
+				count_pending = false;
+			else
+				count_pending = true;
+		} else {
+			if (left_delonly)
+				first_lclu++;
+			if (right_delonly)
+				last_lclu--;
+			if (first_lclu <= last_lclu)
+				count_pending = true;
+			else
+				count_pending = false;
+		}
+
+		/*
+		 * a pending reservation found between first_lclu and last_lclu
+		 * represents an allocated cluster that contained at least one
+		 * delonly block, so the delonly total must be reduced by one
+		 * for each pending reservation found and released
+		 */
+		if (count_pending) {
+			pr = __pr_tree_search(&tree->root, first_lclu);
+			while (pr && pr->lclu <= last_lclu) {
+				rc->ndelonly--;
+				node = rb_next(&pr->rb_node);
+				rb_erase(&pr->rb_node, &tree->root);
+				kmem_cache_free(ext4_pending_cachep, pr);
+				if (!node)
+					break;
+				pr = rb_entry(node, struct pending_reservation,
+					      rb_node);
+			}
+		}
+	}
+	return rc->ndelonly;
+}
+
+
+/*
+ * __es_remove_extent - removes block range from extent status tree
+ *
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @end - last block in range
+ * @reserved - number of cluster reservations released
+ *
+ * If @reserved is not NULL and delayed allocation is enabled, counts
+ * block/cluster reservations freed by removing range and if bigalloc
+ * enabled cancels pending reservations as needed. Returns 0 on success,
+ * error code on failure.
+ */
 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
-			      ext4_lblk_t end)
+			      ext4_lblk_t end, int *reserved)
 {
 	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
 	struct rb_node *node;
@@ -968,9 +1292,14 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 	ext4_lblk_t len1, len2;
 	ext4_fsblk_t block;
 	int err;
+	bool count_reserved = true;
+	struct rsvd_count rc;
 
+	if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
+		count_reserved = false;
 retry:
 	err = 0;
+
 	es = __es_tree_search(&tree->root, lblk);
 	if (!es)
 		goto out;
@@ -979,6 +1308,8 @@ retry:
 
 	/* Simply invalidate cache_es. */
 	tree->cache_es = NULL;
+	if (count_reserved)
+		init_rsvd(inode, lblk, es, &rc);
 
 	orig_es.es_lblk = es->es_lblk;
 	orig_es.es_len = es->es_len;
@@ -1020,10 +1351,16 @@ retry:
 				ext4_es_store_pblock(es, block);
 			}
 		}
+		if (count_reserved)
+			count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
+				   &orig_es, &rc);
 		goto out;
 	}
 
 	if (len1 > 0) {
+		if (count_reserved)
+			count_rsvd(inode, lblk, orig_es.es_len - len1,
+				   &orig_es, &rc);
 		node = rb_next(&es->rb_node);
 		if (node)
 			es = rb_entry(node, struct extent_status, rb_node);
@@ -1032,6 +1369,8 @@ retry:
 	}
 
 	while (es && ext4_es_end(es) <= end) {
+		if (count_reserved)
+			count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
 		node = rb_next(&es->rb_node);
 		rb_erase(&es->rb_node, &tree->root);
 		ext4_es_free_extent(inode, es);
@@ -1046,6 +1385,9 @@ retry:
 		ext4_lblk_t orig_len = es->es_len;
 
 		len1 = ext4_es_end(es) - end;
+		if (count_reserved)
+			count_rsvd(inode, es->es_lblk, orig_len - len1,
+				   es, &rc);
 		es->es_lblk = end + 1;
 		es->es_len = len1;
 		if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
@@ -1054,20 +1396,28 @@ retry:
 		}
 	}
 
+	if (count_reserved)
+		*reserved = get_rsvd(inode, end, es, &rc);
 out:
 	return err;
 }
 
 /*
- * ext4_es_remove_extent() removes a space from a extent status tree.
+ * ext4_es_remove_extent - removes block range from extent status tree
  *
- * Return 0 on success, error code on failure.
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @len - number of blocks to remove
+ *
+ * Reduces block/cluster reservation count and for bigalloc cancels pending
+ * reservations as needed. Returns 0 on success, error code on failure.
  */
 int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 			  ext4_lblk_t len)
 {
 	ext4_lblk_t end;
 	int err = 0;
+	int reserved = 0;
 
 	trace_ext4_es_remove_extent(inode, lblk, len);
 	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
@@ -1085,9 +1435,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 	 * is reclaimed.
 	 */
 	write_lock(&EXT4_I(inode)->i_es_lock);
-	err = __es_remove_extent(inode, lblk, end);
+	err = __es_remove_extent(inode, lblk, end, &reserved);
 	write_unlock(&EXT4_I(inode)->i_es_lock);
 	ext4_es_print_tree(inode);
+	ext4_da_release_space(inode, reserved);
 	return err;
 }
 
@@ -1235,9 +1586,9 @@ int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
 		   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
 		   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
-	seq_printf(seq, "  %lu/%lu cache hits/misses\n",
-		   es_stats->es_stats_cache_hits,
-		   es_stats->es_stats_cache_misses);
+	seq_printf(seq, "  %lld/%lld cache hits/misses\n",
+		   percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
+		   percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
 	if (inode_cnt)
 		seq_printf(seq, "  %d inodes on list\n", inode_cnt);
 
@@ -1264,35 +1615,46 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 	sbi->s_es_nr_inode = 0;
 	spin_lock_init(&sbi->s_es_lock);
 	sbi->s_es_stats.es_stats_shrunk = 0;
-	sbi->s_es_stats.es_stats_cache_hits = 0;
-	sbi->s_es_stats.es_stats_cache_misses = 0;
+	err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
+				  GFP_KERNEL);
+	if (err)
+		return err;
+	err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
+				  GFP_KERNEL);
+	if (err)
+		goto err1;
 	sbi->s_es_stats.es_stats_scan_time = 0;
 	sbi->s_es_stats.es_stats_max_scan_time = 0;
 	err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
 	if (err)
-		return err;
+		goto err2;
 	err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
 	if (err)
-		goto err1;
+		goto err3;
 
 	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
 	sbi->s_es_shrinker.count_objects = ext4_es_count;
 	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
 	err = register_shrinker(&sbi->s_es_shrinker);
 	if (err)
-		goto err2;
+		goto err4;
 
 	return 0;
-
-err2:
+err4:
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
-err1:
+err3:
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+err2:
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
+err1:
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
 	return err;
 }
 
 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
 {
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
+	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
 	unregister_shrinker(&sbi->s_es_shrinker);
@@ -1317,6 +1679,7 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
 	es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
 	if (!es)
 		goto out_wrap;
+
 	while (*nr_to_scan > 0) {
 		if (es->es_lblk > end) {
 			ei->i_es_shrink_lblk = end + 1;
@@ -1374,6 +1737,34 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
 	return nr_shrunk;
 }
 
+/*
+ * Called to support EXT4_IOC_CLEAR_ES_CACHE.  We can only remove
+ * discretionary entries from the extent status cache.  (Some entries
+ * must be present for proper operations.)
+ */
+void ext4_clear_inode_es(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct extent_status *es;
+	struct ext4_es_tree *tree;
+	struct rb_node *node;
+
+	write_lock(&ei->i_es_lock);
+	tree = &EXT4_I(inode)->i_es_tree;
+	tree->cache_es = NULL;
+	node = rb_first(&tree->root);
+	while (node) {
+		es = rb_entry(node, struct extent_status, rb_node);
+		node = rb_next(node);
+		if (!ext4_es_is_delayed(es)) {
+			rb_erase(&es->rb_node, &tree->root);
+			ext4_es_free_extent(inode, es);
+		}
+	}
+	ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+	write_unlock(&ei->i_es_lock);
+}
+
 #ifdef ES_DEBUG__
 static void ext4_print_pending_tree(struct inode *inode)
 {
@@ -1590,7 +1981,7 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
 
 	write_lock(&EXT4_I(inode)->i_es_lock);
 
-	err = __es_remove_extent(inode, lblk, lblk);
+	err = __es_remove_extent(inode, lblk, lblk, NULL);
 	if (err != 0)
 		goto error;
 retry:
@@ -1779,93 +2170,3 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
 			__remove_pending(inode, last);
 	}
 }
-
-/*
- * ext4_es_remove_blks - remove block range from extents status tree and
- *                       reduce reservation count or cancel pending
- *                       reservation as needed
- *
- * @inode - file containing range
- * @lblk - first block in range
- * @len - number of blocks to remove
- *
- */
-void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
-			 ext4_lblk_t len)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	unsigned int clu_size, reserved = 0;
-	ext4_lblk_t last_lclu, first, length, remainder, last;
-	bool delonly;
-	int err = 0;
-	struct pending_reservation *pr;
-	struct ext4_pending_tree *tree;
-
-	/*
-	 * Process cluster by cluster for bigalloc - there may be up to
-	 * two clusters in a 4k page with a 1k block size and two blocks
-	 * per cluster.  Also necessary for systems with larger page sizes
-	 * and potentially larger block sizes.
-	 */
-	clu_size = sbi->s_cluster_ratio;
-	last_lclu = EXT4_B2C(sbi, lblk + len - 1);
-
-	write_lock(&EXT4_I(inode)->i_es_lock);
-
-	for (first = lblk, remainder = len;
-	     remainder > 0;
-	     first += length, remainder -= length) {
-
-		if (EXT4_B2C(sbi, first) == last_lclu)
-			length = remainder;
-		else
-			length = clu_size - EXT4_LBLK_COFF(sbi, first);
-
-		/*
-		 * The BH_Delay flag, which triggers calls to this function,
-		 * and the contents of the extents status tree can be
-		 * inconsistent due to writepages activity. So, note whether
-		 * the blocks to be removed actually belong to an extent with
-		 * delayed only status.
-		 */
-		delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first);
-
-		/*
-		 * because of the writepages effect, written and unwritten
-		 * blocks could be removed here
-		 */
-		last = first + length - 1;
-		err = __es_remove_extent(inode, first, last);
-		if (err)
-			ext4_warning(inode->i_sb,
-				     "%s: couldn't remove page (err = %d)",
-				     __func__, err);
-
-		/* non-bigalloc case: simply count the cluster for release */
-		if (sbi->s_cluster_ratio == 1 && delonly) {
-			reserved++;
-			continue;
-		}
-
-		/*
-		 * bigalloc case: if all delayed allocated only blocks have
-		 * just been removed from a cluster, either cancel a pending
-		 * reservation if it exists or count a cluster for release
-		 */
-		if (delonly &&
-		    !__es_scan_clu(inode, &ext4_es_is_delonly, first)) {
-			pr = __get_pending(inode, EXT4_B2C(sbi, first));
-			if (pr != NULL) {
-				tree = &EXT4_I(inode)->i_pending_tree;
-				rb_erase(&pr->rb_node, &tree->root);
-				kmem_cache_free(ext4_pending_cachep, pr);
-			} else {
-				reserved++;
-			}
-		}
-	}
-
-	write_unlock(&EXT4_I(inode)->i_es_lock);
-
-	ext4_da_release_space(inode, reserved);
-}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 131a8b7df265..825313c59752 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -70,8 +70,8 @@ struct ext4_es_tree {
 
 struct ext4_es_stats {
 	unsigned long es_stats_shrunk;
-	unsigned long es_stats_cache_hits;
-	unsigned long es_stats_cache_misses;
+	struct percpu_counter es_stats_cache_hits;
+	struct percpu_counter es_stats_cache_misses;
 	u64 es_stats_scan_time;
 	u64 es_stats_max_scan_time;
 	struct percpu_counter es_stats_all_cnt;
@@ -140,6 +140,7 @@ extern void ext4_es_find_extent_range(struct inode *inode,
 				      ext4_lblk_t lblk, ext4_lblk_t end,
 				      struct extent_status *es);
 extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+				 ext4_lblk_t *next_lblk,
 				 struct extent_status *es);
 extern bool ext4_es_scan_range(struct inode *inode,
 			       int (*matching_fn)(struct extent_status *es),
@@ -246,7 +247,6 @@ extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
 					bool allocated);
 extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
 					ext4_lblk_t len);
-extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
-				ext4_lblk_t len);
+extern void ext4_clear_inode_es(struct inode *inode);
 
 #endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 70b0438dbc94..8d2bbcc2d813 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -230,8 +230,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (IS_DAX(inode))
 		return ext4_dax_write_iter(iocb, from);
 #endif
-	if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
-		return -EOPNOTSUPP;
 
 	if (!inode_trylock(inode)) {
 		if (iocb->ki_flags & IOCB_NOWAIT)
@@ -457,6 +455,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 	if (ret)
 		return ret;
 
+	ret = fsverity_file_open(inode, filp);
+	if (ret)
+		return ret;
+
 	/*
 	 * Set up the jbd2_inode if we are opening the inode for
 	 * writing and the journal is present
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index d358bfcb6b3f..3e133793a5a3 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -280,7 +280,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
 	unsigned char *buff;
 	struct qstr qstr = {.name = name, .len = len };
 
-	if (len && IS_CASEFOLDED(dir)) {
+	if (len && IS_CASEFOLDED(dir) && um) {
 		buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
 		if (!buff)
 			return -ENOMEM;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 88cdf3c90bd1..2fec62d764fa 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1416,7 +1416,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
 		err = ext4_htree_store_dirent(dir_file, hinfo->hash,
 					      hinfo->minor_hash, de, &tmp_str);
 		if (err) {
-			count = err;
+			ret = err;
 			goto out;
 		}
 		count++;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 420fe3deed39..516faa280ced 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -527,7 +527,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		return -EFSCORRUPTED;
 
 	/* Lookup extent status tree firstly */
-	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
 		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
 			map->m_pblk = ext4_es_pblock(&es) +
 					map->m_lblk - es.es_lblk;
@@ -695,7 +695,7 @@ found:
 		 * extent status tree.
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
-		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+		    ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
 			if (ext4_es_is_written(&es))
 				goto out_sem;
 		}
@@ -1024,7 +1024,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 	bh = ext4_getblk(handle, inode, block, map_flags);
 	if (IS_ERR(bh))
 		return bh;
-	if (!bh || buffer_uptodate(bh))
+	if (!bh || ext4_buffer_uptodate(bh))
 		return bh;
 	ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
 	wait_on_buffer(bh);
@@ -1051,7 +1051,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
 
 	for (i = 0; i < bh_count; i++)
 		/* Note that NULL bhs[i] is valid because of holes. */
-		if (bhs[i] && !buffer_uptodate(bhs[i]))
+		if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
 			ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
 				    &bhs[i]);
 
@@ -1340,6 +1340,9 @@ retry_journal:
 	}
 
 	if (ret) {
+		bool extended = (pos + len > inode->i_size) &&
+				!ext4_verity_in_progress(inode);
+
 		unlock_page(page);
 		/*
 		 * __block_write_begin may have instantiated a few blocks
@@ -1349,11 +1352,11 @@ retry_journal:
 		 * Add inode to orphan list in case we crash before
 		 * truncate finishes
 		 */
-		if (pos + len > inode->i_size && ext4_can_truncate(inode))
+		if (extended && ext4_can_truncate(inode))
 			ext4_orphan_add(handle, inode);
 
 		ext4_journal_stop(handle);
-		if (pos + len > inode->i_size) {
+		if (extended) {
 			ext4_truncate_failed_write(inode);
 			/*
 			 * If truncate failed early the inode might
@@ -1406,6 +1409,7 @@ static int ext4_write_end(struct file *file,
 	int ret = 0, ret2;
 	int i_size_changed = 0;
 	int inline_data = ext4_has_inline_data(inode);
+	bool verity = ext4_verity_in_progress(inode);
 
 	trace_ext4_write_end(inode, pos, len, copied);
 	if (inline_data) {
@@ -1423,12 +1427,16 @@ static int ext4_write_end(struct file *file,
 	/*
 	 * it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
+	 *
+	 * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
+	 * blocks are being written past EOF, so skip the i_size update.
 	 */
-	i_size_changed = ext4_update_inode_size(inode, pos + copied);
+	if (!verity)
+		i_size_changed = ext4_update_inode_size(inode, pos + copied);
 	unlock_page(page);
 	put_page(page);
 
-	if (old_size < pos)
+	if (old_size < pos && !verity)
 		pagecache_isize_extended(inode, old_size, pos);
 	/*
 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
@@ -1439,7 +1447,7 @@ static int ext4_write_end(struct file *file,
 	if (i_size_changed || inline_data)
 		ext4_mark_inode_dirty(handle, inode);
 
-	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
@@ -1450,7 +1458,7 @@ errout:
 	if (!ret)
 		ret = ret2;
 
-	if (pos + len > inode->i_size) {
+	if (pos + len > inode->i_size && !verity) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
@@ -1511,6 +1519,7 @@ static int ext4_journalled_write_end(struct file *file,
 	unsigned from, to;
 	int size_changed = 0;
 	int inline_data = ext4_has_inline_data(inode);
+	bool verity = ext4_verity_in_progress(inode);
 
 	trace_ext4_journalled_write_end(inode, pos, len, copied);
 	from = pos & (PAGE_SIZE - 1);
@@ -1540,13 +1549,14 @@ static int ext4_journalled_write_end(struct file *file,
 		if (!partial)
 			SetPageUptodate(page);
 	}
-	size_changed = ext4_update_inode_size(inode, pos + copied);
+	if (!verity)
+		size_changed = ext4_update_inode_size(inode, pos + copied);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
 	unlock_page(page);
 	put_page(page);
 
-	if (old_size < pos)
+	if (old_size < pos && !verity)
 		pagecache_isize_extended(inode, old_size, pos);
 
 	if (size_changed || inline_data) {
@@ -1555,7 +1565,7 @@ static int ext4_journalled_write_end(struct file *file,
 			ret = ret2;
 	}
 
-	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
@@ -1566,7 +1576,7 @@ errout:
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	if (pos + len > inode->i_size) {
+	if (pos + len > inode->i_size && !verity) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
@@ -1646,49 +1656,6 @@ void ext4_da_release_space(struct inode *inode, int to_free)
 	dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
 }
 
-static void ext4_da_page_release_reservation(struct page *page,
-					     unsigned int offset,
-					     unsigned int length)
-{
-	int contiguous_blks = 0;
-	struct buffer_head *head, *bh;
-	unsigned int curr_off = 0;
-	struct inode *inode = page->mapping->host;
-	unsigned int stop = offset + length;
-	ext4_fsblk_t lblk;
-
-	BUG_ON(stop > PAGE_SIZE || stop < length);
-
-	head = page_buffers(page);
-	bh = head;
-	do {
-		unsigned int next_off = curr_off + bh->b_size;
-
-		if (next_off > stop)
-			break;
-
-		if ((offset <= curr_off) && (buffer_delay(bh))) {
-			contiguous_blks++;
-			clear_buffer_delay(bh);
-		} else if (contiguous_blks) {
-			lblk = page->index <<
-			       (PAGE_SHIFT - inode->i_blkbits);
-			lblk += (curr_off >> inode->i_blkbits) -
-				contiguous_blks;
-			ext4_es_remove_blks(inode, lblk, contiguous_blks);
-			contiguous_blks = 0;
-		}
-		curr_off = next_off;
-	} while ((bh = bh->b_this_page) != head);
-
-	if (contiguous_blks) {
-		lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
-		lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
-		ext4_es_remove_blks(inode, lblk, contiguous_blks);
-	}
-
-}
-
 /*
  * Delayed allocation stuff
  */
@@ -1868,7 +1835,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 		  (unsigned long) map->m_lblk);
 
 	/* Lookup extent status tree firstly */
-	if (ext4_es_lookup_extent(inode, iblock, &es)) {
+	if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
 		if (ext4_es_is_hole(&es)) {
 			retval = 0;
 			down_read(&EXT4_I(inode)->i_data_sem);
@@ -2162,7 +2129,8 @@ static int ext4_writepage(struct page *page,
 
 	trace_ext4_writepage(page);
 	size = i_size_read(inode);
-	if (page->index == size >> PAGE_SHIFT)
+	if (page->index == size >> PAGE_SHIFT &&
+	    !ext4_verity_in_progress(inode))
 		len = size & ~PAGE_MASK;
 	else
 		len = PAGE_SIZE;
@@ -2246,7 +2214,8 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
 	 * after page tables are updated.
 	 */
 	size = i_size_read(mpd->inode);
-	if (page->index == size >> PAGE_SHIFT)
+	if (page->index == size >> PAGE_SHIFT &&
+	    !ext4_verity_in_progress(mpd->inode))
 		len = size & ~PAGE_MASK;
 	else
 		len = PAGE_SIZE;
@@ -2345,6 +2314,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
 	ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
 							>> inode->i_blkbits;
 
+	if (ext4_verity_in_progress(inode))
+		blocks = EXT_MAX_BLOCKS;
+
 	do {
 		BUG_ON(buffer_locked(bh));
 
@@ -2785,15 +2757,6 @@ static int ext4_writepages(struct address_space *mapping,
 		goto out_writepages;
 	}
 
-	if (ext4_should_dioread_nolock(inode)) {
-		/*
-		 * We may need to convert up to one extent per block in
-		 * the page and we may dirty the inode.
-		 */
-		rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
-						PAGE_SIZE >> inode->i_blkbits);
-	}
-
 	/*
 	 * If we have inline data and arrive here, it means that
 	 * we will soon create the block for the 1st page, so
@@ -2812,6 +2775,15 @@ static int ext4_writepages(struct address_space *mapping,
 		ext4_journal_stop(handle);
 	}
 
+	if (ext4_should_dioread_nolock(inode)) {
+		/*
+		 * We may need to convert up to one extent per block in
+		 * the page and we may dirty the inode.
+		 */
+		rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
+						PAGE_SIZE >> inode->i_blkbits);
+	}
+
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 
@@ -3061,8 +3033,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 
 	index = pos >> PAGE_SHIFT;
 
-	if (ext4_nonda_switch(inode->i_sb) ||
-	    S_ISLNK(inode->i_mode)) {
+	if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) ||
+	    ext4_verity_in_progress(inode)) {
 		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
 		return ext4_write_begin(file, mapping, pos,
 					len, flags, pagep, fsdata);
@@ -3227,24 +3199,6 @@ static int ext4_da_write_end(struct file *file,
 	return ret ? ret : copied;
 }
 
-static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
-				   unsigned int length)
-{
-	/*
-	 * Drop reserved blocks
-	 */
-	BUG_ON(!PageLocked(page));
-	if (!page_has_buffers(page))
-		goto out;
-
-	ext4_da_page_release_reservation(page, offset, length);
-
-out:
-	ext4_invalidatepage(page, offset, length);
-
-	return;
-}
-
 /*
  * Force all delayed allocation blocks to be allocated for a given inode.
  */
@@ -3897,6 +3851,8 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
 		return 0;
 #endif
+	if (fsverity_active(inode))
+		return 0;
 
 	/*
 	 * If we are doing data journalling we don't support O_DIRECT
@@ -3985,7 +3941,7 @@ static const struct address_space_operations ext4_da_aops = {
 	.write_end		= ext4_da_write_end,
 	.set_page_dirty		= ext4_set_page_dirty,
 	.bmap			= ext4_bmap,
-	.invalidatepage		= ext4_da_invalidatepage,
+	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
@@ -4297,6 +4253,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 
 	trace_ext4_punch_hole(inode, offset, length, 0);
 
+	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+	if (ext4_has_inline_data(inode)) {
+		down_write(&EXT4_I(inode)->i_mmap_sem);
+		ret = ext4_convert_inline_data(inode);
+		up_write(&EXT4_I(inode)->i_mmap_sem);
+		if (ret)
+			return ret;
+	}
+
 	/*
 	 * Write out all dirty pages to avoid race conditions
 	 * Then release them.
@@ -4739,6 +4704,8 @@ static bool ext4_should_use_dax(struct inode *inode)
 		return false;
 	if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
 		return false;
+	if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
+		return false;
 	return true;
 }
 
@@ -4763,9 +4730,11 @@ void ext4_set_inode_flags(struct inode *inode)
 		new_fl |= S_ENCRYPTED;
 	if (flags & EXT4_CASEFOLD_FL)
 		new_fl |= S_CASEFOLD;
+	if (flags & EXT4_VERITY_FL)
+		new_fl |= S_VERITY;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
-			S_ENCRYPTED|S_CASEFOLD);
+			S_ENCRYPTED|S_CASEFOLD|S_VERITY);
 }
 
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -5119,6 +5088,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 				 "iget: bogus i_mode (%o)", inode->i_mode);
 		goto bad_inode;
 	}
+	if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
+		ext4_error_inode(inode, function, line, 0,
+				 "casefold flag without casefold feature");
 	brelse(iloc.bh);
 
 	unlock_new_inode(inode);
@@ -5555,6 +5527,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	if (error)
 		return error;
 
+	error = fsverity_prepare_setattr(dentry, attr);
+	if (error)
+		return error;
+
 	if (is_quota_modification(inode, attr)) {
 		error = dquot_initialize(inode);
 		if (error)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 442f7ef873fc..0b7f316fd30f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -745,6 +745,74 @@ static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
 		fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
 }
 
+/* copied from fs/ioctl.c */
+static int fiemap_check_ranges(struct super_block *sb,
+			       u64 start, u64 len, u64 *new_len)
+{
+	u64 maxbytes = (u64) sb->s_maxbytes;
+
+	*new_len = len;
+
+	if (len == 0)
+		return -EINVAL;
+
+	if (start > maxbytes)
+		return -EFBIG;
+
+	/*
+	 * Shrink request scope to what the fs can actually handle.
+	 */
+	if (len > maxbytes || (maxbytes - len) < start)
+		*new_len = maxbytes - start;
+
+	return 0;
+}
+
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS	(UINT_MAX / sizeof(struct fiemap_extent))
+
+static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
+{
+	struct fiemap fiemap;
+	struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
+	struct fiemap_extent_info fieinfo = { 0, };
+	struct inode *inode = file_inode(filp);
+	struct super_block *sb = inode->i_sb;
+	u64 len;
+	int error;
+
+	if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
+		return -EFAULT;
+
+	if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+		return -EINVAL;
+
+	error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
+				    &len);
+	if (error)
+		return error;
+
+	fieinfo.fi_flags = fiemap.fm_flags;
+	fieinfo.fi_extents_max = fiemap.fm_extent_count;
+	fieinfo.fi_extents_start = ufiemap->fm_extents;
+
+	if (fiemap.fm_extent_count != 0 &&
+	    !access_ok(fieinfo.fi_extents_start,
+		       fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
+		return -EFAULT;
+
+	if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+		filemap_write_and_wait(inode->i_mapping);
+
+	error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len);
+	fiemap.fm_flags = fieinfo.fi_flags;
+	fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+	if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
+		error = -EFAULT;
+
+	return error;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -1113,8 +1181,62 @@ resizefs_out:
 #endif
 	}
 	case EXT4_IOC_GET_ENCRYPTION_POLICY:
+		if (!ext4_has_feature_encrypt(sb))
+			return -EOPNOTSUPP;
 		return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 
+	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+		if (!ext4_has_feature_encrypt(sb))
+			return -EOPNOTSUPP;
+		return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg);
+
+	case FS_IOC_ADD_ENCRYPTION_KEY:
+		if (!ext4_has_feature_encrypt(sb))
+			return -EOPNOTSUPP;
+		return fscrypt_ioctl_add_key(filp, (void __user *)arg);
+
+	case FS_IOC_REMOVE_ENCRYPTION_KEY:
+		if (!ext4_has_feature_encrypt(sb))
+			return -EOPNOTSUPP;
+		return fscrypt_ioctl_remove_key(filp, (void __user *)arg);
+
+	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+		if (!ext4_has_feature_encrypt(sb))
+			return -EOPNOTSUPP;
+		return fscrypt_ioctl_remove_key_all_users(filp,
+							  (void __user *)arg);
+	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+		if (!ext4_has_feature_encrypt(sb))
+			return -EOPNOTSUPP;
+		return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+
+	case EXT4_IOC_CLEAR_ES_CACHE:
+	{
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+		ext4_clear_inode_es(inode);
+		return 0;
+	}
+
+	case EXT4_IOC_GETSTATE:
+	{
+		__u32	state = 0;
+
+		if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED))
+			state |= EXT4_STATE_FLAG_EXT_PRECACHED;
+		if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
+			state |= EXT4_STATE_FLAG_NEW;
+		if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
+			state |= EXT4_STATE_FLAG_NEWENTRY;
+		if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE))
+			state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE;
+
+		return put_user(state, (__u32 __user *) arg);
+	}
+
+	case EXT4_IOC_GET_ES_CACHE:
+		return ext4_ioctl_get_es_cache(filp, arg);
+
 	case EXT4_IOC_FSGETXATTR:
 	{
 		struct fsxattr fa;
@@ -1171,6 +1293,17 @@ out:
 	}
 	case EXT4_IOC_SHUTDOWN:
 		return ext4_shutdown(sb, arg);
+
+	case FS_IOC_ENABLE_VERITY:
+		if (!ext4_has_feature_verity(sb))
+			return -EOPNOTSUPP;
+		return fsverity_ioctl_enable(filp, (const void __user *)arg);
+
+	case FS_IOC_MEASURE_VERITY:
+		if (!ext4_has_feature_verity(sb))
+			return -EOPNOTSUPP;
+		return fsverity_ioctl_measure(filp, (void __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -1231,8 +1364,18 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_SET_ENCRYPTION_POLICY:
 	case EXT4_IOC_GET_ENCRYPTION_PWSALT:
 	case EXT4_IOC_GET_ENCRYPTION_POLICY:
+	case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+	case FS_IOC_ADD_ENCRYPTION_KEY:
+	case FS_IOC_REMOVE_ENCRYPTION_KEY:
+	case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+	case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
 	case EXT4_IOC_SHUTDOWN:
 	case FS_IOC_GETFSMAP:
+	case FS_IOC_ENABLE_VERITY:
+	case FS_IOC_MEASURE_VERITY:
+	case EXT4_IOC_CLEAR_ES_CACHE:
+	case EXT4_IOC_GETSTATE:
+	case EXT4_IOC_GET_ES_CACHE:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 129029534075..a427d2031a8d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1312,7 +1312,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
 {
 	int len;
 
-	if (!IS_CASEFOLDED(dir)) {
+	if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) {
 		cf_name->name = NULL;
 		return;
 	}
@@ -2183,7 +2183,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 
 #ifdef CONFIG_UNICODE
 	if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
-	    utf8_validate(sbi->s_encoding, &dentry->d_name))
+	    sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name))
 		return -EINVAL;
 #endif
 
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index c916017db334..a30b203fa461 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -47,13 +47,103 @@
 
 #include "ext4.h"
 
-static inline bool ext4_bio_encrypted(struct bio *bio)
+#define NUM_PREALLOC_POST_READ_CTXS	128
+
+static struct kmem_cache *bio_post_read_ctx_cache;
+static mempool_t *bio_post_read_ctx_pool;
+
+/* postprocessing steps for read bios */
+enum bio_post_read_step {
+	STEP_INITIAL = 0,
+	STEP_DECRYPT,
+	STEP_VERITY,
+};
+
+struct bio_post_read_ctx {
+	struct bio *bio;
+	struct work_struct work;
+	unsigned int cur_step;
+	unsigned int enabled_steps;
+};
+
+static void __read_end_io(struct bio *bio)
 {
-#ifdef CONFIG_FS_ENCRYPTION
-	return unlikely(bio->bi_private != NULL);
-#else
-	return false;
-#endif
+	struct page *page;
+	struct bio_vec *bv;
+	struct bvec_iter_all iter_all;
+
+	bio_for_each_segment_all(bv, bio, iter_all) {
+		page = bv->bv_page;
+
+		/* PG_error was set if any post_read step failed */
+		if (bio->bi_status || PageError(page)) {
+			ClearPageUptodate(page);
+			/* will re-read again later */
+			ClearPageError(page);
+		} else {
+			SetPageUptodate(page);
+		}
+		unlock_page(page);
+	}
+	if (bio->bi_private)
+		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+	bio_put(bio);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
+
+static void decrypt_work(struct work_struct *work)
+{
+	struct bio_post_read_ctx *ctx =
+		container_of(work, struct bio_post_read_ctx, work);
+
+	fscrypt_decrypt_bio(ctx->bio);
+
+	bio_post_read_processing(ctx);
+}
+
+static void verity_work(struct work_struct *work)
+{
+	struct bio_post_read_ctx *ctx =
+		container_of(work, struct bio_post_read_ctx, work);
+
+	fsverity_verify_bio(ctx->bio);
+
+	bio_post_read_processing(ctx);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
+{
+	/*
+	 * We use different work queues for decryption and for verity because
+	 * verity may require reading metadata pages that need decryption, and
+	 * we shouldn't recurse to the same workqueue.
+	 */
+	switch (++ctx->cur_step) {
+	case STEP_DECRYPT:
+		if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
+			INIT_WORK(&ctx->work, decrypt_work);
+			fscrypt_enqueue_decrypt_work(&ctx->work);
+			return;
+		}
+		ctx->cur_step++;
+		/* fall-through */
+	case STEP_VERITY:
+		if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+			INIT_WORK(&ctx->work, verity_work);
+			fsverity_enqueue_verify_work(&ctx->work);
+			return;
+		}
+		ctx->cur_step++;
+		/* fall-through */
+	default:
+		__read_end_io(ctx->bio);
+	}
+}
+
+static bool bio_post_read_required(struct bio *bio)
+{
+	return bio->bi_private && !bio->bi_status;
 }
 
 /*
@@ -70,30 +160,53 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
  */
 static void mpage_end_io(struct bio *bio)
 {
-	struct bio_vec *bv;
-	struct bvec_iter_all iter_all;
+	if (bio_post_read_required(bio)) {
+		struct bio_post_read_ctx *ctx = bio->bi_private;
 
-	if (ext4_bio_encrypted(bio)) {
-		if (bio->bi_status) {
-			fscrypt_release_ctx(bio->bi_private);
-		} else {
-			fscrypt_enqueue_decrypt_bio(bio->bi_private, bio);
-			return;
-		}
+		ctx->cur_step = STEP_INITIAL;
+		bio_post_read_processing(ctx);
+		return;
 	}
-	bio_for_each_segment_all(bv, bio, iter_all) {
-		struct page *page = bv->bv_page;
+	__read_end_io(bio);
+}
 
-		if (!bio->bi_status) {
-			SetPageUptodate(page);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-		unlock_page(page);
+static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx)
+{
+	return fsverity_active(inode) &&
+	       idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
+static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode,
+						       struct bio *bio,
+						       pgoff_t first_idx)
+{
+	unsigned int post_read_steps = 0;
+	struct bio_post_read_ctx *ctx = NULL;
+
+	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+		post_read_steps |= 1 << STEP_DECRYPT;
+
+	if (ext4_need_verity(inode, first_idx))
+		post_read_steps |= 1 << STEP_VERITY;
+
+	if (post_read_steps) {
+		ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
+		if (!ctx)
+			return ERR_PTR(-ENOMEM);
+		ctx->bio = bio;
+		ctx->enabled_steps = post_read_steps;
+		bio->bi_private = ctx;
 	}
+	return ctx;
+}
 
-	bio_put(bio);
+static inline loff_t ext4_readpage_limit(struct inode *inode)
+{
+	if (IS_ENABLED(CONFIG_FS_VERITY) &&
+	    (IS_VERITY(inode) || ext4_verity_in_progress(inode)))
+		return inode->i_sb->s_maxbytes;
+
+	return i_size_read(inode);
 }
 
 int ext4_mpage_readpages(struct address_space *mapping,
@@ -141,7 +254,8 @@ int ext4_mpage_readpages(struct address_space *mapping,
 
 		block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
 		last_block = block_in_file + nr_pages * blocks_per_page;
-		last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+		last_block_in_file = (ext4_readpage_limit(inode) +
+				      blocksize - 1) >> blkbits;
 		if (last_block > last_block_in_file)
 			last_block = last_block_in_file;
 		page_block = 0;
@@ -218,6 +332,9 @@ int ext4_mpage_readpages(struct address_space *mapping,
 			zero_user_segment(page, first_hole << blkbits,
 					  PAGE_SIZE);
 			if (first_hole == 0) {
+				if (ext4_need_verity(inode, page->index) &&
+				    !fsverity_verify_page(page))
+					goto set_error_page;
 				SetPageUptodate(page);
 				unlock_page(page);
 				goto next_page;
@@ -241,18 +358,16 @@ int ext4_mpage_readpages(struct address_space *mapping,
 			bio = NULL;
 		}
 		if (bio == NULL) {
-			struct fscrypt_ctx *ctx = NULL;
+			struct bio_post_read_ctx *ctx;
 
-			if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
-				ctx = fscrypt_get_ctx(GFP_NOFS);
-				if (IS_ERR(ctx))
-					goto set_error_page;
-			}
 			bio = bio_alloc(GFP_KERNEL,
 				min_t(int, nr_pages, BIO_MAX_PAGES));
-			if (!bio) {
-				if (ctx)
-					fscrypt_release_ctx(ctx);
+			if (!bio)
+				goto set_error_page;
+			ctx = get_bio_post_read_ctx(inode, bio, page->index);
+			if (IS_ERR(ctx)) {
+				bio_put(bio);
+				bio = NULL;
 				goto set_error_page;
 			}
 			bio_set_dev(bio, bdev);
@@ -293,3 +408,29 @@ int ext4_mpage_readpages(struct address_space *mapping,
 		submit_bio(bio);
 	return 0;
 }
+
+int __init ext4_init_post_read_processing(void)
+{
+	bio_post_read_ctx_cache =
+		kmem_cache_create("ext4_bio_post_read_ctx",
+				  sizeof(struct bio_post_read_ctx), 0, 0, NULL);
+	if (!bio_post_read_ctx_cache)
+		goto fail;
+	bio_post_read_ctx_pool =
+		mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
+					 bio_post_read_ctx_cache);
+	if (!bio_post_read_ctx_pool)
+		goto fail_free_cache;
+	return 0;
+
+fail_free_cache:
+	kmem_cache_destroy(bio_post_read_ctx_cache);
+fail:
+	return -ENOMEM;
+}
+
+void ext4_exit_post_read_processing(void)
+{
+	mempool_destroy(bio_post_read_ctx_pool);
+	kmem_cache_destroy(bio_post_read_ctx_cache);
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4079605d437a..dd654e53ba3d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1107,6 +1107,9 @@ static int ext4_drop_inode(struct inode *inode)
 {
 	int drop = generic_drop_inode(inode);
 
+	if (!drop)
+		drop = fscrypt_drop_inode(inode);
+
 	trace_ext4_drop_inode(inode, drop);
 	return drop;
 }
@@ -1179,6 +1182,7 @@ void ext4_clear_inode(struct inode *inode)
 		EXT4_I(inode)->jinode = NULL;
 	}
 	fscrypt_put_encryption_info(inode);
+	fsverity_cleanup_inode(inode);
 }
 
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -1874,6 +1878,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 	} else if (token == Opt_commit) {
 		if (arg == 0)
 			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+		else if (arg > INT_MAX / HZ) {
+			ext4_msg(sb, KERN_ERR,
+				 "Invalid commit interval %d, "
+				 "must be smaller than %d",
+				 arg, INT_MAX / HZ);
+			return -1;
+		}
 		sbi->s_commit_interval = HZ * arg;
 	} else if (token == Opt_debug_want_extra_isize) {
 		sbi->s_want_extra_isize = arg;
@@ -4035,8 +4046,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			       sbi->s_inode_size);
 			goto failed_mount;
 		}
-		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
-			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
+		/*
+		 * i_atime_extra is the last extra field available for [acm]times in
+		 * struct ext4_inode. Checking for that field should suffice to ensure
+		 * we have extra space for all three.
+		 */
+		if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
+			sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
+			sb->s_time_gran = 1;
+			sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
+		} else {
+			sb->s_time_gran = NSEC_PER_SEC;
+			sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
+		}
+
+		sb->s_time_min = EXT4_TIMESTAMP_MIN;
 	}
 
 	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
@@ -4272,6 +4296,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_FS_ENCRYPTION
 	sb->s_cop = &ext4_cryptops;
 #endif
+#ifdef CONFIG_FS_VERITY
+	sb->s_vop = &ext4_verityops;
+#endif
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &ext4_quota_operations;
 	if (ext4_has_feature_quota(sb))
@@ -4419,6 +4446,11 @@ no_journal:
 		goto failed_mount_wq;
 	}
 
+	if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
+		ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
+		goto failed_mount_wq;
+	}
+
 	if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
 	    !ext4_has_feature_encrypt(sb)) {
 		ext4_set_feature_encrypt(sb);
@@ -6095,6 +6127,10 @@ static int __init ext4_init_fs(void)
 
 	err = ext4_init_pending();
 	if (err)
+		goto out7;
+
+	err = ext4_init_post_read_processing();
+	if (err)
 		goto out6;
 
 	err = ext4_init_pageio();
@@ -6135,8 +6171,10 @@ out3:
 out4:
 	ext4_exit_pageio();
 out5:
-	ext4_exit_pending();
+	ext4_exit_post_read_processing();
 out6:
+	ext4_exit_pending();
+out7:
 	ext4_exit_es();
 
 	return err;
@@ -6153,6 +6191,7 @@ static void __exit ext4_exit_fs(void)
 	ext4_exit_sysfs();
 	ext4_exit_system_zone();
 	ext4_exit_pageio();
+	ext4_exit_post_read_processing();
 	ext4_exit_es();
 	ext4_exit_pending();
 }
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index b3cd7655a6ff..eb1efad0e20a 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -242,6 +242,9 @@ EXT4_ATTR_FEATURE(encryption);
 #ifdef CONFIG_UNICODE
 EXT4_ATTR_FEATURE(casefold);
 #endif
+#ifdef CONFIG_FS_VERITY
+EXT4_ATTR_FEATURE(verity);
+#endif
 EXT4_ATTR_FEATURE(metadata_csum_seed);
 
 static struct attribute *ext4_feat_attrs[] = {
@@ -254,6 +257,9 @@ static struct attribute *ext4_feat_attrs[] = {
 #ifdef CONFIG_UNICODE
 	ATTR_LIST(casefold),
 #endif
+#ifdef CONFIG_FS_VERITY
+	ATTR_LIST(verity),
+#endif
 	ATTR_LIST(metadata_csum_seed),
 	NULL,
 };
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
new file mode 100644
index 000000000000..d0d8a9795dd6
--- /dev/null
+++ b/fs/ext4/verity.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/ext4/verity.c: fs-verity support for ext4
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Implementation of fsverity_operations for ext4.
+ *
+ * ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past
+ * the end of the file, starting at the first 64K boundary beyond i_size.  This
+ * approach works because (a) verity files are readonly, and (b) pages fully
+ * beyond i_size aren't visible to userspace but can be read/written internally
+ * by ext4 with only some relatively small changes to ext4.  This approach
+ * avoids having to depend on the EA_INODE feature and on rearchitecturing
+ * ext4's xattr support to support paging multi-gigabyte xattrs into memory, and
+ * to support encrypting xattrs.  Note that the verity metadata *must* be
+ * encrypted when the file is, since it contains hashes of the plaintext data.
+ *
+ * Using a 64K boundary rather than a 4K one keeps things ready for
+ * architectures with 64K pages, and it doesn't necessarily waste space on-disk
+ * since there can be a hole between i_size and the start of the Merkle tree.
+ */
+
+#include <linux/quotaops.h>
+
+#include "ext4.h"
+#include "ext4_extents.h"
+#include "ext4_jbd2.h"
+
+static inline loff_t ext4_verity_metadata_pos(const struct inode *inode)
+{
+	return round_up(inode->i_size, 65536);
+}
+
+/*
+ * Read some verity metadata from the inode.  __vfs_read() can't be used because
+ * we need to read beyond i_size.
+ */
+static int pagecache_read(struct inode *inode, void *buf, size_t count,
+			  loff_t pos)
+{
+	while (count) {
+		size_t n = min_t(size_t, count,
+				 PAGE_SIZE - offset_in_page(pos));
+		struct page *page;
+		void *addr;
+
+		page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
+					 NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		addr = kmap_atomic(page);
+		memcpy(buf, addr + offset_in_page(pos), n);
+		kunmap_atomic(addr);
+
+		put_page(page);
+
+		buf += n;
+		pos += n;
+		count -= n;
+	}
+	return 0;
+}
+
+/*
+ * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
+ * kernel_write() can't be used because the file descriptor is readonly.
+ */
+static int pagecache_write(struct inode *inode, const void *buf, size_t count,
+			   loff_t pos)
+{
+	if (pos + count > inode->i_sb->s_maxbytes)
+		return -EFBIG;
+
+	while (count) {
+		size_t n = min_t(size_t, count,
+				 PAGE_SIZE - offset_in_page(pos));
+		struct page *page;
+		void *fsdata;
+		void *addr;
+		int res;
+
+		res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
+					    &page, &fsdata);
+		if (res)
+			return res;
+
+		addr = kmap_atomic(page);
+		memcpy(addr + offset_in_page(pos), buf, n);
+		kunmap_atomic(addr);
+
+		res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
+					  page, fsdata);
+		if (res < 0)
+			return res;
+		if (res != n)
+			return -EIO;
+
+		buf += n;
+		pos += n;
+		count -= n;
+	}
+	return 0;
+}
+
+static int ext4_begin_enable_verity(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	const int credits = 2; /* superblock and inode for ext4_orphan_add() */
+	handle_t *handle;
+	int err;
+
+	if (ext4_verity_in_progress(inode))
+		return -EBUSY;
+
+	/*
+	 * Since the file was opened readonly, we have to initialize the jbd
+	 * inode and quotas here and not rely on ->open() doing it.  This must
+	 * be done before evicting the inline data.
+	 */
+
+	err = ext4_inode_attach_jinode(inode);
+	if (err)
+		return err;
+
+	err = dquot_initialize(inode);
+	if (err)
+		return err;
+
+	err = ext4_convert_inline_data(inode);
+	if (err)
+		return err;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		ext4_warning_inode(inode,
+				   "verity is only allowed on extent-based files");
+		return -EOPNOTSUPP;
+	}
+
+	/*
+	 * ext4 uses the last allocated block to find the verity descriptor, so
+	 * we must remove any other blocks past EOF which might confuse things.
+	 */
+	err = ext4_truncate(inode);
+	if (err)
+		return err;
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	err = ext4_orphan_add(handle, inode);
+	if (err == 0)
+		ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+	ext4_journal_stop(handle);
+	return err;
+}
+
+/*
+ * ext4 stores the verity descriptor beginning on the next filesystem block
+ * boundary after the Merkle tree.  Then, the descriptor size is stored in the
+ * last 4 bytes of the last allocated filesystem block --- which is either the
+ * block in which the descriptor ends, or the next block after that if there
+ * weren't at least 4 bytes remaining.
+ *
+ * We can't simply store the descriptor in an xattr because it *must* be
+ * encrypted when ext4 encryption is used, but ext4 encryption doesn't encrypt
+ * xattrs.  Also, if the descriptor includes a large signature blob it may be
+ * too large to store in an xattr without the EA_INODE feature.
+ */
+static int ext4_write_verity_descriptor(struct inode *inode, const void *desc,
+					size_t desc_size, u64 merkle_tree_size)
+{
+	const u64 desc_pos = round_up(ext4_verity_metadata_pos(inode) +
+				      merkle_tree_size, i_blocksize(inode));
+	const u64 desc_end = desc_pos + desc_size;
+	const __le32 desc_size_disk = cpu_to_le32(desc_size);
+	const u64 desc_size_pos = round_up(desc_end + sizeof(desc_size_disk),
+					   i_blocksize(inode)) -
+				  sizeof(desc_size_disk);
+	int err;
+
+	err = pagecache_write(inode, desc, desc_size, desc_pos);
+	if (err)
+		return err;
+
+	return pagecache_write(inode, &desc_size_disk, sizeof(desc_size_disk),
+			       desc_size_pos);
+}
+
+static int ext4_end_enable_verity(struct file *filp, const void *desc,
+				  size_t desc_size, u64 merkle_tree_size)
+{
+	struct inode *inode = file_inode(filp);
+	const int credits = 2; /* superblock and inode for ext4_orphan_del() */
+	handle_t *handle;
+	int err = 0;
+	int err2;
+
+	if (desc != NULL) {
+		/* Succeeded; write the verity descriptor. */
+		err = ext4_write_verity_descriptor(inode, desc, desc_size,
+						   merkle_tree_size);
+
+		/* Write all pages before clearing VERITY_IN_PROGRESS. */
+		if (!err)
+			err = filemap_write_and_wait(inode->i_mapping);
+	}
+
+	/* If we failed, truncate anything we wrote past i_size. */
+	if (desc == NULL || err)
+		ext4_truncate(inode);
+
+	/*
+	 * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
+	 * deleting the inode from the orphan list, even if something failed.
+	 * If everything succeeded, we'll also set the verity bit in the same
+	 * transaction.
+	 */
+
+	ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+	if (IS_ERR(handle)) {
+		ext4_orphan_del(NULL, inode);
+		return PTR_ERR(handle);
+	}
+
+	err2 = ext4_orphan_del(handle, inode);
+	if (err2)
+		goto out_stop;
+
+	if (desc != NULL && !err) {
+		struct ext4_iloc iloc;
+
+		err = ext4_reserve_inode_write(handle, inode, &iloc);
+		if (err)
+			goto out_stop;
+		ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
+		ext4_set_inode_flags(inode);
+		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+	}
+out_stop:
+	ext4_journal_stop(handle);
+	return err ?: err2;
+}
+
+static int ext4_get_verity_descriptor_location(struct inode *inode,
+					       size_t *desc_size_ret,
+					       u64 *desc_pos_ret)
+{
+	struct ext4_ext_path *path;
+	struct ext4_extent *last_extent;
+	u32 end_lblk;
+	u64 desc_size_pos;
+	__le32 desc_size_disk;
+	u32 desc_size;
+	u64 desc_pos;
+	int err;
+
+	/*
+	 * Descriptor size is in last 4 bytes of last allocated block.
+	 * See ext4_write_verity_descriptor().
+	 */
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		EXT4_ERROR_INODE(inode, "verity file doesn't use extents");
+		return -EFSCORRUPTED;
+	}
+
+	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	last_extent = path[path->p_depth].p_ext;
+	if (!last_extent) {
+		EXT4_ERROR_INODE(inode, "verity file has no extents");
+		ext4_ext_drop_refs(path);
+		kfree(path);
+		return -EFSCORRUPTED;
+	}
+
+	end_lblk = le32_to_cpu(last_extent->ee_block) +
+		   ext4_ext_get_actual_len(last_extent);
+	desc_size_pos = (u64)end_lblk << inode->i_blkbits;
+	ext4_ext_drop_refs(path);
+	kfree(path);
+
+	if (desc_size_pos < sizeof(desc_size_disk))
+		goto bad;
+	desc_size_pos -= sizeof(desc_size_disk);
+
+	err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk),
+			     desc_size_pos);
+	if (err)
+		return err;
+	desc_size = le32_to_cpu(desc_size_disk);
+
+	/*
+	 * The descriptor is stored just before the desc_size_disk, but starting
+	 * on a filesystem block boundary.
+	 */
+
+	if (desc_size > INT_MAX || desc_size > desc_size_pos)
+		goto bad;
+
+	desc_pos = round_down(desc_size_pos - desc_size, i_blocksize(inode));
+	if (desc_pos < ext4_verity_metadata_pos(inode))
+		goto bad;
+
+	*desc_size_ret = desc_size;
+	*desc_pos_ret = desc_pos;
+	return 0;
+
+bad:
+	EXT4_ERROR_INODE(inode, "verity file corrupted; can't find descriptor");
+	return -EFSCORRUPTED;
+}
+
+static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
+				      size_t buf_size)
+{
+	size_t desc_size = 0;
+	u64 desc_pos = 0;
+	int err;
+
+	err = ext4_get_verity_descriptor_location(inode, &desc_size, &desc_pos);
+	if (err)
+		return err;
+
+	if (buf_size) {
+		if (desc_size > buf_size)
+			return -ERANGE;
+		err = pagecache_read(inode, buf, desc_size, desc_pos);
+		if (err)
+			return err;
+	}
+	return desc_size;
+}
+
+static struct page *ext4_read_merkle_tree_page(struct inode *inode,
+					       pgoff_t index)
+{
+	index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
+
+	return read_mapping_page(inode->i_mapping, index, NULL);
+}
+
+static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
+					u64 index, int log_blocksize)
+{
+	loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize);
+
+	return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+}
+
+const struct fsverity_operations ext4_verityops = {
+	.begin_enable_verity	= ext4_begin_enable_verity,
+	.end_enable_verity	= ext4_end_enable_verity,
+	.get_verity_descriptor	= ext4_get_verity_descriptor,
+	.read_merkle_tree_page	= ext4_read_merkle_tree_page,
+	.write_merkle_tree_block = ext4_write_merkle_tree_block,
+};
author	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2019-10-27 11:00:19 -0700
committer	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2019-10-27 11:00:19 -0700
commit	728d90bdc9e480dc93913e59a0aa3c896c7aa697 (patch)
tree	258b1b6ee711f0ef67fd225700d84eccec285194 /fs/ext4
parent	cb3efd5a38855eabd26c2b631dd027169678d60f (diff)
parent	d6d5df1db6e9d7f8f76d2911707f7d5877251b02 (diff)